import os from bs4 import BeautifulSoup from fontTools.subset import Subsetter, Options from fontTools.ttLib import TTFont # Directories HTML_DIR = "." # Directory with .html files FONT_DIR = "." # Directory containing fonts to be modified FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types def extract_text_from_html(file_path): """Extract text content from a single HTML file.""" with open(file_path, "r", encoding="utf-8") as f: soup = BeautifulSoup(f.read(), "html.parser") return soup.get_text() def get_used_characters(directory): """Collect unique characters from all .html files in the given directory.""" char_set = set() for root, _, files in os.walk(directory): for file in files: if file.endswith(".html"): full_path = os.path.join(root, file) text = extract_text_from_html(full_path) char_set.update(text) return "".join(sorted(char_set)) def find_font_files(directory): """Find all font files in the given directory, recursively.""" font_files = [] for root, _, files in os.walk(directory): for file in files: if file.endswith(FONT_EXTENSIONS): font_files.append(os.path.join(root, file)) return font_files def subset_font_in_place(font_path, characters): """Subsets the given font file to include only the specified characters.""" # Convert characters to their integer code points unicode_set = {ord(c) for c in characters} font = TTFont(font_path) options = Options() options.drop_tables += ["DSIG"] options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"] options.unicodes = unicode_set options.variations = False options.drop_variations = True options.layout_features = ["*"] # keep all OT features options.hinting = False # Preserve original format if it was WOFF/WOFF2 if font_path.endswith(".woff2"): options.flavor = "woff2" elif font_path.endswith(".woff"): options.flavor = "woff" subsetter = Subsetter(options) subsetter.populate(unicodes=unicode_set) subsetter.subset(font) # Overwrite the original font file font.save(font_path) print(f"Subsetted font in place: {font_path}") if __name__ == "__main__": used_chars = get_used_characters(HTML_DIR) print(f"Extracted {len(used_chars)} unique characters from HTML files.") font_files = find_font_files(FONT_DIR) print(f"Found {len(font_files)} font files to subset.") for font_file in font_files: subset_font_in_place(font_file, used_chars)