diff --git a/chatgpt-subset-one-go.py b/chatgpt-subset-one-go.py new file mode 100644 index 0000000..64db038 --- /dev/null +++ b/chatgpt-subset-one-go.py @@ -0,0 +1,75 @@ +import os +from bs4 import BeautifulSoup +from fontTools.subset import Subsetter, Options +from fontTools.ttLib import TTFont + +# Directories +HTML_DIR = "." # Directory with .html files +FONT_DIR = "." # Directory containing fonts to be modified + +FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types + +def extract_text_from_html(file_path): + """Extract text content from a single HTML file.""" + with open(file_path, "r", encoding="utf-8") as f: + soup = BeautifulSoup(f.read(), "html.parser") + return soup.get_text() + +def get_used_characters(directory): + """Collect unique characters from all .html files in the given directory.""" + char_set = set() + for root, _, files in os.walk(directory): + for file in files: + if file.endswith(".html"): + full_path = os.path.join(root, file) + text = extract_text_from_html(full_path) + char_set.update(text) + return "".join(sorted(char_set)) + +def find_font_files(directory): + """Find all font files in the given directory, recursively.""" + font_files = [] + for root, _, files in os.walk(directory): + for file in files: + if file.endswith(FONT_EXTENSIONS): + font_files.append(os.path.join(root, file)) + return font_files + +def subset_font_in_place(font_path, characters): + """Subsets the given font file to include only the specified characters.""" + # Convert characters to their integer code points + unicode_set = {ord(c) for c in characters} + + font = TTFont(font_path) + options = Options() + options.drop_tables += ["DSIG"] + options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"] + options.unicodes = unicode_set + options.variations = False + options.drop_variations = True + options.layout_features = ["*"] # keep all OT features + options.hinting = False + + # Preserve original format if it was WOFF/WOFF2 + if font_path.endswith(".woff2"): + options.flavor = "woff2" + elif font_path.endswith(".woff"): + options.flavor = "woff" + + subsetter = Subsetter(options) + subsetter.populate(unicodes=unicode_set) + subsetter.subset(font) + + # Overwrite the original font file + font.save(font_path) + print(f"Subsetted font in place: {font_path}") + +if __name__ == "__main__": + used_chars = get_used_characters(HTML_DIR) + print(f"Extracted {len(used_chars)} unique characters from HTML files.") + + font_files = find_font_files(FONT_DIR) + print(f"Found {len(font_files)} font files to subset.") + + for font_file in font_files: + subset_font_in_place(font_file, used_chars)