Add a (ChatGPT-provided) script to perform subsetting

Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com>
2025-02-23 12:01:21 -08:00
parent ce8f8fb872
commit 816a473913
1 changed files with 75 additions and 0 deletions
--- a/chatgpt-subset-one-go.py
+++ b/chatgpt-subset-one-go.py
@@ -0,0 +1,75 @@
+import os
+from bs4 import BeautifulSoup
+from fontTools.subset import Subsetter, Options
+from fontTools.ttLib import TTFont
+
+# Directories
+HTML_DIR = "."    # Directory with .html files
+FONT_DIR = "."          # Directory containing fonts to be modified
+
+FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf")  # Font file types
+
+def extract_text_from_html(file_path):
+    """Extract text content from a single HTML file."""
+    with open(file_path, "r", encoding="utf-8") as f:
+        soup = BeautifulSoup(f.read(), "html.parser")
+        return soup.get_text()
+
+def get_used_characters(directory):
+    """Collect unique characters from all .html files in the given directory."""
+    char_set = set()
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(".html"):
+                full_path = os.path.join(root, file)
+                text = extract_text_from_html(full_path)
+                char_set.update(text)
+    return "".join(sorted(char_set))
+
+def find_font_files(directory):
+    """Find all font files in the given directory, recursively."""
+    font_files = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            if file.endswith(FONT_EXTENSIONS):
+                font_files.append(os.path.join(root, file))
+    return font_files
+
+def subset_font_in_place(font_path, characters):
+    """Subsets the given font file to include only the specified characters."""
+    # Convert characters to their integer code points
+    unicode_set = {ord(c) for c in characters}
+
+    font = TTFont(font_path)
+    options = Options()
+    options.drop_tables += ["DSIG"]
+    options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"]
+    options.unicodes = unicode_set
+    options.variations = False
+    options.drop_variations = True
+    options.layout_features = ["*"]  # keep all OT features
+    options.hinting = False
+
+    # Preserve original format if it was WOFF/WOFF2
+    if font_path.endswith(".woff2"):
+        options.flavor = "woff2"
+    elif font_path.endswith(".woff"):
+        options.flavor = "woff"
+
+    subsetter = Subsetter(options)
+    subsetter.populate(unicodes=unicode_set)
+    subsetter.subset(font)
+
+    # Overwrite the original font file
+    font.save(font_path)
+    print(f"Subsetted font in place: {font_path}")
+
+if __name__ == "__main__":
+    used_chars = get_used_characters(HTML_DIR)
+    print(f"Extracted {len(used_chars)} unique characters from HTML files.")
+
+    font_files = find_font_files(FONT_DIR)
+    print(f"Found {len(font_files)} font files to subset.")
+
+    for font_file in font_files:
+        subset_font_in_place(font_file, used_chars)