Add a (ChatGPT-provided) script to perform subsetting

Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com>
This commit is contained in:
Danila Fedorin 2025-02-23 12:01:21 -08:00
parent ce8f8fb872
commit 816a473913

75
chatgpt-subset-one-go.py Normal file
View File

@ -0,0 +1,75 @@
import os
from bs4 import BeautifulSoup
from fontTools.subset import Subsetter, Options
from fontTools.ttLib import TTFont
# Directories
HTML_DIR = "." # Directory with .html files
FONT_DIR = "." # Directory containing fonts to be modified
FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types
def extract_text_from_html(file_path):
"""Extract text content from a single HTML file."""
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
return soup.get_text()
def get_used_characters(directory):
"""Collect unique characters from all .html files in the given directory."""
char_set = set()
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".html"):
full_path = os.path.join(root, file)
text = extract_text_from_html(full_path)
char_set.update(text)
return "".join(sorted(char_set))
def find_font_files(directory):
"""Find all font files in the given directory, recursively."""
font_files = []
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(FONT_EXTENSIONS):
font_files.append(os.path.join(root, file))
return font_files
def subset_font_in_place(font_path, characters):
"""Subsets the given font file to include only the specified characters."""
# Convert characters to their integer code points
unicode_set = {ord(c) for c in characters}
font = TTFont(font_path)
options = Options()
options.drop_tables += ["DSIG"]
options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"]
options.unicodes = unicode_set
options.variations = False
options.drop_variations = True
options.layout_features = ["*"] # keep all OT features
options.hinting = False
# Preserve original format if it was WOFF/WOFF2
if font_path.endswith(".woff2"):
options.flavor = "woff2"
elif font_path.endswith(".woff"):
options.flavor = "woff"
subsetter = Subsetter(options)
subsetter.populate(unicodes=unicode_set)
subsetter.subset(font)
# Overwrite the original font file
font.save(font_path)
print(f"Subsetted font in place: {font_path}")
if __name__ == "__main__":
used_chars = get_used_characters(HTML_DIR)
print(f"Extracted {len(used_chars)} unique characters from HTML files.")
font_files = find_font_files(FONT_DIR)
print(f"Found {len(font_files)} font files to subset.")
for font_file in font_files:
subset_font_in_place(font_file, used_chars)