70 lines
2.3 KiB
Python
70 lines
2.3 KiB
Python
import os
|
|
import sys
|
|
from bs4 import BeautifulSoup
|
|
from fontTools.subset import Subsetter, Options
|
|
from fontTools.ttLib import TTFont
|
|
|
|
FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types
|
|
|
|
def extract_text_from_html(file_path):
|
|
"""Extract text content from a single HTML file."""
|
|
with open(file_path, "r", encoding="utf-8") as f:
|
|
soup = BeautifulSoup(f.read(), "html.parser")
|
|
return soup.get_text()
|
|
|
|
def get_used_characters(files):
|
|
"""Collect unique characters from all .html files in the given directory."""
|
|
char_set = set()
|
|
for file in files:
|
|
text = extract_text_from_html(file)
|
|
char_set.update(text)
|
|
return "".join(sorted(char_set))
|
|
|
|
def find_font_files(directory):
|
|
"""Find all font files in the given directory, recursively."""
|
|
font_files = []
|
|
for root, _, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith(FONT_EXTENSIONS):
|
|
font_files.append(os.path.join(root, file))
|
|
return font_files
|
|
|
|
def subset_font_in_place(font_path, characters):
|
|
"""Subsets the given font file to include only the specified characters."""
|
|
# Convert characters to their integer code points
|
|
unicode_set = {ord(c) for c in characters}
|
|
|
|
font = TTFont(font_path)
|
|
options = Options()
|
|
options.drop_tables += ["DSIG"]
|
|
options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"]
|
|
options.unicodes = unicode_set
|
|
options.variations = False
|
|
options.drop_variations = True
|
|
options.layout_features = ["*"] # keep all OT features
|
|
options.hinting = False
|
|
|
|
# Preserve original format if it was WOFF/WOFF2
|
|
if font_path.endswith(".woff2"):
|
|
options.flavor = "woff2"
|
|
elif font_path.endswith(".woff"):
|
|
options.flavor = "woff"
|
|
|
|
subsetter = Subsetter(options)
|
|
subsetter.populate(unicodes=unicode_set)
|
|
subsetter.subset(font)
|
|
|
|
# Overwrite the original font file
|
|
font.save(font_path)
|
|
print(f"Subsetted font in place: {font_path}")
|
|
|
|
if __name__ == "__main__":
|
|
used_chars = get_used_characters(sys.argv[2:])
|
|
print(f"Extracted {len(used_chars)} unique characters from {len(sys.argv[2:])} HTML files.")
|
|
|
|
font_files = find_font_files(sys.argv[1])
|
|
print(f"Found {len(font_files)} font files to subset.")
|
|
|
|
for font_file in font_files:
|
|
subset_font_in_place(font_file, used_chars)
|