Add a (ChatGPT-provided) script to perform subsetting
Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com>
This commit is contained in:
parent
ce8f8fb872
commit
816a473913
75
chatgpt-subset-one-go.py
Normal file
75
chatgpt-subset-one-go.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
from bs4 import BeautifulSoup
|
||||
from fontTools.subset import Subsetter, Options
|
||||
from fontTools.ttLib import TTFont
|
||||
|
||||
# Directories
|
||||
HTML_DIR = "." # Directory with .html files
|
||||
FONT_DIR = "." # Directory containing fonts to be modified
|
||||
|
||||
FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types
|
||||
|
||||
def extract_text_from_html(file_path):
|
||||
"""Extract text content from a single HTML file."""
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
return soup.get_text()
|
||||
|
||||
def get_used_characters(directory):
|
||||
"""Collect unique characters from all .html files in the given directory."""
|
||||
char_set = set()
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith(".html"):
|
||||
full_path = os.path.join(root, file)
|
||||
text = extract_text_from_html(full_path)
|
||||
char_set.update(text)
|
||||
return "".join(sorted(char_set))
|
||||
|
||||
def find_font_files(directory):
|
||||
"""Find all font files in the given directory, recursively."""
|
||||
font_files = []
|
||||
for root, _, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith(FONT_EXTENSIONS):
|
||||
font_files.append(os.path.join(root, file))
|
||||
return font_files
|
||||
|
||||
def subset_font_in_place(font_path, characters):
|
||||
"""Subsets the given font file to include only the specified characters."""
|
||||
# Convert characters to their integer code points
|
||||
unicode_set = {ord(c) for c in characters}
|
||||
|
||||
font = TTFont(font_path)
|
||||
options = Options()
|
||||
options.drop_tables += ["DSIG"]
|
||||
options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"]
|
||||
options.unicodes = unicode_set
|
||||
options.variations = False
|
||||
options.drop_variations = True
|
||||
options.layout_features = ["*"] # keep all OT features
|
||||
options.hinting = False
|
||||
|
||||
# Preserve original format if it was WOFF/WOFF2
|
||||
if font_path.endswith(".woff2"):
|
||||
options.flavor = "woff2"
|
||||
elif font_path.endswith(".woff"):
|
||||
options.flavor = "woff"
|
||||
|
||||
subsetter = Subsetter(options)
|
||||
subsetter.populate(unicodes=unicode_set)
|
||||
subsetter.subset(font)
|
||||
|
||||
# Overwrite the original font file
|
||||
font.save(font_path)
|
||||
print(f"Subsetted font in place: {font_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
used_chars = get_used_characters(HTML_DIR)
|
||||
print(f"Extracted {len(used_chars)} unique characters from HTML files.")
|
||||
|
||||
font_files = find_font_files(FONT_DIR)
|
||||
print(f"Found {len(font_files)} font files to subset.")
|
||||
|
||||
for font_file in font_files:
|
||||
subset_font_in_place(font_file, used_chars)
|
Loading…
Reference in New Issue
Block a user