Adjust Python script to also just accept HTML files as args

Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com>
This commit is contained in:
Danila Fedorin 2025-02-23 12:28:19 -08:00
parent 07408d01a9
commit d847d20666

View File

@ -1,12 +1,9 @@
import os
import sys
from bs4 import BeautifulSoup
from fontTools.subset import Subsetter, Options
from fontTools.ttLib import TTFont
# Directories
HTML_DIR = "." # Directory with .html files
FONT_DIR = "." # Directory containing fonts to be modified
FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types
def extract_text_from_html(file_path):
@ -15,12 +12,10 @@ def extract_text_from_html(file_path):
soup = BeautifulSoup(f.read(), "html.parser")
return soup.get_text()
def get_used_characters(directory):
def get_used_characters(files):
"""Collect unique characters from all .html files in the given directory."""
char_set = set()
for root, _, files in os.walk(directory):
for file in files:
if file.endswith(".html"):
full_path = os.path.join(root, file)
text = extract_text_from_html(full_path)
char_set.update(text)
@ -65,10 +60,10 @@ def subset_font_in_place(font_path, characters):
print(f"Subsetted font in place: {font_path}")
if __name__ == "__main__":
used_chars = get_used_characters(HTML_DIR)
used_chars = get_used_characters(sys.argv[1:])
print(f"Extracted {len(used_chars)} unique characters from HTML files.")
font_files = find_font_files(FONT_DIR)
font_files = find_font_files(".")
print(f"Found {len(font_files)} font files to subset.")
for font_file in font_files: