Adjust Python script to also just accept HTML files as args

Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com>
This commit is contained in:
Danila Fedorin 2025-02-23 12:28:19 -08:00
parent 07408d01a9
commit d847d20666

View File

@ -1,12 +1,9 @@
import os import os
import sys
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from fontTools.subset import Subsetter, Options from fontTools.subset import Subsetter, Options
from fontTools.ttLib import TTFont from fontTools.ttLib import TTFont
# Directories
HTML_DIR = "." # Directory with .html files
FONT_DIR = "." # Directory containing fonts to be modified
FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf") # Font file types
def extract_text_from_html(file_path): def extract_text_from_html(file_path):
@ -15,15 +12,13 @@ def extract_text_from_html(file_path):
soup = BeautifulSoup(f.read(), "html.parser") soup = BeautifulSoup(f.read(), "html.parser")
return soup.get_text() return soup.get_text()
def get_used_characters(directory): def get_used_characters(files):
"""Collect unique characters from all .html files in the given directory.""" """Collect unique characters from all .html files in the given directory."""
char_set = set() char_set = set()
for root, _, files in os.walk(directory): for file in files:
for file in files: full_path = os.path.join(root, file)
if file.endswith(".html"): text = extract_text_from_html(full_path)
full_path = os.path.join(root, file) char_set.update(text)
text = extract_text_from_html(full_path)
char_set.update(text)
return "".join(sorted(char_set)) return "".join(sorted(char_set))
def find_font_files(directory): def find_font_files(directory):
@ -65,10 +60,10 @@ def subset_font_in_place(font_path, characters):
print(f"Subsetted font in place: {font_path}") print(f"Subsetted font in place: {font_path}")
if __name__ == "__main__": if __name__ == "__main__":
used_chars = get_used_characters(HTML_DIR) used_chars = get_used_characters(sys.argv[1:])
print(f"Extracted {len(used_chars)} unique characters from HTML files.") print(f"Extracted {len(used_chars)} unique characters from HTML files.")
font_files = find_font_files(FONT_DIR) font_files = find_font_files(".")
print(f"Found {len(font_files)} font files to subset.") print(f"Found {len(font_files)} font files to subset.")
for font_file in font_files: for font_file in font_files: