70 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			70 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import sys
 | |
| from bs4 import BeautifulSoup
 | |
| from fontTools.subset import Subsetter, Options
 | |
| from fontTools.ttLib import TTFont
 | |
| 
 | |
| FONT_EXTENSIONS = (".ttf", ".woff", ".woff2", ".otf")  # Font file types
 | |
| 
 | |
| def extract_text_from_html(file_path):
 | |
|     """Extract text content from a single HTML file."""
 | |
|     with open(file_path, "r", encoding="utf-8") as f:
 | |
|         soup = BeautifulSoup(f.read(), "html.parser")
 | |
|         return soup.get_text()
 | |
| 
 | |
| def get_used_characters(files):
 | |
|     """Collect unique characters from all .html files in the given directory."""
 | |
|     char_set = set()
 | |
|     for file in files:
 | |
|         text = extract_text_from_html(file)
 | |
|         char_set.update(text)
 | |
|     return "".join(sorted(char_set))
 | |
| 
 | |
| def find_font_files(directory):
 | |
|     """Find all font files in the given directory, recursively."""
 | |
|     font_files = []
 | |
|     for root, _, files in os.walk(directory):
 | |
|         for file in files:
 | |
|             if file.endswith(FONT_EXTENSIONS):
 | |
|                 font_files.append(os.path.join(root, file))
 | |
|     return font_files
 | |
| 
 | |
| def subset_font_in_place(font_path, characters):
 | |
|     """Subsets the given font file to include only the specified characters."""
 | |
|     # Convert characters to their integer code points
 | |
|     unicode_set = {ord(c) for c in characters}
 | |
| 
 | |
|     font = TTFont(font_path)
 | |
|     options = Options()
 | |
|     options.drop_tables += ["DSIG"]
 | |
|     options.drop_tables += ["LTSH", "VDMX", "hdmx", "gasp"]
 | |
|     options.unicodes = unicode_set
 | |
|     options.variations = False
 | |
|     options.drop_variations = True
 | |
|     options.layout_features = ["*"]  # keep all OT features
 | |
|     options.hinting = False
 | |
| 
 | |
|     # Preserve original format if it was WOFF/WOFF2
 | |
|     if font_path.endswith(".woff2"):
 | |
|         options.flavor = "woff2"
 | |
|     elif font_path.endswith(".woff"):
 | |
|         options.flavor = "woff"
 | |
| 
 | |
|     subsetter = Subsetter(options)
 | |
|     subsetter.populate(unicodes=unicode_set)
 | |
|     subsetter.subset(font)
 | |
| 
 | |
|     # Overwrite the original font file
 | |
|     font.save(font_path)
 | |
|     print(f"Subsetted font in place: {font_path}")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     used_chars = get_used_characters(sys.argv[2:])
 | |
|     print(f"Extracted {len(used_chars)} unique characters from {len(sys.argv[2:])} HTML files.")
 | |
| 
 | |
|     font_files = find_font_files(sys.argv[1])
 | |
|     print(f"Found {len(font_files)} font files to subset.")
 | |
| 
 | |
|     for font_file in font_files:
 | |
|         subset_font_in_place(font_file, used_chars)
 |