75 lines
2.8 KiB
Python
75 lines
2.8 KiB
Python
|
import os
|
||
|
from bs4 import BeautifulSoup
|
||
|
from urllib.parse import urlparse
|
||
|
|
||
|
# Domains considered part of your site.
|
||
|
SITE_ROOT_URLS = ["https://danilafe.com/", "http://danilafe.com/"]
|
||
|
# The project root is the current working directory.
|
||
|
PROJECT_ROOT = os.getcwd()
|
||
|
HTML_EXTENSIONS = {".html", ".htm"}
|
||
|
|
||
|
def convert_to_relative(url, base_filepath):
|
||
|
"""
|
||
|
Convert an absolute URL (including domain-relative URLs) to a relative path
|
||
|
appropriate for the HTML file at base_filepath.
|
||
|
"""
|
||
|
parsed = urlparse(url)
|
||
|
# If the URL is already relative, return it unchanged.
|
||
|
if not (url.startswith("/") or any(url.startswith(root) for root in SITE_ROOT_URLS)):
|
||
|
return url
|
||
|
|
||
|
# If it's an absolute URL on danilafe.com, strip the domain.
|
||
|
for root_url in SITE_ROOT_URLS:
|
||
|
if url.startswith(root_url):
|
||
|
url = url[len(root_url):]
|
||
|
break
|
||
|
|
||
|
# For domain-relative URLs (starting with "/"), remove the leading slash.
|
||
|
if url.startswith("/"):
|
||
|
url = url.lstrip("/")
|
||
|
|
||
|
# Build the full filesystem path for the target resource.
|
||
|
target_path = os.path.normpath(os.path.join(PROJECT_ROOT, url))
|
||
|
base_dir = os.path.dirname(base_filepath)
|
||
|
# Compute the relative path from the HTML file's directory to the target.
|
||
|
relative_path = os.path.relpath(target_path, start=base_dir)
|
||
|
return relative_path.replace(os.path.sep, "/")
|
||
|
|
||
|
def process_html_file(filepath):
|
||
|
"""Process a single HTML file to rewrite links, unwrap <noscript> blocks, and remove preload links."""
|
||
|
with open(filepath, "r", encoding="utf-8") as f:
|
||
|
soup = BeautifulSoup(f, "lxml")
|
||
|
|
||
|
# Update tags with href/src attributes.
|
||
|
for tag in soup.find_all(["a", "link", "script", "img"]):
|
||
|
attr = "href" if tag.name in ["a", "link"] else "src"
|
||
|
if tag.has_attr(attr):
|
||
|
tag[attr] = convert_to_relative(tag[attr], filepath)
|
||
|
|
||
|
# Process <noscript> blocks: update links inside them.
|
||
|
for noscript in soup.find_all("noscript"):
|
||
|
for link in noscript.find_all("link"):
|
||
|
if link.has_attr("href"):
|
||
|
link["href"] = convert_to_relative(link["href"], filepath)
|
||
|
|
||
|
# Remove all <link> elements with rel="preload"
|
||
|
for preload in soup.find_all("link", rel="preload"):
|
||
|
preload.decompose()
|
||
|
|
||
|
# "Partially evaluate" noscript: unwrap the <noscript> blocks.
|
||
|
for noscript in soup.find_all("noscript"):
|
||
|
noscript.unwrap()
|
||
|
|
||
|
with open(filepath, "w", encoding="utf-8") as f:
|
||
|
f.write(str(soup))
|
||
|
|
||
|
def process_directory(directory):
|
||
|
"""Recursively process all HTML files in the given directory."""
|
||
|
for root, _, files in os.walk(directory):
|
||
|
for file in files:
|
||
|
if os.path.splitext(file)[1].lower() in HTML_EXTENSIONS:
|
||
|
process_html_file(os.path.join(root, file))
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
process_directory(".")
|