blog-versions/chatgpt-fix-root-URLs.py

75 lines
2.8 KiB
Python
Raw Normal View History

import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse
# Domains considered part of your site.
SITE_ROOT_URLS = ["https://danilafe.com/", "http://danilafe.com/"]
# The project root is the current working directory.
PROJECT_ROOT = os.getcwd()
HTML_EXTENSIONS = {".html", ".htm"}
def convert_to_relative(url, base_filepath):
"""
Convert an absolute URL (including domain-relative URLs) to a relative path
appropriate for the HTML file at base_filepath.
"""
parsed = urlparse(url)
# If the URL is already relative, return it unchanged.
if not (url.startswith("/") or any(url.startswith(root) for root in SITE_ROOT_URLS)):
return url
# If it's an absolute URL on danilafe.com, strip the domain.
for root_url in SITE_ROOT_URLS:
if url.startswith(root_url):
url = url[len(root_url):]
break
# For domain-relative URLs (starting with "/"), remove the leading slash.
if url.startswith("/"):
url = url.lstrip("/")
# Build the full filesystem path for the target resource.
target_path = os.path.normpath(os.path.join(PROJECT_ROOT, url))
base_dir = os.path.dirname(base_filepath)
# Compute the relative path from the HTML file's directory to the target.
relative_path = os.path.relpath(target_path, start=base_dir)
return relative_path.replace(os.path.sep, "/")
def process_html_file(filepath):
"""Process a single HTML file to rewrite links, unwrap <noscript> blocks, and remove preload links."""
with open(filepath, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f, "lxml")
# Update tags with href/src attributes.
for tag in soup.find_all(["a", "link", "script", "img"]):
attr = "href" if tag.name in ["a", "link"] else "src"
if tag.has_attr(attr):
tag[attr] = convert_to_relative(tag[attr], filepath)
# Process <noscript> blocks: update links inside them.
for noscript in soup.find_all("noscript"):
for link in noscript.find_all("link"):
if link.has_attr("href"):
link["href"] = convert_to_relative(link["href"], filepath)
# Remove all <link> elements with rel="preload"
for preload in soup.find_all("link", rel="preload"):
preload.decompose()
# "Partially evaluate" noscript: unwrap the <noscript> blocks.
for noscript in soup.find_all("noscript"):
noscript.unwrap()
with open(filepath, "w", encoding="utf-8") as f:
f.write(str(soup))
def process_directory(directory):
"""Recursively process all HTML files in the given directory."""
for root, _, files in os.walk(directory):
for file in files:
if os.path.splitext(file)[1].lower() in HTML_EXTENSIONS:
process_html_file(os.path.join(root, file))
if __name__ == "__main__":
process_directory(".")