blog-versions/chatgpt-fix-root-URLs.py

import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse

# Domains considered part of your site.
SITE_ROOT_URLS = ["https://danilafe.com/", "http://danilafe.com/"]
# The project root is the current working directory.
PROJECT_ROOT = os.getcwd()
HTML_EXTENSIONS = {".html", ".htm"}

def convert_to_relative(url, base_filepath):
    """
    Convert an absolute URL (including domain-relative URLs) to a relative path
    appropriate for the HTML file at base_filepath.
    """
    parsed = urlparse(url)
    # If the URL is already relative, return it unchanged.
    if not (url.startswith("/") or any(url.startswith(root) for root in SITE_ROOT_URLS)):
        return url

    # If it's an absolute URL on danilafe.com, strip the domain.
    for root_url in SITE_ROOT_URLS:
        if url.startswith(root_url):
            url = url[len(root_url):]
            break

    # For domain-relative URLs (starting with "/"), remove the leading slash.
    if url.startswith("/"):
        url = url.lstrip("/")

    # Build the full filesystem path for the target resource.
    target_path = os.path.normpath(os.path.join(PROJECT_ROOT, url))
    base_dir = os.path.dirname(base_filepath)
    # Compute the relative path from the HTML file's directory to the target.
    relative_path = os.path.relpath(target_path, start=base_dir)
    return relative_path.replace(os.path.sep, "/")

def process_html_file(filepath):
    """Process a single HTML file to rewrite links, unwrap <noscript> blocks, and remove preload links."""
    with open(filepath, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "lxml")

    # Update tags with href/src attributes.
    for tag in soup.find_all(["a", "link", "script", "img"]):
        attr = "href" if tag.name in ["a", "link"] else "src"
        if tag.has_attr(attr):
            tag[attr] = convert_to_relative(tag[attr], filepath)

    # Process <noscript> blocks: update links inside them.
    for noscript in soup.find_all("noscript"):
        for link in noscript.find_all("link"):
            if link.has_attr("href"):
                link["href"] = convert_to_relative(link["href"], filepath)
    
    # Remove all <link> elements with rel="preload"
    for preload in soup.find_all("link", rel="preload"):
        preload.decompose()
    
    # "Partially evaluate" noscript: unwrap the <noscript> blocks.
    for noscript in soup.find_all("noscript"):
        noscript.unwrap()

    with open(filepath, "w", encoding="utf-8") as f:
        f.write(str(soup))

def process_directory(directory):
    """Recursively process all HTML files in the given directory."""
    for root, _, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1].lower() in HTML_EXTENSIONS:
                process_html_file(os.path.join(root, file))

if __name__ == "__main__":
    process_directory(".")
Set up a mostly-default project with BackstopJS Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2025-03-31 00:42:01 -07:00			`import os`
			`from bs4 import BeautifulSoup`
			`from urllib.parse import urlparse`

			`# Domains considered part of your site.`
			`SITE_ROOT_URLS = ["https://danilafe.com/", "http://danilafe.com/"]`
			`# The project root is the current working directory.`
			`PROJECT_ROOT = os.getcwd()`
			`HTML_EXTENSIONS = {".html", ".htm"}`

			`def convert_to_relative(url, base_filepath):`
			`"""`
			`Convert an absolute URL (including domain-relative URLs) to a relative path`
			`appropriate for the HTML file at base_filepath.`
			`"""`
			`parsed = urlparse(url)`
			`# If the URL is already relative, return it unchanged.`
			`if not (url.startswith("/") or any(url.startswith(root) for root in SITE_ROOT_URLS)):`
			`return url`

			`# If it's an absolute URL on danilafe.com, strip the domain.`
			`for root_url in SITE_ROOT_URLS:`
			`if url.startswith(root_url):`
			`url = url[len(root_url):]`
			`break`

			`# For domain-relative URLs (starting with "/"), remove the leading slash.`
			`if url.startswith("/"):`
			`url = url.lstrip("/")`

			`# Build the full filesystem path for the target resource.`
			`target_path = os.path.normpath(os.path.join(PROJECT_ROOT, url))`
			`base_dir = os.path.dirname(base_filepath)`
			`# Compute the relative path from the HTML file's directory to the target.`
			`relative_path = os.path.relpath(target_path, start=base_dir)`
			`return relative_path.replace(os.path.sep, "/")`

			`def process_html_file(filepath):`
			`"""Process a single HTML file to rewrite links, unwrap <noscript> blocks, and remove preload links."""`
			`with open(filepath, "r", encoding="utf-8") as f:`
			`soup = BeautifulSoup(f, "lxml")`

			`# Update tags with href/src attributes.`
			`for tag in soup.find_all(["a", "link", "script", "img"]):`
			`attr = "href" if tag.name in ["a", "link"] else "src"`
			`if tag.has_attr(attr):`
			`tag[attr] = convert_to_relative(tag[attr], filepath)`

			`# Process <noscript> blocks: update links inside them.`
			`for noscript in soup.find_all("noscript"):`
			`for link in noscript.find_all("link"):`
			`if link.has_attr("href"):`
			`link["href"] = convert_to_relative(link["href"], filepath)`

			`# Remove all <link> elements with rel="preload"`
			`for preload in soup.find_all("link", rel="preload"):`
			`preload.decompose()`

			`# "Partially evaluate" noscript: unwrap the <noscript> blocks.`
			`for noscript in soup.find_all("noscript"):`
			`noscript.unwrap()`

			`with open(filepath, "w", encoding="utf-8") as f:`
			`f.write(str(soup))`

			`def process_directory(directory):`
			`"""Recursively process all HTML files in the given directory."""`
			`for root, _, files in os.walk(directory):`
			`for file in files:`
			`if os.path.splitext(file)[1].lower() in HTML_EXTENSIONS:`
			`process_html_file(os.path.join(root, file))`

			`if __name__ == "__main__":`
			`process_directory(".")`