import os from bs4 import BeautifulSoup from urllib.parse import urlparse # Domains considered part of your site. SITE_ROOT_URLS = ["https://danilafe.com/", "http://danilafe.com/"] # The project root is the current working directory. PROJECT_ROOT = os.getcwd() HTML_EXTENSIONS = {".html", ".htm"} def convert_to_relative(url, base_filepath): """ Convert an absolute URL (including domain-relative URLs) to a relative path appropriate for the HTML file at base_filepath. """ parsed = urlparse(url) # If the URL is already relative, return it unchanged. if not (url.startswith("/") or any(url.startswith(root) for root in SITE_ROOT_URLS)): return url # If it's an absolute URL on danilafe.com, strip the domain. for root_url in SITE_ROOT_URLS: if url.startswith(root_url): url = url[len(root_url):] break # For domain-relative URLs (starting with "/"), remove the leading slash. if url.startswith("/"): url = url.lstrip("/") # Build the full filesystem path for the target resource. target_path = os.path.normpath(os.path.join(PROJECT_ROOT, url)) base_dir = os.path.dirname(base_filepath) # Compute the relative path from the HTML file's directory to the target. relative_path = os.path.relpath(target_path, start=base_dir) return relative_path.replace(os.path.sep, "/") def process_html_file(filepath): """Process a single HTML file to rewrite links, unwrap