From 032411fe9c40a8aa4cf4f427d92ccb573143bb09 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Wed, 22 May 2024 16:28:00 -0700 Subject: [PATCH] Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin --- agda.rb | 262 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 agda.rb diff --git a/agda.rb b/agda.rb new file mode 100644 index 0000000..5a5f3f3 --- /dev/null +++ b/agda.rb @@ -0,0 +1,262 @@ +require "nokogiri" +require "pathname" + +files = ARGV[0..-1] + +class LineInfo + attr_accessor :links + + def initialize + @links = [] + end +end + +class Handler + def initialize + @file_infos = {} + end + + # Traverse the preformatted Agda block in the given Agda HTML file + # and find which textual ranges have IDs and links to other ranges. + # Store this information in a hash, line => LineInfo. + def process_agda_html_file(file) + return @file_infos[file] if @file_infos.include? file + + @file_infos[file] = line_infos = {} + unless File.exists?(file) + return line_infos + end + + document = Nokogiri::HTML.parse(File.open(file)) + pre_code = document.css("pre.Agda")[0] + + # The traversal postorder; we always visit children before their + # parents, and we visit leaves in sequence. + offset = 0 + line = 1 + pre_code.traverse do |at| + # Text nodes are always leaves; visiting a new leaf means we've advanced + # in the text by the length of that text. However, if there are newlines + # -- since this is a preformatted block -- we also advanced by a line. + # At this time, do not support links that span multiple lines, but + # Agda doesn't produce those either. + if at.text? + if at.content.include? "\n" + # This textual leaf is at least part whitespace. The logic + # assumes that links can't span multiple pages, and that links + # aren't nested, so ensure that the parent of the textual node + # is the preformatted block itself. + raise "unsupported Agda HTML output" if at.parent.name != "pre" + + # Increase the line and track the final offset. Written as a loop + # in case we eventually want to add some handling for the pieces + # sandwiched between newlines. + at.content.split("\n", -1).each_with_index do |bit, idx| + line += 1 unless idx == 0 + offset = bit.length + end + else + # It's not a newline node, so it could be anywhere. All we need to + # do is adjust the offset within the full pre block's text. + offset += at.content.length + end + elsif at.name == "a" + # We found a link. Agda emits both links and "things to link to" as + # 'a' nodes, so check for either, and record them. Even if + # the link is nested, the .content.length accessor will only + # retrieve the textual content, and thus -- assuming the link + # isn't split across lines -- will find the proper from-to range. + + line_info = line_infos.fetch(line) { line_infos[line] = LineInfo.new } + href = at.attribute("href") + id = at.attribute("id") + if href or id + new_link = { :from => offset-at.content.length, :to => offset } + new_link[:href] = href if href + new_link[:id] = id if id + + line_info.links << new_link + end + end + end + return line_infos + end + + # Given a Hugo HTML file which references potentially several Agda modules + # in code blocks, insert links into the code blocks. + # + # There are several things we need to take care of: + # 1. Finding the HTML files associated with each referenced Agda module. + # For this, we make use of the data-base-path etc. attributes that + # the vanilla theme inserts. + # 2. "zipping together" the Agda and Hugo HTML representations. Each of + # them encode the code, but they use different HTML elements and structures. + # So, given a Hugo HTML code block, traverse its textual contents + # and find any that are covered by links in the related Agda HTML file. + # 3. Fixing up links: the Agda HTML links assume each module has its own HTML + # file. This isn't true for us: multiple modules are stitched into + # one Hugo HTML file. Also, we don't include the entire Agda HTML + # file in the Hugo HTML, so some links may be broken. So, find IDs + # that are visible in the Hugo file, rename them to be globally unique, + # and re-write cross-file links that reference these IDs to point + # inside the Hugo file. + def process_source_file(file) + nodes_referencing_href = {} # Agda HTML ID -> all (Hugo HTML) nodes that link to it + seen_hrefs = {} # Agda HTML ID -> its new ID in Hugo-land + id_counter = 0 + + # Process each highlight group that's been marked as an Agda file. + document = Nokogiri::HTML.parse(File.open(file)) + document.css('div[data-agda-block]').each do |t| + first_line, last_line = nil, nil + + if first_line_attr = t.attribute("data-first-line") + first_line = first_line_attr.to_s.to_i + end + if last_line_attr = t.attribute("data-last-line") + last_line = last_line_attr.to_s.to_i + end + + # The name of an Agda module is determined from its directory + # structure: A/B/C.agda creates A.B.C.html. Assume that the + # base path is the Agda module root, and use the remaining directories + # to build the HTML file name. + base_path = t.attribute("data-base-path").to_s + base_path_dirs = Pathname(base_path).each_filename.to_a + full_path = t.attribute("data-file-path").to_s + full_path_dirs = Pathname(full_path).each_filename.to_a + + dirs_in_base = full_path_dirs[base_path_dirs.length..-1] + html_file = dirs_in_base.join(".").gsub(/\.agda$/, ".html") + html_path = File.join(["code", base_path, "html", html_file]) + + agda_info = process_agda_html_file(html_path) + + # Hugo conveniently generates a bunch of spans, each encoding a line + # of code output. We can iterate over these and match them up with + # the line numbers we got from reading the Agda HTML output. + lines = t.css("pre.chroma code[data-lang] .line") + lines.zip((first_line..last_line)).each do |line, line_no| + line_info = agda_info[line_no] + next unless line_info + + offset = 0 + line.traverse do |lt| + if lt.text? + content = lt.content + new_offset = offset + content.length + + # The span/a/etc. structure of the Agda and Hugo HTML files + # need not line up; it's possible for there to be a single link + # in the Agda file that's broken up across multiple HTML nodes + # in the Hugo output. For now, just don't link those, since inserting + # such overlapping links is relatively complicated. Instead, + # require links to fit fully within a current text node (and thus, + # not overlap the boundaries of any HTML). + matching_links = line_info.links.filter do |link| + link[:from] >= offset and link[:to] <= new_offset + end + + # A given text node can be broken into any number of sub-nodes, + # where some sub-nodes are still text, and others have been turned + # into links. Store the new pieces in replace_with. E.g., + # + # Original: + # abc + # + # New: + # abc + # + # replace_with: + # ["a", , "c"] + # + # match_offset represents how much of the original text we've + # already converted. The below iteration assumes that matching + # links are in order, and don't overlap. + replace_with = [] + replace_offset = 0 + matching_links.each do |match| + # The link's range is an offset from the beginning of the line, + # but the text piece we're splitting up might be partway into + # the line. Convert the link coordinates to piece-relative ones. + relative_from = match[:from] - offset + relative_to = match[:to] - offset + + # If the previous link ended some time before the new link + # began (or if the current link is the first one, and is not + # at the beginning), ensure that the plain text "in between" + # is kept. + replace_with << content[replace_offset...relative_from] + + tag = (match.include? :href) ? 'a' : 'span' + new_link = Nokogiri::XML::Node.new(tag, document) + if match.include? :href + # For nodes with links, note what they're referring to, so + # we can adjust their hrefs when we assign global IDs. + href = match[:href].to_s + new_link['href'] = href + ref_list = nodes_referencing_href.fetch(href) { nodes_referencing_href[href] = [] } + ref_list << new_link + end + if match.include? :id + # For nodes with IDs visible in the current Hugo file, we'll + # want to redirect links that previously go to other Agda + # module HTML files. So, note the ID that we want to redirect, + # and pick a new unique ID to replace it with. + id = match[:id].to_s + uniq_id = "agda-unique-ident-#{id_counter}" + id_counter += 1 + new_link['id'] = uniq_id + seen_hrefs["#{html_file}##{id}"] = uniq_id + end + new_link.content = content[relative_from...relative_to] + + replace_with << new_link + replace_offset = relative_to + end + replace_with << content[replace_offset..-1] + + # Finally, replace the node under consideration with the new + # pieces. + replace_with.each do |replacement| + lt.add_previous_sibling replacement + end + lt.remove + + offset = new_offset + end + end + end + end + + # Now, we have a complete list of all the IDs visible in scope. + # Redirect relevant links to these IDs. This achieves within-post + # links. + seen_hrefs.each do |agda_id, hugo_id| + nodes_to_rewrite = nodes_referencing_href[agda_id] + next unless nodes_to_rewrite + + nodes_to_rewrite.each do |node| + node['href'] = "##{hugo_id}" + end + + nodes_referencing_href.delete agda_id + end + + # The remaining nodes reference symbols not in the current Hugo document, + # so just replace them with their contents for now (we'd need some + # external hosting to make this work otherwise). + nodes_referencing_href.each do |href, nodes| + nodes.each do |node| + node.replace node.content + end + end + + File.write(file, document.to_html(encoding: 'UTF-8')) + end +end + +handler = Handler.new +files.each do |file| + handler.process_source_file file +end