require "nokogiri" require "pathname" files = ARGV[0..-1] class LineInfo attr_accessor :links def initialize @links = [] end end class AgdaContext def initialize @file_infos = {} end # Traverse the preformatted Agda block in the given Agda HTML file # and find which textual ranges have IDs and links to other ranges. # Store this information in a hash, line => LineInfo. def process_agda_html_file(file) return @file_infos[file] if @file_infos.include? file @file_infos[file] = line_infos = {} unless File.exists?(file) return line_infos end document = Nokogiri::HTML.parse(File.open(file)) pre_code = document.css("pre.Agda")[0] # The traversal postorder; we always visit children before their # parents, and we visit leaves in sequence. offset = 0 line = 1 pre_code.traverse do |at| # Text nodes are always leaves; visiting a new leaf means we've advanced # in the text by the length of that text. However, if there are newlines # -- since this is a preformatted block -- we also advanced by a line. # At this time, do not support links that span multiple lines, but # Agda doesn't produce those either. if at.text? if at.content.include? "\n" # This textual leaf is at least part whitespace. The logic # assumes that links can't span multiple pages, and that links # aren't nested, so ensure that the parent of the textual node # is the preformatted block itself. raise "unsupported Agda HTML output" if at.parent.name != "pre" # Increase the line and track the final offset. Written as a loop # in case we eventually want to add some handling for the pieces # sandwiched between newlines. at.content.split("\n", -1).each_with_index do |bit, idx| line += 1 unless idx == 0 offset = bit.length end else # It's not a newline node, so it could be anywhere. All we need to # do is adjust the offset within the full pre block's text. offset += at.content.length end elsif at.name == "a" # We found a link. Agda emits both links and "things to link to" as # 'a' nodes, so check for either, and record them. Even if # the link is nested, the .content.length accessor will only # retrieve the textual content, and thus -- assuming the link # isn't split across lines -- will find the proper from-to range. line_info = line_infos.fetch(line) { line_infos[line] = LineInfo.new } href = at.attribute("href") id = at.attribute("id") if href or id new_node = { :from => offset-at.content.length, :to => offset } new_node[:href] = href if href new_node[:id] = id if id line_info.links << new_node end end end return line_infos end end class FileGroup def initialize(agda_context) @agda_context = agda_context # Agda HTML href -> list of (file, Hugo HTML node that links to it) @nodes_referencing_href = {} # Agda HTML href -> (its new ID in Hugo-land, file in which it's defined) # This supports cross-post linking within a seires. @global_seen_hrefs = {} # file name -> Agda HTML href -> its new ID in Hugo-land # This supports linking within a particular post. @local_seen_hrefs = Hash.new { {} } # Global counter to generate fresh IDs. There's no reason for it to # be global within a series (since IDs are namespaced by the file they're in), # but it's just more convenient this way. @id_counter = 0 end def note_defined_href(file, href) file_hrefs = @local_seen_hrefs.fetch(file) do @local_seen_hrefs[file] = {} end uniq_id = file_hrefs.fetch(href) do new_id = "agda-unique-ident-#{@id_counter}" @id_counter += 1 file_hrefs[href] = new_id end unless @global_seen_hrefs.include? href @global_seen_hrefs[href] = { :file => file, :id => uniq_id } end return uniq_id end def note_used_href(file, node, href) ref_list = @nodes_referencing_href.fetch(href) { @nodes_referencing_href[href] = [] } ref_list << { :file => file, :node => node } return href end # Given a Hugo HTML file which references potentially several Agda modules # in code blocks, insert links into the code blocks. # # There are several things we need to take care of: # 1. Finding the HTML files associated with each referenced Agda module. # For this, we make use of the data-base-path etc. attributes that # the vanilla theme inserts. # 2. "zipping together" the Agda and Hugo HTML representations. Each of # them encode the code, but they use different HTML elements and structures. # So, given a Hugo HTML code block, traverse its textual contents # and find any that are covered by links in the related Agda HTML file. # 3. Fixing up links: the Agda HTML links assume each module has its own HTML # file. This isn't true for us: multiple modules are stitched into # one Hugo HTML file. Also, we don't include the entire Agda HTML # file in the Hugo HTML, so some links may be broken. So, find IDs # that are visible in the Hugo file, rename them to be globally unique, # and re-write cross-file links that reference these IDs to point # inside the Hugo file. def process_source_file(file, document) # Process each highlight group that's been marked as an Agda file. document.css('div[data-agda-block]').each do |t| first_line, last_line = nil, nil if first_line_attr = t.attribute("data-first-line") first_line = first_line_attr.to_s.to_i end if last_line_attr = t.attribute("data-last-line") last_line = last_line_attr.to_s.to_i end if first_line and last_line line_range = first_line..last_line else line_range = 1.. end # Sometimes, code is deeply nested in the source file, but we don't # want to show the leading space. In that case, the generator sets # data-source-offset with how much leading space was stripped off. initial_offset = 0 if source_offset_attr = t.attribute("data-source-offset") initial_offset = source_offset_attr.to_s.to_i end full_path = t.attribute("data-file-path").to_s full_path_dirs = Pathname(full_path).each_filename.to_a # The name of an Agda module is determined from its directory # structure: A/B/C.agda creates A.B.C.html. Depending on where # the code is included, there might be some additional folders # that precede A that we want to ignore. base_path = t.attribute("data-base-path").to_s base_dir_depth = 0 if base_path.empty? # No submodules were used. Assume code/ is the root, since # that's the code layout of the blog right now. base_dir_depth = 1 base_path = full_path_dirs[0] else # The code is in a submodule. Assume that the base path / submodule # root is the Agda module root, ignore all folders before that. base_path_dirs = Pathname(base_path).each_filename.to_a base_dir_depth = base_path_dirs.length end dirs_in_base = full_path_dirs[base_dir_depth..-1] html_file = dirs_in_base.join(".").gsub(/\.agda$/, ".html") html_path = File.join(["code", base_path, "html", html_file]) agda_info = @agda_context.process_agda_html_file(html_path) # Hugo conveniently generates a bunch of spans, each encoding a line # of code output. We can iterate over these and match them up with # the line numbers we got from reading the Agda HTML output. lines = t.css("pre.chroma code[data-lang] .line") lines.zip(line_range).each do |line, line_no| line_info = agda_info[line_no] next unless line_info offset = initial_offset line.traverse do |lt| if lt.text? content = lt.content new_offset = offset + content.length # The span/a/etc. structure of the Agda and Hugo HTML files # need not line up; it's possible for there to be a single link # in the Agda file that's broken up across multiple HTML nodes # in the Hugo output. For now, just don't link those, since inserting # such overlapping links is relatively complicated. Instead, # require links to fit fully within a current text node (and thus, # not overlap the boundaries of any HTML). matching_links = line_info.links.filter do |link| link[:from] >= offset and link[:to] <= new_offset end # A given text node can be broken into any number of sub-nodes, # where some sub-nodes are still text, and others have been turned # into links. Store the new pieces in replace_with. E.g., # # Original: # abc # # New: # abc # # replace_with: # ["a", , "c"] # # match_offset represents how much of the original text we've # already converted. The below iteration assumes that matching # links are in order, and don't overlap. replace_with = [] replace_offset = 0 matching_links.each do |match| # The link's range is an offset from the beginning of the line, # but the text piece we're splitting up might be partway into # the line. Convert the link coordinates to piece-relative ones. relative_from = match[:from] - offset relative_to = match[:to] - offset # If the previous link ended some time before the new link # began (or if the current link is the first one, and is not # at the beginning), ensure that the plain text "in between" # is kept. replace_with << content[replace_offset...relative_from] tag = (match.include? :href) ? 'a' : 'span' new_node = Nokogiri::XML::Node.new(tag, document) if match.include? :href # For nodes with links, note what they're referring to, so # we can adjust their hrefs when we assign global IDs. href = match[:href].to_s new_node['href'] = note_used_href file, new_node, href end if match.include? :id # For nodes with IDs visible in the current Hugo file, we'll # want to redirect links that previously go to other Agda # module HTML files. So, note the ID that we want to redirect, # and pick a new unique ID to replace it with. id = match[:id].to_s new_node['id'] = note_defined_href file, "#{html_file}##{id}" end new_node.content = content[relative_from...relative_to] replace_with << new_node replace_offset = relative_to end replace_with << content[replace_offset..-1] # Finally, replace the node under consideration with the new # pieces. replace_with.each do |replacement| lt.add_previous_sibling replacement end lt.remove offset = new_offset end end end end end def cross_link_files # Now, we have a complete list of all the IDs visible in scope. # Redirect relevant links to these IDs. This achieves within-post # links. @nodes_referencing_href.each do |href, references| references.each do |reference| file = reference[:file] node = reference[:node] local_targets = @local_seen_hrefs[file] if local_targets.include? href # A code block in this fine provides this href, create a local link. node['href'] = "##{local_targets[href]}" elsif @global_seen_hrefs.include? href # A code block in this series, but not in this file, defines # this href. Create a cross-file link. target = @global_seen_hrefs[href] other_file = target[:file] id = target[:id] relpath = Pathname.new(other_file).dirname.relative_path_from(Pathname.new(file).dirname) node['href'] = "#{relpath}##{id}" else # No definitions in any blog page. For now, just delete the anchor. node.replace node.content end end end end end agda_context = AgdaContext.new file_documents = {} series_groups = files.group_by do |file| file_documents[file] = document = Nokogiri::HTML.parse(File.open(file)) document.css("meta[name=blog-series]")&.attribute("content")&.to_s end # For the 'nil' group, process individually. if files_with_no_series = series_groups.delete(nil) files_with_no_series.each do |file| file_group = FileGroup.new agda_context file_group.process_source_file file, file_documents[file] file_group.cross_link_files end end # For groups, process them together to allow cross-linking series_groups.each do |_, files_in_series| file_group = FileGroup.new agda_context files_in_series.each do |file| file_group.process_source_file file, file_documents[file] end file_group.cross_link_files end # Having modified all the HTML files, save them. file_documents.each do |file, document| File.write(file, document.to_html(encoding: 'UTF-8')) end