blog-static/agda.rb

require "nokogiri"
require "pathname"

files = ARGV[0..-1]

class LineInfo
  attr_accessor :links

  def initialize
    @links = []
  end
end

class Handler
  def initialize
    @file_infos = {}
  end

  # Traverse the preformatted Agda block in the given Agda HTML file
  # and find which textual ranges have IDs and links to other ranges.
  # Store this information in a hash, line => LineInfo.
  def process_agda_html_file(file)
    return @file_infos[file] if @file_infos.include? file

    @file_infos[file] = line_infos = {}
    unless File.exists?(file)
      return line_infos
    end

    document = Nokogiri::HTML.parse(File.open(file))
    pre_code = document.css("pre.Agda")[0]

    # The traversal postorder; we always visit children before their
    # parents, and we visit leaves in sequence.
    offset = 0
    line = 1
    pre_code.traverse do |at|
      # Text nodes are always leaves; visiting a new leaf means we've advanced
      # in the text by the length of that text. However, if there are newlines
      # -- since this is a preformatted block -- we also advanced by a line.
      # At this time, do not support links that span multiple lines, but
      # Agda doesn't produce those either.
      if at.text?
        if at.content.include? "\n"
          # This textual leaf is at least part whitespace. The logic
          # assumes that links can't span multiple pages, and that links
          # aren't nested, so ensure that the parent of the textual node
          # is the preformatted block itself.
          raise "unsupported Agda HTML output" if at.parent.name != "pre"

          # Increase the line and track the final offset. Written as a loop
          # in case we eventually want to add some handling for the pieces
          # sandwiched between newlines.
          at.content.split("\n", -1).each_with_index do |bit, idx|
            line += 1 unless idx == 0
            offset = bit.length
          end
        else
          # It's not a newline node, so it could be anywhere. All we need to
          # do is adjust the offset within the full pre block's text.
          offset += at.content.length
        end
      elsif at.name == "a"
        # We found a link. Agda emits both links and "things to link to" as
        # 'a' nodes, so check for either, and record them. Even if
        # the link is nested, the .content.length accessor will only
        # retrieve the textual content, and thus -- assuming the link
        # isn't split across lines -- will find the proper from-to range.

        line_info = line_infos.fetch(line) { line_infos[line] = LineInfo.new }
        href = at.attribute("href")
        id = at.attribute("id")
        if href or id
          new_link = { :from => offset-at.content.length, :to => offset }
          new_link[:href] = href if href
          new_link[:id] = id if id

          line_info.links << new_link
        end
      end
    end
    return line_infos
  end

  # Given a Hugo HTML file which references potentially several Agda modules
  # in code blocks, insert links into the code blocks.
  #
  # There are several things we need to take care of:
  # 1. Finding the HTML files associated with each referenced Agda module.
  #    For this, we make use of the data-base-path etc. attributes that
  #    the vanilla theme inserts.
  # 2. "zipping together" the Agda and Hugo HTML representations. Each of
  #    them encode the code, but they use different HTML elements and structures.
  #    So, given a Hugo HTML code block, traverse its textual contents
  #    and find any that are covered by links in the related Agda HTML file.
  # 3. Fixing up links: the Agda HTML links assume each module has its own HTML
  #    file. This isn't true for us: multiple modules are stitched into
  #    one Hugo HTML file. Also, we don't include the entire Agda HTML
  #    file in the Hugo HTML, so some links may be broken. So, find IDs
  #    that are visible in the Hugo file, rename them to be globally unique,
  #    and re-write cross-file links that reference these IDs to point
  #    inside the Hugo file.
  def process_source_file(file)
    nodes_referencing_href = {} # Agda HTML ID -> all (Hugo HTML) nodes that link to it
    seen_hrefs = {} # Agda HTML ID -> its new ID in Hugo-land
    id_counter = 0

    # Process each highlight group that's been marked as an Agda file.
    document = Nokogiri::HTML.parse(File.open(file))
    document.css('div[data-agda-block]').each do |t|
      first_line, last_line = nil, nil

      if first_line_attr = t.attribute("data-first-line")
        first_line = first_line_attr.to_s.to_i
      end
      if last_line_attr = t.attribute("data-last-line")
        last_line = last_line_attr.to_s.to_i
      end

      if first_line and last_line
        line_range = first_line..last_line
      else
        line_range = 1..
      end

      full_path = t.attribute("data-file-path").to_s
      full_path_dirs = Pathname(full_path).each_filename.to_a

      # The name of an Agda module is determined from its directory
      # structure: A/B/C.agda creates A.B.C.html. Depending on where
      # the code is included, there might be some additional folders
      # that precede A that we want to ignore.
      base_path = t.attribute("data-base-path").to_s
      base_dir_depth = 0
      if base_path.empty?
        # No submodules were used. Assume code/<X> is the root, since
        # that's the code layout of the blog right now.
        base_dir_depth = 1
        base_path = full_path_dirs[0]
      else
        # The code is in a submodule. Assume that the base path / submodule
        # root is the Agda module root, ignore all folders before that.
        base_path_dirs = Pathname(base_path).each_filename.to_a
        base_dir_depth = base_path_dirs.length
      end

      dirs_in_base = full_path_dirs[base_dir_depth..-1]
      html_file = dirs_in_base.join(".").gsub(/\.agda$/, ".html")
      html_path = File.join(["code", base_path, "html", html_file])

      agda_info = process_agda_html_file(html_path)

      # Hugo conveniently generates a bunch of spans, each encoding a line
      # of code output. We can iterate over these and match them up with
      # the line numbers we got from reading the Agda HTML output.
      lines = t.css("pre.chroma code[data-lang] .line")
      lines.zip(line_range).each do |line, line_no|
        line_info = agda_info[line_no]
        next unless line_info

        offset = 0
        line.traverse do |lt|
          if lt.text?
            content = lt.content
            new_offset = offset + content.length

            # The span/a/etc. structure of the Agda and Hugo HTML files
            # need not line up; it's possible for there to be a single link
            # in the Agda file that's broken up across multiple HTML nodes
            # in the Hugo output. For now, just don't link those, since inserting
            # such overlapping links is relatively complicated. Instead,
            # require links to fit fully within a current text node (and thus,
            # not overlap the boundaries of any HTML).
            matching_links = line_info.links.filter do |link|
              link[:from] >= offset and link[:to] <= new_offset
            end

            # A given text node can be broken into any number of sub-nodes,
            # where some sub-nodes are still text, and others have been turned
            # into links. Store the new pieces in replace_with. E.g.,
            #
            # Original:
            #   abc
            #
            # New:
            #   a<a href="..">b</a>c
            #
            # replace_with:
            #   ["a", <Nokogiri::XML::Node...>, "c"]
            #
            # match_offset represents how much of the original text we've
            # already converted. The below iteration assumes that matching
            # links are in order, and don't overlap.
            replace_with = []
            replace_offset = 0
            matching_links.each do |match|
              # The link's range is an offset from the beginning of the line,
              # but the text piece we're splitting up might be partway into
              # the line. Convert the link coordinates to piece-relative ones.
              relative_from = match[:from] - offset
              relative_to = match[:to] - offset

              # If the previous link ended some time before the new link
              # began (or if the current link is the first one, and is not
              # at the beginning), ensure that the plain text "in between"
              # is kept.
              replace_with << content[replace_offset...relative_from]

              tag = (match.include? :href) ? 'a' : 'span'
              new_link = Nokogiri::XML::Node.new(tag, document)
              if match.include? :href
                # For nodes with links, note what they're referring to, so
                # we can adjust their hrefs when we assign global IDs.
                href = match[:href].to_s
                new_link['href'] = href
                ref_list = nodes_referencing_href.fetch(href) { nodes_referencing_href[href] = [] }
                ref_list << new_link
              end
              if match.include? :id
                # For nodes with IDs visible in the current Hugo file, we'll
                # want to redirect links that previously go to other Agda
                # module HTML files. So, note the ID that we want to redirect,
                # and pick a new unique ID to replace it with.
                id = match[:id].to_s
                local_href = "#{html_file}##{id}"

                # The same piece of Agda module code could have been included
                # twice (e.g., a snippet first, then the surrounding code)
                # In that case, don't assign it a new ID; prefer the earlier
                # occurrence.
                uniq_id = seen_hrefs.fetch(local_href) do
                  new_id = "agda-unique-ident-#{id_counter}"
                  id_counter += 1
                  seen_hrefs[local_href] = new_id
                end
                new_link['id'] = uniq_id
              end
              new_link.content = content[relative_from...relative_to]

              replace_with << new_link
              replace_offset = relative_to
            end
            replace_with << content[replace_offset..-1]

            # Finally, replace the node under consideration with the new
            # pieces.
            replace_with.each do |replacement|
              lt.add_previous_sibling replacement
            end
            lt.remove

            offset = new_offset
          end
        end
      end
    end

    # Now, we have a complete list of all the IDs visible in scope.
    # Redirect relevant links to these IDs. This achieves within-post
    # links.
    seen_hrefs.each do |agda_id, hugo_id|
      nodes_to_rewrite = nodes_referencing_href[agda_id]
      next unless nodes_to_rewrite

      nodes_to_rewrite.each do |node|
        node['href'] = "##{hugo_id}"
      end

      nodes_referencing_href.delete agda_id
    end

    # The remaining nodes reference symbols not in the current Hugo document,
    # so just replace them with their contents for now (we'd need some
    # external hosting to make this work otherwise).
    nodes_referencing_href.each do |href, nodes|
      nodes.each do |node|
        node.replace node.content
      end
    end

    File.write(file, document.to_html(encoding: 'UTF-8'))
  end
end

handler = Handler.new
files.each do |file|
  handler.process_source_file file
end
Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:28:00 -07:00			`require "nokogiri"`
			`require "pathname"`

			`files = ARGV[0..-1]`

			`class LineInfo`
			`attr_accessor :links`

			`def initialize`
			`@links = []`
			`end`
			`end`

			`class Handler`
			`def initialize`
			`@file_infos = {}`
			`end`

			`# Traverse the preformatted Agda block in the given Agda HTML file`
			`# and find which textual ranges have IDs and links to other ranges.`
			`# Store this information in a hash, line => LineInfo.`
			`def process_agda_html_file(file)`
			`return @file_infos[file] if @file_infos.include? file`

			`@file_infos[file] = line_infos = {}`
			`unless File.exists?(file)`
			`return line_infos`
			`end`

			`document = Nokogiri::HTML.parse(File.open(file))`
			`pre_code = document.css("pre.Agda")[0]`

			`# The traversal postorder; we always visit children before their`
			`# parents, and we visit leaves in sequence.`
			`offset = 0`
			`line = 1`
			`pre_code.traverse do \|at\|`
			`# Text nodes are always leaves; visiting a new leaf means we've advanced`
			`# in the text by the length of that text. However, if there are newlines`
			`# -- since this is a preformatted block -- we also advanced by a line.`
			`# At this time, do not support links that span multiple lines, but`
			`# Agda doesn't produce those either.`
			`if at.text?`
			`if at.content.include? "\n"`
			`# This textual leaf is at least part whitespace. The logic`
			`# assumes that links can't span multiple pages, and that links`
			`# aren't nested, so ensure that the parent of the textual node`
			`# is the preformatted block itself.`
			`raise "unsupported Agda HTML output" if at.parent.name != "pre"`

			`# Increase the line and track the final offset. Written as a loop`
			`# in case we eventually want to add some handling for the pieces`
			`# sandwiched between newlines.`
			`at.content.split("\n", -1).each_with_index do \|bit, idx\|`
			`line += 1 unless idx == 0`
			`offset = bit.length`
			`end`
			`else`
			`# It's not a newline node, so it could be anywhere. All we need to`
			`# do is adjust the offset within the full pre block's text.`
			`offset += at.content.length`
			`end`
			`elsif at.name == "a"`
			`# We found a link. Agda emits both links and "things to link to" as`
			`# 'a' nodes, so check for either, and record them. Even if`
			`# the link is nested, the .content.length accessor will only`
			`# retrieve the textual content, and thus -- assuming the link`
			`# isn't split across lines -- will find the proper from-to range.`

			`line_info = line_infos.fetch(line) { line_infos[line] = LineInfo.new }`
			`href = at.attribute("href")`
			`id = at.attribute("id")`
			`if href or id`
			`new_link = { :from => offset-at.content.length, :to => offset }`
			`new_link[:href] = href if href`
			`new_link[:id] = id if id`

			`line_info.links << new_link`
			`end`
			`end`
			`end`
			`return line_infos`
			`end`

			`# Given a Hugo HTML file which references potentially several Agda modules`
			`# in code blocks, insert links into the code blocks.`
			`#`
			`# There are several things we need to take care of:`
			`# 1. Finding the HTML files associated with each referenced Agda module.`
			`# For this, we make use of the data-base-path etc. attributes that`
			`# the vanilla theme inserts.`
			`# 2. "zipping together" the Agda and Hugo HTML representations. Each of`
			`# them encode the code, but they use different HTML elements and structures.`
			`# So, given a Hugo HTML code block, traverse its textual contents`
			`# and find any that are covered by links in the related Agda HTML file.`
			`# 3. Fixing up links: the Agda HTML links assume each module has its own HTML`
			`# file. This isn't true for us: multiple modules are stitched into`
			`# one Hugo HTML file. Also, we don't include the entire Agda HTML`
			`# file in the Hugo HTML, so some links may be broken. So, find IDs`
			`# that are visible in the Hugo file, rename them to be globally unique,`
			`# and re-write cross-file links that reference these IDs to point`
			`# inside the Hugo file.`
			`def process_source_file(file)`
			`nodes_referencing_href = {} # Agda HTML ID -> all (Hugo HTML) nodes that link to it`
			`seen_hrefs = {} # Agda HTML ID -> its new ID in Hugo-land`
			`id_counter = 0`

			`# Process each highlight group that's been marked as an Agda file.`
			`document = Nokogiri::HTML.parse(File.open(file))`
			`document.css('div[data-agda-block]').each do \|t\|`
			`first_line, last_line = nil, nil`

			`if first_line_attr = t.attribute("data-first-line")`
			`first_line = first_line_attr.to_s.to_i`
			`end`
			`if last_line_attr = t.attribute("data-last-line")`
			`last_line = last_line_attr.to_s.to_i`
			`end`

Support blocks that are the entire file Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:50:45 -07:00			`if first_line and last_line`
			`line_range = first_line..last_line`
			`else`
			`line_range = 1..`
			`end`

Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:28:00 -07:00			`full_path = t.attribute("data-file-path").to_s`
			`full_path_dirs = Pathname(full_path).each_filename.to_a`

Support non-submodule code Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:47:42 -07:00			`# The name of an Agda module is determined from its directory`
			`# structure: A/B/C.agda creates A.B.C.html. Depending on where`
			`# the code is included, there might be some additional folders`
			`# that precede A that we want to ignore.`
			`base_path = t.attribute("data-base-path").to_s`
			`base_dir_depth = 0`
			`if base_path.empty?`
			`# No submodules were used. Assume code/<X> is the root, since`
			`# that's the code layout of the blog right now.`
			`base_dir_depth = 1`
			`base_path = full_path_dirs[0]`
			`else`
			`# The code is in a submodule. Assume that the base path / submodule`
			`# root is the Agda module root, ignore all folders before that.`
			`base_path_dirs = Pathname(base_path).each_filename.to_a`
			`base_dir_depth = base_path_dirs.length`
			`end`

			`dirs_in_base = full_path_dirs[base_dir_depth..-1]`
Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:28:00 -07:00			`html_file = dirs_in_base.join(".").gsub(/\.agda$/, ".html")`
			`html_path = File.join(["code", base_path, "html", html_file])`

			`agda_info = process_agda_html_file(html_path)`

			`# Hugo conveniently generates a bunch of spans, each encoding a line`
			`# of code output. We can iterate over these and match them up with`
			`# the line numbers we got from reading the Agda HTML output.`
			`lines = t.css("pre.chroma code[data-lang] .line")`
Support blocks that are the entire file Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:50:45 -07:00			`lines.zip(line_range).each do \|line, line_no\|`
Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:28:00 -07:00			`line_info = agda_info[line_no]`
			`next unless line_info`

			`offset = 0`
			`line.traverse do \|lt\|`
			`if lt.text?`
			`content = lt.content`
			`new_offset = offset + content.length`

			`# The span/a/etc. structure of the Agda and Hugo HTML files`
			`# need not line up; it's possible for there to be a single link`
			`# in the Agda file that's broken up across multiple HTML nodes`
			`# in the Hugo output. For now, just don't link those, since inserting`
			`# such overlapping links is relatively complicated. Instead,`
			`# require links to fit fully within a current text node (and thus,`
			`# not overlap the boundaries of any HTML).`
			`matching_links = line_info.links.filter do \|link\|`
			`link[:from] >= offset and link[:to] <= new_offset`
			`end`

			`# A given text node can be broken into any number of sub-nodes,`
			`# where some sub-nodes are still text, and others have been turned`
			`# into links. Store the new pieces in replace_with. E.g.,`
			`#`
			`# Original:`
			`# abc`
			`#`
			`# New:`
			`# a<a href="..">b</a>c`
			`#`
			`# replace_with:`
			`# ["a", <Nokogiri::XML::Node...>, "c"]`
			`#`
			`# match_offset represents how much of the original text we've`
			`# already converted. The below iteration assumes that matching`
			`# links are in order, and don't overlap.`
			`replace_with = []`
			`replace_offset = 0`
			`matching_links.each do \|match\|`
			`# The link's range is an offset from the beginning of the line,`
			`# but the text piece we're splitting up might be partway into`
			`# the line. Convert the link coordinates to piece-relative ones.`
			`relative_from = match[:from] - offset`
			`relative_to = match[:to] - offset`

			`# If the previous link ended some time before the new link`
			`# began (or if the current link is the first one, and is not`
			`# at the beginning), ensure that the plain text "in between"`
			`# is kept.`
			`replace_with << content[replace_offset...relative_from]`

			`tag = (match.include? :href) ? 'a' : 'span'`
			`new_link = Nokogiri::XML::Node.new(tag, document)`
			`if match.include? :href`
			`# For nodes with links, note what they're referring to, so`
			`# we can adjust their hrefs when we assign global IDs.`
			`href = match[:href].to_s`
			`new_link['href'] = href`
			`ref_list = nodes_referencing_href.fetch(href) { nodes_referencing_href[href] = [] }`
			`ref_list << new_link`
			`end`
			`if match.include? :id`
			`# For nodes with IDs visible in the current Hugo file, we'll`
			`# want to redirect links that previously go to other Agda`
			`# module HTML files. So, note the ID that we want to redirect,`
			`# and pick a new unique ID to replace it with.`
			`id = match[:id].to_s`
Improve support when code occurrs multiple times Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 17:05:38 -07:00			`local_href = "#{html_file}##{id}"`

			`# The same piece of Agda module code could have been included`
			`# twice (e.g., a snippet first, then the surrounding code)`
			`# In that case, don't assign it a new ID; prefer the earlier`
			`# occurrence.`
			`uniq_id = seen_hrefs.fetch(local_href) do`
			`new_id = "agda-unique-ident-#{id_counter}"`
			`id_counter += 1`
			`seen_hrefs[local_href] = new_id`
			`end`
Add a script to generate links using Agda's HTML output Signed-off-by: Danila Fedorin <danila.fedorin@gmail.com> 2024-05-22 16:28:00 -07:00			`new_link['id'] = uniq_id`
			`end`
			`new_link.content = content[relative_from...relative_to]`

			`replace_with << new_link`
			`replace_offset = relative_to`
			`end`
			`replace_with << content[replace_offset..-1]`

			`# Finally, replace the node under consideration with the new`
			`# pieces.`
			`replace_with.each do \|replacement\|`
			`lt.add_previous_sibling replacement`
			`end`
			`lt.remove`

			`offset = new_offset`
			`end`
			`end`
			`end`
			`end`

			`# Now, we have a complete list of all the IDs visible in scope.`
			`# Redirect relevant links to these IDs. This achieves within-post`
			`# links.`
			`seen_hrefs.each do \|agda_id, hugo_id\|`
			`nodes_to_rewrite = nodes_referencing_href[agda_id]`
			`next unless nodes_to_rewrite`

			`nodes_to_rewrite.each do \|node\|`
			`node['href'] = "##{hugo_id}"`
			`end`

			`nodes_referencing_href.delete agda_id`
			`end`

			`# The remaining nodes reference symbols not in the current Hugo document,`
			`# so just replace them with their contents for now (we'd need some`
			`# external hosting to make this work otherwise).`
			`nodes_referencing_href.each do \|href, nodes\|`
			`nodes.each do \|node\|`
			`node.replace node.content`
			`end`
			`end`

			`File.write(file, document.to_html(encoding: 'UTF-8'))`
			`end`
			`end`

			`handler = Handler.new`
			`files.each do \|file\|`
			`handler.process_source_file file`
			`end`