agda-spa/lean/Spa/Language/Graphs.lean

import Spa.Language.Base
import Mathlib.Data.Fin.Tuple.Basic
import Mathlib.Data.List.ProdSigma
import Mathlib.Data.List.FinRange

/-!

Algebraic Control Flow Graphs.

This file defines control flow graphs and operations to naturally compose them,
making it possible to inductively covnert a program in the object language
(see `Spa.Stmt` in `Spa/Language/Base.lean`) into its corresponding graph.

Graphs are, in general, parameterized by their "payload" (the per-node data); see `GGraph`.
This is useful because other operations, such as finding the CFG node corresponding
to an AST node, are performed by embellishing a graph's basic blocks with their AST
identifiers.

The operations are deliberately a little bit sloppy here, creating empty / statement-less
CFG nodes. Additionally, the current CFG construction algorithm doesn't group
consecutive statements in a single notional basic block into one node.
This makes graph construction much easier to define, and might save us the
trouble of (when trying to find the CFG node for an AST node) doing
indexing into a list.

-/

/-- Bump the upper bound of a list of `Fin`s without changing their value. -/
def List.finCastAdd {n : ℕ} (l : List (Fin n)) (m : ℕ) : List (Fin (n + m)) :=
  l.map (Fin.castAdd m)

/-- Bump the upper bound of a list of `Fin`s by adding the amount to their value. -/
def List.finNatAdd {m : ℕ} (l : List (Fin m)) (n : ℕ) : List (Fin (n + m)) :=
  l.map (Fin.natAdd n)

/-- Bump the upper bound of a list of `Fin` pairs without changing their value. -/
def List.finCastAddProd {n : ℕ} (l : List (Fin n × Fin n)) (m : ℕ) :
    List (Fin (n + m) × Fin (n + m)) :=
  l.map (fun e => (e.1.castAdd m, e.2.castAdd m))

/-- Bump the upper bound of a list of `Fin` pairs by adding the amount to their value. -/
def List.finNatAddProd {m : ℕ} (l : List (Fin m × Fin m)) (n : ℕ) :
    List (Fin (n + m) × Fin (n + m)) :=
  l.map (fun e => (e.1.natAdd n, e.2.natAdd n))

namespace Spa

/-- Graph with general (`α`-labeled) nodes. By using a tuple `Fin size → α`
    and writing `edges` over the `Fin size`, guarantees all edges are between real nodes.

    To make graph composition via operations not force a
    [`alga`](https://hackage.haskell.org/package/algebraic-graphs)-style "connect"-based
    algebra, explicitly defines `inputs` and `outputs`, which are the only nodes that
    get connected when graphs are sequenced. This makes the graph construction
    operations more naturally fit with how CFGs are created from `Stmt`s. -/
structure GGraph (α : Type) where
  size : ℕ
  nodes : Fin size → α
  edges : List (Fin size × Fin size)
  inputs : List (Fin size)
  outputs : List (Fin size)

namespace GGraph

variable {α β : Type}

/-- An index (node) in the CFG. -/
abbrev Index (g : GGraph α) : Type := Fin g.size

/-- An edge in the CFG. -/
abbrev Edge (g : GGraph α) : Type := g.Index × g.Index

instance : Functor GGraph where
  map {α β : Type} (f : α → β) (g : GGraph α) : GGraph β :=
    { size := g.size,
      nodes := f ∘ g.nodes
      edges := g.edges,
      inputs := g.inputs,
      outputs := g.outputs }

@[simp] lemma map_size (f : α → β) (g : GGraph α) : (f <$> g).size = g.size := rfl
@[simp] lemma map_edges (f : α → β) (g : GGraph α) : (f <$> g).edges = g.edges := rfl
@[simp] lemma map_inputs (f : α → β) (g : GGraph α) : (f <$> g).inputs = g.inputs := rfl
@[simp] lemma map_outputs (f : α → β) (g : GGraph α) : (f <$> g).outputs = g.outputs := rfl

/-- Overlay two graphs: create a new graph whose nodes and edges come from two
    sub-graphs, without inserting any additional edges. Also combines the
    input and output node sets. -/
def overlay (g₁ g₂ : GGraph α) : GGraph α where
  size := g₁.size + g₂.size
  nodes := Fin.append g₁.nodes g₂.nodes
  edges := g₁.edges.finCastAddProd g₂.size ++ g₂.edges.finNatAddProd g₁.size
  inputs := g₁.inputs.finCastAdd g₂.size ++ g₂.inputs.finNatAdd g₁.size
  outputs := g₁.outputs.finCastAdd g₂.size ++ g₂.outputs.finNatAdd g₁.size

@[inherit_doc] scoped infixr:70 " ∙ " => GGraph.overlay

/-- Sequence two CFGs: create a combined graph whose nodes and edges come
    from two subgraphs, __and__ make all the outputs of the left graph have edges to
    all the inputs of the right graph. By the semantics of CFGs, this
    encodes the fact that code first traverses the basic blocks in theleft
    graph, and does the same for the right graph. -/
def sequence (g₁ g₂ : GGraph α) : GGraph α where
  size := g₁.size + g₂.size
  nodes := Fin.append g₁.nodes g₂.nodes
  edges := g₁.edges.finCastAddProd g₂.size ++ g₂.edges.finNatAddProd g₁.size ++
    (g₁.outputs.finCastAdd g₂.size).product (g₂.inputs.finNatAdd g₁.size)
  inputs := g₁.inputs.finCastAdd g₂.size
  outputs := g₂.outputs.finNatAdd g₁.size

@[inherit_doc] scoped infixr:70 " ⤳ " => GGraph.sequence

/-- When a graph `g` is wrapped in a `loop`, the index / node corresponding
    to the input of the new loop. -/
def loopIn (g : GGraph α) : Fin (2 + g.size) := (0 : Fin 2).castAdd g.size

/-- When a graph `g` is wrapped in a `loop`, the index / node corresponding
    to the output of the new loop. -/
def loopOut (g : GGraph α) : Fin (2 + g.size) := (1 : Fin 2).castAdd g.size

/-- Creates a zero-or-more loop loop in the CFG: connects all the output
    nodes of the CFG back to the graph's beginning, and also introduces a path
    to a new ending node (see `loopOut`) which bypasses the entire graph.

    Notably, both the new input (`loopIn`) and new output (`loopOut`)
    nodes are necessary for correctness: adding a path from inputs to a
    hypothetical no-op end node encodes something like "just the first statement is executed".
    Similarly, just adding a path from a a hypothetical no-op beginning node
    to the outputs encodes "just the last statement is executed".

    This is technically sloppy (see module comment), but it's simple.
-/
def loop (g : GGraph (List β)) : GGraph (List β) where
  size := 2 + g.size
  nodes := Fin.append (fun _ : Fin 2 => []) g.nodes
  edges := g.edges.finNatAddProd 2 ++
    ((g.loopIn, ·) <$> g.inputs.finNatAdd 2)  ++
    ((·, g.loopOut) <$> g.outputs.finNatAdd 2) ++
    [(g.loopOut, g.loopIn), (g.loopIn, g.loopOut)]
  inputs := [g.loopIn]
  outputs := [g.loopOut]

@[simp] lemma loop_inputs (g : GGraph (List β)) : (loop g).inputs = [g.loopIn] := rfl

@[simp] lemma loop_outputs (g : GGraph (List β)) : (loop g).outputs = [g.loopOut] := rfl

/-- Creates a single-node graph whose node contains the given value. -/
def singleton (a : α) : GGraph α where
  size := 1
  nodes := fun _ => a
  edges := []
  inputs := [0]
  outputs := [0]

/-- Creates a new graph with a single input and single output node. Useful to ensure there's
    a single point of entry and single point of exit. -/
def wrap (g : GGraph (List β)) : GGraph (List β) :=
  singleton [] ⤳ g ⤳ singleton []

@[simp] lemma map_singleton (f : α → β) (a : α) :
    f <$> singleton a = singleton (f a) := rfl

@[simp] lemma map_overlay (f : α → β) (g₁ g₂ : GGraph α) :
    f<$> (g₁ ∙ g₂) = f <$> g₁ ∙ f <$> g₂ := by
  rcases g₁ with ⟨n₁, nd₁, e₁, i₁, o₁⟩; rcases g₂ with ⟨n₂, nd₂, e₂, i₂, o₂⟩
  simp only [Functor.map, GGraph.overlay]
  congr 1
  funext i
  refine Fin.addCases ?_ ?_ i <;> intro j <;> simp [Fin.append_left, Fin.append_right]

@[simp] lemma map_sequence (f : α → β) (g₁ g₂ : GGraph α) :
    f <$> (g₁ ⤳ g₂) = (f <$> g₁) ⤳  (f <$> g₂) := by
  rcases g₁ with ⟨n₁, nd₁, e₁, i₁, o₁⟩; rcases g₂ with ⟨n₂, nd₂, e₂, i₂, o₂⟩
  simp only [Functor.map, GGraph.sequence]
  congr 1
  funext i
  refine Fin.addCases ?_ ?_ i <;> intro j <;> simp [Fin.append_left, Fin.append_right]

@[simp] lemma map_loop (h : β → γ) (g : GGraph (List β)) :
    (List.map h) <$> (loop g) = loop (List.map h <$> g) := by
  rcases g with ⟨n, nd, e, i, o⟩
  simp only [Functor.map, GGraph.loop]
  congr 1
  funext i
  refine Fin.addCases ?_ ?_ i <;> intro j <;> simp [Fin.append_left, Fin.append_right]

@[simp] lemma map_wrap (h : β → γ) (g : GGraph (List β)) :
    (List.map h) <$> wrap g = wrap (List.map h <$> g) := by
  simp [GGraph.wrap, GGraph.map_sequence, GGraph.map_singleton]

variable (g : GGraph α)

/-- All the nodes in the graph. -/
def indices : List g.Index := List.finRange g.size

/-- All of the graph's indices are listed in `indices`. -/
lemma mem_indices (idx : g.Index) : idx ∈ g.indices :=
  List.mem_finRange idx

/-- `indices` does not have duplicates. -/
lemma nodup_indices : g.indices.Nodup :=
  List.nodup_finRange g.size

/-- Predecessors of a particular node in the graph. --/
def predecessors (idx : g.Index) : List g.Index :=
  g.indices.filter (fun idx' => (idx', idx) ∈ g.edges)

/-- There's there's an edge between two nodes `idx₁` and `idx₂`,
    then `idx₁` is the predecessor of `idx₂`. -/
lemma mem_predecessors_of_edge {idx₁ idx₂ : g.Index}
    (h : (idx₁, idx₂) ∈ g.edges) : idx₁ ∈ g.predecessors idx₂ :=
  List.mem_filter.mpr ⟨g.mem_indices idx₁, by simpa using h⟩

/-- A node is a predecessor of another node only if there's an
    edge between them. -/
lemma edge_of_mem_predecessors {idx₁ idx₂ : g.Index}
    (h : idx₁ ∈ g.predecessors idx₂) : (idx₁, idx₂) ∈ g.edges := by
  simpa using (List.mem_filter.mp h).2

end GGraph

/-- "Normal" graphs, for the purposes of the analyses in this
    framework, have basic blocks in their nodes, and nothing else. -/
abbrev Graph : Type := GGraph (List BasicStmt)

namespace Graph

export GGraph (overlay sequence loop singleton wrap loop_inputs loop_outputs)

@[inherit_doc] scoped infixr:70 " ∙ " => GGraph.overlay
@[inherit_doc] scoped infixr:70 " ⤳ " => GGraph.sequence

end Graph

open Graph in
def Stmt.cfg : Stmt → Graph
  -- A basic statement goes into a single basic block
  | .basic bs => singleton [bs]
  -- Sequencing of statements corresponds naturally to CFG sequencing
  | .andThen s₁ s₂ => s₁.cfg ⤳  s₂.cfg
  -- An if can execute either one branch or the other; overlap them.
  -- Subsequent sequencing (etc.) will end up creating the forks and joins.
  | .ifElse _ s₁ s₂ => s₁.cfg ∙ s₂.cfg
  -- The `loop` construct was developed specifically for zero-or-more loops like this.
  | .whileLoop _ s => loop s.cfg

end Spa