From f42cb900cfcc9e6f24948368e01bdcfbd6277786 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Sat, 3 Aug 2019 15:45:14 -0700 Subject: [PATCH] Add the drafts of the two posts --- code/compiler_ast.hpp | 76 ++++++++ code/compiler_parser.y | 26 +++ code/compiler_scanner.l | 33 ++++ content/blog/01_compiler_tokenizing.md | 243 +++++++++++++++++++++++++ content/blog/02_compiler_parsing.md | 231 +++++++++++++++++++++++ 5 files changed, 609 insertions(+) create mode 100644 code/compiler_ast.hpp create mode 100644 code/compiler_parser.y create mode 100644 code/compiler_scanner.l create mode 100644 content/blog/01_compiler_tokenizing.md create mode 100644 content/blog/02_compiler_parsing.md diff --git a/code/compiler_ast.hpp b/code/compiler_ast.hpp new file mode 100644 index 0000000..ea3497d --- /dev/null +++ b/code/compiler_ast.hpp @@ -0,0 +1,76 @@ +#pragma once +#include +#include + +struct ast { + virtual ~ast(); +}; + +using ast_ptr = std::unique_ptr; + +struct pattern { + virtual ~pattern(); +}; + +struct pattern_var : public pattern { + std::string var; + + pattern_var(const char* v) + : var(v) {} +}; + +struct pattern_constr : public pattern { + std::string constr; + std::vector params; + + pattern_constr(const char* c, std::vector&& p) + : constr(c) { + std::swap(params, p); + } +}; + +using pattern_ptr = std::unique_ptr; + +struct branch { + pattern_ptr pat; + ast_ptr expr; + + branch(pattern_ptr&& p, ast_ptr&& a) + : pat(std::move(p)), expr(std::move(a)) {} +}; + +using branch_ptr = std::unique_ptr; + +enum binop { + PLUS, + MINUS, + TIMES, + DIVIDE +}; + +struct ast_binop : public ast { + binop op; + ast_ptr left; + ast_ptr right; + + ast_binop(binop o, ast_ptr&& l, ast_ptr&& r) + : op(o), left(std::move(l)), right(std::move(r)) {} +}; + +struct ast_app : public ast { + ast_ptr left; + ast_ptr right; + + ast_app(ast* l, ast* r) + : left(l), right(r) {} +}; + +struct ast_case : public ast { + ast_ptr of; + std::vector branches; + + ast_case(ast_ptr&& o, std::vector&& b) + : of(std::move(o)) { + std::swap(branches, b); + } +}; diff --git a/code/compiler_parser.y b/code/compiler_parser.y new file mode 100644 index 0000000..a72d14d --- /dev/null +++ b/code/compiler_parser.y @@ -0,0 +1,26 @@ +%{ +#include +#include +#include "ast.hpp" +#include "parser.hpp" +%} + +%token PLUS +%token TIMES +%token MINUS +%token DIVIDE +%token INT +%token DEFN +%token DATA +%token CASE +%token OF +%token OCURLY +%token CCURLY +%token OPAREN +%token CPAREN +%token COMMA +%token ARROW +%token EQUA +%token LID +%token UID + diff --git a/code/compiler_scanner.l b/code/compiler_scanner.l new file mode 100644 index 0000000..b0bb329 --- /dev/null +++ b/code/compiler_scanner.l @@ -0,0 +1,33 @@ +%option noyywrap + +%{ +#include +%} + +%% + +[ \n]+ {} +\+ { std::cout << "PLUS" << std::endl; } +\* { std::cout << "TIMES" << std::endl; } +- { std::cout << "MINUS" << std::endl; } +\/ { std::cout << "DIVIDE" << std::endl; } +[0-9]+ { std::cout << "NUMBER: " << yytext << std::endl; } +defn { std::cout << "KEYWORD: defn" << std::endl; } +data { std::cout << "KEYWORD: data" << std::endl; } +case { std::cout << "KEYWORD: case" << std::endl; } +of { std::cout << "KEYWORD: of" << std::endl; } +\{ { std::cout << "OPEN CURLY" << std::endl; } +\} { std::cout << "CLOSED CURLY" << std::endl; } +\( { std::cout << "OPEN PARENTH" << std::endl; } +\) { std::cout << "CLOSE PARENTH" << std::endl; } +, { std::cout << "COMMA" << std::endl; } +-> { std::cout << "PATTERN ARROW" << std::endl; } += { std::cout << "EQUAL" << std::endl; } +[a-z][a-zA-Z]* { std::cout << "LOWERCASE IDENTIFIER: " << yytext << std::endl; } +[A-Z][a-zA-Z]* { std::cout << "UPPERCASE IDENTIFIER: " << yytext << std::endl; } + +%% + +int main() { + yylex(); +} diff --git a/content/blog/01_compiler_tokenizing.md b/content/blog/01_compiler_tokenizing.md new file mode 100644 index 0000000..c9a7616 --- /dev/null +++ b/content/blog/01_compiler_tokenizing.md @@ -0,0 +1,243 @@ +--- +title: Compiling a Functional Language Using C++, Part 1 - Tokenizing +date: 2019-08-03T01:02:30-07:00 +tags: ["C and C++", "Functional Languages", "Compilers"] +draft: true +--- +During my last academic term, I was enrolled in a compilers course. +We had a final project - develop a compiler for a basic Python subset, +using LLVM. It was a little boring - virtually nothing about the compiler +was __not__ covered in class, and it felt more like putting two puzzles +pieces together than building a real project. + +Being involved of the Programming Language Theory (PLT) research group at my +university, I decided to do something different for the final project - +a compiler for a functional language. In a series of posts, starting with +thise one, I will explain what I did so that those interested in the subject +are able to replicate my steps, and maybe learn something for themselves. + +### The "classic" stages of a compiler +Let's take a look at the high level overview of what a compiler does. +Conceptually, the components of a compiler are pretty cleanly separated. +They are as gollows: + +1. Tokenizing / lexical analysis +2. Parsing +3. Analysis / optimization +5. Code Generation + +There are many variations on this structure. Some compilers don't optimize +at all, some translate the program text into an intermediate representation, +an alternative way of representing the program that isn't machine code. +In some compilers, the stages of parsing and analysis can overlap. +In short, just like the pirate's code, it's more of a guideline than a rule. + +### Tokenizing and Parsing (the "boring stuff") +It makes sense to build a compiler bit by bit, following the stages we outlined above. +This is because these stages are essentially a pipeline, with program text +coming in one end, and the final program coming out of the other. So as we build +up our pipeline, we'll be able to push program text further and further, until +eventually we get something that we can run on our machine. + +This is how most tutorials go about building a compiler, too. The result is that +there are a __lot__ of tutorials covering tokenizing and parsing. This is why +I refer to this part of the process as "boring". Nonetheless, I will cover the steps +required to tokenize and parse our little functional language. But before we do that, +we first need to have an idea of what our language looks like. + +### The Grammar +Simon Peyton Jones, in his two works regarding compiling functional languages, remarks +that most functional languages are very similar, and vary largely in syntax. That's +our main degree of freedom. We want to represent the following things, for sure: + +* Defining functions +* Applying functions +* Arithmetic +* Algebraic data types (to represent lists, pairs, and the like) +* Pattern matching (to operate on data types) + +We can additionally support anonymous (lambda) functions, but compiling those +is actually a bit trickier, so we will skip those for now. Arithmetic is the simplest to +define - let's define it as we would expect: `3` is a number, `3+2*6` evaluates to 15. +Function application isn't much more difficult - `f x` means "apply f to x", and +`f x + g x` means sum the result of applying f to x and g to x. That is, function +application has higher precedence, or __binds tighter__ than binary operators like plus. + +Next, let's define the syntax for declaring a function. Why not: +``` +defn f x = { x + x } +``` + +As for declaring data types: +``` +data List = { Nil, Cons Int List } +``` +Notice that we are avoiding polymorphism here. + +Let's also define a syntax for pattern matching: +``` +case l of { + Nil -> { 0 } + Cons x xs -> { x } +} +``` +The above means "if the list `l` is `Nil`, then return 0, otherwise, if it's +constructed from an integer and another list (as defined in our `data` example), +return the integer". + +That's it for now! Let's take a look at tokenizing. + +### Tokenizing +When we first get our program text, it's in a representation difficult for us to make +sense of. If we look at how it's represented in C++, we see that it's just an array +of characters (potentially hundreds, thousands, or millions in length). We __could__ +jump straight to parsing the text (which involves creating a tree structure, known +as an __abstract syntax tree__; more on that later). There's nothing wrong with this approach - +in fact, in functional languages, tokenizing is frequently skipped. However, +in our closer-to-metal language, it happens to be more convenient to first break the +input text into a bunch of distinct segments (tokens). + +For example, consider the string "320+6". If we skip tokenizing and go straight +into parsing, we'd feed our parser the sequence of characters `['3', '2', '6', '+', '6', '\0']`. +On the other hand, if we run a tokenizing step on the string first, we'd be feeding our +parser three tokens, `("320", NUMBER)`, `("+", OPERATOR)`, and `("6", NUMBER)`. +To us, this is a bit more clear - we've partitioned the string into logical segments. +Our parser, then, won't have to care about recognizing a number - it will just know +that a number is next in the string, and do with that information what it needs. + +How do we go about breaking up a string into tokens? We need to come up with a +way to compare some characters in a string against a set of rules. But "rules" +is a very general term - we could, for instance, define a particular +token that is a fibonacci number - 1, 2, 3, 5, and so on would be marked +as a "fibonacci number", while the other numbers will be marked as just +a regular number. To support that, our rules would get pretty complex. And +equally complex will become our checking of these rules for particular strings. + +Fortunately, we're not insane. We observe that the rules for tokens in practice +are fairly simple - one or more digits is an integer, a few letters together +are a variable name. In order to be able to efficiently break text up into +such tokens, we restrict ourselves to __regular languages__. A language +is defined as a set of strings (potentially infinite), and a regular +language for which we can write a __regular expression__ to check if +a string is in the set. Regular expressions are a way of representing +patterns that a string has to match. We define regular expressions +as follows: + +* Any character is a regular expression that matches that character. Thus, +\\(a\\) is a regular expression (from now shortened to regex) that matches +the character 'a', and nothing else. +* \\(r_1r_2\\), or the concatenation of \\(r_1\\) and \\(r_2\\), is +a regular expression that matches anything matched by \\(r_1\\), followed +by anything that matches \\(r_2\\). For instance, \\(ab\\), matches +the character 'a' followed by the character 'b' (thus matching "ab"). +* \\(r_1|r_2\\) matches anything that is either matched by \\(r_1\\) or +\\(r_2\\). Thus, \\(a|b\\) matches the character 'a' or the character 'b'. +* \\(r_1?\\) matches either an empty string, or anything matched by \\(r_1\\). +* \\(r_1+\\) matches one or more things matched by \\(r_1\\). So, +\\(a+\\) matches "a", "aa", "aaa", and so on. +* \\((r_1)\\) matches anything that matches \\(r_1\\). This is mostly used +to group things together in more complicated expressions. +* \\(.\\) matches any character. + +More powerful variations of regex also include an "any of" operator, \\([c_1c_2c_3]\\), +which is equivalent to \\(c_1|c_2|c_3\\), and a "range" operator, \\([c_1-c_n]\\), which +matches all characters in the range between \\(c_1\\) and \\(c_n\\), inclusive. + +Let's see some examples. An integer, such as 326, can be represented with \\([0-9]+\\). +This means, one or more characters between 0 or 9. Some (most) regex implementations +have a special symbol for \\([0-9]\\), written as \\(\\setminus d\\). A variable, +starting with a lowercase letter and containing lowercase or uppercase letters after it, +can be written as \\(\[a-z\]([a-z]+)?\\). Again, most regex implementations provide +a special operator for \\((r_1+)?\\), written as \\(r_1*\\). + +#### The Theory +So how does one go about checking if a regular expression matches a string? An efficient way is to +first construct a [state machine](https://en.wikipedia.org/wiki/Finite-state_machine). A type of state machine can be constructed from a regular expression +by literally translating each part of it to a series of states, one-to-one. This machine is called +a __Nondeterministic Finite Automaton__, or NFA for short. The "Finite" means that the number of +states in the state machine is, well, finite. For us, this means that we can store such +a machine on disk. The "Nondeterministic" part, though, is more complex: given a particular character +and a particular state, it's possible that an NFA has the option of transitioning into more +than one other state. Well, which state __should__ it pick? No easy way to tell. Each time +we can transition to more than one state, we exponentially increase the number of possible +states that we can be in. This isn't good - we were going for efficiency, remember? + +What we can do is convert our NFA into another kind of state machine, in which for every character, +only one possible state transition is possible. This machine is called a __Deterministic Finite Automaton__, +or DFA for short. There's an algorithm to convert an NFA into a DFA, which I won't explain here. + +Since both the conversion of a regex into an NFA and a conversion of an NFA into a DFA is done +by following an algorithm, we're always going to get the same DFA for the same regex we put in. +If we come up with the rules for our tokens once, we don't want to be building a DFA each time +our tokenizer is run - the result will always be the same! Even worse, translating a regular +expression all the way into a DFA is the inefficient part of the whole process. The solution is to +generate a state machine, and convert it into code to simulate that state machine. Then, we include +that code as part of our compiler. This way, we have a state machine "hardcoded" into our tokenizer, +and no conversion of regex to DFAs needs to be done at runtime. + +#### The Practice +Creating an NFA, and then a DFA, and then generating C++ code are all cumbersome. If we had to +write code to do this every time we made a compiler, it would get very repetitive, very fast. +Fortunately, there exists a tool that does exactly this for us - it's called `flex`. Flex +takes regular expressions, and generates code that matches a string against those regular expressions. +It does one more thing in addition to that - for each regular expression it matches, flex +runs a user-defined action (which we write in C++). We can use this to convert strings that +represent numbers directly into numbers, and do other small tasks. + +So, what tokens do we have? From our arithmetic definition, we see that we have integers. +Let's use the regex `[0-9]+` for those. We also have the operators `+`, `-`, `*`, and `/`. +`-` is simple enough: the corresponding regex is `-`. We need to +preface our `/`, `+` and `*` with a backslash, though, since they happen to also be modifiers +in flex's regular expressions: `\/`, `\+`, `\*`. + +Let's also represent some reserved keywords. We'll say that `defn`, `data`, `case`, and `of` +are reserved. Their regular expressions are just their names. We also want to tokenize +`=`, `->`, `{`, `}`, `,`, `(` and `)`. Finally, we want to represent identifiers, like `f`, +`x`, `Nil`, and `Cons`. We will actually make a distinction between lowercase identifiers +and uppercase identifiers, as we will follow Haskell's convention of representing +data type constructors with uppercase letters, and functions and variables with lowercase ones. +So, our two regular expressions will be `[a-z][a-zA-Z]*` for the lowercase variables, and +`[A-Z][a-zA-Z]*` for uppercase variables. Let's make a tokenizer in flex with all this. To do +this, we create a new file, `scanner.l`, in which we write a mix of regular expressions +and C++ code. Here's the whole thing: + +{{< rawblock "compiler_scanner.l" >}} + +A flex file starts with options. I set the `noyywrap` option, which disables a particular +feature of flex that we won't use, and which causes linker errors. Next up, +flex allows us to put some C++ code that we want at the top of our generated code. +I simply include `iostream`, so that we can use `cout` to print out our tokens. +Next, `%%`, and after that, the meat of our tokenizer: regular expressions, followed by +C++ code that should be executed when the regular expression is matched. + +The first token: whitespace. This includes the space character, +and the newline character. We ignore it, so its rule is empty. After that, +we have the regular expressions for the tokens we've talked about. For each, I just +print a description of the token that matched. This will change we we hook this up to +a parser, but for now, this works fine. Notice that the variable `yytext` contains +the string matched by our regular expression. This variable is set by the code flex +generates, and we can use it to get the extract text that matched a regex. This is +useful, for instance, to print the variable name that we matched. After +all of our tokens, another `%%`, and more C++ code. For this simple example, +I declare a `main` function, which just calls `yylex`, a function flex +generates for us. Let's generate the C++ code, and compile it: + +``` +flex -o scanner.cpp scanner.l +g++ -o scanner scanner.cpp +``` + +Now, let's feed it an expression: +``` +echo "3+2*6" | ./scanner +``` + +We get the output: +``` +NUMBER: 3 +PLUS +NUMBER: 2 +TIMES +NUMBER: 6 +``` +Hooray! We have tokenizing. diff --git a/content/blog/02_compiler_parsing.md b/content/blog/02_compiler_parsing.md new file mode 100644 index 0000000..473c753 --- /dev/null +++ b/content/blog/02_compiler_parsing.md @@ -0,0 +1,231 @@ +--- +title: Compiling a Functional Language Using C++, Part 2 - Parsing +date: 2019-08-03T01:02:30-07:00 +tags: ["C and C++", "Functional Languages", "Compilers"] +draft: true +--- +In the previous post, we covered tokenizing. We learned how to convert an input string into logical segments, and even wrote up a tokenizer to do it according to the rules of our language. Now, it's time to make sense of the tokens, and parse our language. + +### The Theory +The rules to parse a language are more complicated than the rules for +recognizing tokens. For instance, consider a simple language of a matching +number of open and closed parentheses, like `()` and `((()))`. You can't +write a regular expression for it! We resort to a wider class of languages, called +__context free languages__. These languages are ones that are matched by __context free grammars__. +A context free grammar is a list of rules in the form of \\(S \\rightarrow \\alpha\\), where +\\(S\\) is a __nonterminal__ (conceptualy, a thing that expands into other things), and +\\(\\alpha\\) is a sequence of nonterminals and terminals (a terminal is a thing that doesn't +expand into other things; for us, this is a token). + +Let's write a context free grammar (CFG from now on) to match our parenthesis language: + +$$ +\\begin{align} +S & \\rightarrow ( S ) \\\\\\ +S & \\rightarrow () +\\end{align} +$$ + +So, how does this work? We start with a "start symbol" nonterminal, which we usually denote as \\(S\\). Then, to get a desired string, +we replace a nonterminal with the sequence of terminals and nonterminals on the right of one of its rules. For instance, to get `()`, +we start with \\(S\\) and replace it with the body of the second one of its rules. This gives us `()` right away. To get `((()))`, we +have to do a little more work: + +$$ +S \\rightarrow (S) \\rightarrow ((S)) \\rightarrow ((())) +$$ + +In practice, there are many ways of using a CFG to parse a programming language. Various parsing algorithms support various subsets +of context free languages. For instance, top down parsers follow nearly exactly the structure that we had. They try to parse +a nonterminal by trying to match each symbol in its body. In the rule \\(S \\rightarrow \\alpha \\beta \\gamma\\), it will +first try to match \\(alpha\\), then \\(beta\\), and so on. If one of the three contains a nonterminal, it will attempt to parse +that nonterminal following the same strategy. However, this leaves a flaw - For instance, consider the grammar +$$ +\\begin{align} +S & \\rightarrow Sa \\\\\\ +S & \\rightarrow a +\\end{align} +$$ +A top down parser will start with \\(S\\). It will then try the first rule, which starts with \\(S\\). So, dutifully, it will +try to parse __that__ \\(S\\). And to do that, it will once again try the first rule, and find that it starts with another \\(S\\)... +This will never end, and the parser will get stuck. A grammar in which a nonterminal can appear in the beginning of one of its rules +__left recursive__, and top-down parsers aren't able to handle those grammars. + +We __could__ rewrite our grammar without using left-recursion, but we don't want to. Instead, we'll use a __bottom up__ parser, +using specifically the LALR(1) parsing algorithm. Here's an example of how it works, using our left-recursive grammar. We start with our +goal string, and a "dot" indicating where we are. At first, the dot is behind all the characters: +$$ +.aaa +$$ +We see nothing interesting on the left side of the dot, so we move (or __shift__) the dot forward by one character: +$$ +a.aa +$$ +Now, on the left side of the dot, we see something! In particular, we see the body of one of the rules for \\(S\\) (the second one). +So we __reduce__ the thing on the left side of the dot, by replacing it with the left hand side of the rule (\\(S\\)): +$$ +S.aa +$$ +There's nothing else we can do with the left side, so we shift again: +$$ +Sa.a +$$ +Great, we see another body on the left of the dot. We reduce it: +$$ +S.a +$$ +Just like before, we shift over the dot, and again, we reduce. We end up with our +start symbol, and nothing on the right of the dot, so we're done! + +### The Practice +In practice, we don't want to just match a grammar. That would be like saying "yup, this is our language". +Instead, we want to create something called an __abstract syntax tree__, or AST for short. This tree +captures the structure of our language, and is easier to work with than its textual representation. The structure +of the tree we build will often mimic the structure of our grammar: a rule in the form \\(S \\rightarrow A B\\) +will result in a tree named "S", with two children corresponding the trees built for A and B. Since +an AST captures the structure of the language, we'll be able to toss away some punctuation +like `,` and `(`. These tokens will appear in our grammar, but we will tweak our parser to simply throw them away. Additionally, +we will write our grammar ignoring whitespace, since our tokenizer conveniently throws that into the trash. + +The grammar for arithmetic actually involves more effort than it would appear at first. We want to make sure that our +parser respects the order of operations. This way, when we have our tree, it will immediately have the structure in +which multiplication is done before addition. We do this by creating separate "levels" in our grammar, with one +nonterminal matching addition and subtraction, while another nonterminal matches multiplication and division. +We want the operation that has the least precedence to be __higher__ in our tree than one of higher precedence. +For instance, for `3+2*6`, we want our tree to have `+` as the root, `3` as the left child, and the tree for `2*6` as the right child. +Why? Because this tree represents "the addition of 3 and the result of multiplying 2 by 6". If we had `*` be the root, we'd have +a tree representing "the multiplication of the result of adding 3 to 2 and 6", which is __not__ what our expression means. + +So, with this in mind, we want our rule for __addition__ (represented with the nonterminal \\(A\_{add}\\), to be matched first, and +for its children to be trees created by the multiplication rule, \\(A\_{mult}\\). So we write the following rules: +$$ +\\begin{align} +A\_{add} & \\rightarrow A\_{add}+A\_{mult} \\\\\\ +A\_{add} & \\rightarrow A\_{add}-A\_{mult} \\\\\\ +A\_{add} & \\rightarrow A\_{mult} +\\end{align} +$$ +The first rule matches another addition, added to the result of another addition. We use the addition in the body +because we want to be able to parse strings like `1+2+3+4`, which we want to view as `((1+2)+3)+4` (mostly because +subtraction is [left-associative](https://en.wikipedia.org/wiki/Operator_associativity)). So, we want the top level +of the tree to be the rightmost `+` or `-`, since that means it will be the "last" operation. You may be asking, + +> You define addition in terms of addition; how will parsing end? What if there's no addition at all, like `2*6`? + +This is the purpose of the third rule, which serves to say "an addition expression can just be a multiplication, +without any plusses or minuses." Our rules for multiplication are very similar: +$$ +\\begin{align} +A\_{mult} & \\rightarrow A\_{mult}*P \\\\\\ +A\_{mult} & \\rightarrow A\_{mult}/P \\\\\\ +A\_{mult} & \\rightarrow P +\\end{align} +$$ + +P, in this case, is an a__p__lication (remember, application has higher precedence than any binary operator). +Once again, if there's no `*` or `\`, we simply fall through to a \\(P\\) nonterminal, representing application. + +Application is refreshingly simple: +$$ +\\begin{align} +P & \\rightarrow P B \\\\\\ +P & \\rightarrow B +\\end{align} +$$ +An application is either only one "thing" (represented with \\(B\\), for __b__ase), such as a number or an identifier, +or another application followed by a thing. + +We now need to define what a "thing" is. As we said before, it's a number, or an identifier. We also make a parenthesized +arithmetic expression a "thing", allowing us to wrap right back around and recognize anything inside parentheses: +$$ +\\begin{align} +B & \\rightarrow \text{num} \\\\\\ +B & \\rightarrow \text{lowerVar} \\\\\\ +B & \\rightarrow \text{upperVar} \\\\\\ +B & \\rightarrow ( A\_{add} ) \\\\\\ +B & \\rightarrow C +\\end{align} +$$ +What's the last \\(C\\)? We also want a "thing" to be a case expression. Here are the rules for that: +$$ +\\begin{align} +C & \\rightarrow \\text{case} \\; A\_{add} \\; \\text{of} \\; \\{ L\_B\\} \\\\\\ +L\_B & \\rightarrow R \\; , \\; L\_B \\\\\\ +L\_B & \\rightarrow R \\\\\\ +R & \\rightarrow N \\; \\text{arrow} \\; \\{ A\_{add} \\} \\\\\\ +N & \\rightarrow \\text{lowerVar} \\\\\\ +N & \\rightarrow \\text{upperVar} \\; L\_L \\\\\\ +L\_L & \\rightarrow \\text{lowerVar} \\; L\_L \\\\\\ +L\_L & \\rightarrow \\epsilon +\\end{align} +$$ +\\(L\_B\\) is the list of branches in our case expression. \\(R\\) is a single branch, which is in the +form `Pattern -> Expression`. \\(N\\) is a pattern, which we will for now define to be either a variable name +(\\(\\text{lowerVar}\\)), or a constructor with some arguments. The arguments of a constructor will be +lowercase names, and a list of those arguments will be represented with \\(L\_L\\). One of the bodies +of this nonterminal is just the character \\(\\epsilon\\), which just means "nothing". +We use this because a constructor can have no arguments (like Nil). + +We can use these grammar rules to represent any expression we want. For instance, let's try `3+(multiply 2 6)`, +where multiply is a function that, well, multiplies. We start with \\(A_{add}\\): +$$ +\\begin{align} +& A\_{add} \\\\\\ +& \\rightarrow A\_{add} + A\_{mult} \\\\\\ +& \\rightarrow A\_{mult} + A\_{mult} \\\\\\ +& \\rightarrow P + A\_{mult} \\\\\\ +& \\rightarrow B + A\_{mult} \\\\\\ +& \\rightarrow \\text{num(3)} + A\_{mult} \\\\\\ +& \\rightarrow \\text{num(3)} + P \\\\\\ +& \\rightarrow \\text{num(3)} + B \\\\\\ +& \\rightarrow \\text{num(3)} + (A\_{add}) \\\\\\ +& \\rightarrow \\text{num(3)} + (A\_{mult}) \\\\\\ +& \\rightarrow \\text{num(3)} + (P) \\\\\\ +& \\rightarrow \\text{num(3)} + (P \\; \\text{num(6)}) \\\\\\ +& \\rightarrow \\text{num(3)} + (P \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\ +& \\rightarrow \\text{num(3)} + (\\text{lowerVar(multiply)} \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\ +\\end{align} +$$ + +We're almost there. We now want a rule for a "something that can appear at the top level of a program", like +a function or data type declaration. We make a new set of rules: +$$ +\\begin{align} +T & \\rightarrow \\text{defn} \\; \\text{lowerVar} \\; L\_P =\\{ A\_{add} \\} \\\\\\ +T & \\rightarrow \\text{data} \\; \\text{upperVar} = \\{ L\_D \\} \\\\\\ +L\_D & \\rightarrow D \\; , \\; L\_D \\\\\\ +L\_D & \\rightarrow D \\\\\\ +L\_P & \\rightarrow \\text{lowerVar} \\; L\_P \\\\\\ +L\_P & \\rightarrow \\epsilon \\\\\\ +D & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\ +L\_U & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\ +L\_U & \\rightarrow \\epsilon +\\end{align} +$$ +That's a lot of rules! \\(T\\) is the "top-level declaration rule. It matches either +a function or a data definition. A function definition consists of the keyword "defn", +followed by a function name (starting with a lowercase letter), followed by a list of +parameters, represented by \\(L\_P\\). + +A data type definition consists of the name of the data type (starting with an uppercase letter), +and a list \\(L\_D\\) of data constructors \\(D\\). There must be at least one data constructor in this list, +so we don't use the empty string rule here. A data constructor is simply an uppercase variable representing +a constructor of the data type, followed by a list \\(L\_U\\) of zero or more uppercase variables (representing +the types of the arguments of the constructor). + +Finally, we want one or more of these declarations in a valid program: +$$ +\\begin{align} +G & \\rightarrow T \\; G \\\\\\ +G & \\rightarrow T +\\end{align} +$$ + +Just like with tokenizing, there exists a piece of software that will generate a bottom-up parser for us, given our grammar. +It's called Bison, and it is frequently used with Flex. Before we get to bison, though, we need to pay a debt we've already +incurred - the implementation of our AST. Such a tree is language-specific, so Bison doesn't generate it for us. Here's what +I came up with: +{{< codeblock "C++" "compiler_ast.hpp" >}} + +Finally, we get to writing our Bison file, `parser.y`. Here's what I come up with: +{{< rawblock "compiler_parser.y" >}}