Add the drafts of the two posts
This commit is contained in:
parent
708f9bebfa
commit
f42cb900cf
76
code/compiler_ast.hpp
Normal file
76
code/compiler_ast.hpp
Normal file
@ -0,0 +1,76 @@
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
struct ast {
|
||||
virtual ~ast();
|
||||
};
|
||||
|
||||
using ast_ptr = std::unique_ptr<ast>;
|
||||
|
||||
struct pattern {
|
||||
virtual ~pattern();
|
||||
};
|
||||
|
||||
struct pattern_var : public pattern {
|
||||
std::string var;
|
||||
|
||||
pattern_var(const char* v)
|
||||
: var(v) {}
|
||||
};
|
||||
|
||||
struct pattern_constr : public pattern {
|
||||
std::string constr;
|
||||
std::vector<std::string> params;
|
||||
|
||||
pattern_constr(const char* c, std::vector<std::string>&& p)
|
||||
: constr(c) {
|
||||
std::swap(params, p);
|
||||
}
|
||||
};
|
||||
|
||||
using pattern_ptr = std::unique_ptr<pattern>;
|
||||
|
||||
struct branch {
|
||||
pattern_ptr pat;
|
||||
ast_ptr expr;
|
||||
|
||||
branch(pattern_ptr&& p, ast_ptr&& a)
|
||||
: pat(std::move(p)), expr(std::move(a)) {}
|
||||
};
|
||||
|
||||
using branch_ptr = std::unique_ptr<branch>;
|
||||
|
||||
enum binop {
|
||||
PLUS,
|
||||
MINUS,
|
||||
TIMES,
|
||||
DIVIDE
|
||||
};
|
||||
|
||||
struct ast_binop : public ast {
|
||||
binop op;
|
||||
ast_ptr left;
|
||||
ast_ptr right;
|
||||
|
||||
ast_binop(binop o, ast_ptr&& l, ast_ptr&& r)
|
||||
: op(o), left(std::move(l)), right(std::move(r)) {}
|
||||
};
|
||||
|
||||
struct ast_app : public ast {
|
||||
ast_ptr left;
|
||||
ast_ptr right;
|
||||
|
||||
ast_app(ast* l, ast* r)
|
||||
: left(l), right(r) {}
|
||||
};
|
||||
|
||||
struct ast_case : public ast {
|
||||
ast_ptr of;
|
||||
std::vector<branch_ptr> branches;
|
||||
|
||||
ast_case(ast_ptr&& o, std::vector<branch_ptr>&& b)
|
||||
: of(std::move(o)) {
|
||||
std::swap(branches, b);
|
||||
}
|
||||
};
|
26
code/compiler_parser.y
Normal file
26
code/compiler_parser.y
Normal file
@ -0,0 +1,26 @@
|
||||
%{
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include "ast.hpp"
|
||||
#include "parser.hpp"
|
||||
%}
|
||||
|
||||
%token PLUS
|
||||
%token TIMES
|
||||
%token MINUS
|
||||
%token DIVIDE
|
||||
%token INT
|
||||
%token DEFN
|
||||
%token DATA
|
||||
%token CASE
|
||||
%token OF
|
||||
%token OCURLY
|
||||
%token CCURLY
|
||||
%token OPAREN
|
||||
%token CPAREN
|
||||
%token COMMA
|
||||
%token ARROW
|
||||
%token EQUA
|
||||
%token LID
|
||||
%token UID
|
||||
|
33
code/compiler_scanner.l
Normal file
33
code/compiler_scanner.l
Normal file
@ -0,0 +1,33 @@
|
||||
%option noyywrap
|
||||
|
||||
%{
|
||||
#include <iostream>
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
[ \n]+ {}
|
||||
\+ { std::cout << "PLUS" << std::endl; }
|
||||
\* { std::cout << "TIMES" << std::endl; }
|
||||
- { std::cout << "MINUS" << std::endl; }
|
||||
\/ { std::cout << "DIVIDE" << std::endl; }
|
||||
[0-9]+ { std::cout << "NUMBER: " << yytext << std::endl; }
|
||||
defn { std::cout << "KEYWORD: defn" << std::endl; }
|
||||
data { std::cout << "KEYWORD: data" << std::endl; }
|
||||
case { std::cout << "KEYWORD: case" << std::endl; }
|
||||
of { std::cout << "KEYWORD: of" << std::endl; }
|
||||
\{ { std::cout << "OPEN CURLY" << std::endl; }
|
||||
\} { std::cout << "CLOSED CURLY" << std::endl; }
|
||||
\( { std::cout << "OPEN PARENTH" << std::endl; }
|
||||
\) { std::cout << "CLOSE PARENTH" << std::endl; }
|
||||
, { std::cout << "COMMA" << std::endl; }
|
||||
-> { std::cout << "PATTERN ARROW" << std::endl; }
|
||||
= { std::cout << "EQUAL" << std::endl; }
|
||||
[a-z][a-zA-Z]* { std::cout << "LOWERCASE IDENTIFIER: " << yytext << std::endl; }
|
||||
[A-Z][a-zA-Z]* { std::cout << "UPPERCASE IDENTIFIER: " << yytext << std::endl; }
|
||||
|
||||
%%
|
||||
|
||||
int main() {
|
||||
yylex();
|
||||
}
|
243
content/blog/01_compiler_tokenizing.md
Normal file
243
content/blog/01_compiler_tokenizing.md
Normal file
@ -0,0 +1,243 @@
|
||||
---
|
||||
title: Compiling a Functional Language Using C++, Part 1 - Tokenizing
|
||||
date: 2019-08-03T01:02:30-07:00
|
||||
tags: ["C and C++", "Functional Languages", "Compilers"]
|
||||
draft: true
|
||||
---
|
||||
During my last academic term, I was enrolled in a compilers course.
|
||||
We had a final project - develop a compiler for a basic Python subset,
|
||||
using LLVM. It was a little boring - virtually nothing about the compiler
|
||||
was __not__ covered in class, and it felt more like putting two puzzles
|
||||
pieces together than building a real project.
|
||||
|
||||
Being involved of the Programming Language Theory (PLT) research group at my
|
||||
university, I decided to do something different for the final project -
|
||||
a compiler for a functional language. In a series of posts, starting with
|
||||
thise one, I will explain what I did so that those interested in the subject
|
||||
are able to replicate my steps, and maybe learn something for themselves.
|
||||
|
||||
### The "classic" stages of a compiler
|
||||
Let's take a look at the high level overview of what a compiler does.
|
||||
Conceptually, the components of a compiler are pretty cleanly separated.
|
||||
They are as gollows:
|
||||
|
||||
1. Tokenizing / lexical analysis
|
||||
2. Parsing
|
||||
3. Analysis / optimization
|
||||
5. Code Generation
|
||||
|
||||
There are many variations on this structure. Some compilers don't optimize
|
||||
at all, some translate the program text into an intermediate representation,
|
||||
an alternative way of representing the program that isn't machine code.
|
||||
In some compilers, the stages of parsing and analysis can overlap.
|
||||
In short, just like the pirate's code, it's more of a guideline than a rule.
|
||||
|
||||
### Tokenizing and Parsing (the "boring stuff")
|
||||
It makes sense to build a compiler bit by bit, following the stages we outlined above.
|
||||
This is because these stages are essentially a pipeline, with program text
|
||||
coming in one end, and the final program coming out of the other. So as we build
|
||||
up our pipeline, we'll be able to push program text further and further, until
|
||||
eventually we get something that we can run on our machine.
|
||||
|
||||
This is how most tutorials go about building a compiler, too. The result is that
|
||||
there are a __lot__ of tutorials covering tokenizing and parsing. This is why
|
||||
I refer to this part of the process as "boring". Nonetheless, I will cover the steps
|
||||
required to tokenize and parse our little functional language. But before we do that,
|
||||
we first need to have an idea of what our language looks like.
|
||||
|
||||
### The Grammar
|
||||
Simon Peyton Jones, in his two works regarding compiling functional languages, remarks
|
||||
that most functional languages are very similar, and vary largely in syntax. That's
|
||||
our main degree of freedom. We want to represent the following things, for sure:
|
||||
|
||||
* Defining functions
|
||||
* Applying functions
|
||||
* Arithmetic
|
||||
* Algebraic data types (to represent lists, pairs, and the like)
|
||||
* Pattern matching (to operate on data types)
|
||||
|
||||
We can additionally support anonymous (lambda) functions, but compiling those
|
||||
is actually a bit trickier, so we will skip those for now. Arithmetic is the simplest to
|
||||
define - let's define it as we would expect: `3` is a number, `3+2*6` evaluates to 15.
|
||||
Function application isn't much more difficult - `f x` means "apply f to x", and
|
||||
`f x + g x` means sum the result of applying f to x and g to x. That is, function
|
||||
application has higher precedence, or __binds tighter__ than binary operators like plus.
|
||||
|
||||
Next, let's define the syntax for declaring a function. Why not:
|
||||
```
|
||||
defn f x = { x + x }
|
||||
```
|
||||
|
||||
As for declaring data types:
|
||||
```
|
||||
data List = { Nil, Cons Int List }
|
||||
```
|
||||
Notice that we are avoiding polymorphism here.
|
||||
|
||||
Let's also define a syntax for pattern matching:
|
||||
```
|
||||
case l of {
|
||||
Nil -> { 0 }
|
||||
Cons x xs -> { x }
|
||||
}
|
||||
```
|
||||
The above means "if the list `l` is `Nil`, then return 0, otherwise, if it's
|
||||
constructed from an integer and another list (as defined in our `data` example),
|
||||
return the integer".
|
||||
|
||||
That's it for now! Let's take a look at tokenizing.
|
||||
|
||||
### Tokenizing
|
||||
When we first get our program text, it's in a representation difficult for us to make
|
||||
sense of. If we look at how it's represented in C++, we see that it's just an array
|
||||
of characters (potentially hundreds, thousands, or millions in length). We __could__
|
||||
jump straight to parsing the text (which involves creating a tree structure, known
|
||||
as an __abstract syntax tree__; more on that later). There's nothing wrong with this approach -
|
||||
in fact, in functional languages, tokenizing is frequently skipped. However,
|
||||
in our closer-to-metal language, it happens to be more convenient to first break the
|
||||
input text into a bunch of distinct segments (tokens).
|
||||
|
||||
For example, consider the string "320+6". If we skip tokenizing and go straight
|
||||
into parsing, we'd feed our parser the sequence of characters `['3', '2', '6', '+', '6', '\0']`.
|
||||
On the other hand, if we run a tokenizing step on the string first, we'd be feeding our
|
||||
parser three tokens, `("320", NUMBER)`, `("+", OPERATOR)`, and `("6", NUMBER)`.
|
||||
To us, this is a bit more clear - we've partitioned the string into logical segments.
|
||||
Our parser, then, won't have to care about recognizing a number - it will just know
|
||||
that a number is next in the string, and do with that information what it needs.
|
||||
|
||||
How do we go about breaking up a string into tokens? We need to come up with a
|
||||
way to compare some characters in a string against a set of rules. But "rules"
|
||||
is a very general term - we could, for instance, define a particular
|
||||
token that is a fibonacci number - 1, 2, 3, 5, and so on would be marked
|
||||
as a "fibonacci number", while the other numbers will be marked as just
|
||||
a regular number. To support that, our rules would get pretty complex. And
|
||||
equally complex will become our checking of these rules for particular strings.
|
||||
|
||||
Fortunately, we're not insane. We observe that the rules for tokens in practice
|
||||
are fairly simple - one or more digits is an integer, a few letters together
|
||||
are a variable name. In order to be able to efficiently break text up into
|
||||
such tokens, we restrict ourselves to __regular languages__. A language
|
||||
is defined as a set of strings (potentially infinite), and a regular
|
||||
language for which we can write a __regular expression__ to check if
|
||||
a string is in the set. Regular expressions are a way of representing
|
||||
patterns that a string has to match. We define regular expressions
|
||||
as follows:
|
||||
|
||||
* Any character is a regular expression that matches that character. Thus,
|
||||
\\(a\\) is a regular expression (from now shortened to regex) that matches
|
||||
the character 'a', and nothing else.
|
||||
* \\(r_1r_2\\), or the concatenation of \\(r_1\\) and \\(r_2\\), is
|
||||
a regular expression that matches anything matched by \\(r_1\\), followed
|
||||
by anything that matches \\(r_2\\). For instance, \\(ab\\), matches
|
||||
the character 'a' followed by the character 'b' (thus matching "ab").
|
||||
* \\(r_1|r_2\\) matches anything that is either matched by \\(r_1\\) or
|
||||
\\(r_2\\). Thus, \\(a|b\\) matches the character 'a' or the character 'b'.
|
||||
* \\(r_1?\\) matches either an empty string, or anything matched by \\(r_1\\).
|
||||
* \\(r_1+\\) matches one or more things matched by \\(r_1\\). So,
|
||||
\\(a+\\) matches "a", "aa", "aaa", and so on.
|
||||
* \\((r_1)\\) matches anything that matches \\(r_1\\). This is mostly used
|
||||
to group things together in more complicated expressions.
|
||||
* \\(.\\) matches any character.
|
||||
|
||||
More powerful variations of regex also include an "any of" operator, \\([c_1c_2c_3]\\),
|
||||
which is equivalent to \\(c_1|c_2|c_3\\), and a "range" operator, \\([c_1-c_n]\\), which
|
||||
matches all characters in the range between \\(c_1\\) and \\(c_n\\), inclusive.
|
||||
|
||||
Let's see some examples. An integer, such as 326, can be represented with \\([0-9]+\\).
|
||||
This means, one or more characters between 0 or 9. Some (most) regex implementations
|
||||
have a special symbol for \\([0-9]\\), written as \\(\\setminus d\\). A variable,
|
||||
starting with a lowercase letter and containing lowercase or uppercase letters after it,
|
||||
can be written as \\(\[a-z\]([a-z]+)?\\). Again, most regex implementations provide
|
||||
a special operator for \\((r_1+)?\\), written as \\(r_1*\\).
|
||||
|
||||
#### The Theory
|
||||
So how does one go about checking if a regular expression matches a string? An efficient way is to
|
||||
first construct a [state machine](https://en.wikipedia.org/wiki/Finite-state_machine). A type of state machine can be constructed from a regular expression
|
||||
by literally translating each part of it to a series of states, one-to-one. This machine is called
|
||||
a __Nondeterministic Finite Automaton__, or NFA for short. The "Finite" means that the number of
|
||||
states in the state machine is, well, finite. For us, this means that we can store such
|
||||
a machine on disk. The "Nondeterministic" part, though, is more complex: given a particular character
|
||||
and a particular state, it's possible that an NFA has the option of transitioning into more
|
||||
than one other state. Well, which state __should__ it pick? No easy way to tell. Each time
|
||||
we can transition to more than one state, we exponentially increase the number of possible
|
||||
states that we can be in. This isn't good - we were going for efficiency, remember?
|
||||
|
||||
What we can do is convert our NFA into another kind of state machine, in which for every character,
|
||||
only one possible state transition is possible. This machine is called a __Deterministic Finite Automaton__,
|
||||
or DFA for short. There's an algorithm to convert an NFA into a DFA, which I won't explain here.
|
||||
|
||||
Since both the conversion of a regex into an NFA and a conversion of an NFA into a DFA is done
|
||||
by following an algorithm, we're always going to get the same DFA for the same regex we put in.
|
||||
If we come up with the rules for our tokens once, we don't want to be building a DFA each time
|
||||
our tokenizer is run - the result will always be the same! Even worse, translating a regular
|
||||
expression all the way into a DFA is the inefficient part of the whole process. The solution is to
|
||||
generate a state machine, and convert it into code to simulate that state machine. Then, we include
|
||||
that code as part of our compiler. This way, we have a state machine "hardcoded" into our tokenizer,
|
||||
and no conversion of regex to DFAs needs to be done at runtime.
|
||||
|
||||
#### The Practice
|
||||
Creating an NFA, and then a DFA, and then generating C++ code are all cumbersome. If we had to
|
||||
write code to do this every time we made a compiler, it would get very repetitive, very fast.
|
||||
Fortunately, there exists a tool that does exactly this for us - it's called `flex`. Flex
|
||||
takes regular expressions, and generates code that matches a string against those regular expressions.
|
||||
It does one more thing in addition to that - for each regular expression it matches, flex
|
||||
runs a user-defined action (which we write in C++). We can use this to convert strings that
|
||||
represent numbers directly into numbers, and do other small tasks.
|
||||
|
||||
So, what tokens do we have? From our arithmetic definition, we see that we have integers.
|
||||
Let's use the regex `[0-9]+` for those. We also have the operators `+`, `-`, `*`, and `/`.
|
||||
`-` is simple enough: the corresponding regex is `-`. We need to
|
||||
preface our `/`, `+` and `*` with a backslash, though, since they happen to also be modifiers
|
||||
in flex's regular expressions: `\/`, `\+`, `\*`.
|
||||
|
||||
Let's also represent some reserved keywords. We'll say that `defn`, `data`, `case`, and `of`
|
||||
are reserved. Their regular expressions are just their names. We also want to tokenize
|
||||
`=`, `->`, `{`, `}`, `,`, `(` and `)`. Finally, we want to represent identifiers, like `f`,
|
||||
`x`, `Nil`, and `Cons`. We will actually make a distinction between lowercase identifiers
|
||||
and uppercase identifiers, as we will follow Haskell's convention of representing
|
||||
data type constructors with uppercase letters, and functions and variables with lowercase ones.
|
||||
So, our two regular expressions will be `[a-z][a-zA-Z]*` for the lowercase variables, and
|
||||
`[A-Z][a-zA-Z]*` for uppercase variables. Let's make a tokenizer in flex with all this. To do
|
||||
this, we create a new file, `scanner.l`, in which we write a mix of regular expressions
|
||||
and C++ code. Here's the whole thing:
|
||||
|
||||
{{< rawblock "compiler_scanner.l" >}}
|
||||
|
||||
A flex file starts with options. I set the `noyywrap` option, which disables a particular
|
||||
feature of flex that we won't use, and which causes linker errors. Next up,
|
||||
flex allows us to put some C++ code that we want at the top of our generated code.
|
||||
I simply include `iostream`, so that we can use `cout` to print out our tokens.
|
||||
Next, `%%`, and after that, the meat of our tokenizer: regular expressions, followed by
|
||||
C++ code that should be executed when the regular expression is matched.
|
||||
|
||||
The first token: whitespace. This includes the space character,
|
||||
and the newline character. We ignore it, so its rule is empty. After that,
|
||||
we have the regular expressions for the tokens we've talked about. For each, I just
|
||||
print a description of the token that matched. This will change we we hook this up to
|
||||
a parser, but for now, this works fine. Notice that the variable `yytext` contains
|
||||
the string matched by our regular expression. This variable is set by the code flex
|
||||
generates, and we can use it to get the extract text that matched a regex. This is
|
||||
useful, for instance, to print the variable name that we matched. After
|
||||
all of our tokens, another `%%`, and more C++ code. For this simple example,
|
||||
I declare a `main` function, which just calls `yylex`, a function flex
|
||||
generates for us. Let's generate the C++ code, and compile it:
|
||||
|
||||
```
|
||||
flex -o scanner.cpp scanner.l
|
||||
g++ -o scanner scanner.cpp
|
||||
```
|
||||
|
||||
Now, let's feed it an expression:
|
||||
```
|
||||
echo "3+2*6" | ./scanner
|
||||
```
|
||||
|
||||
We get the output:
|
||||
```
|
||||
NUMBER: 3
|
||||
PLUS
|
||||
NUMBER: 2
|
||||
TIMES
|
||||
NUMBER: 6
|
||||
```
|
||||
Hooray! We have tokenizing.
|
231
content/blog/02_compiler_parsing.md
Normal file
231
content/blog/02_compiler_parsing.md
Normal file
@ -0,0 +1,231 @@
|
||||
---
|
||||
title: Compiling a Functional Language Using C++, Part 2 - Parsing
|
||||
date: 2019-08-03T01:02:30-07:00
|
||||
tags: ["C and C++", "Functional Languages", "Compilers"]
|
||||
draft: true
|
||||
---
|
||||
In the previous post, we covered tokenizing. We learned how to convert an input string into logical segments, and even wrote up a tokenizer to do it according to the rules of our language. Now, it's time to make sense of the tokens, and parse our language.
|
||||
|
||||
### The Theory
|
||||
The rules to parse a language are more complicated than the rules for
|
||||
recognizing tokens. For instance, consider a simple language of a matching
|
||||
number of open and closed parentheses, like `()` and `((()))`. You can't
|
||||
write a regular expression for it! We resort to a wider class of languages, called
|
||||
__context free languages__. These languages are ones that are matched by __context free grammars__.
|
||||
A context free grammar is a list of rules in the form of \\(S \\rightarrow \\alpha\\), where
|
||||
\\(S\\) is a __nonterminal__ (conceptualy, a thing that expands into other things), and
|
||||
\\(\\alpha\\) is a sequence of nonterminals and terminals (a terminal is a thing that doesn't
|
||||
expand into other things; for us, this is a token).
|
||||
|
||||
Let's write a context free grammar (CFG from now on) to match our parenthesis language:
|
||||
|
||||
$$
|
||||
\\begin{align}
|
||||
S & \\rightarrow ( S ) \\\\\\
|
||||
S & \\rightarrow ()
|
||||
\\end{align}
|
||||
$$
|
||||
|
||||
So, how does this work? We start with a "start symbol" nonterminal, which we usually denote as \\(S\\). Then, to get a desired string,
|
||||
we replace a nonterminal with the sequence of terminals and nonterminals on the right of one of its rules. For instance, to get `()`,
|
||||
we start with \\(S\\) and replace it with the body of the second one of its rules. This gives us `()` right away. To get `((()))`, we
|
||||
have to do a little more work:
|
||||
|
||||
$$
|
||||
S \\rightarrow (S) \\rightarrow ((S)) \\rightarrow ((()))
|
||||
$$
|
||||
|
||||
In practice, there are many ways of using a CFG to parse a programming language. Various parsing algorithms support various subsets
|
||||
of context free languages. For instance, top down parsers follow nearly exactly the structure that we had. They try to parse
|
||||
a nonterminal by trying to match each symbol in its body. In the rule \\(S \\rightarrow \\alpha \\beta \\gamma\\), it will
|
||||
first try to match \\(alpha\\), then \\(beta\\), and so on. If one of the three contains a nonterminal, it will attempt to parse
|
||||
that nonterminal following the same strategy. However, this leaves a flaw - For instance, consider the grammar
|
||||
$$
|
||||
\\begin{align}
|
||||
S & \\rightarrow Sa \\\\\\
|
||||
S & \\rightarrow a
|
||||
\\end{align}
|
||||
$$
|
||||
A top down parser will start with \\(S\\). It will then try the first rule, which starts with \\(S\\). So, dutifully, it will
|
||||
try to parse __that__ \\(S\\). And to do that, it will once again try the first rule, and find that it starts with another \\(S\\)...
|
||||
This will never end, and the parser will get stuck. A grammar in which a nonterminal can appear in the beginning of one of its rules
|
||||
__left recursive__, and top-down parsers aren't able to handle those grammars.
|
||||
|
||||
We __could__ rewrite our grammar without using left-recursion, but we don't want to. Instead, we'll use a __bottom up__ parser,
|
||||
using specifically the LALR(1) parsing algorithm. Here's an example of how it works, using our left-recursive grammar. We start with our
|
||||
goal string, and a "dot" indicating where we are. At first, the dot is behind all the characters:
|
||||
$$
|
||||
.aaa
|
||||
$$
|
||||
We see nothing interesting on the left side of the dot, so we move (or __shift__) the dot forward by one character:
|
||||
$$
|
||||
a.aa
|
||||
$$
|
||||
Now, on the left side of the dot, we see something! In particular, we see the body of one of the rules for \\(S\\) (the second one).
|
||||
So we __reduce__ the thing on the left side of the dot, by replacing it with the left hand side of the rule (\\(S\\)):
|
||||
$$
|
||||
S.aa
|
||||
$$
|
||||
There's nothing else we can do with the left side, so we shift again:
|
||||
$$
|
||||
Sa.a
|
||||
$$
|
||||
Great, we see another body on the left of the dot. We reduce it:
|
||||
$$
|
||||
S.a
|
||||
$$
|
||||
Just like before, we shift over the dot, and again, we reduce. We end up with our
|
||||
start symbol, and nothing on the right of the dot, so we're done!
|
||||
|
||||
### The Practice
|
||||
In practice, we don't want to just match a grammar. That would be like saying "yup, this is our language".
|
||||
Instead, we want to create something called an __abstract syntax tree__, or AST for short. This tree
|
||||
captures the structure of our language, and is easier to work with than its textual representation. The structure
|
||||
of the tree we build will often mimic the structure of our grammar: a rule in the form \\(S \\rightarrow A B\\)
|
||||
will result in a tree named "S", with two children corresponding the trees built for A and B. Since
|
||||
an AST captures the structure of the language, we'll be able to toss away some punctuation
|
||||
like `,` and `(`. These tokens will appear in our grammar, but we will tweak our parser to simply throw them away. Additionally,
|
||||
we will write our grammar ignoring whitespace, since our tokenizer conveniently throws that into the trash.
|
||||
|
||||
The grammar for arithmetic actually involves more effort than it would appear at first. We want to make sure that our
|
||||
parser respects the order of operations. This way, when we have our tree, it will immediately have the structure in
|
||||
which multiplication is done before addition. We do this by creating separate "levels" in our grammar, with one
|
||||
nonterminal matching addition and subtraction, while another nonterminal matches multiplication and division.
|
||||
We want the operation that has the least precedence to be __higher__ in our tree than one of higher precedence.
|
||||
For instance, for `3+2*6`, we want our tree to have `+` as the root, `3` as the left child, and the tree for `2*6` as the right child.
|
||||
Why? Because this tree represents "the addition of 3 and the result of multiplying 2 by 6". If we had `*` be the root, we'd have
|
||||
a tree representing "the multiplication of the result of adding 3 to 2 and 6", which is __not__ what our expression means.
|
||||
|
||||
So, with this in mind, we want our rule for __addition__ (represented with the nonterminal \\(A\_{add}\\), to be matched first, and
|
||||
for its children to be trees created by the multiplication rule, \\(A\_{mult}\\). So we write the following rules:
|
||||
$$
|
||||
\\begin{align}
|
||||
A\_{add} & \\rightarrow A\_{add}+A\_{mult} \\\\\\
|
||||
A\_{add} & \\rightarrow A\_{add}-A\_{mult} \\\\\\
|
||||
A\_{add} & \\rightarrow A\_{mult}
|
||||
\\end{align}
|
||||
$$
|
||||
The first rule matches another addition, added to the result of another addition. We use the addition in the body
|
||||
because we want to be able to parse strings like `1+2+3+4`, which we want to view as `((1+2)+3)+4` (mostly because
|
||||
subtraction is [left-associative](https://en.wikipedia.org/wiki/Operator_associativity)). So, we want the top level
|
||||
of the tree to be the rightmost `+` or `-`, since that means it will be the "last" operation. You may be asking,
|
||||
|
||||
> You define addition in terms of addition; how will parsing end? What if there's no addition at all, like `2*6`?
|
||||
|
||||
This is the purpose of the third rule, which serves to say "an addition expression can just be a multiplication,
|
||||
without any plusses or minuses." Our rules for multiplication are very similar:
|
||||
$$
|
||||
\\begin{align}
|
||||
A\_{mult} & \\rightarrow A\_{mult}*P \\\\\\
|
||||
A\_{mult} & \\rightarrow A\_{mult}/P \\\\\\
|
||||
A\_{mult} & \\rightarrow P
|
||||
\\end{align}
|
||||
$$
|
||||
|
||||
P, in this case, is an a__p__lication (remember, application has higher precedence than any binary operator).
|
||||
Once again, if there's no `*` or `\`, we simply fall through to a \\(P\\) nonterminal, representing application.
|
||||
|
||||
Application is refreshingly simple:
|
||||
$$
|
||||
\\begin{align}
|
||||
P & \\rightarrow P B \\\\\\
|
||||
P & \\rightarrow B
|
||||
\\end{align}
|
||||
$$
|
||||
An application is either only one "thing" (represented with \\(B\\), for __b__ase), such as a number or an identifier,
|
||||
or another application followed by a thing.
|
||||
|
||||
We now need to define what a "thing" is. As we said before, it's a number, or an identifier. We also make a parenthesized
|
||||
arithmetic expression a "thing", allowing us to wrap right back around and recognize anything inside parentheses:
|
||||
$$
|
||||
\\begin{align}
|
||||
B & \\rightarrow \text{num} \\\\\\
|
||||
B & \\rightarrow \text{lowerVar} \\\\\\
|
||||
B & \\rightarrow \text{upperVar} \\\\\\
|
||||
B & \\rightarrow ( A\_{add} ) \\\\\\
|
||||
B & \\rightarrow C
|
||||
\\end{align}
|
||||
$$
|
||||
What's the last \\(C\\)? We also want a "thing" to be a case expression. Here are the rules for that:
|
||||
$$
|
||||
\\begin{align}
|
||||
C & \\rightarrow \\text{case} \\; A\_{add} \\; \\text{of} \\; \\{ L\_B\\} \\\\\\
|
||||
L\_B & \\rightarrow R \\; , \\; L\_B \\\\\\
|
||||
L\_B & \\rightarrow R \\\\\\
|
||||
R & \\rightarrow N \\; \\text{arrow} \\; \\{ A\_{add} \\} \\\\\\
|
||||
N & \\rightarrow \\text{lowerVar} \\\\\\
|
||||
N & \\rightarrow \\text{upperVar} \\; L\_L \\\\\\
|
||||
L\_L & \\rightarrow \\text{lowerVar} \\; L\_L \\\\\\
|
||||
L\_L & \\rightarrow \\epsilon
|
||||
\\end{align}
|
||||
$$
|
||||
\\(L\_B\\) is the list of branches in our case expression. \\(R\\) is a single branch, which is in the
|
||||
form `Pattern -> Expression`. \\(N\\) is a pattern, which we will for now define to be either a variable name
|
||||
(\\(\\text{lowerVar}\\)), or a constructor with some arguments. The arguments of a constructor will be
|
||||
lowercase names, and a list of those arguments will be represented with \\(L\_L\\). One of the bodies
|
||||
of this nonterminal is just the character \\(\\epsilon\\), which just means "nothing".
|
||||
We use this because a constructor can have no arguments (like Nil).
|
||||
|
||||
We can use these grammar rules to represent any expression we want. For instance, let's try `3+(multiply 2 6)`,
|
||||
where multiply is a function that, well, multiplies. We start with \\(A_{add}\\):
|
||||
$$
|
||||
\\begin{align}
|
||||
& A\_{add} \\\\\\
|
||||
& \\rightarrow A\_{add} + A\_{mult} \\\\\\
|
||||
& \\rightarrow A\_{mult} + A\_{mult} \\\\\\
|
||||
& \\rightarrow P + A\_{mult} \\\\\\
|
||||
& \\rightarrow B + A\_{mult} \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + A\_{mult} \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + P \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + B \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (A\_{add}) \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (A\_{mult}) \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (P) \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (P \\; \\text{num(6)}) \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (P \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\
|
||||
& \\rightarrow \\text{num(3)} + (\\text{lowerVar(multiply)} \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\
|
||||
\\end{align}
|
||||
$$
|
||||
|
||||
We're almost there. We now want a rule for a "something that can appear at the top level of a program", like
|
||||
a function or data type declaration. We make a new set of rules:
|
||||
$$
|
||||
\\begin{align}
|
||||
T & \\rightarrow \\text{defn} \\; \\text{lowerVar} \\; L\_P =\\{ A\_{add} \\} \\\\\\
|
||||
T & \\rightarrow \\text{data} \\; \\text{upperVar} = \\{ L\_D \\} \\\\\\
|
||||
L\_D & \\rightarrow D \\; , \\; L\_D \\\\\\
|
||||
L\_D & \\rightarrow D \\\\\\
|
||||
L\_P & \\rightarrow \\text{lowerVar} \\; L\_P \\\\\\
|
||||
L\_P & \\rightarrow \\epsilon \\\\\\
|
||||
D & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\
|
||||
L\_U & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\
|
||||
L\_U & \\rightarrow \\epsilon
|
||||
\\end{align}
|
||||
$$
|
||||
That's a lot of rules! \\(T\\) is the "top-level declaration rule. It matches either
|
||||
a function or a data definition. A function definition consists of the keyword "defn",
|
||||
followed by a function name (starting with a lowercase letter), followed by a list of
|
||||
parameters, represented by \\(L\_P\\).
|
||||
|
||||
A data type definition consists of the name of the data type (starting with an uppercase letter),
|
||||
and a list \\(L\_D\\) of data constructors \\(D\\). There must be at least one data constructor in this list,
|
||||
so we don't use the empty string rule here. A data constructor is simply an uppercase variable representing
|
||||
a constructor of the data type, followed by a list \\(L\_U\\) of zero or more uppercase variables (representing
|
||||
the types of the arguments of the constructor).
|
||||
|
||||
Finally, we want one or more of these declarations in a valid program:
|
||||
$$
|
||||
\\begin{align}
|
||||
G & \\rightarrow T \\; G \\\\\\
|
||||
G & \\rightarrow T
|
||||
\\end{align}
|
||||
$$
|
||||
|
||||
Just like with tokenizing, there exists a piece of software that will generate a bottom-up parser for us, given our grammar.
|
||||
It's called Bison, and it is frequently used with Flex. Before we get to bison, though, we need to pay a debt we've already
|
||||
incurred - the implementation of our AST. Such a tree is language-specific, so Bison doesn't generate it for us. Here's what
|
||||
I came up with:
|
||||
{{< codeblock "C++" "compiler_ast.hpp" >}}
|
||||
|
||||
Finally, we get to writing our Bison file, `parser.y`. Here's what I come up with:
|
||||
{{< rawblock "compiler_parser.y" >}}
|
Loading…
Reference in New Issue
Block a user