Add the drafts of the two posts
This commit is contained in:
parent
708f9bebfa
commit
f42cb900cf
76
code/compiler_ast.hpp
Normal file
76
code/compiler_ast.hpp
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <memory>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct ast {
|
||||||
|
virtual ~ast();
|
||||||
|
};
|
||||||
|
|
||||||
|
using ast_ptr = std::unique_ptr<ast>;
|
||||||
|
|
||||||
|
struct pattern {
|
||||||
|
virtual ~pattern();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pattern_var : public pattern {
|
||||||
|
std::string var;
|
||||||
|
|
||||||
|
pattern_var(const char* v)
|
||||||
|
: var(v) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pattern_constr : public pattern {
|
||||||
|
std::string constr;
|
||||||
|
std::vector<std::string> params;
|
||||||
|
|
||||||
|
pattern_constr(const char* c, std::vector<std::string>&& p)
|
||||||
|
: constr(c) {
|
||||||
|
std::swap(params, p);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
using pattern_ptr = std::unique_ptr<pattern>;
|
||||||
|
|
||||||
|
struct branch {
|
||||||
|
pattern_ptr pat;
|
||||||
|
ast_ptr expr;
|
||||||
|
|
||||||
|
branch(pattern_ptr&& p, ast_ptr&& a)
|
||||||
|
: pat(std::move(p)), expr(std::move(a)) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
using branch_ptr = std::unique_ptr<branch>;
|
||||||
|
|
||||||
|
enum binop {
|
||||||
|
PLUS,
|
||||||
|
MINUS,
|
||||||
|
TIMES,
|
||||||
|
DIVIDE
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ast_binop : public ast {
|
||||||
|
binop op;
|
||||||
|
ast_ptr left;
|
||||||
|
ast_ptr right;
|
||||||
|
|
||||||
|
ast_binop(binop o, ast_ptr&& l, ast_ptr&& r)
|
||||||
|
: op(o), left(std::move(l)), right(std::move(r)) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ast_app : public ast {
|
||||||
|
ast_ptr left;
|
||||||
|
ast_ptr right;
|
||||||
|
|
||||||
|
ast_app(ast* l, ast* r)
|
||||||
|
: left(l), right(r) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ast_case : public ast {
|
||||||
|
ast_ptr of;
|
||||||
|
std::vector<branch_ptr> branches;
|
||||||
|
|
||||||
|
ast_case(ast_ptr&& o, std::vector<branch_ptr>&& b)
|
||||||
|
: of(std::move(o)) {
|
||||||
|
std::swap(branches, b);
|
||||||
|
}
|
||||||
|
};
|
26
code/compiler_parser.y
Normal file
26
code/compiler_parser.y
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
%{
|
||||||
|
#include <string>
|
||||||
|
#include <iostream>
|
||||||
|
#include "ast.hpp"
|
||||||
|
#include "parser.hpp"
|
||||||
|
%}
|
||||||
|
|
||||||
|
%token PLUS
|
||||||
|
%token TIMES
|
||||||
|
%token MINUS
|
||||||
|
%token DIVIDE
|
||||||
|
%token INT
|
||||||
|
%token DEFN
|
||||||
|
%token DATA
|
||||||
|
%token CASE
|
||||||
|
%token OF
|
||||||
|
%token OCURLY
|
||||||
|
%token CCURLY
|
||||||
|
%token OPAREN
|
||||||
|
%token CPAREN
|
||||||
|
%token COMMA
|
||||||
|
%token ARROW
|
||||||
|
%token EQUA
|
||||||
|
%token LID
|
||||||
|
%token UID
|
||||||
|
|
33
code/compiler_scanner.l
Normal file
33
code/compiler_scanner.l
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
%option noyywrap
|
||||||
|
|
||||||
|
%{
|
||||||
|
#include <iostream>
|
||||||
|
%}
|
||||||
|
|
||||||
|
%%
|
||||||
|
|
||||||
|
[ \n]+ {}
|
||||||
|
\+ { std::cout << "PLUS" << std::endl; }
|
||||||
|
\* { std::cout << "TIMES" << std::endl; }
|
||||||
|
- { std::cout << "MINUS" << std::endl; }
|
||||||
|
\/ { std::cout << "DIVIDE" << std::endl; }
|
||||||
|
[0-9]+ { std::cout << "NUMBER: " << yytext << std::endl; }
|
||||||
|
defn { std::cout << "KEYWORD: defn" << std::endl; }
|
||||||
|
data { std::cout << "KEYWORD: data" << std::endl; }
|
||||||
|
case { std::cout << "KEYWORD: case" << std::endl; }
|
||||||
|
of { std::cout << "KEYWORD: of" << std::endl; }
|
||||||
|
\{ { std::cout << "OPEN CURLY" << std::endl; }
|
||||||
|
\} { std::cout << "CLOSED CURLY" << std::endl; }
|
||||||
|
\( { std::cout << "OPEN PARENTH" << std::endl; }
|
||||||
|
\) { std::cout << "CLOSE PARENTH" << std::endl; }
|
||||||
|
, { std::cout << "COMMA" << std::endl; }
|
||||||
|
-> { std::cout << "PATTERN ARROW" << std::endl; }
|
||||||
|
= { std::cout << "EQUAL" << std::endl; }
|
||||||
|
[a-z][a-zA-Z]* { std::cout << "LOWERCASE IDENTIFIER: " << yytext << std::endl; }
|
||||||
|
[A-Z][a-zA-Z]* { std::cout << "UPPERCASE IDENTIFIER: " << yytext << std::endl; }
|
||||||
|
|
||||||
|
%%
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
yylex();
|
||||||
|
}
|
243
content/blog/01_compiler_tokenizing.md
Normal file
243
content/blog/01_compiler_tokenizing.md
Normal file
@ -0,0 +1,243 @@
|
|||||||
|
---
|
||||||
|
title: Compiling a Functional Language Using C++, Part 1 - Tokenizing
|
||||||
|
date: 2019-08-03T01:02:30-07:00
|
||||||
|
tags: ["C and C++", "Functional Languages", "Compilers"]
|
||||||
|
draft: true
|
||||||
|
---
|
||||||
|
During my last academic term, I was enrolled in a compilers course.
|
||||||
|
We had a final project - develop a compiler for a basic Python subset,
|
||||||
|
using LLVM. It was a little boring - virtually nothing about the compiler
|
||||||
|
was __not__ covered in class, and it felt more like putting two puzzles
|
||||||
|
pieces together than building a real project.
|
||||||
|
|
||||||
|
Being involved of the Programming Language Theory (PLT) research group at my
|
||||||
|
university, I decided to do something different for the final project -
|
||||||
|
a compiler for a functional language. In a series of posts, starting with
|
||||||
|
thise one, I will explain what I did so that those interested in the subject
|
||||||
|
are able to replicate my steps, and maybe learn something for themselves.
|
||||||
|
|
||||||
|
### The "classic" stages of a compiler
|
||||||
|
Let's take a look at the high level overview of what a compiler does.
|
||||||
|
Conceptually, the components of a compiler are pretty cleanly separated.
|
||||||
|
They are as gollows:
|
||||||
|
|
||||||
|
1. Tokenizing / lexical analysis
|
||||||
|
2. Parsing
|
||||||
|
3. Analysis / optimization
|
||||||
|
5. Code Generation
|
||||||
|
|
||||||
|
There are many variations on this structure. Some compilers don't optimize
|
||||||
|
at all, some translate the program text into an intermediate representation,
|
||||||
|
an alternative way of representing the program that isn't machine code.
|
||||||
|
In some compilers, the stages of parsing and analysis can overlap.
|
||||||
|
In short, just like the pirate's code, it's more of a guideline than a rule.
|
||||||
|
|
||||||
|
### Tokenizing and Parsing (the "boring stuff")
|
||||||
|
It makes sense to build a compiler bit by bit, following the stages we outlined above.
|
||||||
|
This is because these stages are essentially a pipeline, with program text
|
||||||
|
coming in one end, and the final program coming out of the other. So as we build
|
||||||
|
up our pipeline, we'll be able to push program text further and further, until
|
||||||
|
eventually we get something that we can run on our machine.
|
||||||
|
|
||||||
|
This is how most tutorials go about building a compiler, too. The result is that
|
||||||
|
there are a __lot__ of tutorials covering tokenizing and parsing. This is why
|
||||||
|
I refer to this part of the process as "boring". Nonetheless, I will cover the steps
|
||||||
|
required to tokenize and parse our little functional language. But before we do that,
|
||||||
|
we first need to have an idea of what our language looks like.
|
||||||
|
|
||||||
|
### The Grammar
|
||||||
|
Simon Peyton Jones, in his two works regarding compiling functional languages, remarks
|
||||||
|
that most functional languages are very similar, and vary largely in syntax. That's
|
||||||
|
our main degree of freedom. We want to represent the following things, for sure:
|
||||||
|
|
||||||
|
* Defining functions
|
||||||
|
* Applying functions
|
||||||
|
* Arithmetic
|
||||||
|
* Algebraic data types (to represent lists, pairs, and the like)
|
||||||
|
* Pattern matching (to operate on data types)
|
||||||
|
|
||||||
|
We can additionally support anonymous (lambda) functions, but compiling those
|
||||||
|
is actually a bit trickier, so we will skip those for now. Arithmetic is the simplest to
|
||||||
|
define - let's define it as we would expect: `3` is a number, `3+2*6` evaluates to 15.
|
||||||
|
Function application isn't much more difficult - `f x` means "apply f to x", and
|
||||||
|
`f x + g x` means sum the result of applying f to x and g to x. That is, function
|
||||||
|
application has higher precedence, or __binds tighter__ than binary operators like plus.
|
||||||
|
|
||||||
|
Next, let's define the syntax for declaring a function. Why not:
|
||||||
|
```
|
||||||
|
defn f x = { x + x }
|
||||||
|
```
|
||||||
|
|
||||||
|
As for declaring data types:
|
||||||
|
```
|
||||||
|
data List = { Nil, Cons Int List }
|
||||||
|
```
|
||||||
|
Notice that we are avoiding polymorphism here.
|
||||||
|
|
||||||
|
Let's also define a syntax for pattern matching:
|
||||||
|
```
|
||||||
|
case l of {
|
||||||
|
Nil -> { 0 }
|
||||||
|
Cons x xs -> { x }
|
||||||
|
}
|
||||||
|
```
|
||||||
|
The above means "if the list `l` is `Nil`, then return 0, otherwise, if it's
|
||||||
|
constructed from an integer and another list (as defined in our `data` example),
|
||||||
|
return the integer".
|
||||||
|
|
||||||
|
That's it for now! Let's take a look at tokenizing.
|
||||||
|
|
||||||
|
### Tokenizing
|
||||||
|
When we first get our program text, it's in a representation difficult for us to make
|
||||||
|
sense of. If we look at how it's represented in C++, we see that it's just an array
|
||||||
|
of characters (potentially hundreds, thousands, or millions in length). We __could__
|
||||||
|
jump straight to parsing the text (which involves creating a tree structure, known
|
||||||
|
as an __abstract syntax tree__; more on that later). There's nothing wrong with this approach -
|
||||||
|
in fact, in functional languages, tokenizing is frequently skipped. However,
|
||||||
|
in our closer-to-metal language, it happens to be more convenient to first break the
|
||||||
|
input text into a bunch of distinct segments (tokens).
|
||||||
|
|
||||||
|
For example, consider the string "320+6". If we skip tokenizing and go straight
|
||||||
|
into parsing, we'd feed our parser the sequence of characters `['3', '2', '6', '+', '6', '\0']`.
|
||||||
|
On the other hand, if we run a tokenizing step on the string first, we'd be feeding our
|
||||||
|
parser three tokens, `("320", NUMBER)`, `("+", OPERATOR)`, and `("6", NUMBER)`.
|
||||||
|
To us, this is a bit more clear - we've partitioned the string into logical segments.
|
||||||
|
Our parser, then, won't have to care about recognizing a number - it will just know
|
||||||
|
that a number is next in the string, and do with that information what it needs.
|
||||||
|
|
||||||
|
How do we go about breaking up a string into tokens? We need to come up with a
|
||||||
|
way to compare some characters in a string against a set of rules. But "rules"
|
||||||
|
is a very general term - we could, for instance, define a particular
|
||||||
|
token that is a fibonacci number - 1, 2, 3, 5, and so on would be marked
|
||||||
|
as a "fibonacci number", while the other numbers will be marked as just
|
||||||
|
a regular number. To support that, our rules would get pretty complex. And
|
||||||
|
equally complex will become our checking of these rules for particular strings.
|
||||||
|
|
||||||
|
Fortunately, we're not insane. We observe that the rules for tokens in practice
|
||||||
|
are fairly simple - one or more digits is an integer, a few letters together
|
||||||
|
are a variable name. In order to be able to efficiently break text up into
|
||||||
|
such tokens, we restrict ourselves to __regular languages__. A language
|
||||||
|
is defined as a set of strings (potentially infinite), and a regular
|
||||||
|
language for which we can write a __regular expression__ to check if
|
||||||
|
a string is in the set. Regular expressions are a way of representing
|
||||||
|
patterns that a string has to match. We define regular expressions
|
||||||
|
as follows:
|
||||||
|
|
||||||
|
* Any character is a regular expression that matches that character. Thus,
|
||||||
|
\\(a\\) is a regular expression (from now shortened to regex) that matches
|
||||||
|
the character 'a', and nothing else.
|
||||||
|
* \\(r_1r_2\\), or the concatenation of \\(r_1\\) and \\(r_2\\), is
|
||||||
|
a regular expression that matches anything matched by \\(r_1\\), followed
|
||||||
|
by anything that matches \\(r_2\\). For instance, \\(ab\\), matches
|
||||||
|
the character 'a' followed by the character 'b' (thus matching "ab").
|
||||||
|
* \\(r_1|r_2\\) matches anything that is either matched by \\(r_1\\) or
|
||||||
|
\\(r_2\\). Thus, \\(a|b\\) matches the character 'a' or the character 'b'.
|
||||||
|
* \\(r_1?\\) matches either an empty string, or anything matched by \\(r_1\\).
|
||||||
|
* \\(r_1+\\) matches one or more things matched by \\(r_1\\). So,
|
||||||
|
\\(a+\\) matches "a", "aa", "aaa", and so on.
|
||||||
|
* \\((r_1)\\) matches anything that matches \\(r_1\\). This is mostly used
|
||||||
|
to group things together in more complicated expressions.
|
||||||
|
* \\(.\\) matches any character.
|
||||||
|
|
||||||
|
More powerful variations of regex also include an "any of" operator, \\([c_1c_2c_3]\\),
|
||||||
|
which is equivalent to \\(c_1|c_2|c_3\\), and a "range" operator, \\([c_1-c_n]\\), which
|
||||||
|
matches all characters in the range between \\(c_1\\) and \\(c_n\\), inclusive.
|
||||||
|
|
||||||
|
Let's see some examples. An integer, such as 326, can be represented with \\([0-9]+\\).
|
||||||
|
This means, one or more characters between 0 or 9. Some (most) regex implementations
|
||||||
|
have a special symbol for \\([0-9]\\), written as \\(\\setminus d\\). A variable,
|
||||||
|
starting with a lowercase letter and containing lowercase or uppercase letters after it,
|
||||||
|
can be written as \\(\[a-z\]([a-z]+)?\\). Again, most regex implementations provide
|
||||||
|
a special operator for \\((r_1+)?\\), written as \\(r_1*\\).
|
||||||
|
|
||||||
|
#### The Theory
|
||||||
|
So how does one go about checking if a regular expression matches a string? An efficient way is to
|
||||||
|
first construct a [state machine](https://en.wikipedia.org/wiki/Finite-state_machine). A type of state machine can be constructed from a regular expression
|
||||||
|
by literally translating each part of it to a series of states, one-to-one. This machine is called
|
||||||
|
a __Nondeterministic Finite Automaton__, or NFA for short. The "Finite" means that the number of
|
||||||
|
states in the state machine is, well, finite. For us, this means that we can store such
|
||||||
|
a machine on disk. The "Nondeterministic" part, though, is more complex: given a particular character
|
||||||
|
and a particular state, it's possible that an NFA has the option of transitioning into more
|
||||||
|
than one other state. Well, which state __should__ it pick? No easy way to tell. Each time
|
||||||
|
we can transition to more than one state, we exponentially increase the number of possible
|
||||||
|
states that we can be in. This isn't good - we were going for efficiency, remember?
|
||||||
|
|
||||||
|
What we can do is convert our NFA into another kind of state machine, in which for every character,
|
||||||
|
only one possible state transition is possible. This machine is called a __Deterministic Finite Automaton__,
|
||||||
|
or DFA for short. There's an algorithm to convert an NFA into a DFA, which I won't explain here.
|
||||||
|
|
||||||
|
Since both the conversion of a regex into an NFA and a conversion of an NFA into a DFA is done
|
||||||
|
by following an algorithm, we're always going to get the same DFA for the same regex we put in.
|
||||||
|
If we come up with the rules for our tokens once, we don't want to be building a DFA each time
|
||||||
|
our tokenizer is run - the result will always be the same! Even worse, translating a regular
|
||||||
|
expression all the way into a DFA is the inefficient part of the whole process. The solution is to
|
||||||
|
generate a state machine, and convert it into code to simulate that state machine. Then, we include
|
||||||
|
that code as part of our compiler. This way, we have a state machine "hardcoded" into our tokenizer,
|
||||||
|
and no conversion of regex to DFAs needs to be done at runtime.
|
||||||
|
|
||||||
|
#### The Practice
|
||||||
|
Creating an NFA, and then a DFA, and then generating C++ code are all cumbersome. If we had to
|
||||||
|
write code to do this every time we made a compiler, it would get very repetitive, very fast.
|
||||||
|
Fortunately, there exists a tool that does exactly this for us - it's called `flex`. Flex
|
||||||
|
takes regular expressions, and generates code that matches a string against those regular expressions.
|
||||||
|
It does one more thing in addition to that - for each regular expression it matches, flex
|
||||||
|
runs a user-defined action (which we write in C++). We can use this to convert strings that
|
||||||
|
represent numbers directly into numbers, and do other small tasks.
|
||||||
|
|
||||||
|
So, what tokens do we have? From our arithmetic definition, we see that we have integers.
|
||||||
|
Let's use the regex `[0-9]+` for those. We also have the operators `+`, `-`, `*`, and `/`.
|
||||||
|
`-` is simple enough: the corresponding regex is `-`. We need to
|
||||||
|
preface our `/`, `+` and `*` with a backslash, though, since they happen to also be modifiers
|
||||||
|
in flex's regular expressions: `\/`, `\+`, `\*`.
|
||||||
|
|
||||||
|
Let's also represent some reserved keywords. We'll say that `defn`, `data`, `case`, and `of`
|
||||||
|
are reserved. Their regular expressions are just their names. We also want to tokenize
|
||||||
|
`=`, `->`, `{`, `}`, `,`, `(` and `)`. Finally, we want to represent identifiers, like `f`,
|
||||||
|
`x`, `Nil`, and `Cons`. We will actually make a distinction between lowercase identifiers
|
||||||
|
and uppercase identifiers, as we will follow Haskell's convention of representing
|
||||||
|
data type constructors with uppercase letters, and functions and variables with lowercase ones.
|
||||||
|
So, our two regular expressions will be `[a-z][a-zA-Z]*` for the lowercase variables, and
|
||||||
|
`[A-Z][a-zA-Z]*` for uppercase variables. Let's make a tokenizer in flex with all this. To do
|
||||||
|
this, we create a new file, `scanner.l`, in which we write a mix of regular expressions
|
||||||
|
and C++ code. Here's the whole thing:
|
||||||
|
|
||||||
|
{{< rawblock "compiler_scanner.l" >}}
|
||||||
|
|
||||||
|
A flex file starts with options. I set the `noyywrap` option, which disables a particular
|
||||||
|
feature of flex that we won't use, and which causes linker errors. Next up,
|
||||||
|
flex allows us to put some C++ code that we want at the top of our generated code.
|
||||||
|
I simply include `iostream`, so that we can use `cout` to print out our tokens.
|
||||||
|
Next, `%%`, and after that, the meat of our tokenizer: regular expressions, followed by
|
||||||
|
C++ code that should be executed when the regular expression is matched.
|
||||||
|
|
||||||
|
The first token: whitespace. This includes the space character,
|
||||||
|
and the newline character. We ignore it, so its rule is empty. After that,
|
||||||
|
we have the regular expressions for the tokens we've talked about. For each, I just
|
||||||
|
print a description of the token that matched. This will change we we hook this up to
|
||||||
|
a parser, but for now, this works fine. Notice that the variable `yytext` contains
|
||||||
|
the string matched by our regular expression. This variable is set by the code flex
|
||||||
|
generates, and we can use it to get the extract text that matched a regex. This is
|
||||||
|
useful, for instance, to print the variable name that we matched. After
|
||||||
|
all of our tokens, another `%%`, and more C++ code. For this simple example,
|
||||||
|
I declare a `main` function, which just calls `yylex`, a function flex
|
||||||
|
generates for us. Let's generate the C++ code, and compile it:
|
||||||
|
|
||||||
|
```
|
||||||
|
flex -o scanner.cpp scanner.l
|
||||||
|
g++ -o scanner scanner.cpp
|
||||||
|
```
|
||||||
|
|
||||||
|
Now, let's feed it an expression:
|
||||||
|
```
|
||||||
|
echo "3+2*6" | ./scanner
|
||||||
|
```
|
||||||
|
|
||||||
|
We get the output:
|
||||||
|
```
|
||||||
|
NUMBER: 3
|
||||||
|
PLUS
|
||||||
|
NUMBER: 2
|
||||||
|
TIMES
|
||||||
|
NUMBER: 6
|
||||||
|
```
|
||||||
|
Hooray! We have tokenizing.
|
231
content/blog/02_compiler_parsing.md
Normal file
231
content/blog/02_compiler_parsing.md
Normal file
@ -0,0 +1,231 @@
|
|||||||
|
---
|
||||||
|
title: Compiling a Functional Language Using C++, Part 2 - Parsing
|
||||||
|
date: 2019-08-03T01:02:30-07:00
|
||||||
|
tags: ["C and C++", "Functional Languages", "Compilers"]
|
||||||
|
draft: true
|
||||||
|
---
|
||||||
|
In the previous post, we covered tokenizing. We learned how to convert an input string into logical segments, and even wrote up a tokenizer to do it according to the rules of our language. Now, it's time to make sense of the tokens, and parse our language.
|
||||||
|
|
||||||
|
### The Theory
|
||||||
|
The rules to parse a language are more complicated than the rules for
|
||||||
|
recognizing tokens. For instance, consider a simple language of a matching
|
||||||
|
number of open and closed parentheses, like `()` and `((()))`. You can't
|
||||||
|
write a regular expression for it! We resort to a wider class of languages, called
|
||||||
|
__context free languages__. These languages are ones that are matched by __context free grammars__.
|
||||||
|
A context free grammar is a list of rules in the form of \\(S \\rightarrow \\alpha\\), where
|
||||||
|
\\(S\\) is a __nonterminal__ (conceptualy, a thing that expands into other things), and
|
||||||
|
\\(\\alpha\\) is a sequence of nonterminals and terminals (a terminal is a thing that doesn't
|
||||||
|
expand into other things; for us, this is a token).
|
||||||
|
|
||||||
|
Let's write a context free grammar (CFG from now on) to match our parenthesis language:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
S & \\rightarrow ( S ) \\\\\\
|
||||||
|
S & \\rightarrow ()
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
So, how does this work? We start with a "start symbol" nonterminal, which we usually denote as \\(S\\). Then, to get a desired string,
|
||||||
|
we replace a nonterminal with the sequence of terminals and nonterminals on the right of one of its rules. For instance, to get `()`,
|
||||||
|
we start with \\(S\\) and replace it with the body of the second one of its rules. This gives us `()` right away. To get `((()))`, we
|
||||||
|
have to do a little more work:
|
||||||
|
|
||||||
|
$$
|
||||||
|
S \\rightarrow (S) \\rightarrow ((S)) \\rightarrow ((()))
|
||||||
|
$$
|
||||||
|
|
||||||
|
In practice, there are many ways of using a CFG to parse a programming language. Various parsing algorithms support various subsets
|
||||||
|
of context free languages. For instance, top down parsers follow nearly exactly the structure that we had. They try to parse
|
||||||
|
a nonterminal by trying to match each symbol in its body. In the rule \\(S \\rightarrow \\alpha \\beta \\gamma\\), it will
|
||||||
|
first try to match \\(alpha\\), then \\(beta\\), and so on. If one of the three contains a nonterminal, it will attempt to parse
|
||||||
|
that nonterminal following the same strategy. However, this leaves a flaw - For instance, consider the grammar
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
S & \\rightarrow Sa \\\\\\
|
||||||
|
S & \\rightarrow a
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
A top down parser will start with \\(S\\). It will then try the first rule, which starts with \\(S\\). So, dutifully, it will
|
||||||
|
try to parse __that__ \\(S\\). And to do that, it will once again try the first rule, and find that it starts with another \\(S\\)...
|
||||||
|
This will never end, and the parser will get stuck. A grammar in which a nonterminal can appear in the beginning of one of its rules
|
||||||
|
__left recursive__, and top-down parsers aren't able to handle those grammars.
|
||||||
|
|
||||||
|
We __could__ rewrite our grammar without using left-recursion, but we don't want to. Instead, we'll use a __bottom up__ parser,
|
||||||
|
using specifically the LALR(1) parsing algorithm. Here's an example of how it works, using our left-recursive grammar. We start with our
|
||||||
|
goal string, and a "dot" indicating where we are. At first, the dot is behind all the characters:
|
||||||
|
$$
|
||||||
|
.aaa
|
||||||
|
$$
|
||||||
|
We see nothing interesting on the left side of the dot, so we move (or __shift__) the dot forward by one character:
|
||||||
|
$$
|
||||||
|
a.aa
|
||||||
|
$$
|
||||||
|
Now, on the left side of the dot, we see something! In particular, we see the body of one of the rules for \\(S\\) (the second one).
|
||||||
|
So we __reduce__ the thing on the left side of the dot, by replacing it with the left hand side of the rule (\\(S\\)):
|
||||||
|
$$
|
||||||
|
S.aa
|
||||||
|
$$
|
||||||
|
There's nothing else we can do with the left side, so we shift again:
|
||||||
|
$$
|
||||||
|
Sa.a
|
||||||
|
$$
|
||||||
|
Great, we see another body on the left of the dot. We reduce it:
|
||||||
|
$$
|
||||||
|
S.a
|
||||||
|
$$
|
||||||
|
Just like before, we shift over the dot, and again, we reduce. We end up with our
|
||||||
|
start symbol, and nothing on the right of the dot, so we're done!
|
||||||
|
|
||||||
|
### The Practice
|
||||||
|
In practice, we don't want to just match a grammar. That would be like saying "yup, this is our language".
|
||||||
|
Instead, we want to create something called an __abstract syntax tree__, or AST for short. This tree
|
||||||
|
captures the structure of our language, and is easier to work with than its textual representation. The structure
|
||||||
|
of the tree we build will often mimic the structure of our grammar: a rule in the form \\(S \\rightarrow A B\\)
|
||||||
|
will result in a tree named "S", with two children corresponding the trees built for A and B. Since
|
||||||
|
an AST captures the structure of the language, we'll be able to toss away some punctuation
|
||||||
|
like `,` and `(`. These tokens will appear in our grammar, but we will tweak our parser to simply throw them away. Additionally,
|
||||||
|
we will write our grammar ignoring whitespace, since our tokenizer conveniently throws that into the trash.
|
||||||
|
|
||||||
|
The grammar for arithmetic actually involves more effort than it would appear at first. We want to make sure that our
|
||||||
|
parser respects the order of operations. This way, when we have our tree, it will immediately have the structure in
|
||||||
|
which multiplication is done before addition. We do this by creating separate "levels" in our grammar, with one
|
||||||
|
nonterminal matching addition and subtraction, while another nonterminal matches multiplication and division.
|
||||||
|
We want the operation that has the least precedence to be __higher__ in our tree than one of higher precedence.
|
||||||
|
For instance, for `3+2*6`, we want our tree to have `+` as the root, `3` as the left child, and the tree for `2*6` as the right child.
|
||||||
|
Why? Because this tree represents "the addition of 3 and the result of multiplying 2 by 6". If we had `*` be the root, we'd have
|
||||||
|
a tree representing "the multiplication of the result of adding 3 to 2 and 6", which is __not__ what our expression means.
|
||||||
|
|
||||||
|
So, with this in mind, we want our rule for __addition__ (represented with the nonterminal \\(A\_{add}\\), to be matched first, and
|
||||||
|
for its children to be trees created by the multiplication rule, \\(A\_{mult}\\). So we write the following rules:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
A\_{add} & \\rightarrow A\_{add}+A\_{mult} \\\\\\
|
||||||
|
A\_{add} & \\rightarrow A\_{add}-A\_{mult} \\\\\\
|
||||||
|
A\_{add} & \\rightarrow A\_{mult}
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
The first rule matches another addition, added to the result of another addition. We use the addition in the body
|
||||||
|
because we want to be able to parse strings like `1+2+3+4`, which we want to view as `((1+2)+3)+4` (mostly because
|
||||||
|
subtraction is [left-associative](https://en.wikipedia.org/wiki/Operator_associativity)). So, we want the top level
|
||||||
|
of the tree to be the rightmost `+` or `-`, since that means it will be the "last" operation. You may be asking,
|
||||||
|
|
||||||
|
> You define addition in terms of addition; how will parsing end? What if there's no addition at all, like `2*6`?
|
||||||
|
|
||||||
|
This is the purpose of the third rule, which serves to say "an addition expression can just be a multiplication,
|
||||||
|
without any plusses or minuses." Our rules for multiplication are very similar:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
A\_{mult} & \\rightarrow A\_{mult}*P \\\\\\
|
||||||
|
A\_{mult} & \\rightarrow A\_{mult}/P \\\\\\
|
||||||
|
A\_{mult} & \\rightarrow P
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
P, in this case, is an a__p__lication (remember, application has higher precedence than any binary operator).
|
||||||
|
Once again, if there's no `*` or `\`, we simply fall through to a \\(P\\) nonterminal, representing application.
|
||||||
|
|
||||||
|
Application is refreshingly simple:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
P & \\rightarrow P B \\\\\\
|
||||||
|
P & \\rightarrow B
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
An application is either only one "thing" (represented with \\(B\\), for __b__ase), such as a number or an identifier,
|
||||||
|
or another application followed by a thing.
|
||||||
|
|
||||||
|
We now need to define what a "thing" is. As we said before, it's a number, or an identifier. We also make a parenthesized
|
||||||
|
arithmetic expression a "thing", allowing us to wrap right back around and recognize anything inside parentheses:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
B & \\rightarrow \text{num} \\\\\\
|
||||||
|
B & \\rightarrow \text{lowerVar} \\\\\\
|
||||||
|
B & \\rightarrow \text{upperVar} \\\\\\
|
||||||
|
B & \\rightarrow ( A\_{add} ) \\\\\\
|
||||||
|
B & \\rightarrow C
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
What's the last \\(C\\)? We also want a "thing" to be a case expression. Here are the rules for that:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
C & \\rightarrow \\text{case} \\; A\_{add} \\; \\text{of} \\; \\{ L\_B\\} \\\\\\
|
||||||
|
L\_B & \\rightarrow R \\; , \\; L\_B \\\\\\
|
||||||
|
L\_B & \\rightarrow R \\\\\\
|
||||||
|
R & \\rightarrow N \\; \\text{arrow} \\; \\{ A\_{add} \\} \\\\\\
|
||||||
|
N & \\rightarrow \\text{lowerVar} \\\\\\
|
||||||
|
N & \\rightarrow \\text{upperVar} \\; L\_L \\\\\\
|
||||||
|
L\_L & \\rightarrow \\text{lowerVar} \\; L\_L \\\\\\
|
||||||
|
L\_L & \\rightarrow \\epsilon
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
\\(L\_B\\) is the list of branches in our case expression. \\(R\\) is a single branch, which is in the
|
||||||
|
form `Pattern -> Expression`. \\(N\\) is a pattern, which we will for now define to be either a variable name
|
||||||
|
(\\(\\text{lowerVar}\\)), or a constructor with some arguments. The arguments of a constructor will be
|
||||||
|
lowercase names, and a list of those arguments will be represented with \\(L\_L\\). One of the bodies
|
||||||
|
of this nonterminal is just the character \\(\\epsilon\\), which just means "nothing".
|
||||||
|
We use this because a constructor can have no arguments (like Nil).
|
||||||
|
|
||||||
|
We can use these grammar rules to represent any expression we want. For instance, let's try `3+(multiply 2 6)`,
|
||||||
|
where multiply is a function that, well, multiplies. We start with \\(A_{add}\\):
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
& A\_{add} \\\\\\
|
||||||
|
& \\rightarrow A\_{add} + A\_{mult} \\\\\\
|
||||||
|
& \\rightarrow A\_{mult} + A\_{mult} \\\\\\
|
||||||
|
& \\rightarrow P + A\_{mult} \\\\\\
|
||||||
|
& \\rightarrow B + A\_{mult} \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + A\_{mult} \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + P \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + B \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (A\_{add}) \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (A\_{mult}) \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (P) \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (P \\; \\text{num(6)}) \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (P \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\
|
||||||
|
& \\rightarrow \\text{num(3)} + (\\text{lowerVar(multiply)} \\; \\text{num(2)} \\; \\text{num(6)}) \\\\\\
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
We're almost there. We now want a rule for a "something that can appear at the top level of a program", like
|
||||||
|
a function or data type declaration. We make a new set of rules:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
T & \\rightarrow \\text{defn} \\; \\text{lowerVar} \\; L\_P =\\{ A\_{add} \\} \\\\\\
|
||||||
|
T & \\rightarrow \\text{data} \\; \\text{upperVar} = \\{ L\_D \\} \\\\\\
|
||||||
|
L\_D & \\rightarrow D \\; , \\; L\_D \\\\\\
|
||||||
|
L\_D & \\rightarrow D \\\\\\
|
||||||
|
L\_P & \\rightarrow \\text{lowerVar} \\; L\_P \\\\\\
|
||||||
|
L\_P & \\rightarrow \\epsilon \\\\\\
|
||||||
|
D & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\
|
||||||
|
L\_U & \\rightarrow \\text{upperVar} \\; L\_U \\\\\\
|
||||||
|
L\_U & \\rightarrow \\epsilon
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
That's a lot of rules! \\(T\\) is the "top-level declaration rule. It matches either
|
||||||
|
a function or a data definition. A function definition consists of the keyword "defn",
|
||||||
|
followed by a function name (starting with a lowercase letter), followed by a list of
|
||||||
|
parameters, represented by \\(L\_P\\).
|
||||||
|
|
||||||
|
A data type definition consists of the name of the data type (starting with an uppercase letter),
|
||||||
|
and a list \\(L\_D\\) of data constructors \\(D\\). There must be at least one data constructor in this list,
|
||||||
|
so we don't use the empty string rule here. A data constructor is simply an uppercase variable representing
|
||||||
|
a constructor of the data type, followed by a list \\(L\_U\\) of zero or more uppercase variables (representing
|
||||||
|
the types of the arguments of the constructor).
|
||||||
|
|
||||||
|
Finally, we want one or more of these declarations in a valid program:
|
||||||
|
$$
|
||||||
|
\\begin{align}
|
||||||
|
G & \\rightarrow T \\; G \\\\\\
|
||||||
|
G & \\rightarrow T
|
||||||
|
\\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
Just like with tokenizing, there exists a piece of software that will generate a bottom-up parser for us, given our grammar.
|
||||||
|
It's called Bison, and it is frequently used with Flex. Before we get to bison, though, we need to pay a debt we've already
|
||||||
|
incurred - the implementation of our AST. Such a tree is language-specific, so Bison doesn't generate it for us. Here's what
|
||||||
|
I came up with:
|
||||||
|
{{< codeblock "C++" "compiler_ast.hpp" >}}
|
||||||
|
|
||||||
|
Finally, we get to writing our Bison file, `parser.y`. Here's what I come up with:
|
||||||
|
{{< rawblock "compiler_parser.y" >}}
|
Loading…
Reference in New Issue
Block a user