Compare commits
No commits in common. "1f6b4bef74ea00a5b6052688a571c4e62ff3072f" and "1f3c42fc4442e8ef94272e96b878a3b590b151c9" have entirely different histories.
1f6b4bef74
...
1f3c42fc44
|
@ -37,7 +37,6 @@ add_executable(compiler
|
||||||
instruction.cpp instruction.hpp
|
instruction.cpp instruction.hpp
|
||||||
graph.cpp graph.hpp
|
graph.cpp graph.hpp
|
||||||
global_scope.cpp global_scope.hpp
|
global_scope.cpp global_scope.hpp
|
||||||
parse_driver.cpp parse_driver.hpp
|
|
||||||
${BISON_parser_OUTPUTS}
|
${BISON_parser_OUTPUTS}
|
||||||
${FLEX_scanner_OUTPUTS}
|
${FLEX_scanner_OUTPUTS}
|
||||||
main.cpp
|
main.cpp
|
||||||
|
|
|
@ -1,55 +0,0 @@
|
||||||
#include "parse_driver.hpp"
|
|
||||||
#include "scanner.hpp"
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
bool parse_driver::run_parse() {
|
|
||||||
FILE* stream = fopen(file_name.c_str(), "r");
|
|
||||||
if(!stream) return false;
|
|
||||||
string_stream = std::ostringstream();
|
|
||||||
file_offset = 0;
|
|
||||||
line_offsets.push_back(0);
|
|
||||||
yyscan_t scanner;
|
|
||||||
yylex_init(&scanner);
|
|
||||||
yyset_in(stream, scanner);
|
|
||||||
yy::parser parser(scanner, *this);
|
|
||||||
parser();
|
|
||||||
yylex_destroy(scanner);
|
|
||||||
fclose(stream);
|
|
||||||
file_contents = string_stream.str();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void parse_driver::write(const char* buf, size_t len) {
|
|
||||||
string_stream.write(buf, len);
|
|
||||||
file_offset += len;
|
|
||||||
}
|
|
||||||
|
|
||||||
void parse_driver::mark_line() {
|
|
||||||
line_offsets.push_back(file_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t parse_driver::get_index(int line, int column) {
|
|
||||||
assert(line > 0);
|
|
||||||
assert(line <= line_offsets.size());
|
|
||||||
size_t file_offset = line_offsets[line-1];
|
|
||||||
file_offset += column - 1;
|
|
||||||
return file_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t parse_driver::get_line_end(int line) {
|
|
||||||
if(line > line_offsets.size()) return file_contents.size();
|
|
||||||
return get_index(line+1, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
void parse_driver::print_highlighted_location(std::ostream& stream, const yy::location& loc) {
|
|
||||||
size_t print_start = get_index(loc.begin.line, 1);
|
|
||||||
size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
|
|
||||||
size_t highlight_end = get_index(loc.end.line, loc.end.column);
|
|
||||||
size_t print_end = get_line_end(loc.end.line);
|
|
||||||
const char* content = file_contents.c_str();
|
|
||||||
stream.write(content + print_start, highlight_start - print_start);
|
|
||||||
stream << "\033[4;31m";
|
|
||||||
stream.write(content + highlight_start, highlight_end - highlight_start);
|
|
||||||
stream << "\033[0m";
|
|
||||||
stream.write(content + highlight_end, print_end - highlight_end);
|
|
||||||
}
|
|
|
@ -13,6 +13,7 @@ void scanner_destroy(yyscan_t* scanner);
|
||||||
|
|
||||||
struct parse_driver {
|
struct parse_driver {
|
||||||
std::string file_name;
|
std::string file_name;
|
||||||
|
std::ifstream file_stream;
|
||||||
std::ostringstream string_stream;
|
std::ostringstream string_stream;
|
||||||
|
|
||||||
yy::location location;
|
yy::location location;
|
||||||
|
@ -20,17 +21,58 @@ struct parse_driver {
|
||||||
|
|
||||||
std::vector<size_t> line_offsets;
|
std::vector<size_t> line_offsets;
|
||||||
definition_group global_defs;
|
definition_group global_defs;
|
||||||
std::string file_contents;
|
std::string read_file;
|
||||||
|
|
||||||
parse_driver(const std::string& file)
|
parse_driver(const std::string& file)
|
||||||
: file_name(file), file_offset(0) {}
|
: file_name(file), file_offset(0) {}
|
||||||
|
|
||||||
bool run_parse();
|
bool run_parse() {
|
||||||
void write(const char* buff, size_t len);
|
file_stream.open(file_name);
|
||||||
void mark_line();
|
if(!file_stream.good()) return false;
|
||||||
size_t get_index(int line, int column);
|
line_offsets.push_back(0);
|
||||||
size_t get_line_end(int line);
|
yyscan_t scanner;
|
||||||
void print_highlighted_location(std::ostream& stream, const yy::location& loc);
|
scanner_init(this, &scanner);
|
||||||
|
yy::parser parser(scanner, *this);
|
||||||
|
parser();
|
||||||
|
scanner_destroy(&scanner);
|
||||||
|
read_file = string_stream.str();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get() {
|
||||||
|
int new_char = file_stream.get();
|
||||||
|
if(new_char == EOF) return EOF;
|
||||||
|
file_offset++;
|
||||||
|
if(new_char == '\n') line_offsets.push_back(file_offset);
|
||||||
|
string_stream.put(new_char);
|
||||||
|
return new_char;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_index(int line, int column) {
|
||||||
|
assert(line > 0);
|
||||||
|
assert(line <= line_offsets.size());
|
||||||
|
size_t file_offset = line_offsets[line-1];
|
||||||
|
file_offset += column - 1;
|
||||||
|
return file_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t get_line_end(int line) {
|
||||||
|
if(line > line_offsets.size()) return read_file.size();
|
||||||
|
return get_index(line+1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_highlighted_location(std::ostream& stream, const yy::location& loc) {
|
||||||
|
size_t print_start = get_index(loc.begin.line, 1);
|
||||||
|
size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
|
||||||
|
size_t highlight_end = get_index(loc.end.line, loc.end.column);
|
||||||
|
size_t print_end = get_line_end(loc.end.line);
|
||||||
|
const char* content = read_file.c_str();
|
||||||
|
stream.write(content + print_start, highlight_start - print_start);
|
||||||
|
stream << "\033[4;31m";
|
||||||
|
stream.write(content + highlight_start, highlight_end - highlight_start);
|
||||||
|
stream << "\033[0m";
|
||||||
|
stream.write(content + highlight_end, print_end - highlight_end);
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#define YY_DECL yy::parser::symbol_type yylex(yyscan_t yyscanner, parse_driver& drv)
|
#define YY_DECL yy::parser::symbol_type yylex(yyscan_t yyscanner, parse_driver& drv)
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
%option noyywrap
|
%option noyywrap
|
||||||
%option reentrant
|
%option reentrant
|
||||||
%option header-file="scanner.hpp"
|
|
||||||
|
|
||||||
%{
|
%{
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
@ -9,12 +8,18 @@
|
||||||
#include "parse_driver.hpp"
|
#include "parse_driver.hpp"
|
||||||
#include "parser.hpp"
|
#include "parser.hpp"
|
||||||
|
|
||||||
#define YY_USER_ACTION drv.write(yytext, yyleng); drv.location.step(); drv.location.columns(yyleng);
|
#define YY_EXTRA_TYPE parse_driver*
|
||||||
|
#define YY_USER_ACTION drv.location.step(); drv.location.columns(yyleng);
|
||||||
|
#define YY_INPUT(buf,result,max_size) \
|
||||||
|
{ \
|
||||||
|
int c = yyextra->get(); \
|
||||||
|
result = (c == EOF) ? YY_NULL : (buf[0] = c, 1); \
|
||||||
|
}
|
||||||
%}
|
%}
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
\n { drv.location.lines(); drv.mark_line(); }
|
\n { drv.location.lines(); }
|
||||||
[ ]+ {}
|
[ ]+ {}
|
||||||
\\ { return yy::parser::make_BACKSLASH(drv.location); }
|
\\ { return yy::parser::make_BACKSLASH(drv.location); }
|
||||||
\+ { return yy::parser::make_PLUS(drv.location); }
|
\+ { return yy::parser::make_PLUS(drv.location); }
|
||||||
|
@ -44,3 +49,10 @@ in { return yy::parser::make_IN(drv.location); }
|
||||||
<<EOF>> { return yy::parser::make_YYEOF(drv.location); }
|
<<EOF>> { return yy::parser::make_YYEOF(drv.location); }
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
void scanner_init(parse_driver* d, yyscan_t* scanner) {
|
||||||
|
yylex_init_extra(d, scanner);
|
||||||
|
}
|
||||||
|
void scanner_destroy(yyscan_t* scanner) {
|
||||||
|
yylex_destroy(*scanner);
|
||||||
|
}
|
||||||
|
|
|
@ -1,214 +0,0 @@
|
||||||
---
|
|
||||||
title: Compiling a Functional Language Using C++, Part 13 - More Improvements
|
|
||||||
date: 2020-09-10T18:50:02-07:00
|
|
||||||
tags: ["C and C++", "Functional Languages", "Compilers"]
|
|
||||||
description: "In this post, we clean up our compiler and add some basic optimizations."
|
|
||||||
---
|
|
||||||
|
|
||||||
In [part 12]({{< relref "12_compiler_let_in_lambda" >}}), we added `let/in`
|
|
||||||
and lambda expressions to our compiler. At the end of that post, I mentioned
|
|
||||||
that before we move on to bigger and better things, I wanted to take a
|
|
||||||
step back and clean up the compiler.
|
|
||||||
|
|
||||||
Recently, I got around to doing that. Unfortunately, I also got around to doing
|
|
||||||
a lot more. Furthermore, I managed to make the changes in such a way that I
|
|
||||||
can't cleanly separate the 'cleanup' and 'optimization' portions of my work.
|
|
||||||
This is partially due to the way in which I organize code, where each post
|
|
||||||
is associated with a version of the compiler with the necessary changes.
|
|
||||||
Because of all this, instead of making this post about the cleanup, and the
|
|
||||||
next post about the optimizations, I have to merge them into one.
|
|
||||||
|
|
||||||
So, this post is split into two major portions: cleanup, which deals mostly
|
|
||||||
with touching up exceptions and improving the 'name mangling' logic, and
|
|
||||||
optimizations, which deals with adding special treatment to booleans,
|
|
||||||
unboxing integers, and implementing more binary operators.
|
|
||||||
|
|
||||||
### Section 1: Cleanup
|
|
||||||
|
|
||||||
The previous post was
|
|
||||||
{{< sidenote "right" "long-note" "rather long," >}}
|
|
||||||
Probably not as long as this one, though! I really need to get the
|
|
||||||
size of my posts under control.
|
|
||||||
{{< /sidenote >}} which led me to omit
|
|
||||||
a rather important aspect of the compiler: proper error reporting.
|
|
||||||
Once again our compiler has instances of `throw 0`, which is a cheap way
|
|
||||||
of avoiding properly handling a runtime error. Before we move on,
|
|
||||||
it's best to get rid of such blatantly lazy code.
|
|
||||||
|
|
||||||
Our existing exceptions (mostly type errors) can use some work, too.
|
|
||||||
Even the most descriptive issues our compiler reports -- unification errors --
|
|
||||||
don't include the crucial information of _where_ the error is. For large
|
|
||||||
programs, this means having to painstakingly read through the entire file
|
|
||||||
to try figure out which subexpression could possibly have an incorrect type.
|
|
||||||
This is far from the ideal debugging experience.
|
|
||||||
|
|
||||||
Addressing all this is a multi-step change in itself. We want to:
|
|
||||||
|
|
||||||
* Replace all `throw 0` code with actual exceptions.
|
|
||||||
* Replace some exceptions that shouldn't be possible for a user to trigger
|
|
||||||
with assertions.
|
|
||||||
* Keep track of source locations of each subexpression, so that we may
|
|
||||||
be able to print it if it causes an error.
|
|
||||||
* Be able to print out said source locations at will. This isn't
|
|
||||||
a _necessity_, but virtually all "big" compilers do this. Instead
|
|
||||||
of reporting that an error occurs on a particular line, we will
|
|
||||||
actually print the line.
|
|
||||||
|
|
||||||
Let's start with gathering the actual location data.
|
|
||||||
|
|
||||||
#### Bison's Locations
|
|
||||||
Bison actually has some rather nice support for location tracking. It can
|
|
||||||
automatically assemble the "from" and "to" locations of a nonterminal
|
|
||||||
from the locations of children, which would be very tedious to write
|
|
||||||
by hand. We enable this feature using the following option:
|
|
||||||
|
|
||||||
{{< codelines "text" "compiler/13/parser.y" 50 50 >}}
|
|
||||||
|
|
||||||
There's just one hitch, though. Sure, Bison can compute bigger
|
|
||||||
locations from smaller ones, but it must get the smaller ones
|
|
||||||
from somewhere. Since Bison operates on _tokens_, rather
|
|
||||||
than _characters_, it effectively doesn't interact with the source
|
|
||||||
text at all, and can't determine from which line or column a token
|
|
||||||
originated. The task of determining the locations of input tokens
|
|
||||||
is delegated to the tokenizer -- Flex, in our case. Flex, on the
|
|
||||||
other hand, doesn't doesn't have a built-in mechanism for tracking
|
|
||||||
locations. Fortunately, Bison provides a `yy::location` class that
|
|
||||||
includes most of the needed functionality.
|
|
||||||
|
|
||||||
A `yy::location` consists of `begin` and `end` source position,
|
|
||||||
which themselves are represented using lines and columns. It
|
|
||||||
also has the following methods:
|
|
||||||
|
|
||||||
* `yy::location::columns(int)` advances the `end` position by
|
|
||||||
the given number of columns, while `begin` stays the same.
|
|
||||||
If `begin` and `end` both point to the beginning of a token,
|
|
||||||
then `columns(token_length)` will move `end` to the token's end,
|
|
||||||
and thus make the whole `location` contain the token.
|
|
||||||
* `yy::location::lines(int)` behaves similarly to `columns`,
|
|
||||||
except that it advances `end` by the given number of lines,
|
|
||||||
rather than columns.
|
|
||||||
* `yy::location::step()` moves `begin` to where `end` is. This
|
|
||||||
is useful for when we've finished processing a token, and want
|
|
||||||
to move on to the next one.
|
|
||||||
|
|
||||||
For Flex specifically, `yyleng` has the length of the token
|
|
||||||
currently being processed. Rather than adding the calls
|
|
||||||
to `columns` and `step` to every rule, we can define the
|
|
||||||
`YY_USER_ACTION` macro, which is run before each token
|
|
||||||
is processed.
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/scanner.l" 12 12 >}}
|
|
||||||
|
|
||||||
We'll see why we are using `drv` soon; for now, you can treat
|
|
||||||
`location` as if it were a global variable declared in the
|
|
||||||
tokenizer. Before processing each token, we ensure that
|
|
||||||
`location` has its `begin` and `end` at the same position,
|
|
||||||
and then advance `end` by `yyleng` columns. This is sufficient
|
|
||||||
to make `location` represent our token's source position.
|
|
||||||
|
|
||||||
So now we have a "global" variable `location` that gives
|
|
||||||
us the source position of the current token. To get it
|
|
||||||
to Bison, we have to pass it as an argument to each
|
|
||||||
of the `make_TOKEN` calls. Here are a few sample lines
|
|
||||||
that should give you the general idea:
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/scanner.l" 41 44 >}}
|
|
||||||
|
|
||||||
That last line is actually new. Previously, we somehow
|
|
||||||
got away without explicitly sending the EOF token to Bison.
|
|
||||||
I suspect that this was due to some kind of implicit conversion
|
|
||||||
of the Flex macro `YY_NULL` into a token; now that we have
|
|
||||||
to pass a position to every token constructor, such an implicit
|
|
||||||
conversion is probably impossible.
|
|
||||||
|
|
||||||
Now we have Bison computing source locations for each nonterminal.
|
|
||||||
However, at the moment, we still aren't using them. To change that,
|
|
||||||
we need to add a `yy::location` argument to each of our `ast` nodes,
|
|
||||||
as well as to the `pattern` subclasses, `definition_defn` and
|
|
||||||
`definition_data`. To avoid breaking all the code that creates
|
|
||||||
AST nodes and definitions outside of the parser, we'll make this
|
|
||||||
argument optional. Inside of `ast.hpp`, we define it as follows:
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/ast.hpp" 16 16 >}}
|
|
||||||
|
|
||||||
Then, we add a constructor to `ast` as follows:
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/ast.hpp" 18 18 >}}
|
|
||||||
|
|
||||||
Note that it's not default here, since `ast` itself is an
|
|
||||||
abstract class, and thus will never be constructed directly.
|
|
||||||
It is in the subclasses of `ast` that we provide a default
|
|
||||||
value. The change is rather mechanical, but here's an example
|
|
||||||
from `ast_binop`:
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/ast.hpp" 98 99 >}}
|
|
||||||
|
|
||||||
#### Line Offsets, File Input, and the Parse Driver
|
|
||||||
There are three more challenges with printing out the line
|
|
||||||
of code where an error occurred. First of all, to
|
|
||||||
print out a line of code, we need to have that line of code
|
|
||||||
available to us. We do not currently meet this requirement:
|
|
||||||
our compiler reads code form `stdin` (as is default for Flex),
|
|
||||||
and `stdin` doesn't always support rewinding. This, in turn,
|
|
||||||
means that once Flex has read a character from the input,
|
|
||||||
it may not be possible to go back and retrieve that character
|
|
||||||
again.
|
|
||||||
|
|
||||||
Second, even if we do have have the entire stream or buffer
|
|
||||||
available to us, to retrieve an offset and length within
|
|
||||||
that buffer from just a line and column number would be a lot
|
|
||||||
of work. A naive approach would be to iterate through
|
|
||||||
the input again, once more keeping track of lines and columns,
|
|
||||||
and print the desired line once we reach it. However, this
|
|
||||||
would lead us to redo a lot of work that our tokenizer
|
|
||||||
is already doing.
|
|
||||||
|
|
||||||
Third, Flex's input mechanism, even if it it's configured
|
|
||||||
not to read from `stdin`, uses a global file descriptor called
|
|
||||||
`yyin`. However, we're better off minimizing global state (especially
|
|
||||||
if we want to read, parse, and compile multiple files in
|
|
||||||
the future). While we're configuring Flex's input mechanism,
|
|
||||||
we may as well fix this, too.
|
|
||||||
|
|
||||||
There are several approaches to fixing the first issue. One possible
|
|
||||||
way is to store the content of `stdin` into a temporary file. Then,
|
|
||||||
it's possible to read from the file multiple times by using
|
|
||||||
the C functions `fseek` and `rewind`. However, since we're
|
|
||||||
working with files, why not just work directly with the files
|
|
||||||
created by the user? Instead of reading from `stdin`, we may
|
|
||||||
as well take in a path to a file via `argv`, and read from there.
|
|
||||||
Also, instead of `fseek` and `rewind`, we can just read the file
|
|
||||||
into memory, and access it like a normal character buffer.
|
|
||||||
|
|
||||||
To address the second issue, we can keep a mapping of line numbers
|
|
||||||
to their locations in the source buffer. This is rather easy to
|
|
||||||
maintain using an array: the first element of the array is 0,
|
|
||||||
which is the beginning of any line in any source file. From there,
|
|
||||||
every time we encounter the character `\n`, we can push
|
|
||||||
the current source location to the top, marking it as
|
|
||||||
the beginning of another line. Where exactly we store this
|
|
||||||
array is as yet unclear, since we're trying to avoid global variables.
|
|
||||||
|
|
||||||
Finally, begin addressing the third issue, we can use Flex's `reentrant`
|
|
||||||
option, which makes it so that all of the tokenizer's state is stored in an
|
|
||||||
opaque `yyscan_t` structure, rather than in global variables. This way,
|
|
||||||
we can configure `yyin` without setting a global variable, which is a step
|
|
||||||
in the right direction. We'll work on this momentarily.
|
|
||||||
|
|
||||||
Our tokenizing and parsing stack has more global variables
|
|
||||||
than just those specific to Flex. Among these variables is `global_defs`,
|
|
||||||
which receives all the top-level function and data type definitions. We
|
|
||||||
will also need some way of accessing the `yy::location` instance, and
|
|
||||||
a way of storing our file input in memory. Fortunately, we're not
|
|
||||||
the only ones to have ever come across the issue of creating non-global
|
|
||||||
state: the Bison documentation has a
|
|
||||||
[section in its C++ guide](https://www.gnu.org/software/bison/manual/html_node/Calc_002b_002b-Parsing-Driver.html) that describes a technique for manipulating
|
|
||||||
state -- "parsing context", in their words. This technique involves the
|
|
||||||
creation of a _parsing driver_.
|
|
||||||
|
|
||||||
The parsing driver is a class (or struct) that holds all the parse-related
|
|
||||||
state. We can arrange for this class to be available to our tokenizing
|
|
||||||
and parsing functions, which will allow us to use it pretty much like we'd
|
|
||||||
use a global variable. We can define it as follows:
|
|
||||||
|
|
||||||
{{< codelines "C++" "compiler/13/parse_driver.hpp" 14 34 >}}
|
|
Loading…
Reference in New Issue
Block a user