Browse Source

Refactor errors and update post draft.

search
Danila Fedorin 1 year ago
parent
commit
6b8d3b0f8a
  1. 12
      code/compiler/13/ast.cpp
  2. 4
      code/compiler/13/definition.cpp
  3. 31
      code/compiler/13/error.cpp
  4. 18
      code/compiler/13/error.hpp
  5. 13
      code/compiler/13/main.cpp
  6. 20
      code/compiler/13/parse_driver.cpp
  7. 9
      code/compiler/13/parse_driver.hpp
  8. 366
      content/blog/13_compiler_cleanup_optimization/index.md

12
code/compiler/13/ast.cpp

@ -235,13 +235,13 @@ struct case_mappings {
std::vector<instruction_ptr>& make_case_for(tag_type tag) {
if(default_case)
throw type_error("attempted pattern match after catch-all");
throw compiler_error("attempted pattern match after catch-all");
return defined_cases[tag];
}
std::vector<instruction_ptr>& make_default_case() {
if(default_case)
throw type_error("attempted repeated use of catch-all");
throw compiler_error("attempted repeated use of catch-all");
default_case.emplace(std::vector<instruction_ptr>());
return *default_case;
}
@ -285,7 +285,7 @@ struct case_strategy_bool {
if(!(cpat = dynamic_cast<pattern_constr*>(pt.get())) ||
(cpat->constr != "True" && cpat->constr != "False") ||
cpat->params.size() != 0)
throw type_error(
throw compiler_error(
"pattern cannot be converted to a boolean",
pt->loc);
return cpat->constr == "True";
@ -335,7 +335,7 @@ struct case_strategy_data {
repr_type repr_from_pattern(const pattern_ptr& pt) {
pattern_constr* cpat;
if(!(cpat = dynamic_cast<pattern_constr*>(pt.get())))
throw type_error(
throw compiler_error(
"pattern cannot be interpreted as constructor.",
pt->loc);
return std::make_pair(
@ -398,7 +398,7 @@ void compile_case(const ast_case& node, const env_ptr& env, const type* type, st
pattern_var* vpat;
if((vpat = dynamic_cast<pattern_var*>(branch->pat.get()))) {
if(cases.defined_cases_count() == strategy.case_count())
throw type_error("redundant catch-all pattern", branch->pat->loc);
throw compiler_error("redundant catch-all pattern", branch->pat->loc);
auto& branch_into = cases.make_default_case();
env_ptr new_env(new env_var(vpat->var, env));
branch->expr->compile(new_env, branch_into);
@ -412,7 +412,7 @@ void compile_case(const ast_case& node, const env_ptr& env, const type* type, st
if(!(cases.defined_cases_count() == strategy.case_count() ||
cases.default_case_defined()))
throw type_error("incomplete patterns", node.loc);
throw compiler_error("incomplete patterns", node.loc);
strategy.into_instructions(cases, into);
}

4
code/compiler/13/definition.cpp

@ -64,9 +64,9 @@ void definition_data::insert_constructors() const {
type_ptr return_type(return_app);
for(auto& var : vars) {
if(var_set.find(var) != var_set.end())
throw std::runtime_error(
throw compiler_error(
std::string("type variable ") +
var + std::string(" used twice in data type definition."));
var + std::string(" used twice in data type definition."), loc);
var_set.insert(var);
return_app->arguments.push_back(type_ptr(new type_var(var)));
}

31
code/compiler/13/error.cpp

@ -1,19 +1,32 @@
#include "error.hpp"
const char* compiler_error::what() const noexcept {
return "an error occured while compiling the program";
}
void compiler_error::print_about(std::ostream& to) {
to << what() << ": ";
to << description << std::endl;
}
void compiler_error::print_location(std::ostream& to, parse_driver& drv, bool highlight) {
if(!loc) return;
to << "occuring on line " << loc->begin.line << ":" << std::endl;
drv.print_location(to, *loc, highlight);
}
void compiler_error::pretty_print(std::ostream& to, parse_driver& drv) {
print_about(to);
print_location(to, drv);
}
const char* type_error::what() const noexcept {
return "an error occured while checking the types of the program";
}
void type_error::pretty_print(std::ostream& to, parse_driver& drv) {
to << "encountered error while typechecking program: ";
to << description << std::endl;
if(loc) {
to << "occuring on line " << loc->begin.line << ":" << std::endl;
to << std::endl << "```" << std::endl;
drv.print_highlighted_location(to, *loc);
to << "```" << std::endl << std::endl;
}
print_about(to);
print_location(to, drv, true);
}
void unification_error::pretty_print(std::ostream& to, parse_driver& drv, type_mgr& mgr) {

18
code/compiler/13/error.hpp

@ -7,12 +7,26 @@
using maybe_location = std::optional<yy::location>;
struct type_error : std::exception {
struct compiler_error : std::exception {
std::string description;
maybe_location loc;
compiler_error(std::string d, maybe_location l = std::nullopt)
: description(std::move(d)), loc(std::move(l)) {}
const char* what() const noexcept override;
void print_about(std::ostream& to);
void print_location(std::ostream& to, parse_driver& drv, bool highlight = false);
void pretty_print(std::ostream& to, parse_driver& drv);
};
struct type_error : compiler_error {
std::optional<yy::location> loc;
type_error(std::string d, maybe_location l = std::nullopt)
: description(std::move(d)), loc(std::move(l)) {}
: compiler_error(std::move(d), std::move(l)) {}
const char* what() const noexcept override;
void pretty_print(std::ostream& to, parse_driver& drv);

13
code/compiler/13/main.cpp

@ -20,7 +20,7 @@
#include "llvm/Target/TargetMachine.h"
void yy::parser::error(const yy::location& loc, const std::string& msg) {
std::cout << "An error occured: " << msg << std::endl;
std::cerr << "An error occured: " << msg << std::endl;
}
void prelude_types(definition_group& defs, type_env_ptr env) {
@ -110,12 +110,12 @@ void output_llvm(llvm_context& ctx, const std::string& filename) {
std::error_code ec;
llvm::raw_fd_ostream file(filename, ec, llvm::sys::fs::F_None);
if (ec) {
throw std::runtime_error("failed to open object file for writing");
throw compiler_error("failed to open object file for writing");
} else {
llvm::CodeGenFileType type = llvm::CGFT_ObjectFile;
llvm::legacy::PassManager pm;
if (targetMachine->addPassesToEmitFile(pm, file, NULL, type)) {
throw std::runtime_error("failed to add passes to pass manager");
throw compiler_error("failed to add passes to pass manager");
} else {
pm.run(ctx.module);
file.close();
@ -177,10 +177,11 @@ void gen_llvm(global_scope& scope) {
int main(int argc, char** argv) {
if(argc != 2) {
std::cerr << "please enter a file to compile." << std::endl;
exit(1);
}
parse_driver driver(argv[1]);
if(!driver.run_parse()) {
std::cerr << "failed to open file " << argv[1] << std::endl;
std::cerr << "failed to parse file " << argv[1] << std::endl;
exit(1);
}
@ -207,7 +208,7 @@ int main(int argc, char** argv) {
err.pretty_print(std::cerr, driver, mgr);
} catch(type_error& err) {
err.pretty_print(std::cerr, driver);
} catch(std::runtime_error& err) {
std::cerr << err.what() << std::endl;
} catch (compiler_error& err) {
err.pretty_print(std::cerr, driver);
}
}

20
code/compiler/13/parse_driver.cpp

@ -5,8 +5,6 @@
bool parse_driver::run_parse() {
FILE* stream = fopen(file_name.c_str(), "r");
if(!stream) return false;
string_stream = std::ostringstream();
file_offset = 0;
line_offsets.push_back(0);
yyscan_t scanner;
yylex_init(&scanner);
@ -29,27 +27,27 @@ void parse_driver::mark_line() {
}
size_t parse_driver::get_index(int line, int column) {
assert(line > 0);
assert(line <= line_offsets.size());
size_t file_offset = line_offsets[line-1];
file_offset += column - 1;
return file_offset;
assert(line > 0 && line <= line_offsets.size());
return line_offsets[line-1] + column - 1;
}
size_t parse_driver::get_line_end(int line) {
if(line > line_offsets.size()) return file_contents.size();
if(line == line_offsets.size()) return file_contents.size();
return get_index(line+1, 1);
}
void parse_driver::print_highlighted_location(std::ostream& stream, const yy::location& loc) {
void parse_driver::print_location(
std::ostream& stream,
const yy::location& loc,
bool highlight) {
size_t print_start = get_index(loc.begin.line, 1);
size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
size_t highlight_end = get_index(loc.end.line, loc.end.column);
size_t print_end = get_line_end(loc.end.line);
const char* content = file_contents.c_str();
stream.write(content + print_start, highlight_start - print_start);
stream << "\033[4;31m";
if(highlight) stream << "\033[4;31m";
stream.write(content + highlight_start, highlight_end - highlight_start);
stream << "\033[0m";
if(highlight) stream << "\033[0m";
stream.write(content + highlight_end, print_end - highlight_end);
}

9
code/compiler/13/parse_driver.hpp

@ -14,13 +14,13 @@ void scanner_destroy(yyscan_t* scanner);
struct parse_driver {
std::string file_name;
std::ostringstream string_stream;
std::string file_contents;
yy::location location;
size_t file_offset;
std::vector<size_t> line_offsets;
definition_group global_defs;
std::string file_contents;
parse_driver(const std::string& file)
: file_name(file), file_offset(0) {}
@ -30,7 +30,10 @@ struct parse_driver {
void mark_line();
size_t get_index(int line, int column);
size_t get_line_end(int line);
void print_highlighted_location(std::ostream& stream, const yy::location& loc);
void print_location(
std::ostream& stream,
const yy::location& loc,
bool highlight = true);
};
#define YY_DECL yy::parser::symbol_type yylex(yyscan_t yyscanner, parse_driver& drv)

366
content/blog/13_compiler_cleanup_optimization/index.md

@ -62,7 +62,7 @@ automatically assemble the "from" and "to" locations of a nonterminal
from the locations of children, which would be very tedious to write
by hand. We enable this feature using the following option:
{{< codelines "text" "compiler/13/parser.y" 50 50 >}}
{{< codelines "C++" "compiler/13/parser.y" 50 50 >}}
There's just one hitch, though. Sure, Bison can compute bigger
locations from smaller ones, but it must get the smaller ones
@ -143,6 +143,17 @@ from `ast_binop`:
{{< codelines "C++" "compiler/13/ast.hpp" 98 99 >}}
Finally, we tell Bison to pass the computed location
data as an argument when constructing our data structures.
This too is a mechanical change, and I think the following
couple of lines demonstrate the general idea in sufficient
detail:
{{< codelines "C++" "compiler/13/parser.y" 107 110 >}}
Here, the `@$` character is used to reference the current
nonterminal's location data.
#### Line Offsets, File Input, and the Parse Driver
There are three more challenges with printing out the line
of code where an error occurred. First of all, to
@ -202,7 +213,8 @@ will also need some way of accessing the `yy::location` instance, and
a way of storing our file input in memory. Fortunately, we're not
the only ones to have ever come across the issue of creating non-global
state: the Bison documentation has a
[section in its C++ guide](https://www.gnu.org/software/bison/manual/html_node/Calc_002b_002b-Parsing-Driver.html) that describes a technique for manipulating
[section in its C++ guide](https://www.gnu.org/software/bison/manual/html_node/Calc_002b_002b-Parsing-Driver.html)
that describes a technique for manipulating
state -- "parsing context", in their words. This technique involves the
creation of a _parsing driver_.
@ -211,4 +223,352 @@ state. We can arrange for this class to be available to our tokenizing
and parsing functions, which will allow us to use it pretty much like we'd
use a global variable. We can define it as follows:
{{< codelines "C++" "compiler/13/parse_driver.hpp" 14 34 >}}
{{< codelines "C++" "compiler/13/parse_driver.hpp" 14 37 >}}
There are quite a few fields here. The `file_name` string represents
the file that we'll be reading code from. the `string_stream` will
be used to back up the contents of source file as Flex reads them;
once Flex is done, the content of the `string_stream` will be
saved into the `file_content` string.
The next three fields deal with tracking source code
locations. The `location` field will be accessed by Flex
via `drv.location` (where `drv` is a reference to our driver class).
The `file_offset` and `line_offsets` fields will be used to
keep track of where each line begins, as we have discussed above.
Finally, `global_defs` will be the new home of our top-level
definitions.
The methods on `parse_driver` are rather simple, too:
* `run_parse` handles the initialization of the tokenizer
and parser, which includes obtaining the `FILE*` and configuring
Flex to use it. It also handles invoking the parsing code.
We'll make this method return `true` if parsing succeeded,
and `false` otherwise (if, say, the file we tried to read doesn't exist).
* `write` will be called from Flex, and will allow us to
record the content of the file we're processing to the `string_stream`.
We've already seen it used in the `YY_USER_ACTION` macro.
* `mark_line` will also be called from Flex, and will mark the current
`file_offset` as the beginning of a line by pushing it into `line_offsets`.
* `get_index` and `get_line_end` will be used for converting
`yy::location` instances to offsets within the source code buffer.
* `print_location` will be used for printing errors.
It will print the lines spanned by the given location, with the
location itself colored and underlined if the last argument is `true`.
This will make our errors easier on the eyes.
Let's take a look at their implementations. First, `run_parse`:
{{< codelines "C++" "compiler/13/parse_driver.cpp" 5 18 >}}
We try open the user-specified file, and return `false` if we can't.
We then initialize `line_offsets` as we discussed above. After
this, we start doing the setup specific to a reentrant
Flex scanner. We declare a `yyscan_t` variable, which
will contain all of Flex's state. Then, we initialize
it using `yylex_init`. Finally, since we can no longer
touch the `yyin` global variable (it doesn't exist),
we have to resort to using a setter function provided by Flex
to configure the tokenizer's input stream.
Next, we construct our Bison-generated parser. Note that
unlike before, we have to pass in two arguments:
`scanner` and `*this`, the latter being of type `parse_driver&`.
We'll come back to how this works in a moment. With
the scanner and parser initialized, we invoke `parser::operator()`,
which actually runs the Flex- and Bison-generated code.
To clean up, we run `yylex_destroy` and `fclose`. Finally,
we extract the contents of our file into the `file_contents`
string, and return.
Next, the `write` method. For the most part, this method
is a proxy for the `write` method of our `string_stream`:
{{< codelines "C++" "compiler/13/parse_driver.cpp" 20 23 >}}
We do, however, also keep track of the `file_offset` variable
here, which ensures we have up-to-date information
regarding our position in the source file. The implementation
of `mark_line` uses this information:
{{< codelines "C++" "compiler/13/parse_driver.cpp" 25 27 >}}
Once we have the line offsets, `get_index` becomes very simple:
{{< codelines "C++" "compiler/13/parse_driver.cpp" 29 32 >}}
Here, we use an assertion for the first time. Calling
`get_index` with a negative or zero line doesn't make
any sense, since Bison starts tracking line numbers
at 1. Similarly, asking for a line for which we don't
have a recorded offset is invalid. Both
of these nonsensical calls to `get_index` cannot
be caused by the user under normal circumstances,
and indicate the method's misuse by the author of
the compiler (us!). Thus, we terminate the program.
Finally, the implementation of `line_end` just finds the
beginning of the next line. We stick to the C convention
of marking 'end' indices exclusive (pointing just past
the end of the array):
{{< codelines "C++" "compiler/13/parse_driver.cpp" 34 37 >}}
Since `line_offsets` has as many elements as there are lines,
the last line number would be equal to the vector's size.
When looking up the end of the last line, we can't look for
the beginning of the next line, so instead we return the end of the file.
Next, the `print_location` method prints three sections
of the source file. These are the text "before" the error,
the error itself, and, finally, the text "after" the error.
For example, if an error began on the fifth column of the third
line, and ended on the eighth column of the fourth line, the
"before" section would include the first four columns of the third
line, and the "after" section would be the ninth column onward
on the fourth line. Before and after the error itself,
if the `highlight` argument is true,
we sprinkle the ANSI escape codes to enable and disable
special formatting, respectively. For now, the special
formatting involves underlining the text and making it red.
{{< codelines "C++" "compiler/13/parse_driver.cpp" 39 53 >}}
Finally, to get the forward declarations for the `yy*` functions
and types, we set the `header-file` option in Flex:
{{< codelines "C++" "compiler/13/scanner.l" 3 3 >}}
We also include this `scanner.hpp` file in our `parse_driver.cpp`:
{{< codelines "C++" "compiler/13/parse_driver.cpp" 2 2 >}}
#### Adding the Driver to Flex and Bison
Bison's C++ language template generates a class called
`yy::parser`. We don't really want to modify this class
in any way: not only is it generated code, but it's
also rather complex. Instead, Bison provides us
with a mechanism to pass more data in to the parser.
This data is made available to all the actions
that the parser runs. Better yet, Bison also attempts
to pass this data on to the tokenizer, which in our
case would mean that whatever data we provide Bison
will also be available to Flex. This is how we'll
allow the two components to access our new `parse_driver`
class. This is also how we'll pass in the `yyscan_t`
that Flex now needs to run its tokenizing code. To
do all this, we use Bison's `%param` option. I'm
going to include a few more lines from `parser.y`,
since they contain the necessary `#include` directives
and a required type definition:
{{< codelines "C++" "compiler/13/parser.y" 1 18 >}}
The `%param` option effectively adds the parameter listed
between the curly braces to the constructor of the generated
`yy::parser`. We've already seen this in the implementation
of our driver, where we passed `scanner` and `*this` as
arguments when creating the parser. The parameters we declare are also passed to the
`yylex` function, which is expected to accept them in the same order.
Since we're adding `parse_driver` as an argument we have to
declare it. However, we can't include the `parse_driver` header
right away because `parse_driver` itself includes the `parser` header:
we'd end up with a circular dependency. Instead, we resort to
forward-declaring the driver class, as well as the `yyscan_t`
structure containing Flex's state.
Adding a parameter to Bison doesn't automatically affect
Flex. To let Flex know that its `yylex` function must now accept
the state and the parse driver, we have to define the
`YY_DECL` macro. We do this in `parse_driver.hpp`, since
this forward declaration will be used by both Flex
and Bison:
{{< codelines "C++" "compiler/13/parse_driver.hpp" 39 41 >}}
Finally, we can change our `main.cpp` file to use the
`parse_driver`:
{{< codelines "C++" "compiler/13/main.cpp" 178 186 >}}
#### Improving Exceptions
Now, it's time to add location data (and a little bit more) to our
exceptions. We want to make it possible for exceptions to include
data about where the error occurred, and to print this data to the user.
However, it's also possible for us to have exceptions that simply
do not have that location data. Furthermore, we want to know
whether or not an exception has an associated location; we'd
rather not print an invalid or "default" location when an error
occurs.
In the old days of programming, we could represent the absence
of location data with a `nullptr`, or `NULL`. But not only
does this approach expose us to all kind of `NULl`-safety
bugs, but it also requires heap allocation! This doesn't
make it sound all that appealing; instead, I think we should
opt for using `std::optional`.
Though `std::optional` is standard (as may be obvious from its
namespace), it's a rather recent addition to the C++ STL.
In order to gain access to it, we need to ensure that our
project is compiled using C++17. To this end, we add
the following two lines to our CMakeLists.txt:
{{< codelines "CMake" "compiler/13/CMakeLists.txt" 5 6 >}}
Now, let's add a new base class for all of our compiler errors,
unsurprisingly called `compiler_error`:
{{< codelines "C++" "compiler/13/error.hpp" 8 23 >}}
We'll put some 'common' exception functionality
into the `print_location` and `print_about` methods. If the error
has an associated location, the former method will print that
location to the screen. We don't always want to highlight
the part of the code that caused the error: for instance,
an invalid data type definition may span several lines,
and coloring that whole section of text red would be
too much. To address this, we add the `highlight`
boolean argument, which can be used to switch the
colors on and off. The `print_about` method
will simply print the `what()` message of the exception,
in addition to the "specific" error that occurred (stored
in `description`). Here are the implementations of the
functions:
{{< codelines "C++" "compiler/13/error.cpp" 3 16 >}}
We will also add a `pretty_print` method to all of
our exceptions. This method will handle
all the exception-specific printing logic.
For the generic compiler error, this means
simply printing out the error text and the location:
{{< codelines "C++" "compiler/13/error.cpp" 18 21 >}}
For `type_error`, this logic slightly changes,
enabling colors when printing the location:
{{< codelines "C++" "compiler/13/error.cpp" 27 30 >}}
Finally, for `unification_error`, we also include
the code to print out the two types that our
compiler could not unify:
{{< codelines "C++" "compiler/13/error.cpp" 32 41 >}}
There's a subtle change here. Compared to the previous
type-printing code (which we had in `main`), what
we wrote here deals with "expected" and "actual" types.
The `left` type passed to the exception is printed
first, and is treat like the "correct" type. The
`right` type, on the other hand, is treated
like the "wrong" type that should have been
unifiable with `left`. This will affect the
calling conventions of our unification code. In
`main`, we remove all our old exception printing code
in favor of calls to `pretty_print`:
{{< codelines "C++" "compiler/13/main.cpp" 207 213 >}}
Now, we can go through and find all the places where
we `throw 0`. One such place was in the data type
definition code, where declaring the same type parameter
twice is invalid. We replace the `0` with a
`compiler_error`:
{{< codelines "C++" "compiler/13/definition.cpp" 66 69 >}}
Not all `throw 0` statements should become exceptions.
For example, here's code from the previous version of
the compiler:
{{< codelines "C++" "compiler/12/definition.cpp" 123 127 >}}
If a definition `def_defn` has a dependency on a "nearby" (declared
in the same group) definition called `dependency`, and if
`dependency` does not exist within the same definition group,
we throw an exception. But this error is impossible
for a user to trigger: the only reason for a variable to appear
in the `nearby_variables` vector is that it was previously
found in the definition group. Here's the code that proves this
(from the current version of the compiler):
{{< codelines "C++" "compiler/13/definition.cpp" 102 106 >}}
Not being able to find the variable in the definition group
is a compiler bug, and should never occur. So, instead
of throwing an exception, we'll use an assertion:
{{< codelines "C++" "compiler/13/definition.cpp" 128 128 >}}
For more complicated error messages, we can use a `stringstream`.
Here's an example from `parsed_type`:
{{< codelines "C++" "compiler/13/parsed_type.cpp" 16 23 >}}
In general, this change is also rather mechanical, but, to
maintain a balance between exceptions and assertions, here
are a couple more assertions from `type_env`:
{{< codelines "C++" "compiler/13/type_env.cpp" 77 78 >}}
Once again, it should not be possible for the compiler
to try generalize the type of a variable that doesn't
exist, and nor should generalization occur twice.
While we're on the topic of types, let's talk about
`type_mgr::unify`. In practice, I suspect that a lot of
errors in our compiler will originate from this method.
However, at present, this method does not in any way
track the locations of where a unification error occurred.
To fix this, we add a new `loc` parameter to `unify`,
which we make optional to allow for unification without
a known location. Here's the declaration:
{{< codelines "C++" "compiler/13/type.hpp" 101 101 >}}
The change to the implementation is mechanical and repetitive,
so instead of showing you the whole method, I'll settle for
a couple of lines:
{{< codelines "C++" "compiler/13/type.cpp" 119 121 >}}
We want to make sure that a location provided to the
top-level call to `unify` is also forwarded to the
recursive calls, so we have to explicitly add it
to the call.
With all of that done, we can finally stand back and
marvel at the results of our hard work. Here is what a
basic unification error looks like now:
{{< figure src="unification_error.png" caption="The result of a unification error." >}}
I used an image to show colors, but here is the content of the error in textual form:
```
an error occured while checking the types of the program: failed to unify types
occuring on line 2:
3 + False
the expected type was:
!Int
while the actual type was:
!Bool
```
The exclamation marks in front of the two types are due to some
changes from section 2. Here's an error that was previously
a `throw 0` statement in our code:
```
an error occured while compiling the program: type variable a used twice in data type definition.
occuring on line 1:
data Pair a a = { MkPair a a }
```
Now, not only have we eliminated the lazy uses of `throw 0` in our
code, but we've also improved the presentation of the errors
to the user!

Loading…
Cancel
Save