5 changed files with 64 additions and 280 deletions
--- a/code/compiler/13/CMakeLists.txt
+++ b/code/compiler/13/CMakeLists.txt
@ -37,7 +37,6 @@ add_executable(compiler
    instruction.cpp instruction.hpp
    graph.cpp graph.hpp
    global_scope.cpp global_scope.hpp
    parse_driver.cpp parse_driver.hpp
    ${BISON_parser_OUTPUTS}
    ${FLEX_scanner_OUTPUTS}
    main.cpp
--- a/code/compiler/13/parse_driver.cpp
+++ b/code/compiler/13/parse_driver.cpp
@ -1,55 +0,0 @@
 #include "parse_driver.hpp"
 #include "scanner.hpp"
 #include <sstream>
 bool parse_driver::run_parse() {
    FILE* stream = fopen(file_name.c_str(), "r");
    if(!stream) return false;
    string_stream = std::ostringstream();
    file_offset = 0;
    line_offsets.push_back(0);
    yyscan_t scanner;
    yylex_init(&scanner);
    yyset_in(stream, scanner);
    yy::parser parser(scanner, *this);
    parser();
    yylex_destroy(scanner);
    fclose(stream);
    file_contents = string_stream.str();
    return true;
 }
 void parse_driver::write(const char* buf, size_t len) {
    string_stream.write(buf, len);
    file_offset += len;
 }
 void parse_driver::mark_line() {
    line_offsets.push_back(file_offset);
 }
 size_t parse_driver::get_index(int line, int column) {
    assert(line > 0);
    assert(line <= line_offsets.size());
    size_t file_offset = line_offsets[line-1];
    file_offset += column - 1;
    return file_offset;
 }
 size_t parse_driver::get_line_end(int line) {
    if(line > line_offsets.size()) return file_contents.size();
    return get_index(line+1, 1);
 }
 void parse_driver::print_highlighted_location(std::ostream& stream, const yy::location& loc) {
    size_t print_start = get_index(loc.begin.line, 1);
    size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
    size_t highlight_end = get_index(loc.end.line, loc.end.column);
    size_t print_end = get_line_end(loc.end.line);
    const char* content = file_contents.c_str();
    stream.write(content + print_start, highlight_start - print_start);
    stream << "\033[4;31m";
    stream.write(content + highlight_start, highlight_end - highlight_start);
    stream << "\033[0m";
    stream.write(content + highlight_end, print_end - highlight_end);
 }
--- a/code/compiler/13/parse_driver.hpp
+++ b/code/compiler/13/parse_driver.hpp
@ -13,6 +13,7 @@ void scanner_destroy(yyscan_t* scanner);
 struct parse_driver {
    std::string file_name;
    std::ifstream file_stream;
    std::ostringstream string_stream;
    yy::location location;
@ -20,17 +21,58 @@ struct parse_driver {
    std::vector<size_t> line_offsets;
    definition_group global_defs;
-    std::string file_contents;
+    std::string read_file;
    parse_driver(const std::string& file)
        : file_name(file), file_offset(0) {}
-    bool run_parse();
+    bool run_parse() {
-    void write(const char* buff, size_t len);
+        file_stream.open(file_name);
-    void mark_line();
+        if(!file_stream.good()) return false;
-    size_t get_index(int line, int column);
+        line_offsets.push_back(0);
-    size_t get_line_end(int line);
+        yyscan_t scanner;
-    void print_highlighted_location(std::ostream& stream, const yy::location& loc);
+        scanner_init(this, &scanner);
        yy::parser parser(scanner, *this);
        parser();
        scanner_destroy(&scanner);
        read_file = string_stream.str();
        return true;
    }
    int get() {
        int new_char = file_stream.get();
        if(new_char == EOF) return EOF;
        file_offset++;
        if(new_char == '\n') line_offsets.push_back(file_offset);
        string_stream.put(new_char);
        return new_char;
    }
    size_t get_index(int line, int column) {
        assert(line > 0);
        assert(line <= line_offsets.size());
        size_t file_offset = line_offsets[line-1];
        file_offset += column - 1;
        return file_offset;
    }
    size_t get_line_end(int line) {
        if(line > line_offsets.size()) return read_file.size();
        return get_index(line+1, 1);
    }
    void print_highlighted_location(std::ostream& stream, const yy::location& loc) {
        size_t print_start = get_index(loc.begin.line, 1);
        size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
        size_t highlight_end = get_index(loc.end.line, loc.end.column);
        size_t print_end = get_line_end(loc.end.line);
        const char* content = read_file.c_str();
        stream.write(content + print_start, highlight_start - print_start);
        stream << "\033[4;31m";
        stream.write(content + highlight_start, highlight_end - highlight_start);
        stream << "\033[0m";
        stream.write(content + highlight_end, print_end - highlight_end);
    }
 };
 #define YY_DECL yy::parser::symbol_type yylex(yyscan_t yyscanner, parse_driver& drv)
--- a/code/compiler/13/scanner.l
+++ b/code/compiler/13/scanner.l
@ -1,6 +1,5 @@
 %option noyywrap
 %option reentrant
 %option header-file="scanner.hpp"
 %{
 #include <iostream>
@ -9,12 +8,18 @@
 #include "parse_driver.hpp"
 #include "parser.hpp"
-#define YY_USER_ACTION drv.write(yytext, yyleng); drv.location.step(); drv.location.columns(yyleng); 
+#define YY_EXTRA_TYPE parse_driver*
 #define YY_USER_ACTION drv.location.step(); drv.location.columns(yyleng); 
 #define YY_INPUT(buf,result,max_size) \
    { \
    int c = yyextra->get(); \
    result = (c == EOF) ? YY_NULL : (buf[0] = c, 1); \
    }
 %}
 %%
-\n { drv.location.lines(); drv.mark_line(); }
+\n { drv.location.lines(); }
 [ ]+ {}
 \\ { return yy::parser::make_BACKSLASH(drv.location); }
 \+ { return yy::parser::make_PLUS(drv.location); }
@ -44,3 +49,10 @@ in { return yy::parser::make_IN(drv.location); }
 <<EOF>> { return yy::parser::make_YYEOF(drv.location); }
 %%
 void scanner_init(parse_driver* d, yyscan_t* scanner) {
    yylex_init_extra(d, scanner);
 }
 void scanner_destroy(yyscan_t* scanner) {
    yylex_destroy(*scanner);
 }
--- a/content/blog/13_compiler_cleanup_optimization/index.md
+++ b/content/blog/13_compiler_cleanup_optimization/index.md
@ -1,214 +0,0 @@
 ---
 title: Compiling a Functional Language Using C++, Part 13 - More Improvements
 date: 2020-09-10T18:50:02-07:00
 tags: ["C and C++", "Functional Languages", "Compilers"]
 description: "In this post, we clean up our compiler and add some basic optimizations."
 ---
 In [part 12]({{< relref "12_compiler_let_in_lambda" >}}), we added `let/in`
 and lambda expressions to our compiler. At the end of that post, I mentioned
 that before we move on to bigger and better things, I wanted to take a 
 step back and clean up the compiler.
 Recently, I got around to doing that. Unfortunately, I also got around to doing
 a lot more. Furthermore, I managed to make the changes in such a way that I
 can't cleanly separate the 'cleanup' and 'optimization' portions of my work.
 This is partially due to the way in which I organize code, where each post
 is associated with a version of the compiler with the necessary changes.
 Because of all this, instead of making this post about the cleanup, and the
 next post about the optimizations, I have to merge them into one.
 So, this post is split into two major portions: cleanup, which deals mostly
 with touching up exceptions and improving the 'name mangling' logic, and
 optimizations, which deals with adding special treatment to booleans,
 unboxing integers, and implementing more binary operators.
 ### Section 1: Cleanup
 The previous post was
 {{< sidenote "right" "long-note" "rather long," >}}
 Probably not as long as this one, though! I really need to get the
 size of my posts under control.
 {{< /sidenote >}} which led me to omit
 a rather important aspect of the compiler: proper error reporting.
 Once again our compiler has instances of `throw 0`, which is a cheap way
 of avoiding properly handling a runtime error. Before we move on,
 it's best to get rid of such blatantly lazy code.
 Our existing exceptions (mostly type errors) can use some work, too.
 Even the most descriptive issues our compiler reports -- unification errors --
 don't include the crucial information of _where_ the error is. For large
 programs, this means having to painstakingly read through the entire file
 to try figure out which subexpression could possibly have an incorrect type.
 This is far from the ideal debugging experience.
 Addressing all this is a multi-step change in itself. We want to:
 * Replace all `throw 0` code with actual exceptions.
 * Replace some exceptions that shouldn't be possible for a user to trigger
 with assertions.
 * Keep track of source locations of each subexpression, so that we may
 be able to print it if it causes an error.
 * Be able to print out said source locations at will. This isn't
 a _necessity_, but virtually all "big" compilers do this. Instead
 of reporting that an error occurs on a particular line, we will
 actually print the line.
 Let's start with gathering the actual location data. 
 #### Bison's Locations
 Bison actually has some rather nice support for location tracking. It can
 automatically assemble the "from" and "to" locations of a nonterminal
 from the locations of children, which would be very tedious to write
 by hand. We enable this feature using the following option:
 {{< codelines "text" "compiler/13/parser.y" 50 50 >}}
 There's just one hitch, though. Sure, Bison can compute bigger
 locations from smaller ones, but it must get the smaller ones
 from somewhere. Since Bison operates on _tokens_, rather
 than _characters_, it effectively doesn't interact with the source
 text at all, and can't determine from which line or column a token
 originated. The task of determining the locations of input tokens
 is delegated to the tokenizer -- Flex, in our case. Flex, on the
 other hand, doesn't doesn't have a built-in mechanism for tracking
 locations. Fortunately, Bison provides a `yy::location` class that
 includes most of the needed functionality.
 A `yy::location` consists of `begin` and `end` source position,
 which themselves are represented using lines and columns. It
 also has the following methods:
 * `yy::location::columns(int)` advances the `end` position by
 the given number of columns, while `begin` stays the same.
 If `begin` and `end` both point to the beginning of a token,
 then `columns(token_length)` will move `end` to the token's end,
 and thus make the whole `location` contain the token.
 * `yy::location::lines(int)` behaves similarly to `columns`,
 except that it advances `end` by the given number of lines,
 rather than columns.
 * `yy::location::step()` moves `begin` to where `end` is. This
 is useful for when we've finished processing a token, and want
 to move on to the next one.
 For Flex specifically, `yyleng` has the length of the token
 currently being processed. Rather than adding the calls
 to `columns` and `step` to every rule, we can define the
 `YY_USER_ACTION` macro, which is run before each token
 is processed.
 {{< codelines "C++" "compiler/13/scanner.l" 12 12 >}}
 We'll see why we are using `drv` soon; for now, you can treat
 `location` as if it were a global variable declared in the
 tokenizer. Before processing each token, we ensure that
 `location` has its `begin` and `end` at the same position,
 and then advance `end` by `yyleng` columns. This is sufficient
 to make `location` represent our token's source position.
 So now we have a "global" variable `location` that gives
 us the source position of the current token. To get it
 to Bison, we have to pass it as an argument to each
 of the `make_TOKEN` calls. Here are a few sample lines
 that should give you the general idea:
 {{< codelines "C++" "compiler/13/scanner.l" 41 44 >}}
 That last line is actually new. Previously, we somehow
 got away without explicitly sending the EOF token to Bison.
 I suspect that this was due to some kind of implicit conversion
 of the Flex macro `YY_NULL` into a token; now that we have
 to pass a position to every token constructor, such an implicit
 conversion is probably impossible.
 Now we have Bison computing source locations for each nonterminal.
 However, at the moment, we still aren't using them. To change that,
 we need to add a `yy::location` argument to each of our `ast` nodes,
 as well as to the `pattern` subclasses, `definition_defn` and
 `definition_data`. To avoid breaking all the code that creates
 AST nodes and definitions outside of the parser, we'll make this
 argument optional. Inside of `ast.hpp`, we define it as follows:
 {{< codelines "C++" "compiler/13/ast.hpp" 16 16 >}}
 Then, we add a constructor to `ast` as follows:
 {{< codelines "C++" "compiler/13/ast.hpp" 18 18 >}}
 Note that it's not default here, since `ast` itself is an
 abstract class, and thus will never be constructed directly.
 It is in the subclasses of `ast` that we provide a default
 value. The change is rather mechanical, but here's an example
 from `ast_binop`:
 {{< codelines "C++" "compiler/13/ast.hpp" 98 99 >}}
 #### Line Offsets, File Input, and the Parse Driver
 There are three more challenges with printing out the line
 of code where an error occurred. First of all, to
 print out a line of code, we need to have that line of code
 available to us. We do not currently meet this requirement:
 our compiler reads code form `stdin` (as is default for Flex),
 and `stdin` doesn't always support rewinding. This, in turn,
 means that once Flex has read a character from the input,
 it may not be possible to go back and retrieve that character
 again.
 Second, even if we do have have the entire stream or buffer
 available to us, to retrieve an offset and length within
 that buffer from just a line and column number would be a lot
 of work. A naive approach would be to iterate through
 the input again, once more keeping track of lines and columns,
 and print the desired line once we reach it. However, this
 would lead us to redo a lot of work that our tokenizer
 is already doing.
 Third, Flex's input mechanism, even if it it's configured
 not to read from `stdin`, uses a global file descriptor called
 `yyin`. However, we're better off minimizing global state (especially
 if we want to read, parse, and compile multiple files in
 the future). While we're configuring Flex's input mechanism,
 we may as well fix this, too.
 There are several approaches to fixing the first issue. One possible
 way is to store the content of `stdin` into a temporary file. Then,
 it's possible to read from the file multiple times by using
 the C functions `fseek` and `rewind`. However, since we're
 working with files, why not just work directly with the files
 created by the user? Instead of reading from `stdin`, we may
 as well take in a path to a file via `argv`, and read from there.
 Also, instead of `fseek` and `rewind`, we can just read the file
 into memory, and access it like a normal character buffer.
 To address the second issue, we can keep a mapping of line numbers
 to their locations in the source buffer. This is rather easy to
 maintain using an array: the first element of the array is 0,
 which is the beginning of any line in any source file. From there,
 every time we encounter the character `\n`, we can push
 the current source location to the top, marking it as
 the beginning of another line. Where exactly we store this
 array is as yet unclear, since we're trying to avoid global variables.
 Finally, begin addressing the third issue, we can use Flex's `reentrant`
 option, which makes it so that all of the tokenizer's state is stored in an
 opaque `yyscan_t` structure, rather than in global variables. This way,
 we can configure `yyin` without setting a global variable, which is a step
 in the right direction. We'll work on this momentarily.
 Our tokenizing and parsing stack has more global variables
 than just those specific to Flex. Among these variables is `global_defs`,
 which receives all the top-level function and data type definitions. We
 will also need some way of accessing the `yy::location` instance, and
 a way of storing our file input in memory. Fortunately, we're not
 the only ones to have ever come across the issue of creating non-global
 state: the Bison documentation has a
 [section in its C++ guide](https://www.gnu.org/software/bison/manual/html_node/Calc_002b_002b-Parsing-Driver.html) that describes a technique for manipulating
 state -- "parsing context", in their words. This technique involves the
 creation of a _parsing driver_.
 The parsing driver is a class (or struct) that holds all the parse-related
 state. We can arrange for this class to be available to our tokenizing
 and parsing functions, which will allow us to use it pretty much like we'd
 use a global variable. We can define it as follows:
 {{< codelines "C++" "compiler/13/parse_driver.hpp" 14 34 >}}