5 changed files with 64 additions and 280 deletions
--- a/code/compiler/13/CMakeLists.txt
+++ b/code/compiler/13/CMakeLists.txt
@ -37,7 +37,6 @@ add_executable(compiler
    instruction.cpp instruction.hpp
    graph.cpp graph.hpp
    global_scope.cpp global_scope.hpp
-    parse_driver.cpp parse_driver.hpp
    ${BISON_parser_OUTPUTS}
    ${FLEX_scanner_OUTPUTS}
    main.cpp
--- a/code/compiler/13/parse_driver.cpp
+++ b/code/compiler/13/parse_driver.cpp
@ -1,55 +0,0 @@
-#include "parse_driver.hpp"
-#include "scanner.hpp"
-#include <sstream>
-
-bool parse_driver::run_parse() {
-    FILE* stream = fopen(file_name.c_str(), "r");
-    if(!stream) return false;
-    string_stream = std::ostringstream();
-    file_offset = 0;
-    line_offsets.push_back(0);
-    yyscan_t scanner;
-    yylex_init(&scanner);
-    yyset_in(stream, scanner);
-    yy::parser parser(scanner, *this);
-    parser();
-    yylex_destroy(scanner);
-    fclose(stream);
-    file_contents = string_stream.str();
-    return true;
-}
-
-void parse_driver::write(const char* buf, size_t len) {
-    string_stream.write(buf, len);
-    file_offset += len;
-}
-
-void parse_driver::mark_line() {
-    line_offsets.push_back(file_offset);
-}
-
-size_t parse_driver::get_index(int line, int column) {
-    assert(line > 0);
-    assert(line <= line_offsets.size());
-    size_t file_offset = line_offsets[line-1];
-    file_offset += column - 1;
-    return file_offset;
-}
-
-size_t parse_driver::get_line_end(int line) {
-    if(line > line_offsets.size()) return file_contents.size();
-    return get_index(line+1, 1);
-}
-
-void parse_driver::print_highlighted_location(std::ostream& stream, const yy::location& loc) {
-    size_t print_start = get_index(loc.begin.line, 1);
-    size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
-    size_t highlight_end = get_index(loc.end.line, loc.end.column);
-    size_t print_end = get_line_end(loc.end.line);
-    const char* content = file_contents.c_str();
-    stream.write(content + print_start, highlight_start - print_start);
-    stream << "\033[4;31m";
-    stream.write(content + highlight_start, highlight_end - highlight_start);
-    stream << "\033[0m";
-    stream.write(content + highlight_end, print_end - highlight_end);
-}
--- a/code/compiler/13/parse_driver.hpp
+++ b/code/compiler/13/parse_driver.hpp
@ -13,6 +13,7 @@ void scanner_destroy(yyscan_t* scanner);

 struct parse_driver {
    std::string file_name;
+    std::ifstream file_stream;
    std::ostringstream string_stream;

    yy::location location;
@ -20,17 +21,58 @@ struct parse_driver {

    std::vector<size_t> line_offsets;
    definition_group global_defs;
-    std::string file_contents;
+    std::string read_file;

    parse_driver(const std::string& file)
        : file_name(file), file_offset(0) {}

-    bool run_parse();
-    void write(const char* buff, size_t len);
-    void mark_line();
-    size_t get_index(int line, int column);
-    size_t get_line_end(int line);
-    void print_highlighted_location(std::ostream& stream, const yy::location& loc);
+    bool run_parse() {
+        file_stream.open(file_name);
+        if(!file_stream.good()) return false;
+        line_offsets.push_back(0);
+        yyscan_t scanner;
+        scanner_init(this, &scanner);
+        yy::parser parser(scanner, *this);
+        parser();
+        scanner_destroy(&scanner);
+        read_file = string_stream.str();
+        return true;
+    }
+
+    int get() {
+        int new_char = file_stream.get();
+        if(new_char == EOF) return EOF;
+        file_offset++;
+        if(new_char == '\n') line_offsets.push_back(file_offset);
+        string_stream.put(new_char);
+        return new_char;
+    }
+
+    size_t get_index(int line, int column) {
+        assert(line > 0);
+        assert(line <= line_offsets.size());
+        size_t file_offset = line_offsets[line-1];
+        file_offset += column - 1;
+        return file_offset;
+    }
+
+    size_t get_line_end(int line) {
+        if(line > line_offsets.size()) return read_file.size();
+        return get_index(line+1, 1);
+    }
+
+    void print_highlighted_location(std::ostream& stream, const yy::location& loc) {
+        size_t print_start = get_index(loc.begin.line, 1);
+        size_t highlight_start = get_index(loc.begin.line, loc.begin.column);
+        size_t highlight_end = get_index(loc.end.line, loc.end.column);
+        size_t print_end = get_line_end(loc.end.line);
+        const char* content = read_file.c_str();
+        stream.write(content + print_start, highlight_start - print_start);
+        stream << "\033[4;31m";
+        stream.write(content + highlight_start, highlight_end - highlight_start);
+        stream << "\033[0m";
+        stream.write(content + highlight_end, print_end - highlight_end);
+    }
 };

 #define YY_DECL yy::parser::symbol_type yylex(yyscan_t yyscanner, parse_driver& drv)
--- a/code/compiler/13/scanner.l
+++ b/code/compiler/13/scanner.l
@ -1,6 +1,5 @@
 %option noyywrap
 %option reentrant
-%option header-file="scanner.hpp"

 %{
 #include <iostream>
@ -9,12 +8,18 @@
 #include "parse_driver.hpp"
 #include "parser.hpp"

-#define YY_USER_ACTION drv.write(yytext, yyleng); drv.location.step(); drv.location.columns(yyleng); 
+#define YY_EXTRA_TYPE parse_driver*
+#define YY_USER_ACTION drv.location.step(); drv.location.columns(yyleng); 
+#define YY_INPUT(buf,result,max_size) \
+    { \
+    int c = yyextra->get(); \
+    result = (c == EOF) ? YY_NULL : (buf[0] = c, 1); \
+    }
 %}

 %%

-\n { drv.location.lines(); drv.mark_line(); }
+\n { drv.location.lines(); }
 [ ]+ {}
 \\ { return yy::parser::make_BACKSLASH(drv.location); }
 \+ { return yy::parser::make_PLUS(drv.location); }
@ -44,3 +49,10 @@ in { return yy::parser::make_IN(drv.location); }
 <<EOF>> { return yy::parser::make_YYEOF(drv.location); }

 %%
+
+void scanner_init(parse_driver* d, yyscan_t* scanner) {
+    yylex_init_extra(d, scanner);
+}
+void scanner_destroy(yyscan_t* scanner) {
+    yylex_destroy(*scanner);
+}
--- a/content/blog/13_compiler_cleanup_optimization/index.md
+++ b/content/blog/13_compiler_cleanup_optimization/index.md
@ -1,214 +0,0 @@
---
-title: Compiling a Functional Language Using C++, Part 13 - More Improvements
-date: 2020-09-10T18:50:02-07:00
-tags: ["C and C++", "Functional Languages", "Compilers"]
-description: "In this post, we clean up our compiler and add some basic optimizations."
---
-
-In [part 12]({{< relref "12_compiler_let_in_lambda" >}}), we added `let/in`
-and lambda expressions to our compiler. At the end of that post, I mentioned
-that before we move on to bigger and better things, I wanted to take a 
-step back and clean up the compiler.
-
-Recently, I got around to doing that. Unfortunately, I also got around to doing
-a lot more. Furthermore, I managed to make the changes in such a way that I
-can't cleanly separate the 'cleanup' and 'optimization' portions of my work.
-This is partially due to the way in which I organize code, where each post
-is associated with a version of the compiler with the necessary changes.
-Because of all this, instead of making this post about the cleanup, and the
-next post about the optimizations, I have to merge them into one.
-
-So, this post is split into two major portions: cleanup, which deals mostly
-with touching up exceptions and improving the 'name mangling' logic, and
-optimizations, which deals with adding special treatment to booleans,
-unboxing integers, and implementing more binary operators.
-
-### Section 1: Cleanup
-
-The previous post was
-{{< sidenote "right" "long-note" "rather long," >}}
-Probably not as long as this one, though! I really need to get the
-size of my posts under control.
-{{< /sidenote >}} which led me to omit
-a rather important aspect of the compiler: proper error reporting.
-Once again our compiler has instances of `throw 0`, which is a cheap way
-of avoiding properly handling a runtime error. Before we move on,
-it's best to get rid of such blatantly lazy code.
-
-Our existing exceptions (mostly type errors) can use some work, too.
-Even the most descriptive issues our compiler reports -- unification errors --
-don't include the crucial information of _where_ the error is. For large
-programs, this means having to painstakingly read through the entire file
-to try figure out which subexpression could possibly have an incorrect type.
-This is far from the ideal debugging experience.
-
-Addressing all this is a multi-step change in itself. We want to:
-
-* Replace all `throw 0` code with actual exceptions.
-* Replace some exceptions that shouldn't be possible for a user to trigger
-with assertions.
-* Keep track of source locations of each subexpression, so that we may
-be able to print it if it causes an error.
-* Be able to print out said source locations at will. This isn't
-a _necessity_, but virtually all "big" compilers do this. Instead
-of reporting that an error occurs on a particular line, we will
-actually print the line.
-
-Let's start with gathering the actual location data. 
-
-#### Bison's Locations
-Bison actually has some rather nice support for location tracking. It can
-automatically assemble the "from" and "to" locations of a nonterminal
-from the locations of children, which would be very tedious to write
-by hand. We enable this feature using the following option:
-
-{{< codelines "text" "compiler/13/parser.y" 50 50 >}}
-
-There's just one hitch, though. Sure, Bison can compute bigger
-locations from smaller ones, but it must get the smaller ones
-from somewhere. Since Bison operates on _tokens_, rather
-than _characters_, it effectively doesn't interact with the source
-text at all, and can't determine from which line or column a token
-originated. The task of determining the locations of input tokens
-is delegated to the tokenizer -- Flex, in our case. Flex, on the
-other hand, doesn't doesn't have a built-in mechanism for tracking
-locations. Fortunately, Bison provides a `yy::location` class that
-includes most of the needed functionality.
-
-A `yy::location` consists of `begin` and `end` source position,
-which themselves are represented using lines and columns. It
-also has the following methods:
-
-* `yy::location::columns(int)` advances the `end` position by
-the given number of columns, while `begin` stays the same.
-If `begin` and `end` both point to the beginning of a token,
-then `columns(token_length)` will move `end` to the token's end,
-and thus make the whole `location` contain the token.
-* `yy::location::lines(int)` behaves similarly to `columns`,
-except that it advances `end` by the given number of lines,
-rather than columns.
-* `yy::location::step()` moves `begin` to where `end` is. This
-is useful for when we've finished processing a token, and want
-to move on to the next one.
-
-For Flex specifically, `yyleng` has the length of the token
-currently being processed. Rather than adding the calls
-to `columns` and `step` to every rule, we can define the
-`YY_USER_ACTION` macro, which is run before each token
-is processed.
-
-{{< codelines "C++" "compiler/13/scanner.l" 12 12 >}}
-
-We'll see why we are using `drv` soon; for now, you can treat
-`location` as if it were a global variable declared in the
-tokenizer. Before processing each token, we ensure that
-`location` has its `begin` and `end` at the same position,
-and then advance `end` by `yyleng` columns. This is sufficient
-to make `location` represent our token's source position.
-
-So now we have a "global" variable `location` that gives
-us the source position of the current token. To get it
-to Bison, we have to pass it as an argument to each
-of the `make_TOKEN` calls. Here are a few sample lines
-that should give you the general idea:
-
-{{< codelines "C++" "compiler/13/scanner.l" 41 44 >}}
-
-That last line is actually new. Previously, we somehow
-got away without explicitly sending the EOF token to Bison.
-I suspect that this was due to some kind of implicit conversion
-of the Flex macro `YY_NULL` into a token; now that we have
-to pass a position to every token constructor, such an implicit
-conversion is probably impossible.
-
-Now we have Bison computing source locations for each nonterminal.
-However, at the moment, we still aren't using them. To change that,
-we need to add a `yy::location` argument to each of our `ast` nodes,
-as well as to the `pattern` subclasses, `definition_defn` and
-`definition_data`. To avoid breaking all the code that creates
-AST nodes and definitions outside of the parser, we'll make this
-argument optional. Inside of `ast.hpp`, we define it as follows:
-
-{{< codelines "C++" "compiler/13/ast.hpp" 16 16 >}}
-
-Then, we add a constructor to `ast` as follows:
-
-{{< codelines "C++" "compiler/13/ast.hpp" 18 18 >}}
-
-Note that it's not default here, since `ast` itself is an
-abstract class, and thus will never be constructed directly.
-It is in the subclasses of `ast` that we provide a default
-value. The change is rather mechanical, but here's an example
-from `ast_binop`:
-
-{{< codelines "C++" "compiler/13/ast.hpp" 98 99 >}}
-
-#### Line Offsets, File Input, and the Parse Driver
-There are three more challenges with printing out the line
-of code where an error occurred. First of all, to
-print out a line of code, we need to have that line of code
-available to us. We do not currently meet this requirement:
-our compiler reads code form `stdin` (as is default for Flex),
-and `stdin` doesn't always support rewinding. This, in turn,
-means that once Flex has read a character from the input,
-it may not be possible to go back and retrieve that character
-again.
-
-Second, even if we do have have the entire stream or buffer
-available to us, to retrieve an offset and length within
-that buffer from just a line and column number would be a lot
-of work. A naive approach would be to iterate through
-the input again, once more keeping track of lines and columns,
-and print the desired line once we reach it. However, this
-would lead us to redo a lot of work that our tokenizer
-is already doing.
-
-Third, Flex's input mechanism, even if it it's configured
-not to read from `stdin`, uses a global file descriptor called
-`yyin`. However, we're better off minimizing global state (especially
-if we want to read, parse, and compile multiple files in
-the future). While we're configuring Flex's input mechanism,
-we may as well fix this, too.
-
-There are several approaches to fixing the first issue. One possible
-way is to store the content of `stdin` into a temporary file. Then,
-it's possible to read from the file multiple times by using
-the C functions `fseek` and `rewind`. However, since we're
-working with files, why not just work directly with the files
-created by the user? Instead of reading from `stdin`, we may
-as well take in a path to a file via `argv`, and read from there.
-Also, instead of `fseek` and `rewind`, we can just read the file
-into memory, and access it like a normal character buffer.
-
-To address the second issue, we can keep a mapping of line numbers
-to their locations in the source buffer. This is rather easy to
-maintain using an array: the first element of the array is 0,
-which is the beginning of any line in any source file. From there,
-every time we encounter the character `\n`, we can push
-the current source location to the top, marking it as
-the beginning of another line. Where exactly we store this
-array is as yet unclear, since we're trying to avoid global variables.
-
-Finally, begin addressing the third issue, we can use Flex's `reentrant`
-option, which makes it so that all of the tokenizer's state is stored in an
-opaque `yyscan_t` structure, rather than in global variables. This way,
-we can configure `yyin` without setting a global variable, which is a step
-in the right direction. We'll work on this momentarily.
-
-Our tokenizing and parsing stack has more global variables
-than just those specific to Flex. Among these variables is `global_defs`,
-which receives all the top-level function and data type definitions. We
-will also need some way of accessing the `yy::location` instance, and
-a way of storing our file input in memory. Fortunately, we're not
-the only ones to have ever come across the issue of creating non-global
-state: the Bison documentation has a
-[section in its C++ guide](https://www.gnu.org/software/bison/manual/html_node/Calc_002b_002b-Parsing-Driver.html) that describes a technique for manipulating
-state -- "parsing context", in their words. This technique involves the
-creation of a _parsing driver_.
-
-The parsing driver is a class (or struct) that holds all the parse-related
-state. We can arrange for this class to be available to our tokenizing
-and parsing functions, which will allow us to use it pretty much like we'd
-use a global variable. We can define it as follows:
-
-{{< codelines "C++" "compiler/13/parse_driver.hpp" 14 34 >}}