From d90993a93c9b31f500b6d87b65011117c2eab9b4 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Tue, 8 Oct 2019 23:46:35 -0700 Subject: [PATCH] Implement ast_case::compile for compiler series and reference code --- code/compiler/06/ast.cpp | 44 ++++++++++++ code/compiler/06/instruction.hpp | 7 ++ content/blog/06_compiler_semantics.md | 98 ++++++++++++++++++++------- 3 files changed, 124 insertions(+), 25 deletions(-) diff --git a/code/compiler/06/ast.cpp b/code/compiler/06/ast.cpp index 3954a8a..ad85a09 100644 --- a/code/compiler/06/ast.cpp +++ b/code/compiler/06/ast.cpp @@ -176,7 +176,51 @@ void ast_case::resolve(const type_mgr& mgr) const { } void ast_case::compile(const env_ptr& env, std::vector& into) const { + type_data* type = dynamic_cast(node_type.get()); + of->compile(env, into); + into.push_back(instruction_ptr(new instruction_eval())); + + instruction_jump* jump_instruction = new instruction_jump(); + into.push_back(instruction_ptr(jump_instruction)); + for(auto& branch : branches) { + std::vector branch_instructions; + pattern_var* vpat; + pattern_constr* cpat; + + if((vpat = dynamic_cast(branch->pat.get()))) { + branch->expr->compile(env_ptr(new env_offset(1, env)), branch_instructions); + + for(auto& constr_pair : type->constructors) { + if(jump_instruction->tag_mappings.find(constr_pair.second.tag) != + jump_instruction->tag_mappings.end()) + break; + + jump_instruction->tag_mappings[constr_pair.second.tag] = + jump_instruction->branches.size(); + } + jump_instruction->branches.push_back(std::move(branch_instructions)); + } else if((cpat = dynamic_cast(branch->pat.get()))) { + branch_instructions.push_back(instruction_ptr(new instruction_split())); + branch->expr->compile(env_ptr(new env_offset(cpat->params.size(), env)), + branch_instructions); + + int new_tag = type->constructors[cpat->constr].tag; + if(jump_instruction->tag_mappings.find(new_tag) != + jump_instruction->tag_mappings.end()) + throw type_error("technically not a type error: duplicate pattern"); + + jump_instruction->tag_mappings[new_tag] = + jump_instruction->branches.size(); + jump_instruction->branches.push_back(std::move(branch_instructions)); + } + } + + for(auto& constr_pair : type->constructors) { + if(jump_instruction->tag_mappings.find(constr_pair.second.tag) == + jump_instruction->tag_mappings.end()) + throw type_error("non-total pattern"); + } } void pattern_var::print(std::ostream& to) const { diff --git a/code/compiler/06/instruction.hpp b/code/compiler/06/instruction.hpp index 8d846e0..40196e4 100644 --- a/code/compiler/06/instruction.hpp +++ b/code/compiler/06/instruction.hpp @@ -2,6 +2,8 @@ #include #include #include "binop.hpp" +#include +#include struct instruction { virtual ~instruction() = default; @@ -53,6 +55,11 @@ struct instruction_split : public instruction { }; +struct instruction_jump : public instruction { + std::vector> branches; + std::map tag_mappings; +}; + struct instruction_slide : public instruction { int offset; diff --git a/content/blog/06_compiler_semantics.md b/content/blog/06_compiler_semantics.md index 5409853..516c85e 100644 --- a/content/blog/06_compiler_semantics.md +++ b/content/blog/06_compiler_semantics.md @@ -241,16 +241,16 @@ And now, we begin our implementation. Let's start with the easy ones: `ast_int`, `ast_lid` and `ast_uid`. The code for `ast_int` involves just pushing the integer into the stack: -{{< codelines "C++" "compiler/06/ast.cpp" 18 20 >}} +{{< codelines "C++" "compiler/06/ast.cpp" 36 38 >}} The code for `ast_lid` needs to check if the variable is global or local, just like we discussed: -{{< codelines "C++" "compiler/06/ast.cpp" 31 36 >}} +{{< codelines "C++" "compiler/06/ast.cpp" 53 58 >}} We do not have to do this for `ast_uid`: -{{< codelines "C++" "compiler/06/ast.cpp" 47 49 >}} +{{< codelines "C++" "compiler/06/ast.cpp" 73 75 >}} On to `ast_binop`! This is the first time we have to change our environment. As we said earlier, once we build the right operand on the stack, every offset that we counted @@ -259,14 +259,14 @@ in our compilation scheme for function application). So, we create a new environment with `env_offset`, and use that when we compile the left child: -{{< codelines "C++" "compiler/06/ast.cpp" 72 79 >}} +{{< codelines "C++" "compiler/06/ast.cpp" 103 110 >}} `ast_binop` performs two applications: `(+) lhs rhs`. We push `rhs`, then `lhs`, then `(+)`, and then use MkApp twice. In `ast_app`, we only need to perform one application, `lhs rhs`: -{{< codelines "C++" "compiler/06/ast.cpp" 98 102 >}} +{{< codelines "C++" "compiler/06/ast.cpp" 134 138 >}} Note that we also extend our environment in this one, for the exact same reason as before. @@ -278,14 +278,15 @@ We need to adjust our code to keep track of the tags of the various constructors of a type. To do this, we add a subclass for the `type_base` struct, called `type_data`: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/type.hpp" 33 42 >}} When we create types from `definition_data`, we tag the corresponding constructors: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/definition.cpp" 35 51 >}} -Ah, but that doesn't solve the problem. Once we performed type checking, we don't keep -the types that we computed for an AST node in the node. And obviously, we don't want +Ah, but adding constructor info to the type doesn't solve the problem. +Once we performed type checking, we don't keep +the types that we computed for an AST node, in the node. And obviously, we don't want to go looking for them again. Furthermore, we can't just look up a constructor in the environment, since we can well have patterns that don't have __any__ constructors: @@ -296,11 +297,8 @@ match l { ``` So, we want each `ast` node to store its type (well, in practice we only need this for -`ast_case`, but we might as well store it for all nodes). We can add it, no problem: - -{{< todo >}}Link code{{< /todo >}} - -Now, we can add another, non-virtual `typecheck` method (let's call it `typecheck_common`, +`ast_case`, but we might as well store it for all nodes). We can add it, no problem. +To add to that, we can add another, non-virtual `typecheck` method (let's call it `typecheck_common`, since naming is hard). This method will call `typecheck`, and store the output into the `node_type` field. @@ -311,7 +309,7 @@ type_ptr typecheck_common(type_mgr& mgr, const type_env& env); And the implementation is as simple as you think: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/ast.cpp" 9 12 >}} In client code (`definition_defn::typecheck_first` for instance), we should now use `typecheck_common` instead of `typecheck`. With that done, we're almost there. @@ -329,26 +327,76 @@ virtual void resolve(const type_mgr& mgr) const = 0; ``` We also add the `resolve` method to `definition`, so that we can call it -without having to run `dynamic_cast`. The implementation for `resolve_common` +without having to run `dynamic_cast`. The implementation for `ast::resolve_common` just resolves the type: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/ast.cpp" 14 21 >}} -The virtual `resolve` just calls `resolve_common` on an all `ast` children +The virtual `ast::resolve` just calls `ast::resolve_common` on an all `ast` children of a node. Here's a sample implementation from `ast_binop`: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/ast.cpp" 98 101 >}} -And here's the implementation of `resolve` on `definition_defn`: +And here's the implementation of `definition::resolve` on `definition_defn`: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/definition.cpp" 31 33 >}} -Finally, we call `resolve` from inside `typecheck_program` in `main.cpp`: +Finally, we call `resolve` at the end `typecheck_program` in `main.cpp`: -{{< todo >}}Link code{{< /todo >}} +{{< codelines "C++" "compiler/06/main.cpp" 40 42 >}} -Finally, we're ready to implement the code for compiling `ast_case`. +At last, we're ready to implement the code for compiling `ast_case`. +Here it is, in all its glory: -{{< todo >}}Figure out how to keep all trees not requiring a type manager. {{< /todo >}} +{{< codelines "C++" "compiler/06/ast.cpp" 178 224 >}} + +There's a lot to unpack here. First of all, just like we said in the compilation +scheme, we want to build and evaluate the expression that's being analyzed. +Once that's done, however, things get more tricky. We know that each +branch of a case expression will correspond to a vector of instructions - +in fact, our jump instruction contains a mapping from tags to instructions. +As we also discussed above, each list of instructions can be mapped to +by multiple tags. We don't want to recompile the same sequence of instructions +multiple times (or indeed, generate machine code for it). So, we keep +a mapping of tags to their corresponding sequences of instructions. We implement +this by having a vector of vectors of instructions (in which each inner vector +represents the code for a branch), and a map of tag number to index +in the vector containing all the branches. This way, multiple tags +can point to the same instruction set without duplicating information. + +We also don't allow a tag to be mapped to more than one sequence of instructions. +This is handled differently depending on whether a variable pattern or a +constructor pattern are encountered. Variable patterns map all +tags that haven't been mapped yet, so no error can occur. Constructor patterns, +though, can explicitly try to map the same tag twice, and we don't want that. + +I implied in the previous paragraph the implementation of our case expression +compilation algorithm, but let's go through it. Once we've compiled +the expression to be analyzed, and evaluated it (just like in our definitions +above), we proceed to look at all the branches specified in the case expression. + +If a branch has a variable pattern, we must map to the result of the compilation +all the remaining, unmapped tags. We also aren't going to be taking apart +our value, so we don't need to use Split, but we do need to add 1 to the +environment offset to account the the presence of that value. So, +we compile the branch body with that offset, and iterate through +all the constructors of our data type. We skip a constructor +if it's been mapped, and if it hasn't been, we map it to the index +that this branch body will have in our list. Finally, +we push the newly compiled instruction sequence into the list of branch +bodies. + +If a branch is a constructor pattern, on the other hand, we lead our compilation +output with a Split. This takes off the value from the stack, but pushes on +all the parameters of the constructor. We account for this by incrementing the +environment with the offset given by the number of arguments (just like we did +in our definitions of our compilation scheme). Before we map the tag, +we ensure that it hasn't already been mapped (and throw an exception, currently +in the form of a type error due to the growing length of this post), +and finally map it and insert the new branch code into the list of branches. + +After we're done with all the branches, we also check for non-exhaustive patterns, +since otherwise we could run into runtime errors. With this, the case expression, +an the last of the AST nodes, can be compiled. {{< todo >}}Backport bugfix in case's typecheck{{< /todo >}}