Finish compiler series
This commit is contained in:
		
							parent
							
								
									e5d01a4e19
								
							
						
					
					
						commit
						e7f0ccfa16
					
				| @ -20,7 +20,9 @@ void llvm_context::create_types() { | ||||
| 
 | ||||
|     gmachine_type->setBody( | ||||
|             stack_ptr_type, | ||||
|             node_ptr_type | ||||
|             node_ptr_type, | ||||
|             IntegerType::getInt64Ty(ctx), | ||||
|             IntegerType::getInt64Ty(ctx) | ||||
|     ); | ||||
|     struct_types.at("node_base")->setBody( | ||||
|             IntegerType::getInt32Ty(ctx), | ||||
| @ -212,11 +214,6 @@ Value* llvm_context::create_track(Function* f, Value* v) { | ||||
|     return builder.CreateCall(track_f, { f->arg_begin(), v }); | ||||
| } | ||||
| 
 | ||||
| Value* llvm_context::create_eval(Value* e) { | ||||
|     auto eval_f = functions.at("eval"); | ||||
|     return builder.CreateCall(eval_f, { e }); | ||||
| } | ||||
| 
 | ||||
| void llvm_context::create_unwind(Function* f) { | ||||
|     auto unwind_f = functions.at("unwind"); | ||||
|     builder.CreateCall(unwind_f, { f->args().begin() }); | ||||
|  | ||||
| @ -55,7 +55,6 @@ struct llvm_context { | ||||
|     void create_alloc(llvm::Function*, llvm::Value*); | ||||
|     llvm::Value* create_track(llvm::Function*, llvm::Value*); | ||||
| 
 | ||||
|     llvm::Value* create_eval(llvm::Value*); | ||||
|     void create_unwind(llvm::Function*); | ||||
| 
 | ||||
|     llvm::Value* unwrap_gmachine_stack_ptr(llvm::Value*); | ||||
|  | ||||
| @ -170,14 +170,16 @@ void gmachine_split(struct gmachine* g, size_t n) { | ||||
| 
 | ||||
| struct node_base* gmachine_track(struct gmachine* g, struct node_base* b) { | ||||
|     g->gc_node_count++; | ||||
|     if(g->gc_node_count >= g->gc_node_threshold) { | ||||
|         gc_visit_node(b); | ||||
|         gmachine_gc(g); | ||||
|         g->gc_node_threshold *= 2; | ||||
|     } | ||||
| 
 | ||||
|     b->gc_next = g->gc_nodes; | ||||
|     g->gc_nodes = b; | ||||
| 
 | ||||
|     if(g->gc_node_count >= g->gc_node_threshold) { | ||||
|         uint64_t nodes_before = g->gc_node_count; | ||||
|         gc_visit_node(b); | ||||
|         gmachine_gc(g); | ||||
|         g->gc_node_threshold = g->gc_node_count * 2; | ||||
|     } | ||||
| 
 | ||||
|     return b; | ||||
| } | ||||
| 
 | ||||
|  | ||||
| @ -574,5 +574,6 @@ In the next several posts, we will improve | ||||
| our compiler to properly free unused memory | ||||
| usign a __garbage collector__, implement | ||||
| lambda functions using __lambda lifting__, | ||||
| and use our Alloc instruction to implement `let/in` expressions. See | ||||
| you there! | ||||
| and use our Alloc instruction to implement `let/in` expressions. | ||||
| We get started on the first of these tasks in | ||||
| [Part 9 - Garbage Collection]({{< relref "09_compiler_garbage_collection.md" >}}). | ||||
|  | ||||
							
								
								
									
										559
									
								
								content/blog/09_compiler_garbage_collection.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										559
									
								
								content/blog/09_compiler_garbage_collection.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,559 @@ | ||||
| --- | ||||
| title: Compiling a Functional Language Using C++, Part 9 - Garbage Collection | ||||
| date: 2020-01-27T20:35:15-08:00 | ||||
| tags: ["C and C++", "Functional Languages", "Compilers"] | ||||
| draft: true | ||||
| --- | ||||
| 
 | ||||
| > "When will you learn? When will you learn that __your actions have consequences?__" | ||||
| 
 | ||||
| So far, we've entirely ignored the problem of memory management. Every time | ||||
| that we need a new node for our growing graph, we simply ask for more memory | ||||
| from the runtime with `malloc`. But selfishly, even when we no longer require | ||||
| the memory allocated for a particular node, when that node is no longer in use, | ||||
| we do not `free` it. In fact, our runtime currently has no idea about | ||||
| which nodes are needed and which ones are ready to be discarded. | ||||
| 
 | ||||
| To convince ourselves that this is a problem, let's first assess the extent of the damage. | ||||
| Consider the program from `works3.txt`: | ||||
| 
 | ||||
| {{< rawblock "compiler/09/examples/works3.txt" >}} | ||||
| 
 | ||||
| Compiling and running this program through `valgrind`, we get the following output: | ||||
| 
 | ||||
| ``` | ||||
| ==XXXX== LEAK SUMMARY: | ||||
| ==XXXX==    definitely lost: 288 bytes in 12 blocks | ||||
| ==XXXX==    indirectly lost: 768 bytes in 34 blocks | ||||
| ==XXXX==      possibly lost: 0 bytes in 0 blocks | ||||
| ==XXXX==    still reachable: 0 bytes in 0 blocks | ||||
| ==XXXX==         suppressed: 0 bytes in 0 blocks | ||||
| ``` | ||||
| 
 | ||||
| We lost 1056 bytes of memory, just to return the length of a list | ||||
| with 3 elements. The problem of leaking memory is very real.  | ||||
| 
 | ||||
| How do we solve this issue? We can't embed memory management into our language; | ||||
| We want to keep it pure, and managing memory is typically pretty far from | ||||
| that goal. Instead, we will make our runtime do the work of freeing memory. | ||||
| Even then, this is a nontrivial goal: our runtime manipulates graphs, each | ||||
| of which can be combined with others in arbitrary ways. In general, there | ||||
| will not always be a _single_ node that, when freed, will guarantee that | ||||
| another node can be freed as well. Instead, it's very possible in our | ||||
| graphs that two parent nodes both refer to a third, and only when both | ||||
| parents are freed can we free that third node itself. Consider, | ||||
| for instance, the function `square` as follows: | ||||
| 
 | ||||
| ``` | ||||
| defn square x = { | ||||
|     x * x | ||||
| } | ||||
| ``` | ||||
| 
 | ||||
| This function will receive, on top of the stack, a single graph representing `x`. | ||||
| It will then create two applications of a global `(+)` function, each time | ||||
| to the graph of `x`. Thus, it will construct a tree with two `App` nodes, both | ||||
| of which | ||||
| {{< sidenote "right" "lazy-note" "must keep track of a reference to x.">}} | ||||
| We later take advantage of this, by replacing the graph of <code>x</code> with the | ||||
| result of evaluating it. Since both <code>App</code> nodes point to the same | ||||
| graph, when we evaluate it once, each node observes this update, and is not | ||||
| required to evaluate <code>x</code> again. With this, we achieve lazy evaluation. | ||||
| {{< /sidenote >}} The runtime will have to wait until both `App` nodes | ||||
| are freed before it can free the graph of `x`. | ||||
| 
 | ||||
| This seems simple enough! If there are multiple things that may reference a node | ||||
| in the graph, why don't we just keep track of how many there are? Once we know | ||||
| that no more things are still referencing a node, we can free it. This is | ||||
| called [reference counting](https://en.wikipedia.org/wiki/Reference_counting). | ||||
| Reference counting is a valid technique, but unfortunately, it will not suit us. | ||||
| The reason for this is that our language may produce | ||||
| [cyclic graphs](https://en.wikipedia.org/wiki/Cycle_(graph_theory)). Consider, | ||||
| for example, this definition of an infinite list of the number 1: | ||||
| 
 | ||||
| ``` | ||||
| defn ones = { Cons 1 ones } | ||||
| ``` | ||||
| 
 | ||||
| Envisioning the graph of the tree, we can see `ones` as an application | ||||
| of the constructor `Cons` to two arguments, one of which is `ones` again. | ||||
| {{< sidenote "right" "recursive-note" "It refers to itself!" >}} | ||||
| Things are actually more complicated than this. In our current language, | ||||
| recursive definitions are only possible in function definitions (like | ||||
| <code>ones</code>). In our runtime, each time there is a reference | ||||
| to a function, this is done through a <em>new node</em>, which | ||||
| means that functions with recursive definitions are <em>not</em> represented cyclically. | ||||
| Therefore, reference counting would work. However, in the future, | ||||
| our language will have more ways of creating circular definitions, | ||||
| some of which will indeed create cycles in our graphs. So, to | ||||
| prepare for this, we will avoid the use of reference counting. | ||||
| {{< /sidenote >}} In this case, when we compute the number of nodes | ||||
| that require `ones`, we will always find the number to be at least 1: `ones` | ||||
| needs `ones`, which needs `ones`, and so on. It will not be possible for | ||||
| us to free `ones`, then, by simply counting the number of references to it. | ||||
| 
 | ||||
| There's a more powerful technique than reference counting for freeing | ||||
| unused memory: __mark-and-sweep garbage collection__. This technique | ||||
| is conceptually pretty simple to grasp, yet will allow us to handle | ||||
| cycles in our graphs. Unsurprisingly, we implement this type | ||||
| of garbage collection in two stages: | ||||
| 
 | ||||
| 1. __Mark__: We go through every node that is still needed by | ||||
| the runtime, and recursively mark is, its children, and so on as "to keep". | ||||
| 2. __Sweep__: We go through every node we haven't yet freed, and, | ||||
| if it hasn't been marked as "to keep", we free it. | ||||
| 
 | ||||
| This also seems simple enough. There are two main things for us | ||||
| to figure out: | ||||
| 
 | ||||
| 1. For __Mark__, what are the "nodes still needed by the runtime"? | ||||
| These are just the nodes on the various G-machine stacks. If | ||||
| a node is not on the stack, nor is it a child of a node | ||||
| that is on the stack, why should we keep it around? | ||||
| 2. For __Sweep__, how do we keep track of all the nodes we haven't | ||||
| yet freed? In our case, the solution is a global list of allocated | ||||
| nodes, which is updated every time that a node is allocated. | ||||
| 
 | ||||
| Wait a minute, though. Inside of `unwind` in C, we only have | ||||
| a reference to the most recent stack. Our execution model allows | ||||
| for an arbitrary number of stacks: we can keep using `Eval`, | ||||
| placing the current stack on the dump, and starting a new stack | ||||
| from scratch to evaluate a node. How can we traverse these stacks | ||||
| from inside unwind? One solution could be to have each stack | ||||
| point to the "parent" stack. To find all the nodes on the | ||||
| stack, then, we'd start with the current stack, mark all the | ||||
| nodes on it as "required", then move on to the parent stack, | ||||
| rinse and repeat. This is plausible and pretty simple, but | ||||
| there's another way. | ||||
| 
 | ||||
| We clean up after ourselves. | ||||
| 
 | ||||
| ### Towards a Cleaner Stack | ||||
| Simon Peyton Jones wrote his G-machine semantics in a particular way. Every time | ||||
| that a function is called, all it leaves behind on the stack is the graph node | ||||
| that represents the function's output. Our own internal functions, however, have been less | ||||
| careful. Consider, for instance, the "binary operator" function I showed you. | ||||
| Its body is given by the following G-machine instructions: | ||||
| 
 | ||||
| ```C++ | ||||
| instructions.push_back(instruction_ptr(new instruction_push(1))); | ||||
| instructions.push_back(instruction_ptr(new instruction_eval())); | ||||
| instructions.push_back(instruction_ptr(new instruction_push(1))); | ||||
| instructions.push_back(instruction_ptr(new instruction_eval())); | ||||
| instructions.push_back(instruction_ptr(new instruction_binop(op))); | ||||
| ``` | ||||
| 
 | ||||
| When the function is called, there are at least 3 things on the stack: | ||||
| 
 | ||||
| 1. The "outermost" application node, to be replaced with an indirection (to enable laziness). | ||||
| 2. The second argument to the binary operator. | ||||
| 3. The first argument to the binary operator. | ||||
| 
 | ||||
| Then, __Push__ adds another node to the stack, an __Eval__ forces | ||||
| its evaluation (and leaves it on the stack). This happens again with the second argument. | ||||
| Finally, we call __BinOp__, popping two values off the stack and combining them | ||||
| according to the binary operator. This leaves the stack with 4 things: the 3 I described | ||||
| above, and thew newly computed value. This is fine as far as `eval` is concerned: its | ||||
| implementation only asks for the top value on the stack after `unwind` finishes. But | ||||
| for anything more complicated, this is a very bad side effect. We want to leave the | ||||
| stack as clean as we found it - with one node and no garbage. | ||||
| 
 | ||||
| Fortunately, the way we compile functions is a good guide for how we should | ||||
| compile internal operators and constructors. The idea is captured | ||||
| by the two instructions we insert at the end of a user-defined | ||||
| function: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/definition.cpp" 56 57 >}} | ||||
| 
 | ||||
| Once a result is computed, we turn the node that represented the application | ||||
| into an indirection, and point it to the computed result (as I said before, | ||||
| this enables lazy evaluation). We also pop the arguments given to the function | ||||
| off the stack. Let's add these two things to the `gen_llvm_internal_op` function: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/main.cpp" 70 85 >}} | ||||
| 
 | ||||
| Notice, in particular, the `instruction_update(2)` and `instruction_pop(2)` | ||||
| instructions that were recently added. A similar thing has to be done for data | ||||
| type constructors. The difference, though, is that __Pack__ removes the data | ||||
| it packs from the stack, and thus, __Pop__ is not needed: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/definition.cpp" 102 117 >}} | ||||
| 
 | ||||
| With this done, let's run a quick test: let's print the number of things | ||||
| on the stack at the end of an `eval` call (before the stack is freed, | ||||
| of course). We can compare the output of runtime without the fix (`old`) | ||||
| and with the fix (`current`): | ||||
| 
 | ||||
| ``` | ||||
|          current                    old           | ||||
| 
 | ||||
| Current stack size is 0  |  Current stack size: 1 | ||||
| Current stack size is 0  |  Current stack size: 1 | ||||
| Current stack size is 0  |  Current stack size: 1 | ||||
| Current stack size is 0  |  Current stack size: 1 | ||||
| Current stack size is 0  |  Current stack size: 0 | ||||
| Current stack size is 0  |  Current stack size: 0 | ||||
| Current stack size is 0  |  Current stack size: 3 | ||||
| Current stack size is 0  |  Current stack size: 0 | ||||
| Current stack size is 0  |  Current stack size: 3 | ||||
| Current stack size is 0  |  Current stack size: 0 | ||||
| Current stack size is 0  |  Current stack size: 3 | ||||
| Result: 3                |  Result: 3 | ||||
| ``` | ||||
| 
 | ||||
| The stack is now much cleaner! Every time `eval` is called, it starts | ||||
| with one node, and ends with one node (which is then popped). | ||||
| 
 | ||||
| ### One Stack to Rule Them All | ||||
| 
 | ||||
| Wait a minute. If the stack is really always empty at the end, do we really need to construct | ||||
| a new stack every time? | ||||
| {{< sidenote "right" "arity-note" "I think not" >}} | ||||
| There's some nuance to this. While it is true that for the most | ||||
| part, we can get rid of the new stacks in favor of a single | ||||
| one, our runtime will experience a change. The change lies | ||||
| in the Unwind-Global rule, which <em>requires that the | ||||
| stack has as many children as the function needs | ||||
| arguments</em>. Until now, there was no way | ||||
| for this condition to be accidentally satisfied: the function | ||||
| we were unwinding was the only thing on the stack. Now, | ||||
| though, things are different: the function being | ||||
| unwound may share a stack with something else, | ||||
| and just checking the stack size will not be sufficient. | ||||
| <em>I believe</em> that this is not a problem for us, | ||||
| since the compiler will only emit <strong>Eval</strong> | ||||
| instructions for things it knows are data types or numbers, | ||||
| meaning their type is not a partially applied function | ||||
| that is missing arguments. However, this is a nontrivial | ||||
| observation. | ||||
| {{< /sidenote >}}, and Simon Peyton Jones seems to | ||||
| agree. In _Implementing Functional Languages: a tutorial_, he mentions | ||||
| that the dump does not need to be implemented as a real stack of stacks. | ||||
| So let's try this out: instead of starting a new stack using `eval`, | ||||
| let's use an existing one, by just calling `unwind` again. To do so, | ||||
| all we have to do is change our `instruction_eval` instruction. When | ||||
| the G-machine wants something evaluated now, it should just call | ||||
| `unwind` directly! | ||||
| 
 | ||||
| To make this change, we have to make `unwind` available to the | ||||
| compiler. We thus declare it in the `llvm_context.cpp` file: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 158 163 >}} | ||||
| 
 | ||||
| And even create a function to construct a call to `unwind` | ||||
| with the following signature: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.hpp" 58 58 >}} | ||||
| 
 | ||||
| We implement it like so: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 217 220 >}} | ||||
| 
 | ||||
| Finally, the `instruction_eval::gen_llvm` method simply calls | ||||
| `unwind`: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/instruction.cpp" 157 159 >}} | ||||
| 
 | ||||
| After this change, we only call `eval` from `main`. Furthermore, | ||||
| since `eval` releases all the resources it allocates before | ||||
| returning, we won't be able to | ||||
| {{< sidenote "right" "retrieve-note" "easily retrieve" >}} | ||||
| We were able to do this before, but that's because our | ||||
| runtime didn't free the nodes, <em>ever</em>. Now that | ||||
| it does, returning a node violates that node's lifetime. | ||||
| {{< /sidenote >}}the result of the evaluation from it. | ||||
| Thus, we simply merge `eval` with `main` - combining | ||||
| the printing and the initialization / freeing | ||||
| code. | ||||
| 
 | ||||
| With this, only one stack will be allocated for the entirety of | ||||
| program execution. This doesn't just help us save on memory | ||||
| allocations, but also __solves the problem of marking | ||||
| valid nodes during garbage collection__! Instead of traversing | ||||
| a dump of stacks, we can now simply traverse a single stack; | ||||
| all that we need is in one place. | ||||
| 
 | ||||
| So this takes care, more or less, of the "mark" portion of mark-and-sweep. | ||||
| Using the stack, we can recursively mark the nodes that we need. But | ||||
| what about "sweeping"? How can we possibly know of every node that | ||||
| we've allocated? There's some more bookkeping for us to do. | ||||
| 
 | ||||
| ### It's All Connected | ||||
| There exists a simple technique I've previously seen (and used) | ||||
| for keeping track of all the allocated memory. The technique is | ||||
| to __turn all the allocated nodes into elements of a linked list__. | ||||
| The general process of implementing this proceeds as follows: | ||||
| 
 | ||||
| 1. To each node, add a "next" pointer. | ||||
| 2. Keep a handle to the whole node chain somewhere. | ||||
| 3. Add each newly allocated node to the front of the whole chain. | ||||
| 
 | ||||
| This "somewhere" could be a global variable. However, | ||||
| since we already pass a stack to almost all of our | ||||
| functions, it makes more sense to make the list handle | ||||
| a part of some data structure that will also contain the stack, | ||||
| and pass that around, instead. This keeps all of the G-machine | ||||
| data in one place, and in principle could allow for concurrent | ||||
| execution of more than one G-machine in a single program. Let's | ||||
| call our new data structure `gmachine`: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/runtime.h" 69 74 >}} | ||||
| 
 | ||||
| Here, the `stack` field holds the G-machine stack, | ||||
| and the `gc_nodes` is the handle to the list of all the nodes | ||||
| we've allocated and not yet freed. Don't worry about the `gc_node_count` | ||||
| and `gc_threshold` fields - we'll get to them a little later. | ||||
| 
 | ||||
| This is going to be a significant change. First of all, since | ||||
| the handle won't be global, it can't be accessed from inside the | ||||
| `alloc_*` functions. Instead, we have to make sure to add | ||||
| nodes allocated through `alloc_*` to a G-machine somewhere | ||||
| wherever we call the allocators. To make it easier to add nodes to a G-machine | ||||
| GC handle, let's make a new function, `track`: | ||||
| 
 | ||||
| ```C | ||||
| struct node_base* gmachine_track(struct gmachine*, struct node_base*); | ||||
| ``` | ||||
| 
 | ||||
| This function will add the given node to the G-machine's handle, | ||||
| and return that same node. This way, we can wrap nodes in | ||||
| a call to `gmachine_track`. We will talk about this | ||||
| function's implementation later in the post. | ||||
| 
 | ||||
| This doesn't get us all the way to a working runtime, though: | ||||
| right now, we still pass around `struct stack*` instead of | ||||
| `struct gmachine*` everywhere. However, the whole point | ||||
| of adding the `gmachine` struct was to store more data in it! | ||||
| Surely we need that new data somewhere, and thus, we need to | ||||
| use the `gmachine` struct for _some_ functions. What functions | ||||
| _do_ need a whole `gmachine*`, and which ones only need | ||||
| a `stack*`? | ||||
| 
 | ||||
| 1. {{< sidenote "right" "ownership-note" "Clearly," >}} | ||||
| This might not be clear. Maybe <em>pushing</em> onto a stack will | ||||
| add a node to our GC handle, and so, we need to have access | ||||
| to the handle in <code>stack_push</code>. The underlying | ||||
| question is that of <em>ownership</em>: when we allocate | ||||
| a node, which part of the program does it "belong" to? | ||||
| The "owner" of the node should do the work of managing | ||||
| when to free it or keep it. Since we already agreed to | ||||
| create a <code>gmachine</code> struct to house the GC | ||||
| handle, it makes sense that nodes are owned by the | ||||
| G-machine. Thus, the assumption in functions like | ||||
| <code>stack_push</code> is that the "owner" of the node | ||||
| already took care of allocating and tracking it, and | ||||
| <code>stack_push</code> itself shouldn't bother. | ||||
| {{< /sidenote >}} `stack_push`, `stack_pop`, and similar functions | ||||
| do not require a G-machine. | ||||
| 2. `stack_alloc` and `stack_pack` __do__ need a G-machine, | ||||
| because they must allocate new nodes. Where the nodes | ||||
| are allocated, we should add them to the GC handle. | ||||
| 3. Since they use `stack_alloc` and `stack_pack`, | ||||
| generated functions also need a G-machine. | ||||
| 4. Since `unwind` calls the generated functions, | ||||
| it must also receive a G-machine. | ||||
| 
 | ||||
| As far as stack functions go, we only _need_ to update | ||||
| `stack_alloc` and `stack_pack`. Everything else | ||||
| doesn't require new node allocations, and thus, | ||||
| does not require the GC handle. However, this makes | ||||
| our code rather ugly: we have a set of mostly `stack_*` | ||||
| functions, followed suddenly by two `gmachine_*` functions. | ||||
| In the interest of cleanliness, let's instead do the following: | ||||
| 
 | ||||
| 1. Make all functions associated with G-machine rules (like | ||||
| __Alloc__, __Update__, and so on) require a `gmachine*`. This | ||||
| way, theres a correspondence between our code and the theory. | ||||
| 2. Leave the rest of the functions (`stack_push`, `stack_pop`, | ||||
| etc.) as is. They are not G-machine specific, and don't | ||||
| require a GC handle, so there's no need to touch them. | ||||
| 
 | ||||
| Let's make this change. We end up with the following | ||||
| functions: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.h" 56 84 >}} | ||||
| 
 | ||||
| For the majority of the changed functions, the | ||||
| updates are | ||||
| {{< sidenote "right" "cosmetic-note" "cosmetic." >}} | ||||
| We must also update the LLVM/C++ declarations of | ||||
| the affected functions: many of them now take a | ||||
| <code>gmachine_ptr_type</code> instead of <code>stack_ptr_type</code>. | ||||
| This change is not shown explicitly here (it is hard to do with our | ||||
| growing code base), but it is nonetheless significant. | ||||
| {{< /sidenote >}} The functions | ||||
| that require more significant modifications are `gmachine_alloc` | ||||
| and `gmachine_pack`. In both, we must now make a call to `gmachine_track` | ||||
| to ensure that a newly allocated node will be garbage collected in the future. | ||||
| The updated code for `gmachine_alloc` is: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.c" 140 145 >}} | ||||
| 
 | ||||
| Correspondingly, the updated code for `gmachine_pack` is: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.c" 147 162 >}} | ||||
| 
 | ||||
| Note that we've secretly made one more change. Instead of | ||||
| allocating `sizeof(*data) * n` bytes of memory for | ||||
| the array of packed nodes, we allocate `sizeof(*data) * (n + 1)`, | ||||
| and set the last element to `NULL`. This will allow other | ||||
| functions (which we will soon write) to know how many elements are packed inside | ||||
| a `node_data` (effectively, we've added a `NULL` terminator). | ||||
| 
 | ||||
| We must change our compiler to keep it up to date with this change. Importantly, | ||||
| it must know that a G-machine struct exists. To give it | ||||
| this information, we add a new | ||||
| `llvm::StructType*` called `gmachine_type` to the `llvm_context` class, | ||||
| initialize it in the constructor, and set its body as follows: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 21 26 >}} | ||||
| 
 | ||||
| The compiler must also know that generated functions now use the G-machine | ||||
| struct rather than a stack struct: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 19 19 >}} | ||||
| 
 | ||||
| Since we still use some functions that require a stack and not a G-machine, | ||||
| we must have a way to get the stack from a G-machine. To do this, | ||||
| we create a new `unwrap` function, which uses LLVM's GEP instruction | ||||
| to get a pointer to the G-machine's stack field: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 222 225 >}} | ||||
| 
 | ||||
| We use this function elsewhere, such `llvm_context::create_pop`: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 176 179 >}} | ||||
| 
 | ||||
| Finally, we want to make sure our generated functions don't allocate | ||||
| nodes without tracking them with the G-machine. To do so, we modify | ||||
| all the `create_*` methods to require the G-machine function argument, | ||||
| and update the functions themselves to call `gmachine_track`. For | ||||
| example, here's `llvm_context::create_num`: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 235 239 >}} | ||||
| 
 | ||||
| Of course, this requires us to add a new `create_track` method | ||||
| to the `llvm_context`: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/llvm_context.cpp" 212 215 >}} | ||||
| 
 | ||||
| This is good. Let's now implement the actual mark-and-sweep algorithm | ||||
| in `gmachine_gc`: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.c" 186 204 >}} | ||||
| 
 | ||||
| In the code above, we first iterate through the stack, | ||||
| calling `gc_visit_node` on every node that we encounter. The | ||||
| assumption is that once `gc_visit_node` is done, every node | ||||
| that _can_ be reached has its `gc_reachable` field set to 1, | ||||
| and all the others have it set to 0.  | ||||
| 
 | ||||
| Once we reach the end of the stack, we continue to the "sweep" phase, | ||||
| iterating through the linked list of nodes (held in the G-machine | ||||
| GC handle `gc_nodes`). For each node, if its `gc_reachable` flag | ||||
| is not set, we remove it from the linked list, and call `free_node_direct` | ||||
| on it. Otherwise (that is, if the flag __is__ set), we clear it, | ||||
| so that the node can potentially be garbage collected in a future | ||||
| invocation of `gmachine_gc`.  | ||||
| 
 | ||||
| `gc_visit_node` recursively marks a node and its children as reachable: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.c" 51 70 >}} | ||||
| 
 | ||||
| This is possible with the `node_data` nodes because of the change we | ||||
| made to the `gmachine_pack` instruction earlier: now, the last element | ||||
| of the "packed" array is `NULL`, telling `gc_visit_node` that it has | ||||
| reached the end of the list of children. | ||||
| 
 | ||||
| `free_node_direct` performs a non-recursive deallocation of all | ||||
| the resources held by a particular node. So far, this is only | ||||
| needed for `node_data` nodes, since the arrays holding their children | ||||
| are dynamically allocated. Thus, the code for the function is | ||||
| pretty simple: | ||||
| 
 | ||||
| {{< codelines "C" "compiler/09/runtime.c" 45 49 >}} | ||||
| 
 | ||||
| ### When to Collect | ||||
| When should we run garbage collection? Initially, I tried | ||||
| running it after every call to `unwind`. However, this | ||||
| quickly proved impractical: the performance of all | ||||
| the programs in the language decreased by a spectacular | ||||
| amount. Programs like `works1.txt` and `works2.txt` | ||||
| would take tens of seconds to complete. | ||||
| 
 | ||||
| Instead of this madness, let's settle for an approach | ||||
| common to many garbage collectors. Let's __perform | ||||
| garbage collection every time the amount of | ||||
| memory we've allocated doubles__. Tracking when the | ||||
| amount of allocated memory doubles is the purpose of | ||||
| the `gc_node_count` and `gc_threshold` fields in the | ||||
| `gmachine` struct. The former field tracks how many | ||||
| nodes have been tracked by the garbage collector, and the | ||||
| latter holds the number of nodes the G-machine must | ||||
| reach before triggering garbage collection. | ||||
| 
 | ||||
| Since the G-machine is made aware of allocations | ||||
| by a call to the `gmachine_track` function, this | ||||
| is where we will attempt to perform garbage collection. | ||||
| We end up with the following code: | ||||
| 
 | ||||
| {{< codelines "C++" "compiler/09/runtime.c" 171 184 >}} | ||||
| 
 | ||||
| When a node is added to the GC handle, we increment the `gc_node_count` | ||||
| field. If the new value of this field exceeds the threshold, | ||||
| we perform garbage collection. There are cases in which | ||||
| this is fairly dangerous: for instance, `gmachine_pack` first | ||||
| moves all packed nodes into an array, then allocates a `node_data` | ||||
| node. This means that for a brief moment, the nodes stored | ||||
| into the new data node are inaccessible from the stack, | ||||
| and thus susceptible to garbage collection. To prevent | ||||
| situations like this, we run `gc_visit_node` on the node | ||||
| being tracked, marking it and its children as "reachable". | ||||
| Finally, we set the next "free" threshold to double | ||||
| the number of currently allocated nodes.  | ||||
| 
 | ||||
| This is about as much as we need to do. The change in this | ||||
| post was a major one, and required updating multiple files. | ||||
| As always, you're welcome to check out [the compiler source | ||||
| code for this post](https://dev.danilafe.com/Web-Projects/blog-static/src/branch/master/code/compiler/09). | ||||
| To wrap up, let's evaluate our change. | ||||
| 
 | ||||
| To especially stress the compiler, I came up with a prime number | ||||
| generator. Since booleans are not in the standard library, and | ||||
| since it isn't possible to pattern match on numbers, my | ||||
| only option was the use Peano encoding. This effectively | ||||
| means that numbers are represented as linked lists, | ||||
| which makes garbage collection all the more | ||||
| important. The program is quite long, but you can | ||||
| [find the entire code here](https://dev.danilafe.com/Web-Projects/blog-static/src/branch/master/code/compiler/09/examples/primes.txt). | ||||
| 
 | ||||
| When I ran the `primes` program compiled using the | ||||
| previous version of the compiler using `time`, I | ||||
| got the following output: | ||||
| 
 | ||||
| ``` | ||||
| Maximum resident set size (kbytes): 935764 | ||||
| Minor (reclaiming a frame) page faults: 233642 | ||||
| ``` | ||||
| 
 | ||||
| In contrast, here is the output of `time` when running | ||||
| the same program compiled with the new version of | ||||
| the compiler: | ||||
| 
 | ||||
| ``` | ||||
| Maximum resident set size (kbytes): 7448 | ||||
| Minor (reclaiming a frame) page faults: 1577 | ||||
| ``` | ||||
| 
 | ||||
| We have reduced maximum memory usage by a factor of | ||||
| 125, and the number of page faults by a factor of 148. | ||||
| That seems pretty good! | ||||
| 
 | ||||
| With this success, we end today's post. As I mentioned | ||||
| before, we're not done. The language is still clunky to use, | ||||
| and can benefit from `let/in` expressions and __lambda functions__. | ||||
| Furthermore, our language is currently monomorphic, and would | ||||
| be much better with __polymorphism__. Finally, to make our language | ||||
| capable of more-than-trivial work, we may want to implement | ||||
| __Input/Output__ and __strings__. I hope to see you in future posts, | ||||
| where we will implement these features! | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user