From 39ebd845aa6259f1ddd3fc16e61205838a4b1341 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Sat, 5 Dec 2020 19:34:25 -0800 Subject: [PATCH] Add changed source files (one seems buggy) --- bpred.c | 1039 +++++++++++ bpred.h | 289 +++ sim-outorder.c | 4637 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 5965 insertions(+) create mode 100644 bpred.c create mode 100644 bpred.h create mode 100644 sim-outorder.c diff --git a/bpred.c b/bpred.c new file mode 100644 index 0000000..d41f827 --- /dev/null +++ b/bpred.c @@ -0,0 +1,1039 @@ +/* bpred.c - branch predictor routines */ + +/* SimpleScalar(TM) Tool Suite + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + * All Rights Reserved. + * + * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR, + * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS. + * + * No portion of this work may be used by any commercial entity, or for any + * commercial purpose, without the prior, written permission of SimpleScalar, + * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted + * as described below. + * + * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express + * or implied. The user of the program accepts full responsibility for the + * application of the program and the use of any results. + * + * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be + * downloaded, compiled, executed, copied, and modified solely for nonprofit, + * educational, noncommercial research, and noncommercial scholarship + * purposes provided that this notice in its entirety accompanies all copies. + * Copies of the modified software can be delivered to persons who use it + * solely for nonprofit, educational, noncommercial research, and + * noncommercial scholarship purposes provided that this notice in its + * entirety accompanies all copies. + * + * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY + * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com). + * + * 4. No nonprofit user may place any restrictions on the use of this software, + * including as modified by the user, by any other authorized user. + * + * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar + * in compiled or executable form as set forth in Section 2, provided that + * either: (A) it is accompanied by the corresponding machine-readable source + * code, or (B) it is accompanied by a written offer, with no time limit, to + * give anyone a machine-readable copy of the corresponding source code in + * return for reimbursement of the cost of distribution. This written offer + * must permit verbatim duplication by anyone, or (C) it is distributed by + * someone who received only the executable form, and is accompanied by a + * copy of the written offer of source code. + * + * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is + * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail: + * 2395 Timbercrest Court, Ann Arbor, MI 48105. + * + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + */ + + +#include +#include +#include +#include + +#include "host.h" +#include "misc.h" +#include "machine.h" +#include "bpred.h" + +/* turn this on to enable the SimpleScalar 2.0 RAS bug */ +/* #define RAS_BUG_COMPATIBLE */ + +/* create a branch predictor */ +struct bpred_t * /* branch predictory instance */ +bpred_create(enum bpred_class class, /* type of predictor to create */ + unsigned int bimod_size, /* bimod table size */ + unsigned int l1size, /* 2lev l1 table size */ + unsigned int l2size, /* 2lev l2 table size */ + unsigned int meta_size, /* meta table size */ + unsigned int shift_width, /* history register width */ + unsigned int xor, /* history xor address flag */ + unsigned int btb_sets, /* number of sets in BTB */ + unsigned int btb_assoc, /* BTB associativity */ + unsigned int retstack_size) /* num entries in ret-addr stack */ +{ + struct bpred_t *pred; + + if (!(pred = calloc(1, sizeof(struct bpred_t)))) + fatal("out of virtual memory"); + + pred->class = class; + + switch (class) { + case BPredComb: + /* bimodal component */ + pred->dirpred.bimod = + bpred_dir_create(BPred2bit, bimod_size, 0, 0, 0); + + /* 2-level component */ + pred->dirpred.twolev = + bpred_dir_create(BPred2Level, l1size, l2size, shift_width, xor); + + /* metapredictor component */ + pred->dirpred.meta = + bpred_dir_create(BPred2bit, meta_size, 0, 0, 0); + + break; + + case BPred2Level: + pred->dirpred.twolev = + bpred_dir_create(class, l1size, l2size, shift_width, xor); + + break; + + case BPred2bit: + pred->dirpred.bimod = + bpred_dir_create(class, bimod_size, 0, 0, 0); + + break; + + case BPred3bit: + pred->dirpred.bimod = + /* Instead of adding another argument to this function, + * just re-use the bimodal size. */ + bpred_dir_create(class, bimod_size, 0, 0, 0); + + break; + + case BPredTaken: + case BPredNotTaken: + /* no other state */ + break; + + default: + panic("bogus predictor class"); + } + + /* allocate ret-addr stack */ + switch (class) { + case BPredComb: + case BPred2Level: + case BPred2bit: + case BPred3bit: + { + int i; + + /* allocate BTB */ + if (!btb_sets || (btb_sets & (btb_sets-1)) != 0) + fatal("number of BTB sets must be non-zero and a power of two"); + if (!btb_assoc || (btb_assoc & (btb_assoc-1)) != 0) + fatal("BTB associativity must be non-zero and a power of two"); + + if (!(pred->btb.btb_data = calloc(btb_sets * btb_assoc, + sizeof(struct bpred_btb_ent_t)))) + fatal("cannot allocate BTB"); + + pred->btb.sets = btb_sets; + pred->btb.assoc = btb_assoc; + + if (pred->btb.assoc > 1) + for (i=0; i < (pred->btb.assoc*pred->btb.sets); i++) + { + if (i % pred->btb.assoc != pred->btb.assoc - 1) + pred->btb.btb_data[i].next = &pred->btb.btb_data[i+1]; + else + pred->btb.btb_data[i].next = NULL; + + if (i % pred->btb.assoc != pred->btb.assoc - 1) + pred->btb.btb_data[i+1].prev = &pred->btb.btb_data[i]; + } + + /* allocate retstack */ + if ((retstack_size & (retstack_size-1)) != 0) + fatal("Return-address-stack size must be zero or a power of two"); + + pred->retstack.size = retstack_size; + if (retstack_size) + if (!(pred->retstack.stack = calloc(retstack_size, + sizeof(struct bpred_btb_ent_t)))) + fatal("cannot allocate return-address-stack"); + pred->retstack.tos = retstack_size - 1; + + break; + } + + case BPredTaken: + case BPredNotTaken: + /* no other state */ + break; + + default: + panic("bogus predictor class"); + } + + return pred; +} + +/* create a branch direction predictor */ +struct bpred_dir_t * /* branch direction predictor instance */ +bpred_dir_create ( + enum bpred_class class, /* type of predictor to create */ + unsigned int l1size, /* level-1 table size */ + unsigned int l2size, /* level-2 table size (if relevant) */ + unsigned int shift_width, /* history register width */ + unsigned int xor) /* history xor address flag */ +{ + struct bpred_dir_t *pred_dir; + unsigned int cnt; + int flipflop; + + if (!(pred_dir = calloc(1, sizeof(struct bpred_dir_t)))) + fatal("out of virtual memory"); + + pred_dir->class = class; + + cnt = -1; + switch (class) { + case BPred2Level: + { + if (!l1size || (l1size & (l1size-1)) != 0) + fatal("level-1 size, `%d', must be non-zero and a power of two", + l1size); + pred_dir->config.two.l1size = l1size; + + if (!l2size || (l2size & (l2size-1)) != 0) + fatal("level-2 size, `%d', must be non-zero and a power of two", + l2size); + pred_dir->config.two.l2size = l2size; + + if (!shift_width || shift_width > 30) + fatal("shift register width, `%d', must be non-zero and positive", + shift_width); + pred_dir->config.two.shift_width = shift_width; + + pred_dir->config.two.xor = xor; + pred_dir->config.two.shiftregs = calloc(l1size, sizeof(int)); + if (!pred_dir->config.two.shiftregs) + fatal("cannot allocate shift register table"); + + pred_dir->config.two.l2table = calloc(l2size, sizeof(unsigned char)); + if (!pred_dir->config.two.l2table) + fatal("cannot allocate second level table"); + + /* initialize counters to weakly this-or-that */ + flipflop = 1; + for (cnt = 0; cnt < l2size; cnt++) + { + pred_dir->config.two.l2table[cnt] = flipflop; + flipflop = 3 - flipflop; + } + + break; + } + + case BPred2bit: + if (!l1size || (l1size & (l1size-1)) != 0) + fatal("2bit table size, `%d', must be non-zero and a power of two", + l1size); + pred_dir->config.bimod.size = l1size; + if (!(pred_dir->config.bimod.table = + calloc(l1size, sizeof(unsigned char)))) + fatal("cannot allocate 2bit storage"); + /* initialize counters to weakly this-or-that */ + flipflop = 1; + for (cnt = 0; cnt < l1size; cnt++) + { + pred_dir->config.bimod.table[cnt] = flipflop; + flipflop = 3 - flipflop; + } + + break; + + case BPred3bit: + if (!l1size || (l1size & (l1size-1)) != 0) + fatal("2bit table size, `%d', must be non-zero and a power of two", + l1size); + pred_dir->config.threebit.size = l1size; + if (!(pred_dir->config.threebit.table = + calloc(l1size, sizeof(unsigned char)))) + fatal("cannot allocate 2bit storage"); + /* initialize counters to weakly this-or-that */ + flipflop = 3; + for (cnt = 0; cnt < l1size; cnt++) + { + pred_dir->config.threebit.table[cnt] = flipflop; + flipflop = 7 - flipflop; + } + + break; + + case BPredTaken: + case BPredNotTaken: + /* no other state */ + break; + + default: + panic("bogus branch direction predictor class"); + } + + return pred_dir; +} + +/* print branch direction predictor configuration */ +void +bpred_dir_config( + struct bpred_dir_t *pred_dir, /* branch direction predictor instance */ + char name[], /* predictor name */ + FILE *stream) /* output stream */ +{ + switch (pred_dir->class) { + case BPred2Level: + fprintf(stream, + "pred_dir: %s: 2-lvl: %d l1-sz, %d bits/ent, %s xor, %d l2-sz, direct-mapped\n", + name, pred_dir->config.two.l1size, pred_dir->config.two.shift_width, + pred_dir->config.two.xor ? "" : "no", pred_dir->config.two.l2size); + break; + + case BPred2bit: + fprintf(stream, "pred_dir: %s: 2-bit: %d entries, direct-mapped\n", + name, pred_dir->config.bimod.size); + break; + + case BPred3bit: + fprintf(stream, "pred_dir: %s: 3-bit: %d entries, direct-mapped\n", + name, pred_dir->config.threebit.size); + break; + + case BPredTaken: + fprintf(stream, "pred_dir: %s: predict taken\n", name); + break; + + case BPredNotTaken: + fprintf(stream, "pred_dir: %s: predict not taken\n", name); + break; + + default: + panic("bogus branch direction predictor class"); + } +} + +/* print branch predictor configuration */ +void +bpred_config(struct bpred_t *pred, /* branch predictor instance */ + FILE *stream) /* output stream */ +{ + switch (pred->class) { + case BPredComb: + bpred_dir_config (pred->dirpred.bimod, "bimod", stream); + bpred_dir_config (pred->dirpred.twolev, "2lev", stream); + bpred_dir_config (pred->dirpred.meta, "meta", stream); + fprintf(stream, "btb: %d sets x %d associativity", + pred->btb.sets, pred->btb.assoc); + fprintf(stream, "ret_stack: %d entries", pred->retstack.size); + break; + + case BPred2Level: + bpred_dir_config (pred->dirpred.twolev, "2lev", stream); + fprintf(stream, "btb: %d sets x %d associativity", + pred->btb.sets, pred->btb.assoc); + fprintf(stream, "ret_stack: %d entries", pred->retstack.size); + break; + + case BPred2bit: + bpred_dir_config (pred->dirpred.bimod, "bimod", stream); + fprintf(stream, "btb: %d sets x %d associativity", + pred->btb.sets, pred->btb.assoc); + fprintf(stream, "ret_stack: %d entries", pred->retstack.size); + break; + + case BPred3bit: + bpred_dir_config (pred->dirpred.bimod, "threebit", stream); + fprintf(stream, "btb: %d sets x %d associativity", + pred->btb.sets, pred->btb.assoc); + fprintf(stream, "ret_stack: %d entries", pred->retstack.size); + break; + + case BPredTaken: + bpred_dir_config (pred->dirpred.bimod, "taken", stream); + break; + case BPredNotTaken: + bpred_dir_config (pred->dirpred.bimod, "nottaken", stream); + break; + + default: + panic("bogus branch predictor class"); + } +} + +/* print predictor stats */ +void +bpred_stats(struct bpred_t *pred, /* branch predictor instance */ + FILE *stream) /* output stream */ +{ + fprintf(stream, "pred: addr-prediction rate = %f\n", + (double)pred->addr_hits/(double)(pred->addr_hits+pred->misses)); + fprintf(stream, "pred: dir-prediction rate = %f\n", + (double)pred->dir_hits/(double)(pred->dir_hits+pred->misses)); +} + +/* register branch predictor stats */ +void +bpred_reg_stats(struct bpred_t *pred, /* branch predictor instance */ + struct stat_sdb_t *sdb) /* stats database */ +{ + char buf[512], buf1[512], *name; + + /* get a name for this predictor */ + switch (pred->class) + { + case BPredComb: + name = "bpred_comb"; + break; + case BPred2Level: + name = "bpred_2lev"; + break; + case BPred2bit: + name = "bpred_bimod"; + break; + case BPred3bit: + name = "bpred_threebit"; + break; + case BPredTaken: + name = "bpred_taken"; + break; + case BPredNotTaken: + name = "bpred_nottaken"; + break; + default: + panic("bogus branch predictor class"); + } + + sprintf(buf, "%s.lookups", name); + stat_reg_counter(sdb, buf, "total number of bpred lookups", + &pred->lookups, 0, NULL); + sprintf(buf, "%s.updates", name); + sprintf(buf1, "%s.dir_hits + %s.misses", name, name); + stat_reg_formula(sdb, buf, "total number of updates", buf1, "%12.0f"); + sprintf(buf, "%s.addr_hits", name); + stat_reg_counter(sdb, buf, "total number of address-predicted hits", + &pred->addr_hits, 0, NULL); + sprintf(buf, "%s.dir_hits", name); + stat_reg_counter(sdb, buf, + "total number of direction-predicted hits " + "(includes addr-hits)", + &pred->dir_hits, 0, NULL); + if (pred->class == BPredComb) + { + sprintf(buf, "%s.used_bimod", name); + stat_reg_counter(sdb, buf, + "total number of bimodal predictions used", + &pred->used_bimod, 0, NULL); + sprintf(buf, "%s.used_2lev", name); + stat_reg_counter(sdb, buf, + "total number of 2-level predictions used", + &pred->used_2lev, 0, NULL); + } + sprintf(buf, "%s.misses", name); + stat_reg_counter(sdb, buf, "total number of misses", &pred->misses, 0, NULL); + sprintf(buf, "%s.jr_hits", name); + stat_reg_counter(sdb, buf, + "total number of address-predicted hits for JR's", + &pred->jr_hits, 0, NULL); + sprintf(buf, "%s.jr_seen", name); + stat_reg_counter(sdb, buf, + "total number of JR's seen", + &pred->jr_seen, 0, NULL); + sprintf(buf, "%s.jr_non_ras_hits.PP", name); + stat_reg_counter(sdb, buf, + "total number of address-predicted hits for non-RAS JR's", + &pred->jr_non_ras_hits, 0, NULL); + sprintf(buf, "%s.jr_non_ras_seen.PP", name); + stat_reg_counter(sdb, buf, + "total number of non-RAS JR's seen", + &pred->jr_non_ras_seen, 0, NULL); + sprintf(buf, "%s.bpred_addr_rate", name); + sprintf(buf1, "%s.addr_hits / %s.updates", name, name); + stat_reg_formula(sdb, buf, + "branch address-prediction rate (i.e., addr-hits/updates)", + buf1, "%9.4f"); + sprintf(buf, "%s.bpred_dir_rate", name); + sprintf(buf1, "%s.dir_hits / %s.updates", name, name); + stat_reg_formula(sdb, buf, + "branch direction-prediction rate (i.e., all-hits/updates)", + buf1, "%9.4f"); + sprintf(buf, "%s.bpred_jr_rate", name); + sprintf(buf1, "%s.jr_hits / %s.jr_seen", name, name); + stat_reg_formula(sdb, buf, + "JR address-prediction rate (i.e., JR addr-hits/JRs seen)", + buf1, "%9.4f"); + sprintf(buf, "%s.bpred_jr_non_ras_rate.PP", name); + sprintf(buf1, "%s.jr_non_ras_hits.PP / %s.jr_non_ras_seen.PP", name, name); + stat_reg_formula(sdb, buf, + "non-RAS JR addr-pred rate (ie, non-RAS JR hits/JRs seen)", + buf1, "%9.4f"); + sprintf(buf, "%s.retstack_pushes", name); + stat_reg_counter(sdb, buf, + "total number of address pushed onto ret-addr stack", + &pred->retstack_pushes, 0, NULL); + sprintf(buf, "%s.retstack_pops", name); + stat_reg_counter(sdb, buf, + "total number of address popped off of ret-addr stack", + &pred->retstack_pops, 0, NULL); + sprintf(buf, "%s.used_ras.PP", name); + stat_reg_counter(sdb, buf, + "total number of RAS predictions used", + &pred->used_ras, 0, NULL); + sprintf(buf, "%s.ras_hits.PP", name); + stat_reg_counter(sdb, buf, + "total number of RAS hits", + &pred->ras_hits, 0, NULL); + sprintf(buf, "%s.ras_rate.PP", name); + sprintf(buf1, "%s.ras_hits.PP / %s.used_ras.PP", name, name); + stat_reg_formula(sdb, buf, + "RAS prediction rate (i.e., RAS hits/used RAS)", + buf1, "%9.4f"); +} + +void +bpred_after_priming(struct bpred_t *bpred) +{ + if (bpred == NULL) + return; + + bpred->lookups = 0; + bpred->addr_hits = 0; + bpred->dir_hits = 0; + bpred->used_ras = 0; + bpred->used_bimod = 0; + bpred->used_2lev = 0; + bpred->jr_hits = 0; + bpred->jr_seen = 0; + bpred->misses = 0; + bpred->retstack_pops = 0; + bpred->retstack_pushes = 0; + bpred->ras_hits = 0; +} + +#define BIMOD_HASH(PRED, ADDR) \ + ((((ADDR) >> 19) ^ ((ADDR) >> MD_BR_SHIFT)) & ((PRED)->config.bimod.size-1)) + /* was: ((baddr >> 16) ^ baddr) & (pred->dirpred.bimod.size-1) */ + +#define THREEBIT_HASH(PRED, ADDR) \ + ((((ADDR) >> 19) ^ ((ADDR) >> MD_BR_SHIFT)) & ((PRED)->config.threebit.size-1)) + +/* predicts a branch direction */ +char * /* pointer to counter */ +bpred_dir_lookup(struct bpred_dir_t *pred_dir, /* branch dir predictor inst */ + md_addr_t baddr) /* branch address */ +{ + unsigned char *p = NULL; + + /* Except for jumps, get a pointer to direction-prediction bits */ + switch (pred_dir->class) { + case BPred2Level: + { + int l1index, l2index; + + /* traverse 2-level tables */ + l1index = (baddr >> MD_BR_SHIFT) & (pred_dir->config.two.l1size - 1); + l2index = pred_dir->config.two.shiftregs[l1index]; + if (pred_dir->config.two.xor) + { +#if 1 + /* this L2 index computation is more "compatible" to McFarling's + verison of it, i.e., if the PC xor address component is only + part of the index, take the lower order address bits for the + other part of the index, rather than the higher order ones */ + l2index = (((l2index ^ (baddr >> MD_BR_SHIFT)) + & ((1 << pred_dir->config.two.shift_width) - 1)) + | ((baddr >> MD_BR_SHIFT) + << pred_dir->config.two.shift_width)); +#else + l2index = l2index ^ (baddr >> MD_BR_SHIFT); +#endif + } + else + { + l2index = + l2index + | ((baddr >> MD_BR_SHIFT) << pred_dir->config.two.shift_width); + } + l2index = l2index & (pred_dir->config.two.l2size - 1); + + /* get a pointer to prediction state information */ + p = &pred_dir->config.two.l2table[l2index]; + } + break; + case BPred2bit: + p = &pred_dir->config.bimod.table[BIMOD_HASH(pred_dir, baddr)]; + break; + case BPred3bit: + p = &pred_dir->config.threebit.table[THREEBIT_HASH(pred_dir, baddr)]; + break; + case BPredTaken: + case BPredNotTaken: + break; + default: + panic("bogus branch direction predictor class"); + } + + return (char *)p; +} + +/* probe a predictor for a next fetch address, the predictor is probed + with branch address BADDR, the branch target is BTARGET (used for + static predictors), and OP is the instruction opcode (used to simulate + predecode bits; a pointer to the predictor state entry (or null for jumps) + is returned in *DIR_UPDATE_PTR (used for updating predictor state), + and the non-speculative top-of-stack is returned in stack_recover_idx + (used for recovering ret-addr stack after mis-predict). */ +md_addr_t /* predicted branch target addr */ +bpred_lookup(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + md_addr_t btarget, /* branch target if taken */ + enum md_opcode op, /* opcode of instruction */ + int is_call, /* non-zero if inst is fn call */ + int is_return, /* non-zero if inst is fn return */ + struct bpred_update_t *dir_update_ptr, /* pred state pointer */ + int *stack_recover_idx) /* Non-speculative top-of-stack; + * used on mispredict recovery */ +{ + struct bpred_btb_ent_t *pbtb = NULL; + int index, i; + + if (!dir_update_ptr) + panic("no bpred update record"); + + /* if this is not a branch, return not-taken */ + if (!(MD_OP_FLAGS(op) & F_CTRL)) + return 0; + + pred->lookups++; + + dir_update_ptr->dir.ras = FALSE; + dir_update_ptr->pdir1 = NULL; + dir_update_ptr->pdir2 = NULL; + dir_update_ptr->pmeta = NULL; + /* Except for jumps, get a pointer to direction-prediction bits */ + switch (pred->class) { + case BPredComb: + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND)) + { + char *bimod, *twolev, *meta; + bimod = bpred_dir_lookup (pred->dirpred.bimod, baddr); + twolev = bpred_dir_lookup (pred->dirpred.twolev, baddr); + meta = bpred_dir_lookup (pred->dirpred.meta, baddr); + dir_update_ptr->pmeta = meta; + dir_update_ptr->dir.meta = (*meta >= 2); + dir_update_ptr->dir.bimod = (*bimod >= 2); + dir_update_ptr->dir.twolev = (*twolev >= 2); + if (*meta >= 2) + { + dir_update_ptr->pdir1 = twolev; + dir_update_ptr->pdir2 = bimod; + } + else + { + dir_update_ptr->pdir1 = bimod; + dir_update_ptr->pdir2 = twolev; + } + } + break; + case BPred2Level: + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND)) + { + dir_update_ptr->pdir1 = + bpred_dir_lookup (pred->dirpred.twolev, baddr); + } + break; + case BPred2bit: + case BPred3bit: + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND)) + { + dir_update_ptr->pdir1 = + bpred_dir_lookup (pred->dirpred.bimod, baddr); + } + break; + case BPredTaken: + return btarget; + case BPredNotTaken: + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND)) + { + return baddr + sizeof(md_inst_t); + } + else + { + return btarget; + } + default: + panic("bogus predictor class"); + } + + /* + * We have a stateful predictor, and have gotten a pointer into the + * direction predictor (except for jumps, for which the ptr is null) + */ + + /* record pre-pop TOS; if this branch is executed speculatively + * and is squashed, we'll restore the TOS and hope the data + * wasn't corrupted in the meantime. */ + if (pred->retstack.size) + *stack_recover_idx = pred->retstack.tos; + else + *stack_recover_idx = 0; + + /* if this is a return, pop return-address stack */ + if (is_return && pred->retstack.size) + { + md_addr_t target = pred->retstack.stack[pred->retstack.tos].target; + pred->retstack.tos = (pred->retstack.tos + pred->retstack.size - 1) + % pred->retstack.size; + pred->retstack_pops++; + dir_update_ptr->dir.ras = TRUE; /* using RAS here */ + return target; + } + +#ifndef RAS_BUG_COMPATIBLE + /* if function call, push return-address onto return-address stack */ + if (is_call && pred->retstack.size) + { + pred->retstack.tos = (pred->retstack.tos + 1)% pred->retstack.size; + pred->retstack.stack[pred->retstack.tos].target = + baddr + sizeof(md_inst_t); + pred->retstack_pushes++; + } +#endif /* !RAS_BUG_COMPATIBLE */ + + /* not a return. Get a pointer into the BTB */ + index = (baddr >> MD_BR_SHIFT) & (pred->btb.sets - 1); + + if (pred->btb.assoc > 1) + { + index *= pred->btb.assoc; + + /* Now we know the set; look for a PC match */ + for (i = index; i < (index+pred->btb.assoc) ; i++) + if (pred->btb.btb_data[i].addr == baddr) + { + /* match */ + pbtb = &pred->btb.btb_data[i]; + break; + } + } + else + { + pbtb = &pred->btb.btb_data[index]; + if (pbtb->addr != baddr) + pbtb = NULL; + } + + /* + * We now also have a pointer into the BTB for a hit, or NULL otherwise + */ + + /* if this is a jump, ignore predicted direction; we know it's taken. */ + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) == (F_CTRL|F_UNCOND)) + { + return (pbtb ? pbtb->target : 1); + } + + char pred_cutoff = (pred->class == BPred3bit) ? 4 : 2; + /* otherwise we have a conditional branch */ + if (pbtb == NULL) + { + /* BTB miss -- just return a predicted direction */ + return ((*(dir_update_ptr->pdir1) >= pred_cutoff) + ? /* taken */ 1 + : /* not taken */ 0); + } + else + { + /* BTB hit, so return target if it's a predicted-taken branch */ + return ((*(dir_update_ptr->pdir1) >= pred_cutoff) + ? /* taken */ pbtb->target + : /* not taken */ 0); + } +} + +/* Speculative execution can corrupt the ret-addr stack. So for each + * lookup we return the top-of-stack (TOS) at that point; a mispredicted + * branch, as part of its recovery, restores the TOS using this value -- + * hopefully this uncorrupts the stack. */ +void +bpred_recover(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + int stack_recover_idx) /* Non-speculative top-of-stack; + * used on mispredict recovery */ +{ + if (pred == NULL) + return; + + pred->retstack.tos = stack_recover_idx; +} + +/* update the branch predictor, only useful for stateful predictors; updates + entry for instruction type OP at address BADDR. BTB only gets updated + for branches which are taken. Inst was determined to jump to + address BTARGET and was taken if TAKEN is non-zero. Predictor + statistics are updated with result of prediction, indicated by CORRECT and + PRED_TAKEN, predictor state to be updated is indicated by *DIR_UPDATE_PTR + (may be NULL for jumps, which shouldn't modify state bits). Note if + bpred_update is done speculatively, branch-prediction may get polluted. */ +void +bpred_update(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + md_addr_t btarget, /* resolved branch target */ + int taken, /* non-zero if branch was taken */ + int pred_taken, /* non-zero if branch was pred taken */ + int correct, /* was earlier addr prediction ok? */ + enum md_opcode op, /* opcode of instruction */ + struct bpred_update_t *dir_update_ptr)/* pred state pointer */ +{ + struct bpred_btb_ent_t *pbtb = NULL; + struct bpred_btb_ent_t *lruhead = NULL, *lruitem = NULL; + int index, i; + + /* don't change bpred state for non-branch instructions or if this + * is a stateless predictor*/ + if (!(MD_OP_FLAGS(op) & F_CTRL)) + return; + + /* Have a branch here */ + + if (correct) + pred->addr_hits++; + + if (!!pred_taken == !!taken) + pred->dir_hits++; + else + pred->misses++; + + if (dir_update_ptr->dir.ras) + { + pred->used_ras++; + if (correct) + pred->ras_hits++; + } + else if ((MD_OP_FLAGS(op) & (F_CTRL|F_COND)) == (F_CTRL|F_COND)) + { + if (dir_update_ptr->dir.meta) + pred->used_2lev++; + else + pred->used_bimod++; + } + + /* keep stats about JR's; also, but don't change any bpred state for JR's + * which are returns unless there's no retstack */ + if (MD_IS_INDIR(op)) + { + pred->jr_seen++; + if (correct) + pred->jr_hits++; + + if (!dir_update_ptr->dir.ras) + { + pred->jr_non_ras_seen++; + if (correct) + pred->jr_non_ras_hits++; + } + else + { + /* return that used the ret-addr stack; no further work to do */ + return; + } + } + + /* Can exit now if this is a stateless predictor */ + if (pred->class == BPredNotTaken || pred->class == BPredTaken) + return; + + /* + * Now we know the branch didn't use the ret-addr stack, and that this + * is a stateful predictor + */ + +#ifdef RAS_BUG_COMPATIBLE + /* if function call, push return-address onto return-address stack */ + if (MD_IS_CALL(op) && pred->retstack.size) + { + pred->retstack.tos = (pred->retstack.tos + 1)% pred->retstack.size; + pred->retstack.stack[pred->retstack.tos].target = + baddr + sizeof(md_inst_t); + pred->retstack_pushes++; + } +#endif /* RAS_BUG_COMPATIBLE */ + + /* update L1 table if appropriate */ + /* L1 table is updated unconditionally for combining predictor too */ + if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND) && + (pred->class == BPred2Level || pred->class == BPredComb)) + { + int l1index, shift_reg; + + /* also update appropriate L1 history register */ + l1index = + (baddr >> MD_BR_SHIFT) & (pred->dirpred.twolev->config.two.l1size - 1); + shift_reg = + (pred->dirpred.twolev->config.two.shiftregs[l1index] << 1) | (!!taken); + pred->dirpred.twolev->config.two.shiftregs[l1index] = + shift_reg & ((1 << pred->dirpred.twolev->config.two.shift_width) - 1); + } + + /* find BTB entry if it's a taken branch (don't allocate for non-taken) */ + if (taken) + { + index = (baddr >> MD_BR_SHIFT) & (pred->btb.sets - 1); + + if (pred->btb.assoc > 1) + { + index *= pred->btb.assoc; + + /* Now we know the set; look for a PC match; also identify + * MRU and LRU items */ + for (i = index; i < (index+pred->btb.assoc) ; i++) + { + if (pred->btb.btb_data[i].addr == baddr) + { + /* match */ + assert(!pbtb); + pbtb = &pred->btb.btb_data[i]; + } + + dassert(pred->btb.btb_data[i].prev + != pred->btb.btb_data[i].next); + if (pred->btb.btb_data[i].prev == NULL) + { + /* this is the head of the lru list, ie current MRU item */ + dassert(lruhead == NULL); + lruhead = &pred->btb.btb_data[i]; + } + if (pred->btb.btb_data[i].next == NULL) + { + /* this is the tail of the lru list, ie the LRU item */ + dassert(lruitem == NULL); + lruitem = &pred->btb.btb_data[i]; + } + } + dassert(lruhead && lruitem); + + if (!pbtb) + /* missed in BTB; choose the LRU item in this set as the victim */ + pbtb = lruitem; + /* else hit, and pbtb points to matching BTB entry */ + + /* Update LRU state: selected item, whether selected because it + * matched or because it was LRU and selected as a victim, becomes + * MRU */ + if (pbtb != lruhead) + { + /* this splices out the matched entry... */ + if (pbtb->prev) + pbtb->prev->next = pbtb->next; + if (pbtb->next) + pbtb->next->prev = pbtb->prev; + /* ...and this puts the matched entry at the head of the list */ + pbtb->next = lruhead; + pbtb->prev = NULL; + lruhead->prev = pbtb; + dassert(pbtb->prev || pbtb->next); + dassert(pbtb->prev != pbtb->next); + } + /* else pbtb is already MRU item; do nothing */ + } + else + pbtb = &pred->btb.btb_data[index]; + } + + /* + * Now 'p' is a possibly null pointer into the direction prediction table, + * and 'pbtb' is a possibly null pointer into the BTB (either to a + * matched-on entry or a victim which was LRU in its set) + */ + + /* update state (but not for jumps) */ + if (dir_update_ptr->pdir1) + { + char pred_cutoff = (pred->dirpred.bimod->class == BPred3bit) ? 7 : 3; + if (taken) + { + if (*dir_update_ptr->pdir1 < pred_cutoff) + ++*dir_update_ptr->pdir1; + } + else + { /* not taken */ + if (*dir_update_ptr->pdir1 > 0) + --*dir_update_ptr->pdir1; + } + } + + /* combining predictor also updates second predictor and meta predictor */ + /* second direction predictor */ + if (dir_update_ptr->pdir2) + { + char pred_cutoff = (pred->dirpred.twolev->class == BPred3bit) ? 7 : 3; + if (taken) + { + if (*dir_update_ptr->pdir2 < pred_cutoff) + ++*dir_update_ptr->pdir2; + } + else + { /* not taken */ + if (*dir_update_ptr->pdir2 > 0) + --*dir_update_ptr->pdir2; + } + } + + /* meta predictor */ + if (dir_update_ptr->pmeta) + { + if (dir_update_ptr->dir.bimod != dir_update_ptr->dir.twolev) + { + /* we only update meta predictor if directions were different */ + if (dir_update_ptr->dir.twolev == (unsigned int)taken) + { + /* 2-level predictor was correct */ + if (*dir_update_ptr->pmeta < 3) + ++*dir_update_ptr->pmeta; + } + else + { + /* bimodal predictor was correct */ + if (*dir_update_ptr->pmeta > 0) + --*dir_update_ptr->pmeta; + } + } + } + + /* update BTB (but only for taken branches) */ + if (pbtb) + { + /* update current information */ + dassert(taken); + + if (pbtb->addr == baddr) + { + if (!correct) + pbtb->target = btarget; + } + else + { + /* enter a new branch in the table */ + pbtb->addr = baddr; + pbtb->op = op; + pbtb->target = btarget; + } + } +} diff --git a/bpred.h b/bpred.h new file mode 100644 index 0000000..2b6f6c9 --- /dev/null +++ b/bpred.h @@ -0,0 +1,289 @@ +/* bpred.h - branch predictor interfaces */ + +/* SimpleScalar(TM) Tool Suite + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + * All Rights Reserved. + * + * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR, + * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS. + * + * No portion of this work may be used by any commercial entity, or for any + * commercial purpose, without the prior, written permission of SimpleScalar, + * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted + * as described below. + * + * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express + * or implied. The user of the program accepts full responsibility for the + * application of the program and the use of any results. + * + * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be + * downloaded, compiled, executed, copied, and modified solely for nonprofit, + * educational, noncommercial research, and noncommercial scholarship + * purposes provided that this notice in its entirety accompanies all copies. + * Copies of the modified software can be delivered to persons who use it + * solely for nonprofit, educational, noncommercial research, and + * noncommercial scholarship purposes provided that this notice in its + * entirety accompanies all copies. + * + * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY + * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com). + * + * 4. No nonprofit user may place any restrictions on the use of this software, + * including as modified by the user, by any other authorized user. + * + * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar + * in compiled or executable form as set forth in Section 2, provided that + * either: (A) it is accompanied by the corresponding machine-readable source + * code, or (B) it is accompanied by a written offer, with no time limit, to + * give anyone a machine-readable copy of the corresponding source code in + * return for reimbursement of the cost of distribution. This written offer + * must permit verbatim duplication by anyone, or (C) it is distributed by + * someone who received only the executable form, and is accompanied by a + * copy of the written offer of source code. + * + * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is + * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail: + * 2395 Timbercrest Court, Ann Arbor, MI 48105. + * + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + */ + + +#ifndef BPRED_H +#define BPRED_H + +#define dassert(a) assert(a) + +#include + +#include "host.h" +#include "misc.h" +#include "machine.h" +#include "stats.h" + +/* + * This module implements a number of branch predictor mechanisms. The + * following predictors are supported: + * + * BPred2Level: two level adaptive branch predictor + * + * It can simulate many prediction mechanisms that have up to + * two levels of tables. Parameters are: + * N # entries in first level (# of shift register(s)) + * W width of shift register(s) + * M # entries in 2nd level (# of counters, or other FSM) + * One BTB entry per level-2 counter. + * + * Configurations: N, W, M + * + * counter based: 1, 0, M + * + * GAg : 1, W, 2^W + * GAp : 1, W, M (M > 2^W) + * PAg : N, W, 2^W + * PAp : N, W, M (M == 2^(N+W)) + * + * BPred2bit: a simple direct mapped bimodal predictor + * + * This predictor has a table of two bit saturating counters. + * Where counter states 0 & 1 are predict not taken and + * counter states 2 & 3 are predict taken, the per-branch counters + * are incremented on taken branches and decremented on + * no taken branches. One BTB entry per counter. + * + * BPredTaken: static predict branch taken + * + * BPredNotTaken: static predict branch not taken + * + */ + +/* branch predictor types */ +enum bpred_class { + BPredComb, /* combined predictor (McFarling) */ + BPred2Level, /* 2-level correlating pred w/2-bit counters */ + BPred2bit, /* 2-bit saturating cntr pred (dir mapped) */ + BPred3bit, /* 2-bit saturating cntr pred (dir mapped) */ + BPredTaken, /* static predict taken */ + BPredNotTaken, /* static predict not taken */ + BPred_NUM +}; + +/* an entry in a BTB */ +struct bpred_btb_ent_t { + md_addr_t addr; /* address of branch being tracked */ + enum md_opcode op; /* opcode of branch corresp. to addr */ + md_addr_t target; /* last destination of branch when taken */ + struct bpred_btb_ent_t *prev, *next; /* lru chaining pointers */ +}; + +/* direction predictor def */ +struct bpred_dir_t { + enum bpred_class class; /* type of predictor */ + union { + struct { + unsigned int size; /* number of entries in direct-mapped table */ + unsigned char *table; /* prediction state table */ + } bimod; + struct { + unsigned int size; + unsigned char *table; + } threebit; + struct { + int l1size; /* level-1 size, number of history regs */ + int l2size; /* level-2 size, number of pred states */ + int shift_width; /* amount of history in level-1 shift regs */ + int xor; /* history xor address flag */ + int *shiftregs; /* level-1 history table */ + unsigned char *l2table; /* level-2 prediction state table */ + } two; + } config; +}; + +/* branch predictor def */ +struct bpred_t { + enum bpred_class class; /* type of predictor */ + struct { + struct bpred_dir_t *bimod; /* first direction predictor */ + struct bpred_dir_t *twolev; /* second direction predictor */ + struct bpred_dir_t *meta; /* meta predictor */ + } dirpred; + + struct { + int sets; /* num BTB sets */ + int assoc; /* BTB associativity */ + struct bpred_btb_ent_t *btb_data; /* BTB addr-prediction table */ + } btb; + + struct { + int size; /* return-address stack size */ + int tos; /* top-of-stack */ + struct bpred_btb_ent_t *stack; /* return-address stack */ + } retstack; + + /* stats */ + counter_t addr_hits; /* num correct addr-predictions */ + counter_t dir_hits; /* num correct dir-predictions (incl addr) */ + counter_t used_ras; /* num RAS predictions used */ + counter_t used_bimod; /* num bimodal predictions used (BPredComb) */ + counter_t used_2lev; /* num 2-level predictions used (BPredComb) */ + counter_t jr_hits; /* num correct addr-predictions for JR's */ + counter_t jr_seen; /* num JR's seen */ + counter_t jr_non_ras_hits; /* num correct addr-preds for non-RAS JR's */ + counter_t jr_non_ras_seen; /* num non-RAS JR's seen */ + counter_t misses; /* num incorrect predictions */ + + counter_t lookups; /* num lookups */ + counter_t retstack_pops; /* number of times a value was popped */ + counter_t retstack_pushes; /* number of times a value was pushed */ + counter_t ras_hits; /* num correct return-address predictions */ +}; + +/* branch predictor update information */ +struct bpred_update_t { + char *pdir1; /* direction-1 predictor counter */ + char *pdir2; /* direction-2 predictor counter */ + char *pmeta; /* meta predictor counter */ + struct { /* predicted directions */ + unsigned int ras : 1; /* RAS used */ + unsigned int bimod : 1; /* bimodal predictor */ + unsigned int twolev : 1; /* 2-level predictor */ + unsigned int meta : 1; /* meta predictor (0..bimod / 1..2lev) */ + } dir; +}; + +/* create a branch predictor */ +struct bpred_t * /* branch predictory instance */ +bpred_create(enum bpred_class class, /* type of predictor to create */ + unsigned int bimod_size, /* bimod table size */ + unsigned int l1size, /* level-1 table size */ + unsigned int l2size, /* level-2 table size */ + unsigned int meta_size, /* meta predictor table size */ + unsigned int shift_width, /* history register width */ + unsigned int xor, /* history xor address flag */ + unsigned int btb_sets, /* number of sets in BTB */ + unsigned int btb_assoc, /* BTB associativity */ + unsigned int retstack_size);/* num entries in ret-addr stack */ + +/* create a branch direction predictor */ +struct bpred_dir_t * /* branch direction predictor instance */ +bpred_dir_create ( + enum bpred_class class, /* type of predictor to create */ + unsigned int l1size, /* level-1 table size */ + unsigned int l2size, /* level-2 table size (if relevant) */ + unsigned int shift_width, /* history register width */ + unsigned int xor); /* history xor address flag */ + +/* print branch predictor configuration */ +void +bpred_config(struct bpred_t *pred, /* branch predictor instance */ + FILE *stream); /* output stream */ + +/* print predictor stats */ +void +bpred_stats(struct bpred_t *pred, /* branch predictor instance */ + FILE *stream); /* output stream */ + +/* register branch predictor stats */ +void +bpred_reg_stats(struct bpred_t *pred, /* branch predictor instance */ + struct stat_sdb_t *sdb);/* stats database */ + +/* reset stats after priming, if appropriate */ +void bpred_after_priming(struct bpred_t *bpred); + +/* probe a predictor for a next fetch address, the predictor is probed + with branch address BADDR, the branch target is BTARGET (used for + static predictors), and OP is the instruction opcode (used to simulate + predecode bits; a pointer to the predictor state entry (or null for jumps) + is returned in *DIR_UPDATE_PTR (used for updating predictor state), + and the non-speculative top-of-stack is returned in stack_recover_idx + (used for recovering ret-addr stack after mis-predict). */ +md_addr_t /* predicted branch target addr */ +bpred_lookup(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + md_addr_t btarget, /* branch target if taken */ + enum md_opcode op, /* opcode of instruction */ + int is_call, /* non-zero if inst is fn call */ + int is_return, /* non-zero if inst is fn return */ + struct bpred_update_t *dir_update_ptr, /* pred state pointer */ + int *stack_recover_idx); /* Non-speculative top-of-stack; + * used on mispredict recovery */ + +/* Speculative execution can corrupt the ret-addr stack. So for each + * lookup we return the top-of-stack (TOS) at that point; a mispredicted + * branch, as part of its recovery, restores the TOS using this value -- + * hopefully this uncorrupts the stack. */ +void +bpred_recover(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + int stack_recover_idx); /* Non-speculative top-of-stack; + * used on mispredict recovery */ + +/* update the branch predictor, only useful for stateful predictors; updates + entry for instruction type OP at address BADDR. BTB only gets updated + for branches which are taken. Inst was determined to jump to + address BTARGET and was taken if TAKEN is non-zero. Predictor + statistics are updated with result of prediction, indicated by CORRECT and + PRED_TAKEN, predictor state to be updated is indicated by *DIR_UPDATE_PTR + (may be NULL for jumps, which shouldn't modify state bits). Note if + bpred_update is done speculatively, branch-prediction may get polluted. */ +void +bpred_update(struct bpred_t *pred, /* branch predictor instance */ + md_addr_t baddr, /* branch address */ + md_addr_t btarget, /* resolved branch target */ + int taken, /* non-zero if branch was taken */ + int pred_taken, /* non-zero if branch was pred taken */ + int correct, /* was earlier prediction correct? */ + enum md_opcode op, /* opcode of instruction */ + struct bpred_update_t *dir_update_ptr); /* pred state pointer */ + + +#ifdef foo0 +/* OBSOLETE */ +/* dump branch predictor state (for debug) */ +void +bpred_dump(struct bpred_t *pred, /* branch predictor instance */ + FILE *stream); /* output stream */ +#endif + +#endif /* BPRED_H */ diff --git a/sim-outorder.c b/sim-outorder.c new file mode 100644 index 0000000..0245b58 --- /dev/null +++ b/sim-outorder.c @@ -0,0 +1,4637 @@ +/* sim-outorder.c - sample out-of-order issue perf simulator implementation */ + +/* SimpleScalar(TM) Tool Suite + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + * All Rights Reserved. + * + * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR, + * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS. + * + * No portion of this work may be used by any commercial entity, or for any + * commercial purpose, without the prior, written permission of SimpleScalar, + * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted + * as described below. + * + * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express + * or implied. The user of the program accepts full responsibility for the + * application of the program and the use of any results. + * + * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be + * downloaded, compiled, executed, copied, and modified solely for nonprofit, + * educational, noncommercial research, and noncommercial scholarship + * purposes provided that this notice in its entirety accompanies all copies. + * Copies of the modified software can be delivered to persons who use it + * solely for nonprofit, educational, noncommercial research, and + * noncommercial scholarship purposes provided that this notice in its + * entirety accompanies all copies. + * + * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY + * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com). + * + * 4. No nonprofit user may place any restrictions on the use of this software, + * including as modified by the user, by any other authorized user. + * + * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar + * in compiled or executable form as set forth in Section 2, provided that + * either: (A) it is accompanied by the corresponding machine-readable source + * code, or (B) it is accompanied by a written offer, with no time limit, to + * give anyone a machine-readable copy of the corresponding source code in + * return for reimbursement of the cost of distribution. This written offer + * must permit verbatim duplication by anyone, or (C) it is distributed by + * someone who received only the executable form, and is accompanied by a + * copy of the written offer of source code. + * + * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is + * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail: + * 2395 Timbercrest Court, Ann Arbor, MI 48105. + * + * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC. + */ + + +#include +#include +#include +#include +#include + +#include "host.h" +#include "misc.h" +#include "machine.h" +#include "regs.h" +#include "memory.h" +#include "cache.h" +#include "loader.h" +#include "syscall.h" +#include "bpred.h" +#include "resource.h" +#include "bitmap.h" +#include "options.h" +#include "eval.h" +#include "stats.h" +#include "ptrace.h" +#include "dlite.h" +#include "sim.h" + +/* + * This file implements a very detailed out-of-order issue superscalar + * processor with a two-level memory system and speculative execution support. + * This simulator is a performance simulator, tracking the latency of all + * pipeline operations. + */ + +/* simulated registers */ +static struct regs_t regs; + +/* simulated memory */ +static struct mem_t *mem = NULL; + + +/* + * simulator options + */ + +/* maximum number of inst's to execute */ +static unsigned int max_insts; + +/* number of insts skipped before timing starts */ +static int fastfwd_count; + +/* pipeline trace range and output filename */ +static int ptrace_nelt = 0; +static char *ptrace_opts[2]; + +/* instruction fetch queue size (in insts) */ +static int ruu_ifq_size; + +/* extra branch mis-prediction latency */ +static int ruu_branch_penalty; + +/* speed of front-end of machine relative to execution core */ +static int fetch_speed; + +/* branch predictor type {nottaken|taken|perfect|bimod|2lev} */ +static char *pred_type; + +/* bimodal predictor config () */ +static int bimod_nelt = 1; +static int bimod_config[1] = + { /* bimod tbl size */2048 }; + +/* threebit predictor config () */ +static int threebit_nelt = 1; +static int threebit_config[1] = + { /* threebit tbl size */2048 }; + +/* 2-level predictor config ( ) */ +static int twolev_nelt = 4; +static int twolev_config[4] = + { /* l1size */1, /* l2size */1024, /* hist */8, /* xor */FALSE}; + +/* combining predictor config ( */ +static int comb_nelt = 1; +static int comb_config[1] = + { /* meta_table_size */1024 }; + +/* return address stack (RAS) size */ +static int ras_size = 8; + +/* BTB predictor config ( ) */ +static int btb_nelt = 2; +static int btb_config[2] = + { /* nsets */512, /* assoc */4 }; + +/* instruction decode B/W (insts/cycle) */ +static int ruu_decode_width; + +/* instruction issue B/W (insts/cycle) */ +static int ruu_issue_width; + +/* run pipeline with in-order issue */ +static int ruu_inorder_issue; + +/* issue instructions down wrong execution paths */ +static int ruu_include_spec = TRUE; + +/* instruction commit B/W (insts/cycle) */ +static int ruu_commit_width; + +/* register update unit (RUU) size */ +static int RUU_size = 8; + +/* load/store queue (LSQ) size */ +static int LSQ_size = 4; + +/* l1 data cache config, i.e., {|none} */ +static char *cache_dl1_opt; + +/* l1 data cache hit latency (in cycles) */ +static int cache_dl1_lat; + +/* l2 data cache config, i.e., {|none} */ +static char *cache_dl2_opt; + +/* l2 data cache hit latency (in cycles) */ +static int cache_dl2_lat; + +/* l1 instruction cache config, i.e., {|dl1|dl2|none} */ +static char *cache_il1_opt; + +/* l1 instruction cache hit latency (in cycles) */ +static int cache_il1_lat; + +/* l2 instruction cache config, i.e., {|dl1|dl2|none} */ +static char *cache_il2_opt; + +/* l2 instruction cache hit latency (in cycles) */ +static int cache_il2_lat; + +/* flush caches on system calls */ +static int flush_on_syscalls; + +/* convert 64-bit inst addresses to 32-bit inst equivalents */ +static int compress_icache_addrs; + +/* memory access latency ( ) */ +static int mem_nelt = 2; +static int mem_lat[2] = + { /* lat to first chunk */18, /* lat between remaining chunks */2 }; + +/* memory access bus width (in bytes) */ +static int mem_bus_width; + +/* instruction TLB config, i.e., {|none} */ +static char *itlb_opt; + +/* data TLB config, i.e., {|none} */ +static char *dtlb_opt; + +/* inst/data TLB miss latency (in cycles) */ +static int tlb_miss_lat; + +/* total number of integer ALU's available */ +static int res_ialu; + +/* total number of integer multiplier/dividers available */ +static int res_imult; + +/* total number of memory system ports available (to CPU) */ +static int res_memport; + +/* total number of floating point ALU's available */ +static int res_fpalu; + +/* total number of floating point multiplier/dividers available */ +static int res_fpmult; + +/* text-based stat profiles */ +#define MAX_PCSTAT_VARS 8 +static int pcstat_nelt = 0; +static char *pcstat_vars[MAX_PCSTAT_VARS]; + +/* convert 64-bit inst text addresses to 32-bit inst equivalents */ +#ifdef TARGET_PISA +#define IACOMPRESS(A) \ + (compress_icache_addrs ? ((((A) - ld_text_base) >> 1) + ld_text_base) : (A)) +#define ISCOMPRESS(SZ) \ + (compress_icache_addrs ? ((SZ) >> 1) : (SZ)) +#else /* !TARGET_PISA */ +#define IACOMPRESS(A) (A) +#define ISCOMPRESS(SZ) (SZ) +#endif /* TARGET_PISA */ + +/* operate in backward-compatible bugs mode (for testing only) */ +static int bugcompat_mode; + +/* + * functional unit resource configuration + */ + +/* resource pool indices, NOTE: update these if you change FU_CONFIG */ +#define FU_IALU_INDEX 0 +#define FU_IMULT_INDEX 1 +#define FU_MEMPORT_INDEX 2 +#define FU_FPALU_INDEX 3 +#define FU_FPMULT_INDEX 4 + +/* resource pool definition, NOTE: update FU_*_INDEX defs if you change this */ +struct res_desc fu_config[] = { + { + "integer-ALU", + 4, + 0, + { + { IntALU, 1, 1 } + } + }, + { + "integer-MULT/DIV", + 1, + 0, + { + { IntMULT, 3, 1 }, + { IntDIV, 20, 19 } + } + }, + { + "memory-port", + 2, + 0, + { + { RdPort, 1, 1 }, + { WrPort, 1, 1 } + } + }, + { + "FP-adder", + 4, + 0, + { + { FloatADD, 2, 1 }, + { FloatCMP, 2, 1 }, + { FloatCVT, 2, 1 } + } + }, + { + "FP-MULT/DIV", + 1, + 0, + { + { FloatMULT, 4, 1 }, + { FloatDIV, 12, 12 }, + { FloatSQRT, 24, 24 } + } + }, +}; + + +/* + * simulator stats + */ +/* SLIP variable */ +static counter_t sim_slip = 0; + +/* total number of instructions executed */ +static counter_t sim_total_insn = 0; + +/* total number of memory references committed */ +static counter_t sim_num_refs = 0; + +/* total number of memory references executed */ +static counter_t sim_total_refs = 0; + +/* total number of loads committed */ +static counter_t sim_num_loads = 0; + +/* total number of loads executed */ +static counter_t sim_total_loads = 0; + +/* total number of branches committed */ +static counter_t sim_num_branches = 0; + +/* total number of branches executed */ +static counter_t sim_total_branches = 0; + +/* cycle counter */ +static tick_t sim_cycle = 0; + +/* occupancy counters */ +static counter_t IFQ_count; /* cumulative IFQ occupancy */ +static counter_t IFQ_fcount; /* cumulative IFQ full count */ +static counter_t RUU_count; /* cumulative RUU occupancy */ +static counter_t RUU_fcount; /* cumulative RUU full count */ +static counter_t LSQ_count; /* cumulative LSQ occupancy */ +static counter_t LSQ_fcount; /* cumulative LSQ full count */ + +/* total non-speculative bogus addresses seen (debug var) */ +static counter_t sim_invalid_addrs; + +/* + * simulator state variables + */ + +/* instruction sequence counter, used to assign unique id's to insts */ +static unsigned int inst_seq = 0; + +/* pipetrace instruction sequence counter */ +static unsigned int ptrace_seq = 0; + +/* speculation mode, non-zero when mis-speculating, i.e., executing + instructions down the wrong path, thus state recovery will eventually have + to occur that resets processor register and memory state back to the last + precise state */ +static int spec_mode = FALSE; + +/* cycles until fetch issue resumes */ +static unsigned ruu_fetch_issue_delay = 0; + +/* perfect prediction enabled */ +static int pred_perfect = FALSE; + +/* speculative bpred-update enabled */ +static char *bpred_spec_opt; +static enum { spec_ID, spec_WB, spec_CT } bpred_spec_update; + +/* level 1 instruction cache, entry level instruction cache */ +static struct cache_t *cache_il1; + +/* level 1 instruction cache */ +static struct cache_t *cache_il2; + +/* level 1 data cache, entry level data cache */ +static struct cache_t *cache_dl1; + +/* level 2 data cache */ +static struct cache_t *cache_dl2; + +/* instruction TLB */ +static struct cache_t *itlb; + +/* data TLB */ +static struct cache_t *dtlb; + +/* branch predictor */ +static struct bpred_t *pred; + +/* functional unit resource pool */ +static struct res_pool *fu_pool = NULL; + +/* text-based stat profiles */ +static struct stat_stat_t *pcstat_stats[MAX_PCSTAT_VARS]; +static counter_t pcstat_lastvals[MAX_PCSTAT_VARS]; +static struct stat_stat_t *pcstat_sdists[MAX_PCSTAT_VARS]; + +/* wedge all stat values into a counter_t */ +#define STATVAL(STAT) \ + ((STAT)->sc == sc_int \ + ? (counter_t)*((STAT)->variant.for_int.var) \ + : ((STAT)->sc == sc_uint \ + ? (counter_t)*((STAT)->variant.for_uint.var) \ + : ((STAT)->sc == sc_counter \ + ? *((STAT)->variant.for_counter.var) \ + : (panic("bad stat class"), 0)))) + + +/* memory access latency, assumed to not cross a page boundary */ +static unsigned int /* total latency of access */ +mem_access_latency(int blk_sz) /* block size accessed */ +{ + int chunks = (blk_sz + (mem_bus_width - 1)) / mem_bus_width; + + assert(chunks > 0); + + return (/* first chunk latency */mem_lat[0] + + (/* remainder chunk latency */mem_lat[1] * (chunks - 1))); +} + + +/* + * cache miss handlers + */ + +/* l1 data cache l1 block miss handler function */ +static unsigned int /* latency of block access */ +dl1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + unsigned int lat; + + if (cache_dl2) + { + /* access next level of data cache hierarchy */ + lat = cache_access(cache_dl2, cmd, baddr, NULL, bsize, + /* now */now, /* pudata */NULL, /* repl addr */NULL); + if (cmd == Read) + return lat; + else + { + /* FIXME: unlimited write buffers */ + return 0; + } + } + else + { + /* access main memory */ + if (cmd == Read) + return mem_access_latency(bsize); + else + { + /* FIXME: unlimited write buffers */ + return 0; + } + } +} + +/* l2 data cache block miss handler function */ +static unsigned int /* latency of block access */ +dl2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + /* this is a miss to the lowest level, so access main memory */ + if (cmd == Read) + return mem_access_latency(bsize); + else + { + /* FIXME: unlimited write buffers */ + return 0; + } +} + +/* l1 inst cache l1 block miss handler function */ +static unsigned int /* latency of block access */ +il1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + unsigned int lat; + +if (cache_il2) + { + /* access next level of inst cache hierarchy */ + lat = cache_access(cache_il2, cmd, baddr, NULL, bsize, + /* now */now, /* pudata */NULL, /* repl addr */NULL); + if (cmd == Read) + return lat; + else + panic("writes to instruction memory not supported"); + } + else + { + /* access main memory */ + if (cmd == Read) + return mem_access_latency(bsize); + else + panic("writes to instruction memory not supported"); + } +} + +/* l2 inst cache block miss handler function */ +static unsigned int /* latency of block access */ +il2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + /* this is a miss to the lowest level, so access main memory */ + if (cmd == Read) + return mem_access_latency(bsize); + else + panic("writes to instruction memory not supported"); +} + + +/* + * TLB miss handlers + */ + +/* inst cache block miss handler function */ +static unsigned int /* latency of block access */ +itlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data; + + /* no real memory access, however, should have user data space attached */ + assert(phy_page_ptr); + + /* fake translation, for now... */ + *phy_page_ptr = 0; + + /* return tlb miss latency */ + return tlb_miss_lat; +} + +/* data cache block miss handler function */ +static unsigned int /* latency of block access */ +dtlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */ + md_addr_t baddr, /* block address to access */ + int bsize, /* size of block to access */ + struct cache_blk_t *blk, /* ptr to block in upper level */ + tick_t now) /* time of access */ +{ + md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data; + + /* no real memory access, however, should have user data space attached */ + assert(phy_page_ptr); + + /* fake translation, for now... */ + *phy_page_ptr = 0; + + /* return tlb miss latency */ + return tlb_miss_lat; +} + + +/* register simulator-specific options */ +void +sim_reg_options(struct opt_odb_t *odb) +{ + opt_reg_header(odb, +"sim-outorder: This simulator implements a very detailed out-of-order issue\n" +"superscalar processor with a two-level memory system and speculative\n" +"execution support. This simulator is a performance simulator, tracking the\n" +"latency of all pipeline operations.\n" + ); + + /* instruction limit */ + + opt_reg_uint(odb, "-max:inst", "maximum number of inst's to execute", + &max_insts, /* default */0, + /* print */TRUE, /* format */NULL); + + /* trace options */ + + opt_reg_int(odb, "-fastfwd", "number of insts skipped before timing starts", + &fastfwd_count, /* default */0, + /* print */TRUE, /* format */NULL); + opt_reg_string_list(odb, "-ptrace", + "generate pipetrace, i.e., ", + ptrace_opts, /* arr_sz */2, &ptrace_nelt, /* default */NULL, + /* !print */FALSE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_note(odb, +" Pipetrace range arguments are formatted as follows:\n" +"\n" +" {{@|#}}:{{@|#|+}}\n" +"\n" +" Both ends of the range are optional, if neither are specified, the entire\n" +" execution is traced. Ranges that start with a `@' designate an address\n" +" range to be traced, those that start with an `#' designate a cycle count\n" +" range. All other range values represent an instruction count range. The\n" +" second argument, if specified with a `+', indicates a value relative\n" +" to the first argument, e.g., 1000:+100 == 1000:1100. Program symbols may\n" +" be used in all contexts.\n" +"\n" +" Examples: -ptrace FOO.trc #0:#1000\n" +" -ptrace BAR.trc @2000:\n" +" -ptrace BLAH.trc :1500\n" +" -ptrace UXXE.trc :\n" +" -ptrace FOOBAR.trc @main:+278\n" + ); + + /* ifetch options */ + + opt_reg_int(odb, "-fetch:ifqsize", "instruction fetch queue size (in insts)", + &ruu_ifq_size, /* default */4, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-fetch:mplat", "extra branch mis-prediction latency", + &ruu_branch_penalty, /* default */3, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-fetch:speed", + "speed of front-end of machine relative to execution core", + &fetch_speed, /* default */1, + /* print */TRUE, /* format */NULL); + + /* branch predictor options */ + + opt_reg_note(odb, +" Branch predictor configuration examples for 2-level predictor:\n" +" Configurations: N, M, W, X\n" +" N # entries in first level (# of shift register(s))\n" +" W width of shift register(s)\n" +" M # entries in 2nd level (# of counters, or other FSM)\n" +" X (yes-1/no-0) xor history and address for 2nd level index\n" +" Sample predictors:\n" +" GAg : 1, W, 2^W, 0\n" +" GAp : 1, W, M (M > 2^W), 0\n" +" PAg : N, W, 2^W, 0\n" +" PAp : N, W, M (M == 2^(N+W)), 0\n" +" gshare : 1, W, 2^W, 1\n" +" Predictor `comb' combines a bimodal and a 2-level predictor.\n" + ); + + opt_reg_string(odb, "-bpred", + "branch predictor type {nottaken|taken|perfect|bimod|threebit|2lev|comb}", + &pred_type, /* default */"bimod", + /* print */TRUE, /* format */NULL); + + opt_reg_int_list(odb, "-bpred:bimod", + "bimodal predictor config ()", + bimod_config, bimod_nelt, &bimod_nelt, + /* default */bimod_config, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_int_list(odb, "-bpred:threebit", + "3-bit predictor config (
)", + threebit_config, threebit_nelt, &threebit_nelt, + /* default */threebit_config, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_int_list(odb, "-bpred:2lev", + "2-level predictor config " + "( )", + twolev_config, twolev_nelt, &twolev_nelt, + /* default */twolev_config, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_int_list(odb, "-bpred:comb", + "combining predictor config ()", + comb_config, comb_nelt, &comb_nelt, + /* default */comb_config, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_int(odb, "-bpred:ras", + "return address stack size (0 for no return stack)", + &ras_size, /* default */ras_size, + /* print */TRUE, /* format */NULL); + + opt_reg_int_list(odb, "-bpred:btb", + "BTB config ( )", + btb_config, btb_nelt, &btb_nelt, + /* default */btb_config, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_string(odb, "-bpred:spec_update", + "speculative predictors update in {ID|WB} (default non-spec)", + &bpred_spec_opt, /* default */NULL, + /* print */TRUE, /* format */NULL); + + /* decode options */ + + opt_reg_int(odb, "-decode:width", + "instruction decode B/W (insts/cycle)", + &ruu_decode_width, /* default */4, + /* print */TRUE, /* format */NULL); + + /* issue options */ + + opt_reg_int(odb, "-issue:width", + "instruction issue B/W (insts/cycle)", + &ruu_issue_width, /* default */4, + /* print */TRUE, /* format */NULL); + + opt_reg_flag(odb, "-issue:inorder", "run pipeline with in-order issue", + &ruu_inorder_issue, /* default */FALSE, + /* print */TRUE, /* format */NULL); + + opt_reg_flag(odb, "-issue:wrongpath", + "issue instructions down wrong execution paths", + &ruu_include_spec, /* default */TRUE, + /* print */TRUE, /* format */NULL); + + /* commit options */ + + opt_reg_int(odb, "-commit:width", + "instruction commit B/W (insts/cycle)", + &ruu_commit_width, /* default */4, + /* print */TRUE, /* format */NULL); + + /* register scheduler options */ + + opt_reg_int(odb, "-ruu:size", + "register update unit (RUU) size", + &RUU_size, /* default */16, + /* print */TRUE, /* format */NULL); + + /* memory scheduler options */ + + opt_reg_int(odb, "-lsq:size", + "load/store queue (LSQ) size", + &LSQ_size, /* default */8, + /* print */TRUE, /* format */NULL); + + /* cache options */ + + opt_reg_string(odb, "-cache:dl1", + "l1 data cache config, i.e., {|none}", + &cache_dl1_opt, "dl1:128:32:4:l", + /* print */TRUE, NULL); + + opt_reg_note(odb, +" The cache config parameter has the following format:\n" +"\n" +" ::::\n" +"\n" +" - name of the cache being defined\n" +" - number of sets in the cache\n" +" - block size of the cache\n" +" - associativity of the cache\n" +" - block replacement strategy, 'l'-LRU, 'f'-FIFO, 'r'-random\n" +"\n" +" Examples: -cache:dl1 dl1:4096:32:1:l\n" +" -dtlb dtlb:128:4096:32:r\n" + ); + + opt_reg_int(odb, "-cache:dl1lat", + "l1 data cache hit latency (in cycles)", + &cache_dl1_lat, /* default */1, + /* print */TRUE, /* format */NULL); + + opt_reg_string(odb, "-cache:dl2", + "l2 data cache config, i.e., {|none}", + &cache_dl2_opt, "ul2:1024:64:4:l", + /* print */TRUE, NULL); + + opt_reg_int(odb, "-cache:dl2lat", + "l2 data cache hit latency (in cycles)", + &cache_dl2_lat, /* default */6, + /* print */TRUE, /* format */NULL); + + opt_reg_string(odb, "-cache:il1", + "l1 inst cache config, i.e., {|dl1|dl2|none}", + &cache_il1_opt, "il1:512:32:1:l", + /* print */TRUE, NULL); + + opt_reg_note(odb, +" Cache levels can be unified by pointing a level of the instruction cache\n" +" hierarchy at the data cache hiearchy using the \"dl1\" and \"dl2\" cache\n" +" configuration arguments. Most sensible combinations are supported, e.g.,\n" +"\n" +" A unified l2 cache (il2 is pointed at dl2):\n" +" -cache:il1 il1:128:64:1:l -cache:il2 dl2\n" +" -cache:dl1 dl1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n" +"\n" +" Or, a fully unified cache hierarchy (il1 pointed at dl1):\n" +" -cache:il1 dl1\n" +" -cache:dl1 ul1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n" + ); + + opt_reg_int(odb, "-cache:il1lat", + "l1 instruction cache hit latency (in cycles)", + &cache_il1_lat, /* default */1, + /* print */TRUE, /* format */NULL); + + opt_reg_string(odb, "-cache:il2", + "l2 instruction cache config, i.e., {|dl2|none}", + &cache_il2_opt, "dl2", + /* print */TRUE, NULL); + + opt_reg_int(odb, "-cache:il2lat", + "l2 instruction cache hit latency (in cycles)", + &cache_il2_lat, /* default */6, + /* print */TRUE, /* format */NULL); + + opt_reg_flag(odb, "-cache:flush", "flush caches on system calls", + &flush_on_syscalls, /* default */FALSE, /* print */TRUE, NULL); + + opt_reg_flag(odb, "-cache:icompress", + "convert 64-bit inst addresses to 32-bit inst equivalents", + &compress_icache_addrs, /* default */FALSE, + /* print */TRUE, NULL); + + /* mem options */ + opt_reg_int_list(odb, "-mem:lat", + "memory access latency ( )", + mem_lat, mem_nelt, &mem_nelt, mem_lat, + /* print */TRUE, /* format */NULL, /* !accrue */FALSE); + + opt_reg_int(odb, "-mem:width", "memory access bus width (in bytes)", + &mem_bus_width, /* default */8, + /* print */TRUE, /* format */NULL); + + /* TLB options */ + + opt_reg_string(odb, "-tlb:itlb", + "instruction TLB config, i.e., {|none}", + &itlb_opt, "itlb:16:4096:4:l", /* print */TRUE, NULL); + + opt_reg_string(odb, "-tlb:dtlb", + "data TLB config, i.e., {|none}", + &dtlb_opt, "dtlb:32:4096:4:l", /* print */TRUE, NULL); + + opt_reg_int(odb, "-tlb:lat", + "inst/data TLB miss latency (in cycles)", + &tlb_miss_lat, /* default */30, + /* print */TRUE, /* format */NULL); + + /* resource configuration */ + + opt_reg_int(odb, "-res:ialu", + "total number of integer ALU's available", + &res_ialu, /* default */fu_config[FU_IALU_INDEX].quantity, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-res:imult", + "total number of integer multiplier/dividers available", + &res_imult, /* default */fu_config[FU_IMULT_INDEX].quantity, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-res:memport", + "total number of memory system ports available (to CPU)", + &res_memport, /* default */fu_config[FU_MEMPORT_INDEX].quantity, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-res:fpalu", + "total number of floating point ALU's available", + &res_fpalu, /* default */fu_config[FU_FPALU_INDEX].quantity, + /* print */TRUE, /* format */NULL); + + opt_reg_int(odb, "-res:fpmult", + "total number of floating point multiplier/dividers available", + &res_fpmult, /* default */fu_config[FU_FPMULT_INDEX].quantity, + /* print */TRUE, /* format */NULL); + + opt_reg_string_list(odb, "-pcstat", + "profile stat(s) against text addr's (mult uses ok)", + pcstat_vars, MAX_PCSTAT_VARS, &pcstat_nelt, NULL, + /* !print */FALSE, /* format */NULL, /* accrue */TRUE); + + opt_reg_flag(odb, "-bugcompat", + "operate in backward-compatible bugs mode (for testing only)", + &bugcompat_mode, /* default */FALSE, /* print */TRUE, NULL); +} + +/* check simulator-specific option values */ +void +sim_check_options(struct opt_odb_t *odb, /* options database */ + int argc, char **argv) /* command line arguments */ +{ + char name[128], c; + int nsets, bsize, assoc; + + if (fastfwd_count < 0 || fastfwd_count >= 2147483647) + fatal("bad fast forward count: %d", fastfwd_count); + + if (ruu_ifq_size < 1 || (ruu_ifq_size & (ruu_ifq_size - 1)) != 0) + fatal("inst fetch queue size must be positive > 0 and a power of two"); + + if (ruu_branch_penalty < 1) + fatal("mis-prediction penalty must be at least 1 cycle"); + + if (fetch_speed < 1) + fatal("front-end speed must be positive and non-zero"); + + if (!mystricmp(pred_type, "perfect")) + { + /* perfect predictor */ + pred = NULL; + pred_perfect = TRUE; + } + else if (!mystricmp(pred_type, "taken")) + { + /* static predictor, not taken */ + pred = bpred_create(BPredTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0); + } + else if (!mystricmp(pred_type, "nottaken")) + { + /* static predictor, taken */ + pred = bpred_create(BPredNotTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0); + } + else if (!mystricmp(pred_type, "bimod")) + { + /* bimodal predictor, bpred_create() checks BTB_SIZE */ + if (bimod_nelt != 1) + fatal("bad bimod predictor config ()"); + if (btb_nelt != 2) + fatal("bad btb config ( )"); + + /* bimodal predictor, bpred_create() checks BTB_SIZE */ + pred = bpred_create(BPred2bit, + /* bimod table size */bimod_config[0], + /* 2lev l1 size */0, + /* 2lev l2 size */0, + /* meta table size */0, + /* history reg size */0, + /* history xor address */0, + /* btb sets */btb_config[0], + /* btb assoc */btb_config[1], + /* ret-addr stack size */ras_size); + } + else if (!mystricmp(pred_type, "threebit")) + { + /* 3-bit predictor, bpred_create() checks BTB_SIZE */ + if (threebit_nelt != 1) + fatal("bad 3-bit predictor config ()"); + if (btb_nelt != 2) + fatal("bad btb config ( )"); + + /* 3-bit predictor, bpred_create() checks BTB_SIZE */ + pred = bpred_create(BPred3bit, + /* threebit table size */threebit_config[0], + /* 2lev l1 size */0, + /* 2lev l2 size */0, + /* meta table size */0, + /* history reg size */0, + /* history xor address */0, + /* btb sets */btb_config[0], + /* btb assoc */btb_config[1], + /* ret-addr stack size */ras_size); + } + else if (!mystricmp(pred_type, "2lev")) + { + /* 2-level adaptive predictor, bpred_create() checks args */ + if (twolev_nelt != 4) + fatal("bad 2-level pred config ( )"); + if (btb_nelt != 2) + fatal("bad btb config ( )"); + + pred = bpred_create(BPred2Level, + /* bimod table size */0, + /* 2lev l1 size */twolev_config[0], + /* 2lev l2 size */twolev_config[1], + /* meta table size */0, + /* history reg size */twolev_config[2], + /* history xor address */twolev_config[3], + /* btb sets */btb_config[0], + /* btb assoc */btb_config[1], + /* ret-addr stack size */ras_size); + } + else if (!mystricmp(pred_type, "comb")) + { + /* combining predictor, bpred_create() checks args */ + if (twolev_nelt != 4) + fatal("bad 2-level pred config ( )"); + if (bimod_nelt != 1) + fatal("bad bimod predictor config ()"); + if (comb_nelt != 1) + fatal("bad combining predictor config ()"); + if (btb_nelt != 2) + fatal("bad btb config ( )"); + + pred = bpred_create(BPredComb, + /* bimod table size */bimod_config[0], + /* l1 size */twolev_config[0], + /* l2 size */twolev_config[1], + /* meta table size */comb_config[0], + /* history reg size */twolev_config[2], + /* history xor address */twolev_config[3], + /* btb sets */btb_config[0], + /* btb assoc */btb_config[1], + /* ret-addr stack size */ras_size); + } + else + fatal("cannot parse predictor type `%s'", pred_type); + + if (!bpred_spec_opt) + bpred_spec_update = spec_CT; + else if (!mystricmp(bpred_spec_opt, "ID")) + bpred_spec_update = spec_ID; + else if (!mystricmp(bpred_spec_opt, "WB")) + bpred_spec_update = spec_WB; + else + fatal("bad speculative update stage specifier, use {ID|WB}"); + + if (ruu_decode_width < 1 || (ruu_decode_width & (ruu_decode_width-1)) != 0) + fatal("issue width must be positive non-zero and a power of two"); + + if (ruu_issue_width < 1 || (ruu_issue_width & (ruu_issue_width-1)) != 0) + fatal("issue width must be positive non-zero and a power of two"); + + if (ruu_commit_width < 1) + fatal("commit width must be positive non-zero"); + + if (RUU_size < 2 || (RUU_size & (RUU_size-1)) != 0) + fatal("RUU size must be a positive number > 1 and a power of two"); + + if (LSQ_size < 2 || (LSQ_size & (LSQ_size-1)) != 0) + fatal("LSQ size must be a positive number > 1 and a power of two"); + + /* use a level 1 D-cache? */ + if (!mystricmp(cache_dl1_opt, "none")) + { + cache_dl1 = NULL; + + /* the level 2 D-cache cannot be defined */ + if (strcmp(cache_dl2_opt, "none")) + fatal("the l1 data cache must defined if the l2 cache is defined"); + cache_dl2 = NULL; + } + else /* dl1 is defined */ + { + if (sscanf(cache_dl1_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad l1 D-cache parms: ::::"); + cache_dl1 = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */0, assoc, cache_char2policy(c), + dl1_access_fn, /* hit lat */cache_dl1_lat); + + /* is the level 2 D-cache defined? */ + if (!mystricmp(cache_dl2_opt, "none")) + cache_dl2 = NULL; + else + { + if (sscanf(cache_dl2_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad l2 D-cache parms: " + "::::"); + cache_dl2 = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */0, assoc, cache_char2policy(c), + dl2_access_fn, /* hit lat */cache_dl2_lat); + } + } + + /* use a level 1 I-cache? */ + if (!mystricmp(cache_il1_opt, "none")) + { + cache_il1 = NULL; + + /* the level 2 I-cache cannot be defined */ + if (strcmp(cache_il2_opt, "none")) + fatal("the l1 inst cache must defined if the l2 cache is defined"); + cache_il2 = NULL; + } + else if (!mystricmp(cache_il1_opt, "dl1")) + { + if (!cache_dl1) + fatal("I-cache l1 cannot access D-cache l1 as it's undefined"); + cache_il1 = cache_dl1; + + /* the level 2 I-cache cannot be defined */ + if (strcmp(cache_il2_opt, "none")) + fatal("the l1 inst cache must defined if the l2 cache is defined"); + cache_il2 = NULL; + } + else if (!mystricmp(cache_il1_opt, "dl2")) + { + if (!cache_dl2) + fatal("I-cache l1 cannot access D-cache l2 as it's undefined"); + cache_il1 = cache_dl2; + + /* the level 2 I-cache cannot be defined */ + if (strcmp(cache_il2_opt, "none")) + fatal("the l1 inst cache must defined if the l2 cache is defined"); + cache_il2 = NULL; + } + else /* il1 is defined */ + { + if (sscanf(cache_il1_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad l1 I-cache parms: ::::"); + cache_il1 = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */0, assoc, cache_char2policy(c), + il1_access_fn, /* hit lat */cache_il1_lat); + + /* is the level 2 D-cache defined? */ + if (!mystricmp(cache_il2_opt, "none")) + cache_il2 = NULL; + else if (!mystricmp(cache_il2_opt, "dl2")) + { + if (!cache_dl2) + fatal("I-cache l2 cannot access D-cache l2 as it's undefined"); + cache_il2 = cache_dl2; + } + else + { + if (sscanf(cache_il2_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad l2 I-cache parms: " + "::::"); + cache_il2 = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */0, assoc, cache_char2policy(c), + il2_access_fn, /* hit lat */cache_il2_lat); + } + } + + /* use an I-TLB? */ + if (!mystricmp(itlb_opt, "none")) + itlb = NULL; + else + { + if (sscanf(itlb_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad TLB parms: ::::"); + itlb = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */sizeof(md_addr_t), assoc, + cache_char2policy(c), itlb_access_fn, + /* hit latency */1); + } + + /* use a D-TLB? */ + if (!mystricmp(dtlb_opt, "none")) + dtlb = NULL; + else + { + if (sscanf(dtlb_opt, "%[^:]:%d:%d:%d:%c", + name, &nsets, &bsize, &assoc, &c) != 5) + fatal("bad TLB parms: ::::"); + dtlb = cache_create(name, nsets, bsize, /* balloc */FALSE, + /* usize */sizeof(md_addr_t), assoc, + cache_char2policy(c), dtlb_access_fn, + /* hit latency */1); + } + + if (cache_dl1_lat < 1) + fatal("l1 data cache latency must be greater than zero"); + + if (cache_dl2_lat < 1) + fatal("l2 data cache latency must be greater than zero"); + + if (cache_il1_lat < 1) + fatal("l1 instruction cache latency must be greater than zero"); + + if (cache_il2_lat < 1) + fatal("l2 instruction cache latency must be greater than zero"); + + if (mem_nelt != 2) + fatal("bad memory access latency ( )"); + + if (mem_lat[0] < 1 || mem_lat[1] < 1) + fatal("all memory access latencies must be greater than zero"); + + if (mem_bus_width < 1 || (mem_bus_width & (mem_bus_width-1)) != 0) + fatal("memory bus width must be positive non-zero and a power of two"); + + if (tlb_miss_lat < 1) + fatal("TLB miss latency must be greater than zero"); + + if (res_ialu < 1) + fatal("number of integer ALU's must be greater than zero"); + if (res_ialu > MAX_INSTS_PER_CLASS) + fatal("number of integer ALU's must be <= MAX_INSTS_PER_CLASS"); + fu_config[FU_IALU_INDEX].quantity = res_ialu; + + if (res_imult < 1) + fatal("number of integer multiplier/dividers must be greater than zero"); + if (res_imult > MAX_INSTS_PER_CLASS) + fatal("number of integer mult/div's must be <= MAX_INSTS_PER_CLASS"); + fu_config[FU_IMULT_INDEX].quantity = res_imult; + + if (res_memport < 1) + fatal("number of memory system ports must be greater than zero"); + if (res_memport > MAX_INSTS_PER_CLASS) + fatal("number of memory system ports must be <= MAX_INSTS_PER_CLASS"); + fu_config[FU_MEMPORT_INDEX].quantity = res_memport; + + if (res_fpalu < 1) + fatal("number of floating point ALU's must be greater than zero"); + if (res_fpalu > MAX_INSTS_PER_CLASS) + fatal("number of floating point ALU's must be <= MAX_INSTS_PER_CLASS"); + fu_config[FU_FPALU_INDEX].quantity = res_fpalu; + + if (res_fpmult < 1) + fatal("number of floating point multiplier/dividers must be > zero"); + if (res_fpmult > MAX_INSTS_PER_CLASS) + fatal("number of FP mult/div's must be <= MAX_INSTS_PER_CLASS"); + fu_config[FU_FPMULT_INDEX].quantity = res_fpmult; +} + +/* print simulator-specific configuration information */ +void +sim_aux_config(FILE *stream) /* output stream */ +{ + /* nada */ +} + +/* register simulator-specific statistics */ +void +sim_reg_stats(struct stat_sdb_t *sdb) /* stats database */ +{ + int i; + stat_reg_counter(sdb, "sim_num_insn", + "total number of instructions committed", + &sim_num_insn, sim_num_insn, NULL); + stat_reg_counter(sdb, "sim_num_refs", + "total number of loads and stores committed", + &sim_num_refs, 0, NULL); + stat_reg_counter(sdb, "sim_num_loads", + "total number of loads committed", + &sim_num_loads, 0, NULL); + stat_reg_formula(sdb, "sim_num_stores", + "total number of stores committed", + "sim_num_refs - sim_num_loads", NULL); + stat_reg_counter(sdb, "sim_num_branches", + "total number of branches committed", + &sim_num_branches, /* initial value */0, /* format */NULL); + stat_reg_int(sdb, "sim_elapsed_time", + "total simulation time in seconds", + &sim_elapsed_time, 0, NULL); + stat_reg_formula(sdb, "sim_inst_rate", + "simulation speed (in insts/sec)", + "sim_num_insn / sim_elapsed_time", NULL); + + stat_reg_counter(sdb, "sim_total_insn", + "total number of instructions executed", + &sim_total_insn, 0, NULL); + stat_reg_counter(sdb, "sim_total_refs", + "total number of loads and stores executed", + &sim_total_refs, 0, NULL); + stat_reg_counter(sdb, "sim_total_loads", + "total number of loads executed", + &sim_total_loads, 0, NULL); + stat_reg_formula(sdb, "sim_total_stores", + "total number of stores executed", + "sim_total_refs - sim_total_loads", NULL); + stat_reg_counter(sdb, "sim_total_branches", + "total number of branches executed", + &sim_total_branches, /* initial value */0, /* format */NULL); + + /* register performance stats */ + stat_reg_counter(sdb, "sim_cycle", + "total simulation time in cycles", + &sim_cycle, /* initial value */0, /* format */NULL); + stat_reg_formula(sdb, "sim_IPC", + "instructions per cycle", + "sim_num_insn / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "sim_CPI", + "cycles per instruction", + "sim_cycle / sim_num_insn", /* format */NULL); + stat_reg_formula(sdb, "sim_exec_BW", + "total instructions (mis-spec + committed) per cycle", + "sim_total_insn / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "sim_IPB", + "instruction per branch", + "sim_num_insn / sim_num_branches", /* format */NULL); + + /* occupancy stats */ + stat_reg_counter(sdb, "IFQ_count", "cumulative IFQ occupancy", + &IFQ_count, /* initial value */0, /* format */NULL); + stat_reg_counter(sdb, "IFQ_fcount", "cumulative IFQ full count", + &IFQ_fcount, /* initial value */0, /* format */NULL); + stat_reg_formula(sdb, "ifq_occupancy", "avg IFQ occupancy (insn's)", + "IFQ_count / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "ifq_rate", "avg IFQ dispatch rate (insn/cycle)", + "sim_total_insn / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "ifq_latency", "avg IFQ occupant latency (cycle's)", + "ifq_occupancy / ifq_rate", /* format */NULL); + stat_reg_formula(sdb, "ifq_full", "fraction of time (cycle's) IFQ was full", + "IFQ_fcount / sim_cycle", /* format */NULL); + + stat_reg_counter(sdb, "RUU_count", "cumulative RUU occupancy", + &RUU_count, /* initial value */0, /* format */NULL); + stat_reg_counter(sdb, "RUU_fcount", "cumulative RUU full count", + &RUU_fcount, /* initial value */0, /* format */NULL); + stat_reg_formula(sdb, "ruu_occupancy", "avg RUU occupancy (insn's)", + "RUU_count / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "ruu_rate", "avg RUU dispatch rate (insn/cycle)", + "sim_total_insn / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "ruu_latency", "avg RUU occupant latency (cycle's)", + "ruu_occupancy / ruu_rate", /* format */NULL); + stat_reg_formula(sdb, "ruu_full", "fraction of time (cycle's) RUU was full", + "RUU_fcount / sim_cycle", /* format */NULL); + + stat_reg_counter(sdb, "LSQ_count", "cumulative LSQ occupancy", + &LSQ_count, /* initial value */0, /* format */NULL); + stat_reg_counter(sdb, "LSQ_fcount", "cumulative LSQ full count", + &LSQ_fcount, /* initial value */0, /* format */NULL); + stat_reg_formula(sdb, "lsq_occupancy", "avg LSQ occupancy (insn's)", + "LSQ_count / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "lsq_rate", "avg LSQ dispatch rate (insn/cycle)", + "sim_total_insn / sim_cycle", /* format */NULL); + stat_reg_formula(sdb, "lsq_latency", "avg LSQ occupant latency (cycle's)", + "lsq_occupancy / lsq_rate", /* format */NULL); + stat_reg_formula(sdb, "lsq_full", "fraction of time (cycle's) LSQ was full", + "LSQ_fcount / sim_cycle", /* format */NULL); + + stat_reg_counter(sdb, "sim_slip", + "total number of slip cycles", + &sim_slip, 0, NULL); + /* register baseline stats */ + stat_reg_formula(sdb, "avg_sim_slip", + "the average slip between issue and retirement", + "sim_slip / sim_num_insn", NULL); + + /* register predictor stats */ + if (pred) + bpred_reg_stats(pred, sdb); + + /* register cache stats */ + if (cache_il1 + && (cache_il1 != cache_dl1 && cache_il1 != cache_dl2)) + cache_reg_stats(cache_il1, sdb); + if (cache_il2 + && (cache_il2 != cache_dl1 && cache_il2 != cache_dl2)) + cache_reg_stats(cache_il2, sdb); + if (cache_dl1) + cache_reg_stats(cache_dl1, sdb); + if (cache_dl2) + cache_reg_stats(cache_dl2, sdb); + if (itlb) + cache_reg_stats(itlb, sdb); + if (dtlb) + cache_reg_stats(dtlb, sdb); + + /* debug variable(s) */ + stat_reg_counter(sdb, "sim_invalid_addrs", + "total non-speculative bogus addresses seen (debug var)", + &sim_invalid_addrs, /* initial value */0, /* format */NULL); + + for (i=0; isc != sc_int && stat->sc != sc_uint && stat->sc != sc_counter) + fatal("`-pcstat' statistical variable `%s' is not an integral type", + stat->name); + + /* register this stat */ + pcstat_stats[i] = stat; + pcstat_lastvals[i] = STATVAL(stat); + + /* declare the sparce text distribution */ + sprintf(buf, "%s_by_pc", stat->name); + sprintf(buf1, "%s (by text address)", stat->desc); + pcstat_sdists[i] = stat_reg_sdist(sdb, buf, buf1, + /* initial value */0, + /* print format */(PF_COUNT|PF_PDF), + /* format */"0x%lx %lu %.2f", + /* print fn */NULL); + } + ld_reg_stats(sdb); + mem_reg_stats(mem, sdb); +} + +/* forward declarations */ +static void ruu_init(void); +static void lsq_init(void); +static void rslink_init(int nlinks); +static void eventq_init(void); +static void readyq_init(void); +static void cv_init(void); +static void tracer_init(void); +static void fetch_init(void); + +/* initialize the simulator */ +void +sim_init(void) +{ + sim_num_refs = 0; + + /* allocate and initialize register file */ + regs_init(®s); + + /* allocate and initialize memory space */ + mem = mem_create("mem"); + mem_init(mem); +} + +/* default register state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_reg_obj(struct regs_t *regs, /* registers to access */ + int is_write, /* access type */ + enum md_reg_type rt, /* reg bank to probe */ + int reg, /* register number */ + struct eval_value_t *val); /* input, output */ + +/* default memory state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_mem_obj(struct mem_t *mem, /* memory space to access */ + int is_write, /* access type */ + md_addr_t addr, /* address to access */ + char *p, /* input/output buffer */ + int nbytes); /* size of access */ + +/* default machine state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_mstate_obj(FILE *stream, /* output stream */ + char *cmd, /* optional command string */ + struct regs_t *regs, /* registers to access */ + struct mem_t *mem); /* memory space to access */ + +/* total RS links allocated at program start */ +#define MAX_RS_LINKS 4096 + +/* load program into simulated state */ +void +sim_load_prog(char *fname, /* program to load */ + int argc, char **argv, /* program arguments */ + char **envp) /* program environment */ +{ + /* load program text and data, set up environment, memory, and regs */ + ld_load_prog(fname, argc, argv, envp, ®s, mem, TRUE); + + /* initialize here, so symbols can be loaded */ + if (ptrace_nelt == 2) + { + /* generate a pipeline trace */ + ptrace_open(/* fname */ptrace_opts[0], /* range */ptrace_opts[1]); + } + else if (ptrace_nelt == 0) + { + /* no pipetracing */; + } + else + fatal("bad pipetrace args, use: "); + + /* finish initialization of the simulation engine */ + fu_pool = res_create_pool("fu-pool", fu_config, N_ELT(fu_config)); + rslink_init(MAX_RS_LINKS); + tracer_init(); + fetch_init(); + cv_init(); + eventq_init(); + readyq_init(); + ruu_init(); + lsq_init(); + + /* initialize the DLite debugger */ + dlite_init(simoo_reg_obj, simoo_mem_obj, simoo_mstate_obj); +} + +/* dump simulator-specific auxiliary simulator statistics */ +void +sim_aux_stats(FILE *stream) /* output stream */ +{ + /* nada */ +} + +/* un-initialize the simulator */ +void +sim_uninit(void) +{ + if (ptrace_nelt > 0) + ptrace_close(); +} + + +/* + * processor core definitions and declarations + */ + +/* inst tag type, used to tag an operation instance in the RUU */ +typedef unsigned int INST_TAG_TYPE; + +/* inst sequence type, used to order instructions in the ready list, if + this rolls over the ready list order temporarily will get messed up, + but execution will continue and complete correctly */ +typedef unsigned int INST_SEQ_TYPE; + + +/* total input dependencies possible */ +#define MAX_IDEPS 3 + +/* total output dependencies possible */ +#define MAX_ODEPS 2 + +/* a register update unit (RUU) station, this record is contained in the + processors RUU, which serves as a collection of ordered reservations + stations. The reservation stations capture register results and await + the time when all operands are ready, at which time the instruction is + issued to the functional units; the RUU is an order circular queue, in which + instructions are inserted in fetch (program) order, results are stored in + the RUU buffers, and later when an RUU entry is the oldest entry in the + machines, it and its instruction's value is retired to the architectural + register file in program order, NOTE: the RUU and LSQ share the same + structure, this is useful because loads and stores are split into two + operations: an effective address add and a load/store, the add is inserted + into the RUU and the load/store inserted into the LSQ, allowing the add + to wake up the load/store when effective address computation has finished */ +struct RUU_station { + /* inst info */ + md_inst_t IR; /* instruction bits */ + enum md_opcode op; /* decoded instruction opcode */ + md_addr_t PC, next_PC, pred_PC; /* inst PC, next PC, predicted PC */ + int in_LSQ; /* non-zero if op is in LSQ */ + int ea_comp; /* non-zero if op is an addr comp */ + int recover_inst; /* start of mis-speculation? */ + int stack_recover_idx; /* non-speculative TOS for RSB pred */ + struct bpred_update_t dir_update; /* bpred direction update info */ + int spec_mode; /* non-zero if issued in spec_mode */ + md_addr_t addr; /* effective address for ld/st's */ + INST_TAG_TYPE tag; /* RUU slot tag, increment to + squash operation */ + INST_SEQ_TYPE seq; /* instruction sequence, used to + sort the ready list and tag inst */ + unsigned int ptrace_seq; /* pipetrace sequence number */ + int slip; + /* instruction status */ + int queued; /* operands ready and queued */ + int issued; /* operation is/was executing */ + int completed; /* operation has completed execution */ + /* output operand dependency list, these lists are used to + limit the number of associative searches into the RUU when + instructions complete and need to wake up dependent insts */ + int onames[MAX_ODEPS]; /* output logical names (NA=unused) */ + struct RS_link *odep_list[MAX_ODEPS]; /* chains to consuming operations */ + + /* input dependent links, the output chains rooted above use these + fields to mark input operands as ready, when all these fields have + been set non-zero, the RUU operation has all of its register + operands, it may commence execution as soon as all of its memory + operands are known to be read (see lsq_refresh() for details on + enforcing memory dependencies) */ + int idep_ready[MAX_IDEPS]; /* input operand ready? */ +}; + +/* non-zero if all register operands are ready, update with MAX_IDEPS */ +#define OPERANDS_READY(RS) \ + ((RS)->idep_ready[0] && (RS)->idep_ready[1] && (RS)->idep_ready[2]) + +/* register update unit, combination of reservation stations and reorder + buffer device, organized as a circular queue */ +static struct RUU_station *RUU; /* register update unit */ +static int RUU_head, RUU_tail; /* RUU head and tail pointers */ +static int RUU_num; /* num entries currently in RUU */ + +/* allocate and initialize register update unit (RUU) */ +static void +ruu_init(void) +{ + RUU = calloc(RUU_size, sizeof(struct RUU_station)); + if (!RUU) + fatal("out of virtual memory"); + + RUU_num = 0; + RUU_head = RUU_tail = 0; + RUU_count = 0; + RUU_fcount = 0; +} + +/* dump the contents of the RUU */ +static void +ruu_dumpent(struct RUU_station *rs, /* ptr to RUU station */ + int index, /* entry index */ + FILE *stream, /* output stream */ + int header) /* print header? */ +{ + if (!stream) + stream = stderr; + + if (header) + fprintf(stream, "idx: %2d: opcode: %s, inst: `", + index, MD_OP_NAME(rs->op)); + else + fprintf(stream, " opcode: %s, inst: `", + MD_OP_NAME(rs->op)); + md_print_insn(rs->IR, rs->PC, stream); + fprintf(stream, "'\n"); + myfprintf(stream, " PC: 0x%08p, NPC: 0x%08p (pred_PC: 0x%08p)\n", + rs->PC, rs->next_PC, rs->pred_PC); + fprintf(stream, " in_LSQ: %s, ea_comp: %s, recover_inst: %s\n", + rs->in_LSQ ? "t" : "f", + rs->ea_comp ? "t" : "f", + rs->recover_inst ? "t" : "f"); + myfprintf(stream, " spec_mode: %s, addr: 0x%08p, tag: 0x%08x\n", + rs->spec_mode ? "t" : "f", rs->addr, rs->tag); + fprintf(stream, " seq: 0x%08x, ptrace_seq: 0x%08x\n", + rs->seq, rs->ptrace_seq); + fprintf(stream, " queued: %s, issued: %s, completed: %s\n", + rs->queued ? "t" : "f", + rs->issued ? "t" : "f", + rs->completed ? "t" : "f"); + fprintf(stream, " operands ready: %s\n", + OPERANDS_READY(rs) ? "t" : "f"); +} + +/* dump the contents of the RUU */ +static void +ruu_dump(FILE *stream) /* output stream */ +{ + int num, head; + struct RUU_station *rs; + + if (!stream) + stream = stderr; + + fprintf(stream, "** RUU state **\n"); + fprintf(stream, "RUU_head: %d, RUU_tail: %d\n", RUU_head, RUU_tail); + fprintf(stream, "RUU_num: %d\n", RUU_num); + + num = RUU_num; + head = RUU_head; + while (num) + { + rs = &RUU[head]; + ruu_dumpent(rs, rs - RUU, stream, /* header */TRUE); + head = (head + 1) % RUU_size; + num--; + } +} + +/* + * load/store queue (LSQ): holds loads and stores in program order, indicating + * status of load/store access: + * + * - issued: address computation complete, memory access in progress + * - completed: memory access has completed, stored value available + * - squashed: memory access was squashed, ignore this entry + * + * loads may execute when: + * 1) register operands are ready, and + * 2) memory operands are ready (no earlier unresolved store) + * + * loads are serviced by: + * 1) previous store at same address in LSQ (hit latency), or + * 2) data cache (hit latency + miss latency) + * + * stores may execute when: + * 1) register operands are ready + * + * stores are serviced by: + * 1) depositing store value into the load/store queue + * 2) writing store value to the store buffer (plus tag check) at commit + * 3) writing store buffer entry to data cache when cache is free + * + * NOTE: the load/store queue can bypass a store value to a load in the same + * cycle the store executes (using a bypass network), thus stores complete + * in effective zero time after their effective address is known + */ +static struct RUU_station *LSQ; /* load/store queue */ +static int LSQ_head, LSQ_tail; /* LSQ head and tail pointers */ +static int LSQ_num; /* num entries currently in LSQ */ + +/* + * input dependencies for stores in the LSQ: + * idep #0 - operand input (value that is store'd) + * idep #1 - effective address input (address of store operation) + */ +#define STORE_OP_INDEX 0 +#define STORE_ADDR_INDEX 1 + +#define STORE_OP_READY(RS) ((RS)->idep_ready[STORE_OP_INDEX]) +#define STORE_ADDR_READY(RS) ((RS)->idep_ready[STORE_ADDR_INDEX]) + +/* allocate and initialize the load/store queue (LSQ) */ +static void +lsq_init(void) +{ + LSQ = calloc(LSQ_size, sizeof(struct RUU_station)); + if (!LSQ) + fatal("out of virtual memory"); + + LSQ_num = 0; + LSQ_head = LSQ_tail = 0; + LSQ_count = 0; + LSQ_fcount = 0; +} + +/* dump the contents of the RUU */ +static void +lsq_dump(FILE *stream) /* output stream */ +{ + int num, head; + struct RUU_station *rs; + + if (!stream) + stream = stderr; + + fprintf(stream, "** LSQ state **\n"); + fprintf(stream, "LSQ_head: %d, LSQ_tail: %d\n", LSQ_head, LSQ_tail); + fprintf(stream, "LSQ_num: %d\n", LSQ_num); + + num = LSQ_num; + head = LSQ_head; + while (num) + { + rs = &LSQ[head]; + ruu_dumpent(rs, rs - LSQ, stream, /* header */TRUE); + head = (head + 1) % LSQ_size; + num--; + } +} + + +/* + * RS_LINK defs and decls + */ + +/* a reservation station link: this structure links elements of a RUU + reservation station list; used for ready instruction queue, event queue, and + output dependency lists; each RS_LINK node contains a pointer to the RUU + entry it references along with an instance tag, the RS_LINK is only valid if + the instruction instance tag matches the instruction RUU entry instance tag; + this strategy allows entries in the RUU can be squashed and reused without + updating the lists that point to it, which significantly improves the + performance of (all to frequent) squash events */ +struct RS_link { + struct RS_link *next; /* next entry in list */ + struct RUU_station *rs; /* referenced RUU resv station */ + INST_TAG_TYPE tag; /* inst instance sequence number */ + union { + tick_t when; /* time stamp of entry (for eventq) */ + INST_SEQ_TYPE seq; /* inst sequence */ + int opnum; /* input/output operand number */ + } x; +}; + +/* RS link free list, grab RS_LINKs from here, when needed */ +static struct RS_link *rslink_free_list; + +/* NULL value for an RS link */ +#define RSLINK_NULL_DATA { NULL, NULL, 0 } +static struct RS_link RSLINK_NULL = RSLINK_NULL_DATA; + +/* create and initialize an RS link */ +#define RSLINK_INIT(RSL, RS) \ + ((RSL).next = NULL, (RSL).rs = (RS), (RSL).tag = (RS)->tag) + +/* non-zero if RS link is NULL */ +#define RSLINK_IS_NULL(LINK) ((LINK)->rs == NULL) + +/* non-zero if RS link is to a valid (non-squashed) entry */ +#define RSLINK_VALID(LINK) ((LINK)->tag == (LINK)->rs->tag) + +/* extra RUU reservation station pointer */ +#define RSLINK_RS(LINK) ((LINK)->rs) + +/* get a new RS link record */ +#define RSLINK_NEW(DST, RS) \ + { struct RS_link *n_link; \ + if (!rslink_free_list) \ + panic("out of rs links"); \ + n_link = rslink_free_list; \ + rslink_free_list = rslink_free_list->next; \ + n_link->next = NULL; \ + n_link->rs = (RS); n_link->tag = n_link->rs->tag; \ + (DST) = n_link; \ + } + +/* free an RS link record */ +#define RSLINK_FREE(LINK) \ + { struct RS_link *f_link = (LINK); \ + f_link->rs = NULL; f_link->tag = 0; \ + f_link->next = rslink_free_list; \ + rslink_free_list = f_link; \ + } + +/* FIXME: could this be faster!!! */ +/* free an RS link list */ +#define RSLINK_FREE_LIST(LINK) \ + { struct RS_link *fl_link, *fl_link_next; \ + for (fl_link=(LINK); fl_link; fl_link=fl_link_next) \ + { \ + fl_link_next = fl_link->next; \ + RSLINK_FREE(fl_link); \ + } \ + } + +/* initialize the free RS_LINK pool */ +static void +rslink_init(int nlinks) /* total number of RS_LINK available */ +{ + int i; + struct RS_link *link; + + rslink_free_list = NULL; + for (i=0; inext = rslink_free_list; + rslink_free_list = link; + } +} + +/* service all functional unit release events, this function is called + once per cycle, and it used to step the BUSY timers attached to each + functional unit in the function unit resource pool, as long as a functional + unit's BUSY count is > 0, it cannot be issued an operation */ +static void +ruu_release_fu(void) +{ + int i; + + /* walk all resource units, decrement busy counts by one */ + for (i=0; inum_resources; i++) + { + /* resource is released when BUSY hits zero */ + if (fu_pool->resources[i].busy > 0) + fu_pool->resources[i].busy--; + } +} + + +/* + * the execution unit event queue implementation follows, the event queue + * indicates which instruction will complete next, the writeback handler + * drains this queue + */ + +/* pending event queue, sorted from soonest to latest event (in time), NOTE: + RS_LINK nodes are used for the event queue list so that it need not be + updated during squash events */ +static struct RS_link *event_queue; + +/* initialize the event queue structures */ +static void +eventq_init(void) +{ + event_queue = NULL; +} + +/* dump the contents of the event queue */ +static void +eventq_dump(FILE *stream) /* output stream */ +{ + struct RS_link *ev; + + if (!stream) + stream = stderr; + + fprintf(stream, "** event queue state **\n"); + + for (ev = event_queue; ev != NULL; ev = ev->next) + { + /* is event still valid? */ + if (RSLINK_VALID(ev)) + { + struct RUU_station *rs = RSLINK_RS(ev); + + fprintf(stream, "idx: %2d: @ %.0f\n", + (int)(rs - (rs->in_LSQ ? LSQ : RUU)), (double)ev->x.when); + ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU), + stream, /* !header */FALSE); + } + } +} + +/* insert an event for RS into the event queue, event queue is sorted from + earliest to latest event, event and associated side-effects will be + apparent at the start of cycle WHEN */ +static void +eventq_queue_event(struct RUU_station *rs, tick_t when) +{ + struct RS_link *prev, *ev, *new_ev; + + if (rs->completed) + panic("event completed"); + + if (when <= sim_cycle) + panic("event occurred in the past"); + + /* get a free event record */ + RSLINK_NEW(new_ev, rs); + new_ev->x.when = when; + + /* locate insertion point */ + for (prev=NULL, ev=event_queue; + ev && ev->x.when < when; + prev=ev, ev=ev->next); + + if (prev) + { + /* insert middle or end */ + new_ev->next = prev->next; + prev->next = new_ev; + } + else + { + /* insert at beginning */ + new_ev->next = event_queue; + event_queue = new_ev; + } +} + +/* return the next event that has already occurred, returns NULL when no + remaining events or all remaining events are in the future */ +static struct RUU_station * +eventq_next_event(void) +{ + struct RS_link *ev; + + if (event_queue && event_queue->x.when <= sim_cycle) + { + /* unlink and return first event on priority list */ + ev = event_queue; + event_queue = event_queue->next; + + /* event still valid? */ + if (RSLINK_VALID(ev)) + { + struct RUU_station *rs = RSLINK_RS(ev); + + /* reclaim event record */ + RSLINK_FREE(ev); + + /* event is valid, return resv station */ + return rs; + } + else + { + /* reclaim event record */ + RSLINK_FREE(ev); + + /* receiving inst was squashed, return next event */ + return eventq_next_event(); + } + } + else + { + /* no event or no event is ready */ + return NULL; + } +} + + +/* + * the ready instruction queue implementation follows, the ready instruction + * queue indicates which instruction have all of there *register* dependencies + * satisfied, instruction will issue when 1) all memory dependencies for + * the instruction have been satisfied (see lsq_refresh() for details on how + * this is accomplished) and 2) resources are available; ready queue is fully + * constructed each cycle before any operation is issued from it -- this + * ensures that instruction issue priorities are properly observed; NOTE: + * RS_LINK nodes are used for the event queue list so that it need not be + * updated during squash events + */ + +/* the ready instruction queue */ +static struct RS_link *ready_queue; + +/* initialize the event queue structures */ +static void +readyq_init(void) +{ + ready_queue = NULL; +} + +/* dump the contents of the ready queue */ +static void +readyq_dump(FILE *stream) /* output stream */ +{ + struct RS_link *link; + + if (!stream) + stream = stderr; + + fprintf(stream, "** ready queue state **\n"); + + for (link = ready_queue; link != NULL; link = link->next) + { + /* is entry still valid? */ + if (RSLINK_VALID(link)) + { + struct RUU_station *rs = RSLINK_RS(link); + + ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU), + stream, /* header */TRUE); + } + } +} + +/* insert ready node into the ready list using ready instruction scheduling + policy; currently the following scheduling policy is enforced: + + memory and long latency operands, and branch instructions first + + then + + all other instructions, oldest instructions first + + this policy works well because branches pass through the machine quicker + which works to reduce branch misprediction latencies, and very long latency + instructions (such loads and multiplies) get priority since they are very + likely on the program's critical path */ +static void +readyq_enqueue(struct RUU_station *rs) /* RS to enqueue */ +{ + struct RS_link *prev, *node, *new_node; + + /* node is now queued */ + if (rs->queued) + panic("node is already queued"); + rs->queued = TRUE; + + /* get a free ready list node */ + RSLINK_NEW(new_node, rs); + new_node->x.seq = rs->seq; + + /* locate insertion point */ + if (rs->in_LSQ || MD_OP_FLAGS(rs->op) & (F_LONGLAT|F_CTRL)) + { + /* insert loads/stores and long latency ops at the head of the queue */ + prev = NULL; + node = ready_queue; + } + else + { + /* otherwise insert in program order (earliest seq first) */ + for (prev=NULL, node=ready_queue; + node && node->x.seq < rs->seq; + prev=node, node=node->next); + } + + if (prev) + { + /* insert middle or end */ + new_node->next = prev->next; + prev->next = new_node; + } + else + { + /* insert at beginning */ + new_node->next = ready_queue; + ready_queue = new_node; + } +} + + +/* + * the create vector maps a logical register to a creator in the RUU (and + * specific output operand) or the architected register file (if RS_link + * is NULL) + */ + +/* an entry in the create vector */ +struct CV_link { + struct RUU_station *rs; /* creator's reservation station */ + int odep_num; /* specific output operand */ +}; + +/* a NULL create vector entry */ +static struct CV_link CVLINK_NULL = { NULL, 0 }; + +/* get a new create vector link */ +#define CVLINK_INIT(CV, RS,ONUM) ((CV).rs = (RS), (CV).odep_num = (ONUM)) + +/* size of the create vector (one entry per architected register) */ +#define CV_BMAP_SZ (BITMAP_SIZE(MD_TOTAL_REGS)) + +/* the create vector, NOTE: speculative copy on write storage provided + for fast recovery during wrong path execute (see tracer_recover() for + details on this process */ +static BITMAP_TYPE(MD_TOTAL_REGS, use_spec_cv); +static struct CV_link create_vector[MD_TOTAL_REGS]; +static struct CV_link spec_create_vector[MD_TOTAL_REGS]; + +/* these arrays shadow the create vector an indicate when a register was + last created */ +static tick_t create_vector_rt[MD_TOTAL_REGS]; +static tick_t spec_create_vector_rt[MD_TOTAL_REGS]; + +/* read a create vector entry */ +#define CREATE_VECTOR(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\ + ? spec_create_vector[N] \ + : create_vector[N]) + +/* read a create vector timestamp entry */ +#define CREATE_VECTOR_RT(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\ + ? spec_create_vector_rt[N] \ + : create_vector_rt[N]) + +/* set a create vector entry */ +#define SET_CREATE_VECTOR(N, L) (spec_mode \ + ? (BITMAP_SET(use_spec_cv, CV_BMAP_SZ, (N)),\ + spec_create_vector[N] = (L)) \ + : (create_vector[N] = (L))) + +/* initialize the create vector */ +static void +cv_init(void) +{ + int i; + + /* initially all registers are valid in the architected register file, + i.e., the create vector entry is CVLINK_NULL */ + for (i=0; i < MD_TOTAL_REGS; i++) + { + create_vector[i] = CVLINK_NULL; + create_vector_rt[i] = 0; + spec_create_vector[i] = CVLINK_NULL; + spec_create_vector_rt[i] = 0; + } + + /* all create vector entries are non-speculative */ + BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ); +} + +/* dump the contents of the create vector */ +static void +cv_dump(FILE *stream) /* output stream */ +{ + int i; + struct CV_link ent; + + if (!stream) + stream = stderr; + + fprintf(stream, "** create vector state **\n"); + + for (i=0; i < MD_TOTAL_REGS; i++) + { + ent = CREATE_VECTOR(i); + if (!ent.rs) + fprintf(stream, "[cv%02d]: from architected reg file\n", i); + else + fprintf(stream, "[cv%02d]: from %s, idx: %d\n", + i, (ent.rs->in_LSQ ? "LSQ" : "RUU"), + (int)(ent.rs - (ent.rs->in_LSQ ? LSQ : RUU))); + } +} + + +/* + * RUU_COMMIT() - instruction retirement pipeline stage + */ + +/* this function commits the results of the oldest completed entries from the + RUU and LSQ to the architected reg file, stores in the LSQ will commit + their store data to the data cache at this point as well */ +static void +ruu_commit(void) +{ + int i, lat, events, committed = 0; + static counter_t sim_ret_insn = 0; + + /* all values must be retired to the architected reg file in program order */ + while (RUU_num > 0 && committed < ruu_commit_width) + { + struct RUU_station *rs = &(RUU[RUU_head]); + + if (!rs->completed) + { + /* at least RUU entry must be complete */ + break; + } + + /* default commit events */ + events = 0; + + /* load/stores must retire load/store queue entry as well */ + if (RUU[RUU_head].ea_comp) + { + /* load/store, retire head of LSQ as well */ + if (LSQ_num <= 0 || !LSQ[LSQ_head].in_LSQ) + panic("RUU out of sync with LSQ"); + + /* load/store operation must be complete */ + if (!LSQ[LSQ_head].completed) + { + /* load/store operation is not yet complete */ + break; + } + + if ((MD_OP_FLAGS(LSQ[LSQ_head].op) & (F_MEM|F_STORE)) + == (F_MEM|F_STORE)) + { + struct res_template *fu; + + + /* stores must retire their store value to the cache at commit, + try to get a store port (functional unit allocation) */ + fu = res_get(fu_pool, MD_OP_FUCLASS(LSQ[LSQ_head].op)); + if (fu) + { + /* reserve the functional unit */ + if (fu->master->busy) + panic("functional unit already in use"); + + /* schedule functional unit release event */ + fu->master->busy = fu->issuelat; + + /* go to the data cache */ + if (cache_dl1) + { + /* commit store value to D-cache */ + lat = + cache_access(cache_dl1, Write, (LSQ[LSQ_head].addr&~3), + NULL, 4, sim_cycle, NULL, NULL); + if (lat > cache_dl1_lat) + events |= PEV_CACHEMISS; + } + + /* all loads and stores must to access D-TLB */ + if (dtlb) + { + /* access the D-TLB */ + lat = + cache_access(dtlb, Read, (LSQ[LSQ_head].addr & ~3), + NULL, 4, sim_cycle, NULL, NULL); + if (lat > 1) + events |= PEV_TLBMISS; + } + } + else + { + /* no store ports left, cannot continue to commit insts */ + break; + } + } + + /* invalidate load/store operation instance */ + LSQ[LSQ_head].tag++; + sim_slip += (sim_cycle - LSQ[LSQ_head].slip); + + /* indicate to pipeline trace that this instruction retired */ + ptrace_newstage(LSQ[LSQ_head].ptrace_seq, PST_COMMIT, events); + ptrace_endinst(LSQ[LSQ_head].ptrace_seq); + + /* commit head of LSQ as well */ + LSQ_head = (LSQ_head + 1) % LSQ_size; + LSQ_num--; + } + + if (pred + && bpred_spec_update == spec_CT + && (MD_OP_FLAGS(rs->op) & F_CTRL)) + { + bpred_update(pred, + /* branch address */rs->PC, + /* actual target address */rs->next_PC, + /* taken? */rs->next_PC != (rs->PC + + sizeof(md_inst_t)), + /* pred taken? */rs->pred_PC != (rs->PC + + sizeof(md_inst_t)), + /* correct pred? */rs->pred_PC == rs->next_PC, + /* opcode */rs->op, + /* dir predictor update pointer */&rs->dir_update); + } + + /* invalidate RUU operation instance */ + RUU[RUU_head].tag++; + sim_slip += (sim_cycle - RUU[RUU_head].slip); + /* print retirement trace if in verbose mode */ + if (verbose) + { + sim_ret_insn++; + myfprintf(stderr, "%10n @ 0x%08p: ", sim_ret_insn, RUU[RUU_head].PC); + md_print_insn(RUU[RUU_head].IR, RUU[RUU_head].PC, stderr); + if (MD_OP_FLAGS(RUU[RUU_head].op) & F_MEM) + myfprintf(stderr, " mem: 0x%08p", RUU[RUU_head].addr); + fprintf(stderr, "\n"); + /* fflush(stderr); */ + } + + /* indicate to pipeline trace that this instruction retired */ + ptrace_newstage(RUU[RUU_head].ptrace_seq, PST_COMMIT, events); + ptrace_endinst(RUU[RUU_head].ptrace_seq); + + /* commit head entry of RUU */ + RUU_head = (RUU_head + 1) % RUU_size; + RUU_num--; + + /* one more instruction committed to architected state */ + committed++; + + for (i=0; iodep_list[i]) + panic ("retired instruction has odeps\n"); + } + } +} + + +/* + * RUU_RECOVER() - squash mispredicted microarchitecture state + */ + +/* recover processor microarchitecture state back to point of the + mis-predicted branch at RUU[BRANCH_INDEX] */ +static void +ruu_recover(int branch_index) /* index of mis-pred branch */ +{ + int i, RUU_index = RUU_tail, LSQ_index = LSQ_tail; + int RUU_prev_tail = RUU_tail, LSQ_prev_tail = LSQ_tail; + + /* recover from the tail of the RUU towards the head until the branch index + is reached, this direction ensures that the LSQ can be synchronized with + the RUU */ + + /* go to first element to squash */ + RUU_index = (RUU_index + (RUU_size-1)) % RUU_size; + LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size; + + /* traverse to older insts until the mispredicted branch is encountered */ + while (RUU_index != branch_index) + { + /* the RUU should not drain since the mispredicted branch will remain */ + if (!RUU_num) + panic("empty RUU"); + + /* should meet up with the tail first */ + if (RUU_index == RUU_head) + panic("RUU head and tail broken"); + + /* is this operation an effective addr calc for a load or store? */ + if (RUU[RUU_index].ea_comp) + { + /* should be at least one load or store in the LSQ */ + if (!LSQ_num) + panic("RUU and LSQ out of sync"); + + /* recover any resources consumed by the load or store operation */ + for (i=0; iqueued || !rs->issued || rs->completed) + panic("inst completed and !ready, !issued, or completed"); + + /* operation has completed */ + rs->completed = TRUE; + + /* does this operation reveal a mis-predicted branch? */ + if (rs->recover_inst) + { + if (rs->in_LSQ) + panic("mis-predicted load or store?!?!?"); + + /* recover processor state and reinit fetch to correct path */ + ruu_recover(rs - RUU); + tracer_recover(); + bpred_recover(pred, rs->PC, rs->stack_recover_idx); + + /* stall fetch until I-fetch and I-decode recover */ + ruu_fetch_issue_delay = ruu_branch_penalty; + + /* continue writeback of the branch/control instruction */ + } + + /* if we speculatively update branch-predictor, do it here */ + if (pred + && bpred_spec_update == spec_WB + && !rs->in_LSQ + && (MD_OP_FLAGS(rs->op) & F_CTRL)) + { + bpred_update(pred, + /* branch address */rs->PC, + /* actual target address */rs->next_PC, + /* taken? */rs->next_PC != (rs->PC + + sizeof(md_inst_t)), + /* pred taken? */rs->pred_PC != (rs->PC + + sizeof(md_inst_t)), + /* correct pred? */rs->pred_PC == rs->next_PC, + /* opcode */rs->op, + /* dir predictor update pointer */&rs->dir_update); + } + + /* entered writeback stage, indicate in pipe trace */ + ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK, + rs->recover_inst ? PEV_MPDETECT : 0); + + /* broadcast results to consuming operations, this is more efficiently + accomplished by walking the output dependency chains of the + completed instruction */ + for (i=0; ionames[i] != NA) + { + struct CV_link link; + struct RS_link *olink, *olink_next; + + if (rs->spec_mode) + { + /* update the speculative create vector, future operations + get value from later creator or architected reg file */ + link = spec_create_vector[rs->onames[i]]; + if (/* !NULL */link.rs + && /* refs RS */(link.rs == rs && link.odep_num == i)) + { + /* the result can now be read from a physical register, + indicate this as so */ + spec_create_vector[rs->onames[i]] = CVLINK_NULL; + spec_create_vector_rt[rs->onames[i]] = sim_cycle; + } + /* else, creator invalidated or there is another creator */ + } + else + { + /* update the non-speculative create vector, future + operations get value from later creator or architected + reg file */ + link = create_vector[rs->onames[i]]; + if (/* !NULL */link.rs + && /* refs RS */(link.rs == rs && link.odep_num == i)) + { + /* the result can now be read from a physical register, + indicate this as so */ + create_vector[rs->onames[i]] = CVLINK_NULL; + create_vector_rt[rs->onames[i]] = sim_cycle; + } + /* else, creator invalidated or there is another creator */ + } + + /* walk output list, queue up ready operations */ + for (olink=rs->odep_list[i]; olink; olink=olink_next) + { + if (RSLINK_VALID(olink)) + { + if (olink->rs->idep_ready[olink->x.opnum]) + panic("output dependence already satisfied"); + + /* input is now ready */ + olink->rs->idep_ready[olink->x.opnum] = TRUE; + + /* are all the register operands of target ready? */ + if (OPERANDS_READY(olink->rs)) + { + /* yes! enqueue instruction as ready, NOTE: stores + complete at dispatch, so no need to enqueue + them */ + if (!olink->rs->in_LSQ + || ((MD_OP_FLAGS(olink->rs->op)&(F_MEM|F_STORE)) + == (F_MEM|F_STORE))) + readyq_enqueue(olink->rs); + /* else, ld op, issued when no mem conflict */ + } + } + + /* grab link to next element prior to free */ + olink_next = olink->next; + + /* free dependence link element */ + RSLINK_FREE(olink); + } + /* blow away the consuming op list */ + rs->odep_list[i] = NULL; + + } /* if not NA output */ + + } /* for all outputs */ + + } /* for all writeback events */ + +} + + +/* + * LSQ_REFRESH() - memory access dependence checker/scheduler + */ + +/* this function locates ready instructions whose memory dependencies have + been satisfied, this is accomplished by walking the LSQ for loads, looking + for blocking memory dependency condition (e.g., earlier store with an + unknown address) */ +#define MAX_STD_UNKNOWNS 64 +static void +lsq_refresh(void) +{ + int i, j, index, n_std_unknowns; + md_addr_t std_unknowns[MAX_STD_UNKNOWNS]; + + /* scan entire queue for ready loads: scan from oldest instruction + (head) until we reach the tail or an unresolved store, after which no + other instruction will become ready */ + for (i=0, index=LSQ_head, n_std_unknowns=0; + i < LSQ_num; + i++, index=(index + 1) % LSQ_size) + { + /* terminate search for ready loads after first unresolved store, + as no later load could be resolved in its presence */ + if (/* store? */ + (MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE)) + { + if (!STORE_ADDR_READY(&LSQ[index])) + { + /* FIXME: a later STD + STD known could hide the STA unknown */ + /* sta unknown, blocks all later loads, stop search */ + break; + } + else if (!OPERANDS_READY(&LSQ[index])) + { + /* sta known, but std unknown, may block a later store, record + this address for later referral, we use an array here because + for most simulations the number of entries to search will be + very small */ + if (n_std_unknowns == MAX_STD_UNKNOWNS) + fatal("STD unknown array overflow, increase MAX_STD_UNKNOWNS"); + std_unknowns[n_std_unknowns++] = LSQ[index].addr; + } + else /* STORE_ADDR_READY() && OPERANDS_READY() */ + { + /* a later STD known hides an earlier STD unknown */ + for (j=0; jnext; + + /* still valid? */ + if (RSLINK_VALID(node)) + { + struct RUU_station *rs = RSLINK_RS(node); + + /* issue operation, both reg and mem deps have been satisfied */ + if (!OPERANDS_READY(rs) || !rs->queued + || rs->issued || rs->completed) + panic("issued inst !ready, issued, or completed"); + + /* node is now un-queued */ + rs->queued = FALSE; + + if (rs->in_LSQ + && ((MD_OP_FLAGS(rs->op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE))) + { + /* stores complete in effectively zero time, result is + written into the load/store queue, the actual store into + the memory system occurs when the instruction is retired + (see ruu_commit()) */ + rs->issued = TRUE; + rs->completed = TRUE; + if (rs->onames[0] || rs->onames[1]) + panic("store creates result"); + + if (rs->recover_inst) + panic("mis-predicted store"); + + /* entered execute stage, indicate in pipe trace */ + ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK, 0); + + /* one more inst issued */ + n_issued++; + } + else + { + /* issue the instruction to a functional unit */ + if (MD_OP_FUCLASS(rs->op) != NA) + { + fu = res_get(fu_pool, MD_OP_FUCLASS(rs->op)); + if (fu) + { + /* got one! issue inst to functional unit */ + rs->issued = TRUE; + /* reserve the functional unit */ + if (fu->master->busy) + panic("functional unit already in use"); + + /* schedule functional unit release event */ + fu->master->busy = fu->issuelat; + + /* schedule a result writeback event */ + if (rs->in_LSQ + && ((MD_OP_FLAGS(rs->op) & (F_MEM|F_LOAD)) + == (F_MEM|F_LOAD))) + { + int events = 0; + + /* for loads, determine cache access latency: + first scan LSQ to see if a store forward is + possible, if not, access the data cache */ + load_lat = 0; + i = (rs - LSQ); + if (i != LSQ_head) + { + for (;;) + { + /* go to next earlier LSQ entry */ + i = (i + (LSQ_size-1)) % LSQ_size; + + /* FIXME: not dealing with partials! */ + if ((MD_OP_FLAGS(LSQ[i].op) & F_STORE) + && (LSQ[i].addr == rs->addr)) + { + /* hit in the LSQ */ + load_lat = 1; + break; + } + + /* scan finished? */ + if (i == LSQ_head) + break; + } + } + + /* was the value store forwared from the LSQ? */ + if (!load_lat) + { + int valid_addr = MD_VALID_ADDR(rs->addr); + + if (!spec_mode && !valid_addr) + sim_invalid_addrs++; + + /* no! go to the data cache if addr is valid */ + if (cache_dl1 && valid_addr) + { + /* access the cache if non-faulting */ + load_lat = + cache_access(cache_dl1, Read, + (rs->addr & ~3), NULL, 4, + sim_cycle, NULL, NULL); + if (load_lat > cache_dl1_lat) + events |= PEV_CACHEMISS; + } + else + { + /* no caches defined, just use op latency */ + load_lat = fu->oplat; + } + } + + /* all loads and stores must to access D-TLB */ + if (dtlb && MD_VALID_ADDR(rs->addr)) + { + /* access the D-DLB, NOTE: this code will + initiate speculative TLB misses */ + tlb_lat = + cache_access(dtlb, Read, (rs->addr & ~3), + NULL, 4, sim_cycle, NULL, NULL); + if (tlb_lat > 1) + events |= PEV_TLBMISS; + + /* D-cache/D-TLB accesses occur in parallel */ + load_lat = MAX(tlb_lat, load_lat); + } + + /* use computed cache access latency */ + eventq_queue_event(rs, sim_cycle + load_lat); + + /* entered execute stage, indicate in pipe trace */ + ptrace_newstage(rs->ptrace_seq, PST_EXECUTE, + ((rs->ea_comp ? PEV_AGEN : 0) + | events)); + } + else /* !load && !store */ + { + /* use deterministic functional unit latency */ + eventq_queue_event(rs, sim_cycle + fu->oplat); + + /* entered execute stage, indicate in pipe trace */ + ptrace_newstage(rs->ptrace_seq, PST_EXECUTE, + rs->ea_comp ? PEV_AGEN : 0); + } + + /* one more inst issued */ + n_issued++; + } + else /* no functional unit */ + { + /* insufficient functional unit resources, put operation + back onto the ready list, we'll try to issue it + again next cycle */ + readyq_enqueue(rs); + } + } + else /* does not require a functional unit! */ + { + /* FIXME: need better solution for these */ + /* the instruction does not need a functional unit */ + rs->issued = TRUE; + + /* schedule a result event */ + eventq_queue_event(rs, sim_cycle + 1); + + /* entered execute stage, indicate in pipe trace */ + ptrace_newstage(rs->ptrace_seq, PST_EXECUTE, + rs->ea_comp ? PEV_AGEN : 0); + + /* one more inst issued */ + n_issued++; + } + } /* !store */ + + } + /* else, RUU entry was squashed */ + + /* reclaim ready list entry, NOTE: this is done whether or not the + instruction issued, since the instruction was once again reinserted + into the ready queue if it did not issue, this ensures that the ready + queue is always properly sorted */ + RSLINK_FREE(node); + } + + /* put any instruction not issued back into the ready queue, go through + normal channels to ensure instruction stay ordered correctly */ + for (; node; node = next_node) + { + next_node = node->next; + + /* still valid? */ + if (RSLINK_VALID(node)) + { + struct RUU_station *rs = RSLINK_RS(node); + + /* node is now un-queued */ + rs->queued = FALSE; + + /* not issued, put operation back onto the ready list, we'll try to + issue it again next cycle */ + readyq_enqueue(rs); + } + /* else, RUU entry was squashed */ + + /* reclaim ready list entry, NOTE: this is done whether or not the + instruction issued, since the instruction was once again reinserted + into the ready queue if it did not issue, this ensures that the ready + queue is always properly sorted */ + RSLINK_FREE(node); + } +} + + +/* + * routines for generating on-the-fly instruction traces with support + * for control and data misspeculation modeling + */ + +/* integer register file */ +#define R_BMAP_SZ (BITMAP_SIZE(MD_NUM_IREGS)) +static BITMAP_TYPE(MD_NUM_IREGS, use_spec_R); +static md_gpr_t spec_regs_R; + +/* floating point register file */ +#define F_BMAP_SZ (BITMAP_SIZE(MD_NUM_FREGS)) +static BITMAP_TYPE(MD_NUM_FREGS, use_spec_F); +static md_fpr_t spec_regs_F; + +/* miscellaneous registers */ +#define C_BMAP_SZ (BITMAP_SIZE(MD_NUM_CREGS)) +static BITMAP_TYPE(MD_NUM_FREGS, use_spec_C); +static md_ctrl_t spec_regs_C; + +/* dump speculative register state */ +static void +rspec_dump(FILE *stream) /* output stream */ +{ + int i; + + if (!stream) + stream = stderr; + + fprintf(stream, "** speculative register contents **\n"); + + fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f"); + + /* dump speculative integer regs */ + for (i=0; i < MD_NUM_IREGS; i++) + { + if (BITMAP_SET_P(use_spec_R, R_BMAP_SZ, i)) + { + md_print_ireg(spec_regs_R, i, stream); + fprintf(stream, "\n"); + } + } + + /* dump speculative FP regs */ + for (i=0; i < MD_NUM_FREGS; i++) + { + if (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, i)) + { + md_print_fpreg(spec_regs_F, i, stream); + fprintf(stream, "\n"); + } + } + + /* dump speculative CTRL regs */ + for (i=0; i < MD_NUM_CREGS; i++) + { + if (BITMAP_SET_P(use_spec_C, C_BMAP_SZ, i)) + { + md_print_creg(spec_regs_C, i, stream); + fprintf(stream, "\n"); + } + } +} + + +/* speculative memory hash table size, NOTE: this must be a power-of-two */ +#define STORE_HASH_SIZE 32 + +/* speculative memory hash table definition, accesses go through this hash + table when accessing memory in speculative mode, the hash table flush the + table when recovering from mispredicted branches */ +struct spec_mem_ent { + struct spec_mem_ent *next; /* ptr to next hash table bucket */ + md_addr_t addr; /* virtual address of spec state */ + unsigned int data[2]; /* spec buffer, up to 8 bytes */ +}; + +/* speculative memory hash table */ +static struct spec_mem_ent *store_htable[STORE_HASH_SIZE]; + +/* speculative memory hash table bucket free list */ +static struct spec_mem_ent *bucket_free_list = NULL; + + +/* program counter */ +static md_addr_t pred_PC; +static md_addr_t recover_PC; + +/* fetch unit next fetch address */ +static md_addr_t fetch_regs_PC; +static md_addr_t fetch_pred_PC; + +/* IFETCH -> DISPATCH instruction queue definition */ +struct fetch_rec { + md_inst_t IR; /* inst register */ + md_addr_t regs_PC, pred_PC; /* current PC, predicted next PC */ + struct bpred_update_t dir_update; /* bpred direction update info */ + int stack_recover_idx; /* branch predictor RSB index */ + unsigned int ptrace_seq; /* print trace sequence id */ +}; +static struct fetch_rec *fetch_data; /* IFETCH -> DISPATCH inst queue */ +static int fetch_num; /* num entries in IF -> DIS queue */ +static int fetch_tail, fetch_head; /* head and tail pointers of queue */ + +/* recover instruction trace generator state to precise state state immediately + before the first mis-predicted branch; this is accomplished by resetting + all register value copied-on-write bitmasks are reset, and the speculative + memory hash table is cleared */ +static void +tracer_recover(void) +{ + int i; + struct spec_mem_ent *ent, *ent_next; + + /* better be in mis-speculative trace generation mode */ + if (!spec_mode) + panic("cannot recover unless in speculative mode"); + + /* reset to non-speculative trace generation mode */ + spec_mode = FALSE; + + /* reset copied-on-write register bitmasks back to non-speculative state */ + BITMAP_CLEAR_MAP(use_spec_R, R_BMAP_SZ); + BITMAP_CLEAR_MAP(use_spec_F, F_BMAP_SZ); + BITMAP_CLEAR_MAP(use_spec_C, C_BMAP_SZ); + + /* reset memory state back to non-speculative state */ + /* FIXME: could version stamps be used here?!?!? */ + for (i=0; inext; + ent->next = bucket_free_list; + bucket_free_list = ent; + } + store_htable[i] = NULL; + } + + /* if pipetracing, indicate squash of instructions in the inst fetch queue */ + if (ptrace_active) + { + while (fetch_num != 0) + { + /* squash the next instruction from the IFETCH -> DISPATCH queue */ + ptrace_endinst(fetch_data[fetch_head].ptrace_seq); + + /* consume instruction from IFETCH -> DISPATCH queue */ + fetch_head = (fetch_head+1) & (ruu_ifq_size - 1); + fetch_num--; + } + } + + /* reset IFETCH state */ + fetch_num = 0; + fetch_tail = fetch_head = 0; + fetch_pred_PC = fetch_regs_PC = recover_PC; +} + +/* initialize the speculative instruction state generator state */ +static void +tracer_init(void) +{ + int i; + + /* initially in non-speculative mode */ + spec_mode = FALSE; + + /* register state is from non-speculative state buffers */ + BITMAP_CLEAR_MAP(use_spec_R, R_BMAP_SZ); + BITMAP_CLEAR_MAP(use_spec_F, F_BMAP_SZ); + BITMAP_CLEAR_MAP(use_spec_C, C_BMAP_SZ); + + /* memory state is from non-speculative memory pages */ + for (i=0; i> 24)^((ADDR) >> 16)^((ADDR) >> 8)^(ADDR)) & (STORE_HASH_SIZE-1)) + +/* this functional provides a layer of mis-speculated state over the + non-speculative memory state, when in mis-speculation trace generation mode, + the simulator will call this function to access memory, instead of the + non-speculative memory access interfaces defined in memory.h; when storage + is written, an entry is allocated in the speculative memory hash table, + future reads and writes while in mis-speculative trace generation mode will + access this buffer instead of non-speculative memory state; when the trace + generator transitions back to non-speculative trace generation mode, + tracer_recover() clears this table, returns any access fault */ +static enum md_fault_type +spec_mem_access(struct mem_t *mem, /* memory space to access */ + enum mem_cmd cmd, /* Read or Write access cmd */ + md_addr_t addr, /* virtual address of access */ + void *p, /* input/output buffer */ + int nbytes) /* number of bytes to access */ +{ + int i, index; + struct spec_mem_ent *ent, *prev; + + /* FIXME: partially overlapping writes are not combined... */ + /* FIXME: partially overlapping reads are not handled correctly... */ + + /* check alignments, even speculative this test should always pass */ + if ((nbytes & (nbytes-1)) != 0 || (addr & (nbytes-1)) != 0) + { + /* no can do, return zero result */ + for (i=0; i < nbytes; i++) + ((char *)p)[i] = 0; + + return md_fault_none; + } + + /* check permissions */ + if (!((addr >= ld_text_base && addr < (ld_text_base+ld_text_size) + && cmd == Read) + || MD_VALID_ADDR(addr))) + { + /* no can do, return zero result */ + for (i=0; i < nbytes; i++) + ((char *)p)[i] = 0; + + return md_fault_none; + } + + /* has this memory state been copied on mis-speculative write? */ + index = HASH_ADDR(addr); + for (prev=NULL,ent=store_htable[index]; ent; prev=ent,ent=ent->next) + { + if (ent->addr == addr) + { + /* reorder chains to speed access into hash table */ + if (prev != NULL) + { + /* not at head of list, relink the hash table entry at front */ + prev->next = ent->next; + ent->next = store_htable[index]; + store_htable[index] = ent; + } + break; + } + } + + /* no, if it is a write, allocate a hash table entry to hold the data */ + if (!ent && cmd == Write) + { + /* try to get an entry from the free list, if available */ + if (!bucket_free_list) + { + /* otherwise, call calloc() to get the needed storage */ + bucket_free_list = calloc(1, sizeof(struct spec_mem_ent)); + if (!bucket_free_list) + fatal("out of virtual memory"); + } + ent = bucket_free_list; + bucket_free_list = bucket_free_list->next; + + if (!bugcompat_mode) + { + /* insert into hash table */ + ent->next = store_htable[index]; + store_htable[index] = ent; + ent->addr = addr; + ent->data[0] = 0; ent->data[1] = 0; + } + } + + /* handle the read or write to speculative or non-speculative storage */ + switch (nbytes) + { + case 1: + if (cmd == Read) + { + if (ent) + { + /* read from mis-speculated state buffer */ + *((byte_t *)p) = *((byte_t *)(&ent->data[0])); + } + else + { + /* read from non-speculative memory state, don't allocate + memory pages with speculative loads */ + *((byte_t *)p) = MEM_READ_BYTE(mem, addr); + } + } + else + { + /* always write into mis-speculated state buffer */ + *((byte_t *)(&ent->data[0])) = *((byte_t *)p); + } + break; + case 2: + if (cmd == Read) + { + if (ent) + { + /* read from mis-speculated state buffer */ + *((half_t *)p) = *((half_t *)(&ent->data[0])); + } + else + { + /* read from non-speculative memory state, don't allocate + memory pages with speculative loads */ + *((half_t *)p) = MEM_READ_HALF(mem, addr); + } + } + else + { + /* always write into mis-speculated state buffer */ + *((half_t *)&ent->data[0]) = *((half_t *)p); + } + break; + case 4: + if (cmd == Read) + { + if (ent) + { + /* read from mis-speculated state buffer */ + *((word_t *)p) = *((word_t *)&ent->data[0]); + } + else + { + /* read from non-speculative memory state, don't allocate + memory pages with speculative loads */ + *((word_t *)p) = MEM_READ_WORD(mem, addr); + } + } + else + { + /* always write into mis-speculated state buffer */ + *((word_t *)&ent->data[0]) = *((word_t *)p); + } + break; + case 8: + if (cmd == Read) + { + if (ent) + { + /* read from mis-speculated state buffer */ + *((word_t *)p) = *((word_t *)&ent->data[0]); + *(((word_t *)p)+1) = *((word_t *)&ent->data[1]); + } + else + { + /* read from non-speculative memory state, don't allocate + memory pages with speculative loads */ + *((word_t *)p) = MEM_READ_WORD(mem, addr); + *(((word_t *)p)+1) = + MEM_READ_WORD(mem, addr + sizeof(word_t)); + } + } + else + { + /* always write into mis-speculated state buffer */ + *((word_t *)&ent->data[0]) = *((word_t *)p); + *((word_t *)&ent->data[1]) = *(((word_t *)p)+1); + } + break; + default: + panic("access size not supported in mis-speculative mode"); + } + + return md_fault_none; +} + +/* dump speculative memory state */ +static void +mspec_dump(FILE *stream) /* output stream */ +{ + int i; + struct spec_mem_ent *ent; + + if (!stream) + stream = stderr; + + fprintf(stream, "** speculative memory contents **\n"); + + fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f"); + + for (i=0; inext) + { + myfprintf(stream, "[0x%08p]: %12.0f/0x%08x:%08x\n", + ent->addr, (double)(*((double *)ent->data)), + *((unsigned int *)&ent->data[0]), + *(((unsigned int *)&ent->data[0]) + 1)); + } + } +} + +/* default memory state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_mem_obj(struct mem_t *mem, /* memory space to access */ + int is_write, /* access type */ + md_addr_t addr, /* address to access */ + char *p, /* input/output buffer */ + int nbytes) /* size of access */ +{ + enum mem_cmd cmd; + + if (!is_write) + cmd = Read; + else + cmd = Write; + +#if 0 + char *errstr; + + errstr = mem_valid(cmd, addr, nbytes, /* declare */FALSE); + if (errstr) + return errstr; +#endif + + /* else, no error, access memory */ + if (spec_mode) + spec_mem_access(mem, cmd, addr, p, nbytes); + else + mem_access(mem, cmd, addr, p, nbytes); + + /* no error */ + return NULL; +} + + +/* + * RUU_DISPATCH() - decode instructions and allocate RUU and LSQ resources + */ + +/* link RS onto the output chain number of whichever operation will next + create the architected register value IDEP_NAME */ +static INLINE void +ruu_link_idep(struct RUU_station *rs, /* rs station to link */ + int idep_num, /* input dependence number */ + int idep_name) /* input register name */ +{ + struct CV_link head; + struct RS_link *link; + + /* any dependence? */ + if (idep_name == NA) + { + /* no input dependence for this input slot, mark operand as ready */ + rs->idep_ready[idep_num] = TRUE; + return; + } + + /* locate creator of operand */ + head = CREATE_VECTOR(idep_name); + + /* any creator? */ + if (!head.rs) + { + /* no active creator, use value available in architected reg file, + indicate the operand is ready for use */ + rs->idep_ready[idep_num] = TRUE; + return; + } + /* else, creator operation will make this value sometime in the future */ + + /* indicate value will be created sometime in the future, i.e., operand + is not yet ready for use */ + rs->idep_ready[idep_num] = FALSE; + + /* link onto creator's output list of dependant operand */ + RSLINK_NEW(link, rs); link->x.opnum = idep_num; + link->next = head.rs->odep_list[head.odep_num]; + head.rs->odep_list[head.odep_num] = link; +} + +/* make RS the creator of architected register ODEP_NAME */ +static INLINE void +ruu_install_odep(struct RUU_station *rs, /* creating RUU station */ + int odep_num, /* output operand number */ + int odep_name) /* output register name */ +{ + struct CV_link cv; + + /* any dependence? */ + if (odep_name == NA) + { + /* no value created */ + rs->onames[odep_num] = NA; + return; + } + /* else, create a RS_NULL terminated output chain in create vector */ + + /* record output name, used to update create vector at completion */ + rs->onames[odep_num] = odep_name; + + /* initialize output chain to empty list */ + rs->odep_list[odep_num] = NULL; + + /* indicate this operation is latest creator of ODEP_NAME */ + CVLINK_INIT(cv, rs, odep_num); + SET_CREATE_VECTOR(odep_name, cv); +} + + +/* + * configure the instruction decode engine + */ + +#define DNA (0) + +#if defined(TARGET_PISA) + +/* general register dependence decoders */ +#define DGPR(N) (N) +#define DGPR_D(N) ((N) &~1) + +/* floating point register dependence decoders */ +#define DFPR_L(N) (((N)+32)&~1) +#define DFPR_F(N) (((N)+32)&~1) +#define DFPR_D(N) (((N)+32)&~1) + +/* miscellaneous register dependence decoders */ +#define DHI (0+32+32) +#define DLO (1+32+32) +#define DFCC (2+32+32) +#define DTMP (3+32+32) + +#elif defined(TARGET_ALPHA) + +/* general register dependence decoders, $r31 maps to DNA (0) */ +#define DGPR(N) (31 - (N)) /* was: (((N) == 31) ? DNA : (N)) */ + +/* floating point register dependence decoders */ +#define DFPR(N) (((N) == 31) ? DNA : ((N)+32)) + +/* miscellaneous register dependence decoders */ +#define DFPCR (0+32+32) +#define DUNIQ (1+32+32) +#define DTMP (2+32+32) + +#else +#error No ISA target defined... +#endif + + +/* + * configure the execution engine + */ + +/* next program counter */ +#define SET_NPC(EXPR) (regs.regs_NPC = (EXPR)) + +/* target program counter */ +#undef SET_TPC +#define SET_TPC(EXPR) (target_PC = (EXPR)) + +/* current program counter */ +#define CPC (regs.regs_PC) +#define SET_CPC(EXPR) (regs.regs_PC = (EXPR)) + +/* general purpose register accessors, NOTE: speculative copy on write storage + provided for fast recovery during wrong path execute (see tracer_recover() + for details on this process */ +#define GPR(N) (BITMAP_SET_P(use_spec_R, R_BMAP_SZ, (N))\ + ? spec_regs_R[N] \ + : regs.regs_R[N]) +#define SET_GPR(N,EXPR) (spec_mode \ + ? ((spec_regs_R[N] = (EXPR)), \ + BITMAP_SET(use_spec_R, R_BMAP_SZ, (N)),\ + spec_regs_R[N]) \ + : (regs.regs_R[N] = (EXPR))) + +#if defined(TARGET_PISA) + +/* floating point register accessors, NOTE: speculative copy on write storage + provided for fast recovery during wrong path execute (see tracer_recover() + for details on this process */ +#define FPR_L(N) (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\ + ? spec_regs_F.l[(N)] \ + : regs.regs_F.l[(N)]) +#define SET_FPR_L(N,EXPR) (spec_mode \ + ? ((spec_regs_F.l[(N)] = (EXPR)), \ + BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\ + spec_regs_F.l[(N)]) \ + : (regs.regs_F.l[(N)] = (EXPR))) +#define FPR_F(N) (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\ + ? spec_regs_F.f[(N)] \ + : regs.regs_F.f[(N)]) +#define SET_FPR_F(N,EXPR) (spec_mode \ + ? ((spec_regs_F.f[(N)] = (EXPR)), \ + BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\ + spec_regs_F.f[(N)]) \ + : (regs.regs_F.f[(N)] = (EXPR))) +#define FPR_D(N) (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\ + ? spec_regs_F.d[(N) >> 1] \ + : regs.regs_F.d[(N) >> 1]) +#define SET_FPR_D(N,EXPR) (spec_mode \ + ? ((spec_regs_F.d[(N) >> 1] = (EXPR)), \ + BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\ + spec_regs_F.d[(N) >> 1]) \ + : (regs.regs_F.d[(N) >> 1] = (EXPR))) + +/* miscellanous register accessors, NOTE: speculative copy on write storage + provided for fast recovery during wrong path execute (see tracer_recover() + for details on this process */ +#define HI (BITMAP_SET_P(use_spec_C, C_BMAP_SZ, /*hi*/0)\ + ? spec_regs_C.hi \ + : regs.regs_C.hi) +#define SET_HI(EXPR) (spec_mode \ + ? ((spec_regs_C.hi = (EXPR)), \ + BITMAP_SET(use_spec_C, C_BMAP_SZ,/*hi*/0),\ + spec_regs_C.hi) \ + : (regs.regs_C.hi = (EXPR))) +#define LO (BITMAP_SET_P(use_spec_C, C_BMAP_SZ, /*lo*/1)\ + ? spec_regs_C.lo \ + : regs.regs_C.lo) +#define SET_LO(EXPR) (spec_mode \ + ? ((spec_regs_C.lo = (EXPR)), \ + BITMAP_SET(use_spec_C, C_BMAP_SZ,/*lo*/1),\ + spec_regs_C.lo) \ + : (regs.regs_C.lo = (EXPR))) +#define FCC (BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fcc*/2)\ + ? spec_regs_C.fcc \ + : regs.regs_C.fcc) +#define SET_FCC(EXPR) (spec_mode \ + ? ((spec_regs_C.fcc = (EXPR)), \ + BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fcc*/2),\ + spec_regs_C.fcc) \ + : (regs.regs_C.fcc = (EXPR))) + +#elif defined(TARGET_ALPHA) + +/* floating point register accessors, NOTE: speculative copy on write storage + provided for fast recovery during wrong path execute (see tracer_recover() + for details on this process */ +#define FPR_Q(N) (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, (N))\ + ? spec_regs_F.q[(N)] \ + : regs.regs_F.q[(N)]) +#define SET_FPR_Q(N,EXPR) (spec_mode \ + ? ((spec_regs_F.q[(N)] = (EXPR)), \ + BITMAP_SET(use_spec_F,F_BMAP_SZ, (N)),\ + spec_regs_F.q[(N)]) \ + : (regs.regs_F.q[(N)] = (EXPR))) +#define FPR(N) (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, (N))\ + ? spec_regs_F.d[(N)] \ + : regs.regs_F.d[(N)]) +#define SET_FPR(N,EXPR) (spec_mode \ + ? ((spec_regs_F.d[(N)] = (EXPR)), \ + BITMAP_SET(use_spec_F,F_BMAP_SZ, (N)),\ + spec_regs_F.d[(N)]) \ + : (regs.regs_F.d[(N)] = (EXPR))) + +/* miscellanous register accessors, NOTE: speculative copy on write storage + provided for fast recovery during wrong path execute (see tracer_recover() + for details on this process */ +#define FPCR (BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fpcr*/0)\ + ? spec_regs_C.fpcr \ + : regs.regs_C.fpcr) +#define SET_FPCR(EXPR) (spec_mode \ + ? ((spec_regs_C.fpcr = (EXPR)), \ + BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fpcr*/0),\ + spec_regs_C.fpcr) \ + : (regs.regs_C.fpcr = (EXPR))) +#define UNIQ (BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*uniq*/1)\ + ? spec_regs_C.uniq \ + : regs.regs_C.uniq) +#define SET_UNIQ(EXPR) (spec_mode \ + ? ((spec_regs_C.uniq = (EXPR)), \ + BITMAP_SET(use_spec_C,C_BMAP_SZ,/*uniq*/1),\ + spec_regs_C.uniq) \ + : (regs.regs_C.uniq = (EXPR))) +#define FCC (BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fcc*/2)\ + ? spec_regs_C.fcc \ + : regs.regs_C.fcc) +#define SET_FCC(EXPR) (spec_mode \ + ? ((spec_regs_C.fcc = (EXPR)), \ + BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fcc*/1),\ + spec_regs_C.fcc) \ + : (regs.regs_C.fcc = (EXPR))) + +#else +#error No ISA target defined... +#endif + +/* precise architected memory state accessor macros, NOTE: speculative copy on + write storage provided for fast recovery during wrong path execute (see + tracer_recover() for details on this process */ +#define __READ_SPECMEM(SRC, SRC_V, FAULT) \ + (addr = (SRC), \ + (spec_mode \ + ? ((FAULT) = spec_mem_access(mem, Read, addr, &SRC_V, sizeof(SRC_V)))\ + : ((FAULT) = mem_access(mem, Read, addr, &SRC_V, sizeof(SRC_V)))), \ + SRC_V) + +#define READ_BYTE(SRC, FAULT) \ + __READ_SPECMEM((SRC), temp_byte, (FAULT)) +#define READ_HALF(SRC, FAULT) \ + MD_SWAPH(__READ_SPECMEM((SRC), temp_half, (FAULT))) +#define READ_WORD(SRC, FAULT) \ + MD_SWAPW(__READ_SPECMEM((SRC), temp_word, (FAULT))) +#ifdef HOST_HAS_QWORD +#define READ_QWORD(SRC, FAULT) \ + MD_SWAPQ(__READ_SPECMEM((SRC), temp_qword, (FAULT))) +#endif /* HOST_HAS_QWORD */ + + +#define __WRITE_SPECMEM(SRC, DST, DST_V, FAULT) \ + (DST_V = (SRC), addr = (DST), \ + (spec_mode \ + ? ((FAULT) = spec_mem_access(mem, Write, addr, &DST_V, sizeof(DST_V)))\ + : ((FAULT) = mem_access(mem, Write, addr, &DST_V, sizeof(DST_V))))) + +#define WRITE_BYTE(SRC, DST, FAULT) \ + __WRITE_SPECMEM((SRC), (DST), temp_byte, (FAULT)) +#define WRITE_HALF(SRC, DST, FAULT) \ + __WRITE_SPECMEM(MD_SWAPH(SRC), (DST), temp_half, (FAULT)) +#define WRITE_WORD(SRC, DST, FAULT) \ + __WRITE_SPECMEM(MD_SWAPW(SRC), (DST), temp_word, (FAULT)) +#ifdef HOST_HAS_QWORD +#define WRITE_QWORD(SRC, DST, FAULT) \ + __WRITE_SPECMEM(MD_SWAPQ(SRC), (DST), temp_qword, (FAULT)) +#endif /* HOST_HAS_QWORD */ + +/* system call handler macro */ +#define SYSCALL(INST) \ + (/* only execute system calls in non-speculative mode */ \ + (spec_mode ? panic("speculative syscall") : (void) 0), \ + sys_syscall(®s, mem_access, mem, INST, TRUE)) + +/* default register state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_reg_obj(struct regs_t *xregs, /* registers to access */ + int is_write, /* access type */ + enum md_reg_type rt, /* reg bank to probe */ + int reg, /* register number */ + struct eval_value_t *val) /* input, output */ +{ + switch (rt) + { + case rt_gpr: + if (reg < 0 || reg >= MD_NUM_IREGS) + return "register number out of range"; + + if (!is_write) + { + val->type = et_uint; + val->value.as_uint = GPR(reg); + } + else + SET_GPR(reg, eval_as_uint(*val)); + break; + + case rt_lpr: + if (reg < 0 || reg >= MD_NUM_FREGS) + return "register number out of range"; + + /* FIXME: this is not portable... */ + abort(); +#if 0 + if (!is_write) + { + val->type = et_uint; + val->value.as_uint = FPR_L(reg); + } + else + SET_FPR_L(reg, eval_as_uint(*val)); +#endif + break; + + case rt_fpr: + /* FIXME: this is not portable... */ + abort(); +#if 0 + if (!is_write) + val->value.as_float = FPR_F(reg); + else + SET_FPR_F(reg, val->value.as_float); +#endif + break; + + case rt_dpr: + /* FIXME: this is not portable... */ + abort(); +#if 0 + /* 1/2 as many regs in this mode */ + if (reg < 0 || reg >= MD_NUM_REGS/2) + return "register number out of range"; + + if (at == at_read) + val->as_double = FPR_D(reg * 2); + else + SET_FPR_D(reg * 2, val->as_double); +#endif + break; + + /* FIXME: this is not portable... */ +#if 0 + abort(); + case rt_hi: + if (at == at_read) + val->as_word = HI; + else + SET_HI(val->as_word); + break; + case rt_lo: + if (at == at_read) + val->as_word = LO; + else + SET_LO(val->as_word); + break; + case rt_FCC: + if (at == at_read) + val->as_condition = FCC; + else + SET_FCC(val->as_condition); + break; +#endif + + case rt_PC: + if (!is_write) + { + val->type = et_addr; + val->value.as_addr = regs.regs_PC; + } + else + regs.regs_PC = eval_as_addr(*val); + break; + + case rt_NPC: + if (!is_write) + { + val->type = et_addr; + val->value.as_addr = regs.regs_NPC; + } + else + regs.regs_NPC = eval_as_addr(*val); + break; + + default: + panic("bogus register bank"); + } + + /* no error */ + return NULL; +} + +/* the last operation that ruu_dispatch() attempted to dispatch, for + implementing in-order issue */ +static struct RS_link last_op = RSLINK_NULL_DATA; + +/* dispatch instructions from the IFETCH -> DISPATCH queue: instructions are + first decoded, then they allocated RUU (and LSQ for load/stores) resources + and input and output dependence chains are updated accordingly */ +static void +ruu_dispatch(void) +{ + int i; + int n_dispatched; /* total insts dispatched */ + md_inst_t inst; /* actual instruction bits */ + enum md_opcode op; /* decoded opcode enum */ + int out1, out2, in1, in2, in3; /* output/input register names */ + md_addr_t target_PC; /* actual next/target PC address */ + md_addr_t addr; /* effective address, if load/store */ + struct RUU_station *rs; /* RUU station being allocated */ + struct RUU_station *lsq; /* LSQ station for ld/st's */ + struct bpred_update_t *dir_update_ptr;/* branch predictor dir update ptr */ + int stack_recover_idx; /* bpred retstack recovery index */ + unsigned int pseq; /* pipetrace sequence number */ + int is_write; /* store? */ + int made_check; /* used to ensure DLite entry */ + int br_taken, br_pred_taken; /* if br, taken? predicted taken? */ + int fetch_redirected = FALSE; + byte_t temp_byte = 0; /* temp variable for spec mem access */ + half_t temp_half = 0; /* " ditto " */ + word_t temp_word = 0; /* " ditto " */ +#ifdef HOST_HAS_QWORD + qword_t temp_qword = 0; /* " ditto " */ +#endif /* HOST_HAS_QWORD */ + enum md_fault_type fault; + + made_check = FALSE; + n_dispatched = 0; + while (/* instruction decode B/W left? */ + n_dispatched < (ruu_decode_width * fetch_speed) + /* RUU and LSQ not full? */ + && RUU_num < RUU_size && LSQ_num < LSQ_size + /* insts still available from fetch unit? */ + && fetch_num != 0 + /* on an acceptable trace path */ + && (ruu_include_spec || !spec_mode)) + { + /* if issuing in-order, block until last op issues if inorder issue */ + if (ruu_inorder_issue + && (last_op.rs && RSLINK_VALID(&last_op) + && !OPERANDS_READY(last_op.rs))) + { + /* stall until last operation is ready to issue */ + break; + } + + /* get the next instruction from the IFETCH -> DISPATCH queue */ + inst = fetch_data[fetch_head].IR; + regs.regs_PC = fetch_data[fetch_head].regs_PC; + pred_PC = fetch_data[fetch_head].pred_PC; + dir_update_ptr = &(fetch_data[fetch_head].dir_update); + stack_recover_idx = fetch_data[fetch_head].stack_recover_idx; + pseq = fetch_data[fetch_head].ptrace_seq; + + /* decode the inst */ + MD_SET_OPCODE(op, inst); + + /* compute default next PC */ + regs.regs_NPC = regs.regs_PC + sizeof(md_inst_t); + + /* drain RUU for TRAPs and system calls */ + if (MD_OP_FLAGS(op) & F_TRAP) + { + if (RUU_num != 0) + break; + + /* else, syscall is only instruction in the machine, at this + point we should not be in (mis-)speculative mode */ + if (spec_mode) + panic("drained and speculative"); + } + + /* maintain $r0 semantics (in spec and non-spec space) */ + regs.regs_R[MD_REG_ZERO] = 0; spec_regs_R[MD_REG_ZERO] = 0; +#ifdef TARGET_ALPHA + regs.regs_F.d[MD_REG_ZERO] = 0.0; spec_regs_F.d[MD_REG_ZERO] = 0.0; +#endif /* TARGET_ALPHA */ + + if (!spec_mode) + { + /* one more non-speculative instruction executed */ + sim_num_insn++; + } + + /* default effective address (none) and access */ + addr = 0; is_write = FALSE; + + /* set default fault - none */ + fault = md_fault_none; + + /* more decoding and execution */ + switch (op) + { +#define DEFINST(OP,MSK,NAME,OPFORM,RES,CLASS,O1,O2,I1,I2,I3) \ + case OP: \ + /* compute output/input dependencies to out1-2 and in1-3 */ \ + out1 = O1; out2 = O2; \ + in1 = I1; in2 = I2; in3 = I3; \ + /* execute the instruction */ \ + SYMCAT(OP,_IMPL); \ + break; +#define DEFLINK(OP,MSK,NAME,MASK,SHIFT) \ + case OP: \ + /* could speculatively decode a bogus inst, convert to NOP */ \ + op = MD_NOP_OP; \ + /* compute output/input dependencies to out1-2 and in1-3 */ \ + out1 = NA; out2 = NA; \ + in1 = NA; in2 = NA; in3 = NA; \ + /* no EXPR */ \ + break; +#define CONNECT(OP) /* nada... */ + /* the following macro wraps the instruction fault declaration macro + with a test to see if the trace generator is in non-speculative + mode, if so the instruction fault is declared, otherwise, the + error is shunted because instruction faults need to be masked on + the mis-speculated instruction paths */ +#define DECLARE_FAULT(FAULT) \ + { \ + if (!spec_mode) \ + fault = (FAULT); \ + /* else, spec fault, ignore it, always terminate exec... */ \ + break; \ + } +#include "machine.def" + default: + /* can speculatively decode a bogus inst, convert to a NOP */ + op = MD_NOP_OP; + /* compute output/input dependencies to out1-2 and in1-3 */ \ + out1 = NA; out2 = NA; + in1 = NA; in2 = NA; in3 = NA; + /* no EXPR */ + } + /* operation sets next PC */ + + /* print retirement trace if in verbose mode */ + if (!spec_mode && verbose) + { + myfprintf(stderr, "++ %10n [xor: 0x%08x] {%d} @ 0x%08p: ", + sim_num_insn, md_xor_regs(®s), + inst_seq+1, regs.regs_PC); + md_print_insn(inst, regs.regs_PC, stderr); + fprintf(stderr, "\n"); + /* fflush(stderr); */ + } + + if (fault != md_fault_none) + fatal("non-speculative fault (%d) detected @ 0x%08p", + fault, regs.regs_PC); + + /* update memory access stats */ + if (MD_OP_FLAGS(op) & F_MEM) + { + sim_total_refs++; + if (!spec_mode) + sim_num_refs++; + + if (MD_OP_FLAGS(op) & F_STORE) + is_write = TRUE; + else + { + sim_total_loads++; + if (!spec_mode) + sim_num_loads++; + } + } + + br_taken = (regs.regs_NPC != (regs.regs_PC + sizeof(md_inst_t))); + br_pred_taken = (pred_PC != (regs.regs_PC + sizeof(md_inst_t))); + + if ((pred_PC != regs.regs_NPC && pred_perfect) + || ((MD_OP_FLAGS(op) & (F_CTRL|F_DIRJMP)) == (F_CTRL|F_DIRJMP) + && target_PC != pred_PC && br_pred_taken)) + { + /* Either 1) we're simulating perfect prediction and are in a + mis-predict state and need to patch up, or 2) We're not simulating + perfect prediction, we've predicted the branch taken, but our + predicted target doesn't match the computed target (i.e., + mis-fetch). Just update the PC values and do a fetch squash. + This is just like calling fetch_squash() except we pre-anticipate + the updates to the fetch values at the end of this function. If + case #2, also charge a mispredict penalty for redirecting fetch */ + fetch_pred_PC = fetch_regs_PC = regs.regs_NPC; + /* was: if (pred_perfect) */ + if (pred_perfect) + pred_PC = regs.regs_NPC; + + fetch_head = (ruu_ifq_size-1); + fetch_num = 1; + fetch_tail = 0; + + if (!pred_perfect) + ruu_fetch_issue_delay = ruu_branch_penalty; + + fetch_redirected = TRUE; + } + + /* is this a NOP */ + if (op != MD_NOP_OP) + { + /* for load/stores: + idep #0 - store operand (value that is store'ed) + idep #1, #2 - eff addr computation inputs (addr of access) + + resulting RUU/LSQ operation pair: + RUU (effective address computation operation): + idep #0, #1 - eff addr computation inputs (addr of access) + LSQ (memory access operation): + idep #0 - operand input (value that is store'd) + idep #1 - eff addr computation result (from RUU op) + + effective address computation is transfered via the reserved + name DTMP + */ + + /* fill in RUU reservation station */ + rs = &RUU[RUU_tail]; + rs->slip = sim_cycle - 1; + rs->IR = inst; + rs->op = op; + rs->PC = regs.regs_PC; + rs->next_PC = regs.regs_NPC; rs->pred_PC = pred_PC; + rs->in_LSQ = FALSE; + rs->ea_comp = FALSE; + rs->recover_inst = FALSE; + rs->dir_update = *dir_update_ptr; + rs->stack_recover_idx = stack_recover_idx; + rs->spec_mode = spec_mode; + rs->addr = 0; + /* rs->tag is already set */ + rs->seq = ++inst_seq; + rs->queued = rs->issued = rs->completed = FALSE; + rs->ptrace_seq = pseq; + + /* split ld/st's into two operations: eff addr comp + mem access */ + if (MD_OP_FLAGS(op) & F_MEM) + { + /* convert RUU operation from ld/st to an add (eff addr comp) */ + rs->op = MD_AGEN_OP; + rs->ea_comp = TRUE; + + /* fill in LSQ reservation station */ + lsq = &LSQ[LSQ_tail]; + lsq->slip = sim_cycle - 1; + lsq->IR = inst; + lsq->op = op; + lsq->PC = regs.regs_PC; + lsq->next_PC = regs.regs_NPC; lsq->pred_PC = pred_PC; + lsq->in_LSQ = TRUE; + lsq->ea_comp = FALSE; + lsq->recover_inst = FALSE; + lsq->dir_update.pdir1 = lsq->dir_update.pdir2 = NULL; + lsq->dir_update.pmeta = NULL; + lsq->stack_recover_idx = 0; + lsq->spec_mode = spec_mode; + lsq->addr = addr; + /* lsq->tag is already set */ + lsq->seq = ++inst_seq; + lsq->queued = lsq->issued = lsq->completed = FALSE; + lsq->ptrace_seq = ptrace_seq++; + + /* pipetrace this uop */ + ptrace_newuop(lsq->ptrace_seq, "internal ld/st", lsq->PC, 0); + ptrace_newstage(lsq->ptrace_seq, PST_DISPATCH, 0); + + /* link eff addr computation onto operand's output chains */ + ruu_link_idep(rs, /* idep_ready[] index */0, NA); + ruu_link_idep(rs, /* idep_ready[] index */1, in2); + ruu_link_idep(rs, /* idep_ready[] index */2, in3); + + /* install output after inputs to prevent self reference */ + ruu_install_odep(rs, /* odep_list[] index */0, DTMP); + ruu_install_odep(rs, /* odep_list[] index */1, NA); + + /* link memory access onto output chain of eff addr operation */ + ruu_link_idep(lsq, + /* idep_ready[] index */STORE_OP_INDEX/* 0 */, + in1); + ruu_link_idep(lsq, + /* idep_ready[] index */STORE_ADDR_INDEX/* 1 */, + DTMP); + ruu_link_idep(lsq, /* idep_ready[] index */2, NA); + + /* install output after inputs to prevent self reference */ + ruu_install_odep(lsq, /* odep_list[] index */0, out1); + ruu_install_odep(lsq, /* odep_list[] index */1, out2); + + /* install operation in the RUU and LSQ */ + n_dispatched++; + RUU_tail = (RUU_tail + 1) % RUU_size; + RUU_num++; + LSQ_tail = (LSQ_tail + 1) % LSQ_size; + LSQ_num++; + + if (OPERANDS_READY(rs)) + { + /* eff addr computation ready, queue it on ready list */ + readyq_enqueue(rs); + } + /* issue may continue when the load/store is issued */ + RSLINK_INIT(last_op, lsq); + + /* issue stores only, loads are issued by lsq_refresh() */ + if (((MD_OP_FLAGS(op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE)) + && OPERANDS_READY(lsq)) + { + /* panic("store immediately ready"); */ + /* put operation on ready list, ruu_issue() issue it later */ + readyq_enqueue(lsq); + } + } + else /* !(MD_OP_FLAGS(op) & F_MEM) */ + { + /* link onto producing operation */ + ruu_link_idep(rs, /* idep_ready[] index */0, in1); + ruu_link_idep(rs, /* idep_ready[] index */1, in2); + ruu_link_idep(rs, /* idep_ready[] index */2, in3); + + /* install output after inputs to prevent self reference */ + ruu_install_odep(rs, /* odep_list[] index */0, out1); + ruu_install_odep(rs, /* odep_list[] index */1, out2); + + /* install operation in the RUU */ + n_dispatched++; + RUU_tail = (RUU_tail + 1) % RUU_size; + RUU_num++; + + /* issue op if all its reg operands are ready (no mem input) */ + if (OPERANDS_READY(rs)) + { + /* put operation on ready list, ruu_issue() issue it later */ + readyq_enqueue(rs); + /* issue may continue */ + last_op = RSLINK_NULL; + } + else + { + /* could not issue this inst, stall issue until we can */ + RSLINK_INIT(last_op, rs); + } + } + } + else + { + /* this is a NOP, no need to update RUU/LSQ state */ + rs = NULL; + } + + /* one more instruction executed, speculative or otherwise */ + sim_total_insn++; + if (MD_OP_FLAGS(op) & F_CTRL) + sim_total_branches++; + + if (!spec_mode) + { +#if 0 /* moved above for EIO trace file support */ + /* one more non-speculative instruction executed */ + sim_num_insn++; +#endif + + /* if this is a branching instruction update BTB, i.e., only + non-speculative state is committed into the BTB */ + if (MD_OP_FLAGS(op) & F_CTRL) + { + sim_num_branches++; + if (pred && bpred_spec_update == spec_ID) + { + bpred_update(pred, + /* branch address */regs.regs_PC, + /* actual target address */regs.regs_NPC, + /* taken? */regs.regs_NPC != (regs.regs_PC + + sizeof(md_inst_t)), + /* pred taken? */pred_PC != (regs.regs_PC + + sizeof(md_inst_t)), + /* correct pred? */pred_PC == regs.regs_NPC, + /* opcode */op, + /* predictor update ptr */&rs->dir_update); + } + } + + /* is the trace generator trasitioning into mis-speculation mode? */ + if (pred_PC != regs.regs_NPC && !fetch_redirected) + { + /* entering mis-speculation mode, indicate this and save PC */ + spec_mode = TRUE; + rs->recover_inst = TRUE; + recover_PC = regs.regs_NPC; + } + } + + /* entered decode/allocate stage, indicate in pipe trace */ + ptrace_newstage(pseq, PST_DISPATCH, + (pred_PC != regs.regs_NPC) ? PEV_MPOCCURED : 0); + if (op == MD_NOP_OP) + { + /* end of the line */ + ptrace_endinst(pseq); + } + + /* update any stats tracked by PC */ + for (i=0; i DISPATCH queue */ + fetch_head = (fetch_head+1) & (ruu_ifq_size - 1); + fetch_num--; + + /* check for DLite debugger entry condition */ + made_check = TRUE; + if (dlite_check_break(pred_PC, + is_write ? ACCESS_WRITE : ACCESS_READ, + addr, sim_num_insn, sim_cycle)) + dlite_main(regs.regs_PC, pred_PC, sim_cycle, ®s, mem); + } + + /* need to enter DLite at least once per cycle */ + if (!made_check) + { + if (dlite_check_break(/* no next PC */0, + is_write ? ACCESS_WRITE : ACCESS_READ, + addr, sim_num_insn, sim_cycle)) + dlite_main(regs.regs_PC, /* no next PC */0, sim_cycle, ®s, mem); + } +} + + +/* + * RUU_FETCH() - instruction fetch pipeline stage(s) + */ + +/* initialize the instruction fetch pipeline stage */ +static void +fetch_init(void) +{ + /* allocate the IFETCH -> DISPATCH instruction queue */ + fetch_data = + (struct fetch_rec *)calloc(ruu_ifq_size, sizeof(struct fetch_rec)); + if (!fetch_data) + fatal("out of virtual memory"); + + fetch_num = 0; + fetch_tail = fetch_head = 0; + IFQ_count = 0; + IFQ_fcount = 0; +} + +/* dump contents of fetch stage registers and fetch queue */ +void +fetch_dump(FILE *stream) /* output stream */ +{ + int num, head; + + if (!stream) + stream = stderr; + + fprintf(stream, "** fetch stage state **\n"); + + fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f"); + myfprintf(stream, "pred_PC: 0x%08p, recover_PC: 0x%08p\n", + pred_PC, recover_PC); + myfprintf(stream, "fetch_regs_PC: 0x%08p, fetch_pred_PC: 0x%08p\n", + fetch_regs_PC, fetch_pred_PC); + fprintf(stream, "\n"); + + fprintf(stream, "** fetch queue contents **\n"); + fprintf(stream, "fetch_num: %d\n", fetch_num); + fprintf(stream, "fetch_head: %d, fetch_tail: %d\n", + fetch_head, fetch_tail); + + num = fetch_num; + head = fetch_head; + while (num) + { + fprintf(stream, "idx: %2d: inst: `", head); + md_print_insn(fetch_data[head].IR, fetch_data[head].regs_PC, stream); + fprintf(stream, "'\n"); + myfprintf(stream, " regs_PC: 0x%08p, pred_PC: 0x%08p\n", + fetch_data[head].regs_PC, fetch_data[head].pred_PC); + head = (head + 1) & (ruu_ifq_size - 1); + num--; + } +} + +static int last_inst_missed = FALSE; +static int last_inst_tmissed = FALSE; + +/* fetch up as many instruction as one branch prediction and one cache line + acess will support without overflowing the IFETCH -> DISPATCH QUEUE */ +static void +ruu_fetch(void) +{ + int i, lat, tlb_lat, done = FALSE; + md_inst_t inst; + int stack_recover_idx; + int branch_cnt; + + for (i=0, branch_cnt=0; + /* fetch up to as many instruction as the DISPATCH stage can decode */ + i < (ruu_decode_width * fetch_speed) + /* fetch until IFETCH -> DISPATCH queue fills */ + && fetch_num < ruu_ifq_size + /* and no IFETCH blocking condition encountered */ + && !done; + i++) + { + /* fetch an instruction at the next predicted fetch address */ + fetch_regs_PC = fetch_pred_PC; + + /* is this a bogus text address? (can happen on mis-spec path) */ + if (ld_text_base <= fetch_regs_PC + && fetch_regs_PC < (ld_text_base+ld_text_size) + && !(fetch_regs_PC & (sizeof(md_inst_t)-1))) + { + /* read instruction from memory */ + MD_FETCH_INST(inst, mem, fetch_regs_PC); + + /* address is within program text, read instruction from memory */ + lat = cache_il1_lat; + if (cache_il1) + { + /* access the I-cache */ + lat = + cache_access(cache_il1, Read, IACOMPRESS(fetch_regs_PC), + NULL, ISCOMPRESS(sizeof(md_inst_t)), sim_cycle, + NULL, NULL); + if (lat > cache_il1_lat) + last_inst_missed = TRUE; + } + + if (itlb) + { + /* access the I-TLB, NOTE: this code will initiate + speculative TLB misses */ + tlb_lat = + cache_access(itlb, Read, IACOMPRESS(fetch_regs_PC), + NULL, ISCOMPRESS(sizeof(md_inst_t)), sim_cycle, + NULL, NULL); + if (tlb_lat > 1) + last_inst_tmissed = TRUE; + + /* I-cache/I-TLB accesses occur in parallel */ + lat = MAX(tlb_lat, lat); + } + + /* I-cache/I-TLB miss? assumes I-cache hit >= I-TLB hit */ + if (lat != cache_il1_lat) + { + /* I-cache miss, block fetch until it is resolved */ + ruu_fetch_issue_delay += lat - 1; + break; + } + /* else, I-cache/I-TLB hit */ + } + else + { + /* fetch PC is bogus, send a NOP down the pipeline */ + inst = MD_NOP_INST; + } + + /* have a valid inst, here */ + + /* possibly use the BTB target */ + if (pred) + { + enum md_opcode op; + + /* pre-decode instruction, used for bpred stats recording */ + MD_SET_OPCODE(op, inst); + + /* get the next predicted fetch address; only use branch predictor + result for branches (assumes pre-decode bits); NOTE: returned + value may be 1 if bpred can only predict a direction */ + if (MD_OP_FLAGS(op) & F_CTRL) + fetch_pred_PC = + bpred_lookup(pred, + /* branch address */fetch_regs_PC, + /* target address *//* FIXME: not computed */0, + /* opcode */op, + /* call? */MD_IS_CALL(op), + /* return? */MD_IS_RETURN(op), + /* updt */&(fetch_data[fetch_tail].dir_update), + /* RSB index */&stack_recover_idx); + else + fetch_pred_PC = 0; + + /* valid address returned from branch predictor? */ + if (!fetch_pred_PC) + { + /* no predicted taken target, attempt not taken target */ + fetch_pred_PC = fetch_regs_PC + sizeof(md_inst_t); + } + else + { + /* go with target, NOTE: discontinuous fetch, so terminate */ + branch_cnt++; + if (branch_cnt >= fetch_speed) + done = TRUE; + } + } + else + { + /* no predictor, just default to predict not taken, and + continue fetching instructions linearly */ + fetch_pred_PC = fetch_regs_PC + sizeof(md_inst_t); + } + + /* commit this instruction to the IFETCH -> DISPATCH queue */ + fetch_data[fetch_tail].IR = inst; + fetch_data[fetch_tail].regs_PC = fetch_regs_PC; + fetch_data[fetch_tail].pred_PC = fetch_pred_PC; + fetch_data[fetch_tail].stack_recover_idx = stack_recover_idx; + fetch_data[fetch_tail].ptrace_seq = ptrace_seq++; + + /* for pipe trace */ + ptrace_newinst(fetch_data[fetch_tail].ptrace_seq, + inst, fetch_data[fetch_tail].regs_PC, + 0); + ptrace_newstage(fetch_data[fetch_tail].ptrace_seq, + PST_IFETCH, + ((last_inst_missed ? PEV_CACHEMISS : 0) + | (last_inst_tmissed ? PEV_TLBMISS : 0))); + last_inst_missed = FALSE; + last_inst_tmissed = FALSE; + + /* adjust instruction fetch queue */ + fetch_tail = (fetch_tail + 1) & (ruu_ifq_size - 1); + fetch_num++; + } +} + +/* default machine state accessor, used by DLite */ +static char * /* err str, NULL for no err */ +simoo_mstate_obj(FILE *stream, /* output stream */ + char *cmd, /* optional command string */ + struct regs_t *regs, /* registers to access */ + struct mem_t *mem) /* memory space to access */ +{ + if (!cmd || !strcmp(cmd, "help")) + fprintf(stream, +"mstate commands:\n" +"\n" +" mstate help - show all machine-specific commands (this list)\n" +" mstate stats - dump all statistical variables\n" +" mstate res - dump current functional unit resource states\n" +" mstate ruu - dump contents of the register update unit\n" +" mstate lsq - dump contents of the load/store queue\n" +" mstate eventq - dump contents of event queue\n" +" mstate readyq - dump contents of ready instruction queue\n" +" mstate cv - dump contents of the register create vector\n" +" mstate rspec - dump contents of speculative regs\n" +" mstate mspec - dump contents of speculative memory\n" +" mstate fetch - dump contents of fetch stage registers and fetch queue\n" +"\n" + ); + else if (!strcmp(cmd, "stats")) + { + /* just dump intermediate stats */ + sim_print_stats(stream); + } + else if (!strcmp(cmd, "res")) + { + /* dump resource state */ + res_dump(fu_pool, stream); + } + else if (!strcmp(cmd, "ruu")) + { + /* dump RUU contents */ + ruu_dump(stream); + } + else if (!strcmp(cmd, "lsq")) + { + /* dump LSQ contents */ + lsq_dump(stream); + } + else if (!strcmp(cmd, "eventq")) + { + /* dump event queue contents */ + eventq_dump(stream); + } + else if (!strcmp(cmd, "readyq")) + { + /* dump event queue contents */ + readyq_dump(stream); + } + else if (!strcmp(cmd, "cv")) + { + /* dump event queue contents */ + cv_dump(stream); + } + else if (!strcmp(cmd, "rspec")) + { + /* dump event queue contents */ + rspec_dump(stream); + } + else if (!strcmp(cmd, "mspec")) + { + /* dump event queue contents */ + mspec_dump(stream); + } + else if (!strcmp(cmd, "fetch")) + { + /* dump event queue contents */ + fetch_dump(stream); + } + else + return "unknown mstate command"; + + /* no error */ + return NULL; +} + + +/* start simulation, program loaded, processor precise state initialized */ +void +sim_main(void) +{ + /* ignore any floating point exceptions, they may occur on mis-speculated + execution paths */ + signal(SIGFPE, SIG_IGN); + + /* set up program entry state */ + regs.regs_PC = ld_prog_entry; + regs.regs_NPC = regs.regs_PC + sizeof(md_inst_t); + + /* check for DLite debugger entry condition */ + if (dlite_check_break(regs.regs_PC, /* no access */0, /* addr */0, 0, 0)) + dlite_main(regs.regs_PC, regs.regs_PC + sizeof(md_inst_t), + sim_cycle, ®s, mem); + + /* fast forward simulator loop, performs functional simulation for + FASTFWD_COUNT insts, then turns on performance (timing) simulation */ + if (fastfwd_count > 0) + { + int icount; + md_inst_t inst; /* actual instruction bits */ + enum md_opcode op; /* decoded opcode enum */ + md_addr_t target_PC; /* actual next/target PC address */ + md_addr_t addr; /* effective address, if load/store */ + int is_write; /* store? */ + byte_t temp_byte = 0; /* temp variable for spec mem access */ + half_t temp_half = 0; /* " ditto " */ + word_t temp_word = 0; /* " ditto " */ +#ifdef HOST_HAS_QWORD + qword_t temp_qword = 0; /* " ditto " */ +#endif /* HOST_HAS_QWORD */ + enum md_fault_type fault; + + fprintf(stderr, "sim: ** fast forwarding %d insts **\n", fastfwd_count); + + for (icount=0; icount < fastfwd_count; icount++) + { + /* maintain $r0 semantics */ + regs.regs_R[MD_REG_ZERO] = 0; +#ifdef TARGET_ALPHA + regs.regs_F.d[MD_REG_ZERO] = 0.0; +#endif /* TARGET_ALPHA */ + + /* get the next instruction to execute */ + MD_FETCH_INST(inst, mem, regs.regs_PC); + + /* set default reference address */ + addr = 0; is_write = FALSE; + + /* set default fault - none */ + fault = md_fault_none; + + /* decode the instruction */ + MD_SET_OPCODE(op, inst); + + /* execute the instruction */ + switch (op) + { +#define DEFINST(OP,MSK,NAME,OPFORM,RES,FLAGS,O1,O2,I1,I2,I3) \ + case OP: \ + SYMCAT(OP,_IMPL); \ + break; +#define DEFLINK(OP,MSK,NAME,MASK,SHIFT) \ + case OP: \ + panic("attempted to execute a linking opcode"); +#define CONNECT(OP) +#undef DECLARE_FAULT +#define DECLARE_FAULT(FAULT) \ + { fault = (FAULT); break; } +#include "machine.def" + default: + panic("attempted to execute a bogus opcode"); + } + + if (fault != md_fault_none) + fatal("fault (%d) detected @ 0x%08p", fault, regs.regs_PC); + + /* update memory access stats */ + if (MD_OP_FLAGS(op) & F_MEM) + { + if (MD_OP_FLAGS(op) & F_STORE) + is_write = TRUE; + } + + /* check for DLite debugger entry condition */ + if (dlite_check_break(regs.regs_NPC, + is_write ? ACCESS_WRITE : ACCESS_READ, + addr, sim_num_insn, sim_num_insn)) + dlite_main(regs.regs_PC, regs.regs_NPC, sim_num_insn, ®s, mem); + + /* go to the next instruction */ + regs.regs_PC = regs.regs_NPC; + regs.regs_NPC += sizeof(md_inst_t); + } + } + + fprintf(stderr, "sim: ** starting performance simulation **\n"); + + /* set up timing simulation entry state */ + fetch_regs_PC = regs.regs_PC - sizeof(md_inst_t); + fetch_pred_PC = regs.regs_PC; + regs.regs_PC = regs.regs_PC - sizeof(md_inst_t); + + /* main simulator loop, NOTE: the pipe stages are traverse in reverse order + to eliminate this/next state synchronization and relaxation problems */ + for (;;) + { + /* RUU/LSQ sanity checks */ + if (RUU_num < LSQ_num) + panic("RUU_num < LSQ_num"); + if (((RUU_head + RUU_num) % RUU_size) != RUU_tail) + panic("RUU_head/RUU_tail wedged"); + if (((LSQ_head + LSQ_num) % LSQ_size) != LSQ_tail) + panic("LSQ_head/LSQ_tail wedged"); + + /* check if pipetracing is still active */ + ptrace_check_active(regs.regs_PC, sim_num_insn, sim_cycle); + + /* indicate new cycle in pipetrace */ + ptrace_newcycle(sim_cycle); + + /* commit entries from RUU/LSQ to architected register file */ + ruu_commit(); + + /* service function unit release events */ + ruu_release_fu(); + + /* ==> may have ready queue entries carried over from previous cycles */ + + /* service result completions, also readies dependent operations */ + /* ==> inserts operations into ready queue --> register deps resolved */ + ruu_writeback(); + + if (!bugcompat_mode) + { + /* try to locate memory operations that are ready to execute */ + /* ==> inserts operations into ready queue --> mem deps resolved */ + lsq_refresh(); + + /* issue operations ready to execute from a previous cycle */ + /* <== drains ready queue <-- ready operations commence execution */ + ruu_issue(); + } + + /* decode and dispatch new operations */ + /* ==> insert ops w/ no deps or all regs ready --> reg deps resolved */ + ruu_dispatch(); + + if (bugcompat_mode) + { + /* try to locate memory operations that are ready to execute */ + /* ==> inserts operations into ready queue --> mem deps resolved */ + lsq_refresh(); + + /* issue operations ready to execute from a previous cycle */ + /* <== drains ready queue <-- ready operations commence execution */ + ruu_issue(); + } + + /* call instruction fetch unit if it is not blocked */ + if (!ruu_fetch_issue_delay) + ruu_fetch(); + else + ruu_fetch_issue_delay--; + + /* update buffer occupancy stats */ + IFQ_count += fetch_num; + IFQ_fcount += ((fetch_num == ruu_ifq_size) ? 1 : 0); + RUU_count += RUU_num; + RUU_fcount += ((RUU_num == RUU_size) ? 1 : 0); + LSQ_count += LSQ_num; + LSQ_fcount += ((LSQ_num == LSQ_size) ? 1 : 0); + + /* go to next cycle */ + sim_cycle++; + + /* finish early? */ + if (max_insts && sim_num_insn >= max_insts) + return; + } +}