Report/sim-outorder.c

4638 lines
141 KiB
C

<
/* sim-outorder.c - sample out-of-order issue perf simulator implementation */
/* SimpleScalar(TM) Tool Suite
* Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
* All Rights Reserved.
*
* THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR,
* YOU ARE AGREEING TO THESE TERMS AND CONDITIONS.
*
* No portion of this work may be used by any commercial entity, or for any
* commercial purpose, without the prior, written permission of SimpleScalar,
* LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted
* as described below.
*
* 1. SimpleScalar is provided AS IS, with no warranty of any kind, express
* or implied. The user of the program accepts full responsibility for the
* application of the program and the use of any results.
*
* 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be
* downloaded, compiled, executed, copied, and modified solely for nonprofit,
* educational, noncommercial research, and noncommercial scholarship
* purposes provided that this notice in its entirety accompanies all copies.
* Copies of the modified software can be delivered to persons who use it
* solely for nonprofit, educational, noncommercial research, and
* noncommercial scholarship purposes provided that this notice in its
* entirety accompanies all copies.
*
* 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY
* PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com).
*
* 4. No nonprofit user may place any restrictions on the use of this software,
* including as modified by the user, by any other authorized user.
*
* 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar
* in compiled or executable form as set forth in Section 2, provided that
* either: (A) it is accompanied by the corresponding machine-readable source
* code, or (B) it is accompanied by a written offer, with no time limit, to
* give anyone a machine-readable copy of the corresponding source code in
* return for reimbursement of the cost of distribution. This written offer
* must permit verbatim duplication by anyone, or (C) it is distributed by
* someone who received only the executable form, and is accompanied by a
* copy of the written offer of source code.
*
* 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is
* currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail:
* 2395 Timbercrest Court, Ann Arbor, MI 48105.
*
* Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <signal.h>
#include "host.h"
#include "misc.h"
#include "machine.h"
#include "regs.h"
#include "memory.h"
#include "cache.h"
#include "loader.h"
#include "syscall.h"
#include "bpred.h"
#include "resource.h"
#include "bitmap.h"
#include "options.h"
#include "eval.h"
#include "stats.h"
#include "ptrace.h"
#include "dlite.h"
#include "sim.h"
/*
* This file implements a very detailed out-of-order issue superscalar
* processor with a two-level memory system and speculative execution support.
* This simulator is a performance simulator, tracking the latency of all
* pipeline operations.
*/
/* simulated registers */
static struct regs_t regs;
/* simulated memory */
static struct mem_t *mem = NULL;
/*
* simulator options
*/
/* maximum number of inst's to execute */
static unsigned int max_insts;
/* number of insts skipped before timing starts */
static int fastfwd_count;
/* pipeline trace range and output filename */
static int ptrace_nelt = 0;
static char *ptrace_opts[2];
/* instruction fetch queue size (in insts) */
static int ruu_ifq_size;
/* extra branch mis-prediction latency */
static int ruu_branch_penalty;
/* speed of front-end of machine relative to execution core */
static int fetch_speed;
/* branch predictor type {nottaken|taken|perfect|bimod|2lev} */
static char *pred_type;
/* bimodal predictor config (<table_size>) */
static int bimod_nelt = 1;
static int bimod_config[1] =
{ /* bimod tbl size */2048 };
/* threebit predictor config (<table_size>) */
static int threebit_nelt = 1;
static int threebit_config[1] =
{ /* threebit tbl size */2048 };
/* 2-level predictor config (<l1size> <l2size> <hist_size> <xor>) */
static int twolev_nelt = 4;
static int twolev_config[4] =
{ /* l1size */1, /* l2size */1024, /* hist */8, /* xor */FALSE};
/* combining predictor config (<meta_table_size> */
static int comb_nelt = 1;
static int comb_config[1] =
{ /* meta_table_size */1024 };
/* return address stack (RAS) size */
static int ras_size = 8;
/* BTB predictor config (<num_sets> <associativity>) */
static int btb_nelt = 2;
static int btb_config[2] =
{ /* nsets */512, /* assoc */4 };
/* instruction decode B/W (insts/cycle) */
static int ruu_decode_width;
/* instruction issue B/W (insts/cycle) */
static int ruu_issue_width;
/* run pipeline with in-order issue */
static int ruu_inorder_issue;
/* issue instructions down wrong execution paths */
static int ruu_include_spec = TRUE;
/* instruction commit B/W (insts/cycle) */
static int ruu_commit_width;
/* register update unit (RUU) size */
static int RUU_size = 8;
/* load/store queue (LSQ) size */
static int LSQ_size = 4;
/* l1 data cache config, i.e., {<config>|none} */
static char *cache_dl1_opt;
/* l1 data cache hit latency (in cycles) */
static int cache_dl1_lat;
/* l2 data cache config, i.e., {<config>|none} */
static char *cache_dl2_opt;
/* l2 data cache hit latency (in cycles) */
static int cache_dl2_lat;
/* l1 instruction cache config, i.e., {<config>|dl1|dl2|none} */
static char *cache_il1_opt;
/* l1 instruction cache hit latency (in cycles) */
static int cache_il1_lat;
/* l2 instruction cache config, i.e., {<config>|dl1|dl2|none} */
static char *cache_il2_opt;
/* l2 instruction cache hit latency (in cycles) */
static int cache_il2_lat;
/* flush caches on system calls */
static int flush_on_syscalls;
/* convert 64-bit inst addresses to 32-bit inst equivalents */
static int compress_icache_addrs;
/* memory access latency (<first_chunk> <inter_chunk>) */
static int mem_nelt = 2;
static int mem_lat[2] =
{ /* lat to first chunk */18, /* lat between remaining chunks */2 };
/* memory access bus width (in bytes) */
static int mem_bus_width;
/* instruction TLB config, i.e., {<config>|none} */
static char *itlb_opt;
/* data TLB config, i.e., {<config>|none} */
static char *dtlb_opt;
/* inst/data TLB miss latency (in cycles) */
static int tlb_miss_lat;
/* total number of integer ALU's available */
static int res_ialu;
/* total number of integer multiplier/dividers available */
static int res_imult;
/* total number of memory system ports available (to CPU) */
static int res_memport;
/* total number of floating point ALU's available */
static int res_fpalu;
/* total number of floating point multiplier/dividers available */
static int res_fpmult;
/* text-based stat profiles */
#define MAX_PCSTAT_VARS 8
static int pcstat_nelt = 0;
static char *pcstat_vars[MAX_PCSTAT_VARS];
/* convert 64-bit inst text addresses to 32-bit inst equivalents */
#ifdef TARGET_PISA
#define IACOMPRESS(A) \
(compress_icache_addrs ? ((((A) - ld_text_base) >> 1) + ld_text_base) : (A))
#define ISCOMPRESS(SZ) \
(compress_icache_addrs ? ((SZ) >> 1) : (SZ))
#else /* !TARGET_PISA */
#define IACOMPRESS(A) (A)
#define ISCOMPRESS(SZ) (SZ)
#endif /* TARGET_PISA */
/* operate in backward-compatible bugs mode (for testing only) */
static int bugcompat_mode;
/*
* functional unit resource configuration
*/
/* resource pool indices, NOTE: update these if you change FU_CONFIG */
#define FU_IALU_INDEX 0
#define FU_IMULT_INDEX 1
#define FU_MEMPORT_INDEX 2
#define FU_FPALU_INDEX 3
#define FU_FPMULT_INDEX 4
/* resource pool definition, NOTE: update FU_*_INDEX defs if you change this */
struct res_desc fu_config[] = {
{
"integer-ALU",
4,
0,
{
{ IntALU, 1, 1 }
}
},
{
"integer-MULT/DIV",
1,
0,
{
{ IntMULT, 3, 1 },
{ IntDIV, 20, 19 }
}
},
{
"memory-port",
2,
0,
{
{ RdPort, 1, 1 },
{ WrPort, 1, 1 }
}
},
{
"FP-adder",
4,
0,
{
{ FloatADD, 2, 1 },
{ FloatCMP, 2, 1 },
{ FloatCVT, 2, 1 }
}
},
{
"FP-MULT/DIV",
1,
0,
{
{ FloatMULT, 4, 1 },
{ FloatDIV, 12, 12 },
{ FloatSQRT, 24, 24 }
}
},
};
/*
* simulator stats
*/
/* SLIP variable */
static counter_t sim_slip = 0;
/* total number of instructions executed */
static counter_t sim_total_insn = 0;
/* total number of memory references committed */
static counter_t sim_num_refs = 0;
/* total number of memory references executed */
static counter_t sim_total_refs = 0;
/* total number of loads committed */
static counter_t sim_num_loads = 0;
/* total number of loads executed */
static counter_t sim_total_loads = 0;
/* total number of branches committed */
static counter_t sim_num_branches = 0;
/* total number of branches executed */
static counter_t sim_total_branches = 0;
/* cycle counter */
static tick_t sim_cycle = 0;
/* occupancy counters */
static counter_t IFQ_count; /* cumulative IFQ occupancy */
static counter_t IFQ_fcount; /* cumulative IFQ full count */
static counter_t RUU_count; /* cumulative RUU occupancy */
static counter_t RUU_fcount; /* cumulative RUU full count */
static counter_t LSQ_count; /* cumulative LSQ occupancy */
static counter_t LSQ_fcount; /* cumulative LSQ full count */
/* total non-speculative bogus addresses seen (debug var) */
static counter_t sim_invalid_addrs;
/*
* simulator state variables
*/
/* instruction sequence counter, used to assign unique id's to insts */
static unsigned int inst_seq = 0;
/* pipetrace instruction sequence counter */
static unsigned int ptrace_seq = 0;
/* speculation mode, non-zero when mis-speculating, i.e., executing
instructions down the wrong path, thus state recovery will eventually have
to occur that resets processor register and memory state back to the last
precise state */
static int spec_mode = FALSE;
/* cycles until fetch issue resumes */
static unsigned ruu_fetch_issue_delay = 0;
/* perfect prediction enabled */
static int pred_perfect = FALSE;
/* speculative bpred-update enabled */
static char *bpred_spec_opt;
static enum { spec_ID, spec_WB, spec_CT } bpred_spec_update;
/* level 1 instruction cache, entry level instruction cache */
static struct cache_t *cache_il1;
/* level 1 instruction cache */
static struct cache_t *cache_il2;
/* level 1 data cache, entry level data cache */
static struct cache_t *cache_dl1;
/* level 2 data cache */
static struct cache_t *cache_dl2;
/* instruction TLB */
static struct cache_t *itlb;
/* data TLB */
static struct cache_t *dtlb;
/* branch predictor */
static struct bpred_t *pred;
/* functional unit resource pool */
static struct res_pool *fu_pool = NULL;
/* text-based stat profiles */
static struct stat_stat_t *pcstat_stats[MAX_PCSTAT_VARS];
static counter_t pcstat_lastvals[MAX_PCSTAT_VARS];
static struct stat_stat_t *pcstat_sdists[MAX_PCSTAT_VARS];
/* wedge all stat values into a counter_t */
#define STATVAL(STAT) \
((STAT)->sc == sc_int \
? (counter_t)*((STAT)->variant.for_int.var) \
: ((STAT)->sc == sc_uint \
? (counter_t)*((STAT)->variant.for_uint.var) \
: ((STAT)->sc == sc_counter \
? *((STAT)->variant.for_counter.var) \
: (panic("bad stat class"), 0))))
/* memory access latency, assumed to not cross a page boundary */
static unsigned int /* total latency of access */
mem_access_latency(int blk_sz) /* block size accessed */
{
int chunks = (blk_sz + (mem_bus_width - 1)) / mem_bus_width;
assert(chunks > 0);
return (/* first chunk latency */mem_lat[0] +
(/* remainder chunk latency */mem_lat[1] * (chunks - 1)));
}
/*
* cache miss handlers
*/
/* l1 data cache l1 block miss handler function */
static unsigned int /* latency of block access */
dl1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
unsigned int lat;
if (cache_dl2)
{
/* access next level of data cache hierarchy */
lat = cache_access(cache_dl2, cmd, baddr, NULL, bsize,
/* now */now, /* pudata */NULL, /* repl addr */NULL);
if (cmd == Read)
return lat;
else
{
/* FIXME: unlimited write buffers */
return 0;
}
}
else
{
/* access main memory */
if (cmd == Read)
return mem_access_latency(bsize);
else
{
/* FIXME: unlimited write buffers */
return 0;
}
}
}
/* l2 data cache block miss handler function */
static unsigned int /* latency of block access */
dl2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
/* this is a miss to the lowest level, so access main memory */
if (cmd == Read)
return mem_access_latency(bsize);
else
{
/* FIXME: unlimited write buffers */
return 0;
}
}
/* l1 inst cache l1 block miss handler function */
static unsigned int /* latency of block access */
il1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
unsigned int lat;
if (cache_il2)
{
/* access next level of inst cache hierarchy */
lat = cache_access(cache_il2, cmd, baddr, NULL, bsize,
/* now */now, /* pudata */NULL, /* repl addr */NULL);
if (cmd == Read)
return lat;
else
panic("writes to instruction memory not supported");
}
else
{
/* access main memory */
if (cmd == Read)
return mem_access_latency(bsize);
else
panic("writes to instruction memory not supported");
}
}
/* l2 inst cache block miss handler function */
static unsigned int /* latency of block access */
il2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
/* this is a miss to the lowest level, so access main memory */
if (cmd == Read)
return mem_access_latency(bsize);
else
panic("writes to instruction memory not supported");
}
/*
* TLB miss handlers
*/
/* inst cache block miss handler function */
static unsigned int /* latency of block access */
itlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
/* no real memory access, however, should have user data space attached */
assert(phy_page_ptr);
/* fake translation, for now... */
*phy_page_ptr = 0;
/* return tlb miss latency */
return tlb_miss_lat;
}
/* data cache block miss handler function */
static unsigned int /* latency of block access */
dtlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
md_addr_t baddr, /* block address to access */
int bsize, /* size of block to access */
struct cache_blk_t *blk, /* ptr to block in upper level */
tick_t now) /* time of access */
{
md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
/* no real memory access, however, should have user data space attached */
assert(phy_page_ptr);
/* fake translation, for now... */
*phy_page_ptr = 0;
/* return tlb miss latency */
return tlb_miss_lat;
}
/* register simulator-specific options */
void
sim_reg_options(struct opt_odb_t *odb)
{
opt_reg_header(odb,
"sim-outorder: This simulator implements a very detailed out-of-order issue\n"
"superscalar processor with a two-level memory system and speculative\n"
"execution support. This simulator is a performance simulator, tracking the\n"
"latency of all pipeline operations.\n"
);
/* instruction limit */
opt_reg_uint(odb, "-max:inst", "maximum number of inst's to execute",
&max_insts, /* default */0,
/* print */TRUE, /* format */NULL);
/* trace options */
opt_reg_int(odb, "-fastfwd", "number of insts skipped before timing starts",
&fastfwd_count, /* default */0,
/* print */TRUE, /* format */NULL);
opt_reg_string_list(odb, "-ptrace",
"generate pipetrace, i.e., <fname|stdout|stderr> <range>",
ptrace_opts, /* arr_sz */2, &ptrace_nelt, /* default */NULL,
/* !print */FALSE, /* format */NULL, /* !accrue */FALSE);
opt_reg_note(odb,
" Pipetrace range arguments are formatted as follows:\n"
"\n"
" {{@|#}<start>}:{{@|#|+}<end>}\n"
"\n"
" Both ends of the range are optional, if neither are specified, the entire\n"
" execution is traced. Ranges that start with a `@' designate an address\n"
" range to be traced, those that start with an `#' designate a cycle count\n"
" range. All other range values represent an instruction count range. The\n"
" second argument, if specified with a `+', indicates a value relative\n"
" to the first argument, e.g., 1000:+100 == 1000:1100. Program symbols may\n"
" be used in all contexts.\n"
"\n"
" Examples: -ptrace FOO.trc #0:#1000\n"
" -ptrace BAR.trc @2000:\n"
" -ptrace BLAH.trc :1500\n"
" -ptrace UXXE.trc :\n"
" -ptrace FOOBAR.trc @main:+278\n"
);
/* ifetch options */
opt_reg_int(odb, "-fetch:ifqsize", "instruction fetch queue size (in insts)",
&ruu_ifq_size, /* default */4,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-fetch:mplat", "extra branch mis-prediction latency",
&ruu_branch_penalty, /* default */3,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-fetch:speed",
"speed of front-end of machine relative to execution core",
&fetch_speed, /* default */1,
/* print */TRUE, /* format */NULL);
/* branch predictor options */
opt_reg_note(odb,
" Branch predictor configuration examples for 2-level predictor:\n"
" Configurations: N, M, W, X\n"
" N # entries in first level (# of shift register(s))\n"
" W width of shift register(s)\n"
" M # entries in 2nd level (# of counters, or other FSM)\n"
" X (yes-1/no-0) xor history and address for 2nd level index\n"
" Sample predictors:\n"
" GAg : 1, W, 2^W, 0\n"
" GAp : 1, W, M (M > 2^W), 0\n"
" PAg : N, W, 2^W, 0\n"
" PAp : N, W, M (M == 2^(N+W)), 0\n"
" gshare : 1, W, 2^W, 1\n"
" Predictor `comb' combines a bimodal and a 2-level predictor.\n"
);
opt_reg_string(odb, "-bpred",
"branch predictor type {nottaken|taken|perfect|bimod|threebit|2lev|comb}",
&pred_type, /* default */"bimod",
/* print */TRUE, /* format */NULL);
opt_reg_int_list(odb, "-bpred:bimod",
"bimodal predictor config (<table size>)",
bimod_config, bimod_nelt, &bimod_nelt,
/* default */bimod_config,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_int_list(odb, "-bpred:threebit",
"3-bit predictor config (<table size>)",
threebit_config, threebit_nelt, &threebit_nelt,
/* default */threebit_config,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_int_list(odb, "-bpred:2lev",
"2-level predictor config "
"(<l1size> <l2size> <hist_size> <xor>)",
twolev_config, twolev_nelt, &twolev_nelt,
/* default */twolev_config,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_int_list(odb, "-bpred:comb",
"combining predictor config (<meta_table_size>)",
comb_config, comb_nelt, &comb_nelt,
/* default */comb_config,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_int(odb, "-bpred:ras",
"return address stack size (0 for no return stack)",
&ras_size, /* default */ras_size,
/* print */TRUE, /* format */NULL);
opt_reg_int_list(odb, "-bpred:btb",
"BTB config (<num_sets> <associativity>)",
btb_config, btb_nelt, &btb_nelt,
/* default */btb_config,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_string(odb, "-bpred:spec_update",
"speculative predictors update in {ID|WB} (default non-spec)",
&bpred_spec_opt, /* default */NULL,
/* print */TRUE, /* format */NULL);
/* decode options */
opt_reg_int(odb, "-decode:width",
"instruction decode B/W (insts/cycle)",
&ruu_decode_width, /* default */4,
/* print */TRUE, /* format */NULL);
/* issue options */
opt_reg_int(odb, "-issue:width",
"instruction issue B/W (insts/cycle)",
&ruu_issue_width, /* default */4,
/* print */TRUE, /* format */NULL);
opt_reg_flag(odb, "-issue:inorder", "run pipeline with in-order issue",
&ruu_inorder_issue, /* default */FALSE,
/* print */TRUE, /* format */NULL);
opt_reg_flag(odb, "-issue:wrongpath",
"issue instructions down wrong execution paths",
&ruu_include_spec, /* default */TRUE,
/* print */TRUE, /* format */NULL);
/* commit options */
opt_reg_int(odb, "-commit:width",
"instruction commit B/W (insts/cycle)",
&ruu_commit_width, /* default */4,
/* print */TRUE, /* format */NULL);
/* register scheduler options */
opt_reg_int(odb, "-ruu:size",
"register update unit (RUU) size",
&RUU_size, /* default */16,
/* print */TRUE, /* format */NULL);
/* memory scheduler options */
opt_reg_int(odb, "-lsq:size",
"load/store queue (LSQ) size",
&LSQ_size, /* default */8,
/* print */TRUE, /* format */NULL);
/* cache options */
opt_reg_string(odb, "-cache:dl1",
"l1 data cache config, i.e., {<config>|none}",
&cache_dl1_opt, "dl1:128:32:4:l",
/* print */TRUE, NULL);
opt_reg_note(odb,
" The cache config parameter <config> has the following format:\n"
"\n"
" <name>:<nsets>:<bsize>:<assoc>:<repl>\n"
"\n"
" <name> - name of the cache being defined\n"
" <nsets> - number of sets in the cache\n"
" <bsize> - block size of the cache\n"
" <assoc> - associativity of the cache\n"
" <repl> - block replacement strategy, 'l'-LRU, 'f'-FIFO, 'r'-random\n"
"\n"
" Examples: -cache:dl1 dl1:4096:32:1:l\n"
" -dtlb dtlb:128:4096:32:r\n"
);
opt_reg_int(odb, "-cache:dl1lat",
"l1 data cache hit latency (in cycles)",
&cache_dl1_lat, /* default */1,
/* print */TRUE, /* format */NULL);
opt_reg_string(odb, "-cache:dl2",
"l2 data cache config, i.e., {<config>|none}",
&cache_dl2_opt, "ul2:1024:64:4:l",
/* print */TRUE, NULL);
opt_reg_int(odb, "-cache:dl2lat",
"l2 data cache hit latency (in cycles)",
&cache_dl2_lat, /* default */6,
/* print */TRUE, /* format */NULL);
opt_reg_string(odb, "-cache:il1",
"l1 inst cache config, i.e., {<config>|dl1|dl2|none}",
&cache_il1_opt, "il1:512:32:1:l",
/* print */TRUE, NULL);
opt_reg_note(odb,
" Cache levels can be unified by pointing a level of the instruction cache\n"
" hierarchy at the data cache hiearchy using the \"dl1\" and \"dl2\" cache\n"
" configuration arguments. Most sensible combinations are supported, e.g.,\n"
"\n"
" A unified l2 cache (il2 is pointed at dl2):\n"
" -cache:il1 il1:128:64:1:l -cache:il2 dl2\n"
" -cache:dl1 dl1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
"\n"
" Or, a fully unified cache hierarchy (il1 pointed at dl1):\n"
" -cache:il1 dl1\n"
" -cache:dl1 ul1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
);
opt_reg_int(odb, "-cache:il1lat",
"l1 instruction cache hit latency (in cycles)",
&cache_il1_lat, /* default */1,
/* print */TRUE, /* format */NULL);
opt_reg_string(odb, "-cache:il2",
"l2 instruction cache config, i.e., {<config>|dl2|none}",
&cache_il2_opt, "dl2",
/* print */TRUE, NULL);
opt_reg_int(odb, "-cache:il2lat",
"l2 instruction cache hit latency (in cycles)",
&cache_il2_lat, /* default */6,
/* print */TRUE, /* format */NULL);
opt_reg_flag(odb, "-cache:flush", "flush caches on system calls",
&flush_on_syscalls, /* default */FALSE, /* print */TRUE, NULL);
opt_reg_flag(odb, "-cache:icompress",
"convert 64-bit inst addresses to 32-bit inst equivalents",
&compress_icache_addrs, /* default */FALSE,
/* print */TRUE, NULL);
/* mem options */
opt_reg_int_list(odb, "-mem:lat",
"memory access latency (<first_chunk> <inter_chunk>)",
mem_lat, mem_nelt, &mem_nelt, mem_lat,
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
opt_reg_int(odb, "-mem:width", "memory access bus width (in bytes)",
&mem_bus_width, /* default */8,
/* print */TRUE, /* format */NULL);
/* TLB options */
opt_reg_string(odb, "-tlb:itlb",
"instruction TLB config, i.e., {<config>|none}",
&itlb_opt, "itlb:16:4096:4:l", /* print */TRUE, NULL);
opt_reg_string(odb, "-tlb:dtlb",
"data TLB config, i.e., {<config>|none}",
&dtlb_opt, "dtlb:32:4096:4:l", /* print */TRUE, NULL);
opt_reg_int(odb, "-tlb:lat",
"inst/data TLB miss latency (in cycles)",
&tlb_miss_lat, /* default */30,
/* print */TRUE, /* format */NULL);
/* resource configuration */
opt_reg_int(odb, "-res:ialu",
"total number of integer ALU's available",
&res_ialu, /* default */fu_config[FU_IALU_INDEX].quantity,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-res:imult",
"total number of integer multiplier/dividers available",
&res_imult, /* default */fu_config[FU_IMULT_INDEX].quantity,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-res:memport",
"total number of memory system ports available (to CPU)",
&res_memport, /* default */fu_config[FU_MEMPORT_INDEX].quantity,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-res:fpalu",
"total number of floating point ALU's available",
&res_fpalu, /* default */fu_config[FU_FPALU_INDEX].quantity,
/* print */TRUE, /* format */NULL);
opt_reg_int(odb, "-res:fpmult",
"total number of floating point multiplier/dividers available",
&res_fpmult, /* default */fu_config[FU_FPMULT_INDEX].quantity,
/* print */TRUE, /* format */NULL);
opt_reg_string_list(odb, "-pcstat",
"profile stat(s) against text addr's (mult uses ok)",
pcstat_vars, MAX_PCSTAT_VARS, &pcstat_nelt, NULL,
/* !print */FALSE, /* format */NULL, /* accrue */TRUE);
opt_reg_flag(odb, "-bugcompat",
"operate in backward-compatible bugs mode (for testing only)",
&bugcompat_mode, /* default */FALSE, /* print */TRUE, NULL);
}
/* check simulator-specific option values */
void
sim_check_options(struct opt_odb_t *odb, /* options database */
int argc, char **argv) /* command line arguments */
{
char name[128], c;
int nsets, bsize, assoc;
if (fastfwd_count < 0 || fastfwd_count >= 2147483647)
fatal("bad fast forward count: %d", fastfwd_count);
if (ruu_ifq_size < 1 || (ruu_ifq_size & (ruu_ifq_size - 1)) != 0)
fatal("inst fetch queue size must be positive > 0 and a power of two");
if (ruu_branch_penalty < 1)
fatal("mis-prediction penalty must be at least 1 cycle");
if (fetch_speed < 1)
fatal("front-end speed must be positive and non-zero");
if (!mystricmp(pred_type, "perfect"))
{
/* perfect predictor */
pred = NULL;
pred_perfect = TRUE;
}
else if (!mystricmp(pred_type, "taken"))
{
/* static predictor, not taken */
pred = bpred_create(BPredTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
else if (!mystricmp(pred_type, "nottaken"))
{
/* static predictor, taken */
pred = bpred_create(BPredNotTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
}
else if (!mystricmp(pred_type, "bimod"))
{
/* bimodal predictor, bpred_create() checks BTB_SIZE */
if (bimod_nelt != 1)
fatal("bad bimod predictor config (<table_size>)");
if (btb_nelt != 2)
fatal("bad btb config (<num_sets> <associativity>)");
/* bimodal predictor, bpred_create() checks BTB_SIZE */
pred = bpred_create(BPred2bit,
/* bimod table size */bimod_config[0],
/* 2lev l1 size */0,
/* 2lev l2 size */0,
/* meta table size */0,
/* history reg size */0,
/* history xor address */0,
/* btb sets */btb_config[0],
/* btb assoc */btb_config[1],
/* ret-addr stack size */ras_size);
}
else if (!mystricmp(pred_type, "threebit"))
{
/* 3-bit predictor, bpred_create() checks BTB_SIZE */
if (threebit_nelt != 1)
fatal("bad 3-bit predictor config (<table_size>)");
if (btb_nelt != 2)
fatal("bad btb config (<num_sets> <associativity>)");
/* 3-bit predictor, bpred_create() checks BTB_SIZE */
pred = bpred_create(BPred3bit,
/* threebit table size */threebit_config[0],
/* 2lev l1 size */0,
/* 2lev l2 size */0,
/* meta table size */0,
/* history reg size */0,
/* history xor address */0,
/* btb sets */btb_config[0],
/* btb assoc */btb_config[1],
/* ret-addr stack size */ras_size);
}
else if (!mystricmp(pred_type, "2lev"))
{
/* 2-level adaptive predictor, bpred_create() checks args */
if (twolev_nelt != 4)
fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
if (btb_nelt != 2)
fatal("bad btb config (<num_sets> <associativity>)");
pred = bpred_create(BPred2Level,
/* bimod table size */0,
/* 2lev l1 size */twolev_config[0],
/* 2lev l2 size */twolev_config[1],
/* meta table size */0,
/* history reg size */twolev_config[2],
/* history xor address */twolev_config[3],
/* btb sets */btb_config[0],
/* btb assoc */btb_config[1],
/* ret-addr stack size */ras_size);
}
else if (!mystricmp(pred_type, "comb"))
{
/* combining predictor, bpred_create() checks args */
if (twolev_nelt != 4)
fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
if (bimod_nelt != 1)
fatal("bad bimod predictor config (<table_size>)");
if (comb_nelt != 1)
fatal("bad combining predictor config (<meta_table_size>)");
if (btb_nelt != 2)
fatal("bad btb config (<num_sets> <associativity>)");
pred = bpred_create(BPredComb,
/* bimod table size */bimod_config[0],
/* l1 size */twolev_config[0],
/* l2 size */twolev_config[1],
/* meta table size */comb_config[0],
/* history reg size */twolev_config[2],
/* history xor address */twolev_config[3],
/* btb sets */btb_config[0],
/* btb assoc */btb_config[1],
/* ret-addr stack size */ras_size);
}
else
fatal("cannot parse predictor type `%s'", pred_type);
if (!bpred_spec_opt)
bpred_spec_update = spec_CT;
else if (!mystricmp(bpred_spec_opt, "ID"))
bpred_spec_update = spec_ID;
else if (!mystricmp(bpred_spec_opt, "WB"))
bpred_spec_update = spec_WB;
else
fatal("bad speculative update stage specifier, use {ID|WB}");
if (ruu_decode_width < 1 || (ruu_decode_width & (ruu_decode_width-1)) != 0)
fatal("issue width must be positive non-zero and a power of two");
if (ruu_issue_width < 1 || (ruu_issue_width & (ruu_issue_width-1)) != 0)
fatal("issue width must be positive non-zero and a power of two");
if (ruu_commit_width < 1)
fatal("commit width must be positive non-zero");
if (RUU_size < 2 || (RUU_size & (RUU_size-1)) != 0)
fatal("RUU size must be a positive number > 1 and a power of two");
if (LSQ_size < 2 || (LSQ_size & (LSQ_size-1)) != 0)
fatal("LSQ size must be a positive number > 1 and a power of two");
/* use a level 1 D-cache? */
if (!mystricmp(cache_dl1_opt, "none"))
{
cache_dl1 = NULL;
/* the level 2 D-cache cannot be defined */
if (strcmp(cache_dl2_opt, "none"))
fatal("the l1 data cache must defined if the l2 cache is defined");
cache_dl2 = NULL;
}
else /* dl1 is defined */
{
if (sscanf(cache_dl1_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad l1 D-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
cache_dl1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */0, assoc, cache_char2policy(c),
dl1_access_fn, /* hit lat */cache_dl1_lat);
/* is the level 2 D-cache defined? */
if (!mystricmp(cache_dl2_opt, "none"))
cache_dl2 = NULL;
else
{
if (sscanf(cache_dl2_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad l2 D-cache parms: "
"<name>:<nsets>:<bsize>:<assoc>:<repl>");
cache_dl2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */0, assoc, cache_char2policy(c),
dl2_access_fn, /* hit lat */cache_dl2_lat);
}
}
/* use a level 1 I-cache? */
if (!mystricmp(cache_il1_opt, "none"))
{
cache_il1 = NULL;
/* the level 2 I-cache cannot be defined */
if (strcmp(cache_il2_opt, "none"))
fatal("the l1 inst cache must defined if the l2 cache is defined");
cache_il2 = NULL;
}
else if (!mystricmp(cache_il1_opt, "dl1"))
{
if (!cache_dl1)
fatal("I-cache l1 cannot access D-cache l1 as it's undefined");
cache_il1 = cache_dl1;
/* the level 2 I-cache cannot be defined */
if (strcmp(cache_il2_opt, "none"))
fatal("the l1 inst cache must defined if the l2 cache is defined");
cache_il2 = NULL;
}
else if (!mystricmp(cache_il1_opt, "dl2"))
{
if (!cache_dl2)
fatal("I-cache l1 cannot access D-cache l2 as it's undefined");
cache_il1 = cache_dl2;
/* the level 2 I-cache cannot be defined */
if (strcmp(cache_il2_opt, "none"))
fatal("the l1 inst cache must defined if the l2 cache is defined");
cache_il2 = NULL;
}
else /* il1 is defined */
{
if (sscanf(cache_il1_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad l1 I-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
cache_il1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */0, assoc, cache_char2policy(c),
il1_access_fn, /* hit lat */cache_il1_lat);
/* is the level 2 D-cache defined? */
if (!mystricmp(cache_il2_opt, "none"))
cache_il2 = NULL;
else if (!mystricmp(cache_il2_opt, "dl2"))
{
if (!cache_dl2)
fatal("I-cache l2 cannot access D-cache l2 as it's undefined");
cache_il2 = cache_dl2;
}
else
{
if (sscanf(cache_il2_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad l2 I-cache parms: "
"<name>:<nsets>:<bsize>:<assoc>:<repl>");
cache_il2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */0, assoc, cache_char2policy(c),
il2_access_fn, /* hit lat */cache_il2_lat);
}
}
/* use an I-TLB? */
if (!mystricmp(itlb_opt, "none"))
itlb = NULL;
else
{
if (sscanf(itlb_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
itlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */sizeof(md_addr_t), assoc,
cache_char2policy(c), itlb_access_fn,
/* hit latency */1);
}
/* use a D-TLB? */
if (!mystricmp(dtlb_opt, "none"))
dtlb = NULL;
else
{
if (sscanf(dtlb_opt, "%[^:]:%d:%d:%d:%c",
name, &nsets, &bsize, &assoc, &c) != 5)
fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
dtlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
/* usize */sizeof(md_addr_t), assoc,
cache_char2policy(c), dtlb_access_fn,
/* hit latency */1);
}
if (cache_dl1_lat < 1)
fatal("l1 data cache latency must be greater than zero");
if (cache_dl2_lat < 1)
fatal("l2 data cache latency must be greater than zero");
if (cache_il1_lat < 1)
fatal("l1 instruction cache latency must be greater than zero");
if (cache_il2_lat < 1)
fatal("l2 instruction cache latency must be greater than zero");
if (mem_nelt != 2)
fatal("bad memory access latency (<first_chunk> <inter_chunk>)");
if (mem_lat[0] < 1 || mem_lat[1] < 1)
fatal("all memory access latencies must be greater than zero");
if (mem_bus_width < 1 || (mem_bus_width & (mem_bus_width-1)) != 0)
fatal("memory bus width must be positive non-zero and a power of two");
if (tlb_miss_lat < 1)
fatal("TLB miss latency must be greater than zero");
if (res_ialu < 1)
fatal("number of integer ALU's must be greater than zero");
if (res_ialu > MAX_INSTS_PER_CLASS)
fatal("number of integer ALU's must be <= MAX_INSTS_PER_CLASS");
fu_config[FU_IALU_INDEX].quantity = res_ialu;
if (res_imult < 1)
fatal("number of integer multiplier/dividers must be greater than zero");
if (res_imult > MAX_INSTS_PER_CLASS)
fatal("number of integer mult/div's must be <= MAX_INSTS_PER_CLASS");
fu_config[FU_IMULT_INDEX].quantity = res_imult;
if (res_memport < 1)
fatal("number of memory system ports must be greater than zero");
if (res_memport > MAX_INSTS_PER_CLASS)
fatal("number of memory system ports must be <= MAX_INSTS_PER_CLASS");
fu_config[FU_MEMPORT_INDEX].quantity = res_memport;
if (res_fpalu < 1)
fatal("number of floating point ALU's must be greater than zero");
if (res_fpalu > MAX_INSTS_PER_CLASS)
fatal("number of floating point ALU's must be <= MAX_INSTS_PER_CLASS");
fu_config[FU_FPALU_INDEX].quantity = res_fpalu;
if (res_fpmult < 1)
fatal("number of floating point multiplier/dividers must be > zero");
if (res_fpmult > MAX_INSTS_PER_CLASS)
fatal("number of FP mult/div's must be <= MAX_INSTS_PER_CLASS");
fu_config[FU_FPMULT_INDEX].quantity = res_fpmult;
}
/* print simulator-specific configuration information */
void
sim_aux_config(FILE *stream) /* output stream */
{
/* nada */
}
/* register simulator-specific statistics */
void
sim_reg_stats(struct stat_sdb_t *sdb) /* stats database */
{
int i;
stat_reg_counter(sdb, "sim_num_insn",
"total number of instructions committed",
&sim_num_insn, sim_num_insn, NULL);
stat_reg_counter(sdb, "sim_num_refs",
"total number of loads and stores committed",
&sim_num_refs, 0, NULL);
stat_reg_counter(sdb, "sim_num_loads",
"total number of loads committed",
&sim_num_loads, 0, NULL);
stat_reg_formula(sdb, "sim_num_stores",
"total number of stores committed",
"sim_num_refs - sim_num_loads", NULL);
stat_reg_counter(sdb, "sim_num_branches",
"total number of branches committed",
&sim_num_branches, /* initial value */0, /* format */NULL);
stat_reg_int(sdb, "sim_elapsed_time",
"total simulation time in seconds",
&sim_elapsed_time, 0, NULL);
stat_reg_formula(sdb, "sim_inst_rate",
"simulation speed (in insts/sec)",
"sim_num_insn / sim_elapsed_time", NULL);
stat_reg_counter(sdb, "sim_total_insn",
"total number of instructions executed",
&sim_total_insn, 0, NULL);
stat_reg_counter(sdb, "sim_total_refs",
"total number of loads and stores executed",
&sim_total_refs, 0, NULL);
stat_reg_counter(sdb, "sim_total_loads",
"total number of loads executed",
&sim_total_loads, 0, NULL);
stat_reg_formula(sdb, "sim_total_stores",
"total number of stores executed",
"sim_total_refs - sim_total_loads", NULL);
stat_reg_counter(sdb, "sim_total_branches",
"total number of branches executed",
&sim_total_branches, /* initial value */0, /* format */NULL);
/* register performance stats */
stat_reg_counter(sdb, "sim_cycle",
"total simulation time in cycles",
&sim_cycle, /* initial value */0, /* format */NULL);
stat_reg_formula(sdb, "sim_IPC",
"instructions per cycle",
"sim_num_insn / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "sim_CPI",
"cycles per instruction",
"sim_cycle / sim_num_insn", /* format */NULL);
stat_reg_formula(sdb, "sim_exec_BW",
"total instructions (mis-spec + committed) per cycle",
"sim_total_insn / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "sim_IPB",
"instruction per branch",
"sim_num_insn / sim_num_branches", /* format */NULL);
/* occupancy stats */
stat_reg_counter(sdb, "IFQ_count", "cumulative IFQ occupancy",
&IFQ_count, /* initial value */0, /* format */NULL);
stat_reg_counter(sdb, "IFQ_fcount", "cumulative IFQ full count",
&IFQ_fcount, /* initial value */0, /* format */NULL);
stat_reg_formula(sdb, "ifq_occupancy", "avg IFQ occupancy (insn's)",
"IFQ_count / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "ifq_rate", "avg IFQ dispatch rate (insn/cycle)",
"sim_total_insn / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "ifq_latency", "avg IFQ occupant latency (cycle's)",
"ifq_occupancy / ifq_rate", /* format */NULL);
stat_reg_formula(sdb, "ifq_full", "fraction of time (cycle's) IFQ was full",
"IFQ_fcount / sim_cycle", /* format */NULL);
stat_reg_counter(sdb, "RUU_count", "cumulative RUU occupancy",
&RUU_count, /* initial value */0, /* format */NULL);
stat_reg_counter(sdb, "RUU_fcount", "cumulative RUU full count",
&RUU_fcount, /* initial value */0, /* format */NULL);
stat_reg_formula(sdb, "ruu_occupancy", "avg RUU occupancy (insn's)",
"RUU_count / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "ruu_rate", "avg RUU dispatch rate (insn/cycle)",
"sim_total_insn / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "ruu_latency", "avg RUU occupant latency (cycle's)",
"ruu_occupancy / ruu_rate", /* format */NULL);
stat_reg_formula(sdb, "ruu_full", "fraction of time (cycle's) RUU was full",
"RUU_fcount / sim_cycle", /* format */NULL);
stat_reg_counter(sdb, "LSQ_count", "cumulative LSQ occupancy",
&LSQ_count, /* initial value */0, /* format */NULL);
stat_reg_counter(sdb, "LSQ_fcount", "cumulative LSQ full count",
&LSQ_fcount, /* initial value */0, /* format */NULL);
stat_reg_formula(sdb, "lsq_occupancy", "avg LSQ occupancy (insn's)",
"LSQ_count / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "lsq_rate", "avg LSQ dispatch rate (insn/cycle)",
"sim_total_insn / sim_cycle", /* format */NULL);
stat_reg_formula(sdb, "lsq_latency", "avg LSQ occupant latency (cycle's)",
"lsq_occupancy / lsq_rate", /* format */NULL);
stat_reg_formula(sdb, "lsq_full", "fraction of time (cycle's) LSQ was full",
"LSQ_fcount / sim_cycle", /* format */NULL);
stat_reg_counter(sdb, "sim_slip",
"total number of slip cycles",
&sim_slip, 0, NULL);
/* register baseline stats */
stat_reg_formula(sdb, "avg_sim_slip",
"the average slip between issue and retirement",
"sim_slip / sim_num_insn", NULL);
/* register predictor stats */
if (pred)
bpred_reg_stats(pred, sdb);
/* register cache stats */
if (cache_il1
&& (cache_il1 != cache_dl1 && cache_il1 != cache_dl2))
cache_reg_stats(cache_il1, sdb);
if (cache_il2
&& (cache_il2 != cache_dl1 && cache_il2 != cache_dl2))
cache_reg_stats(cache_il2, sdb);
if (cache_dl1)
cache_reg_stats(cache_dl1, sdb);
if (cache_dl2)
cache_reg_stats(cache_dl2, sdb);
if (itlb)
cache_reg_stats(itlb, sdb);
if (dtlb)
cache_reg_stats(dtlb, sdb);
/* debug variable(s) */
stat_reg_counter(sdb, "sim_invalid_addrs",
"total non-speculative bogus addresses seen (debug var)",
&sim_invalid_addrs, /* initial value */0, /* format */NULL);
for (i=0; i<pcstat_nelt; i++)
{
char buf[512], buf1[512];
struct stat_stat_t *stat;
/* track the named statistical variable by text address */
/* find it... */
stat = stat_find_stat(sdb, pcstat_vars[i]);
if (!stat)
fatal("cannot locate any statistic named `%s'", pcstat_vars[i]);
/* stat must be an integral type */
if (stat->sc != sc_int && stat->sc != sc_uint && stat->sc != sc_counter)
fatal("`-pcstat' statistical variable `%s' is not an integral type",
stat->name);
/* register this stat */
pcstat_stats[i] = stat;
pcstat_lastvals[i] = STATVAL(stat);
/* declare the sparce text distribution */
sprintf(buf, "%s_by_pc", stat->name);
sprintf(buf1, "%s (by text address)", stat->desc);
pcstat_sdists[i] = stat_reg_sdist(sdb, buf, buf1,
/* initial value */0,
/* print format */(PF_COUNT|PF_PDF),
/* format */"0x%lx %lu %.2f",
/* print fn */NULL);
}
ld_reg_stats(sdb);
mem_reg_stats(mem, sdb);
}
/* forward declarations */
static void ruu_init(void);
static void lsq_init(void);
static void rslink_init(int nlinks);
static void eventq_init(void);
static void readyq_init(void);
static void cv_init(void);
static void tracer_init(void);
static void fetch_init(void);
/* initialize the simulator */
void
sim_init(void)
{
sim_num_refs = 0;
/* allocate and initialize register file */
regs_init(&regs);
/* allocate and initialize memory space */
mem = mem_create("mem");
mem_init(mem);
}
/* default register state accessor, used by DLite */
static char * /* err str, NULL for no err */
simoo_reg_obj(struct regs_t *regs, /* registers to access */
int is_write, /* access type */
enum md_reg_type rt, /* reg bank to probe */
int reg, /* register number */
struct eval_value_t *val); /* input, output */
/* default memory state accessor, used by DLite */
static char * /* err str, NULL for no err */
simoo_mem_obj(struct mem_t *mem, /* memory space to access */
int is_write, /* access type */
md_addr_t addr, /* address to access */
char *p, /* input/output buffer */
int nbytes); /* size of access */
/* default machine state accessor, used by DLite */
static char * /* err str, NULL for no err */
simoo_mstate_obj(FILE *stream, /* output stream */
char *cmd, /* optional command string */
struct regs_t *regs, /* registers to access */
struct mem_t *mem); /* memory space to access */
/* total RS links allocated at program start */
#define MAX_RS_LINKS 4096
/* load program into simulated state */
void
sim_load_prog(char *fname, /* program to load */
int argc, char **argv, /* program arguments */
char **envp) /* program environment */
{
/* load program text and data, set up environment, memory, and regs */
ld_load_prog(fname, argc, argv, envp, &regs, mem, TRUE);
/* initialize here, so symbols can be loaded */
if (ptrace_nelt == 2)
{
/* generate a pipeline trace */
ptrace_open(/* fname */ptrace_opts[0], /* range */ptrace_opts[1]);
}
else if (ptrace_nelt == 0)
{
/* no pipetracing */;
}
else
fatal("bad pipetrace args, use: <fname|stdout|stderr> <range>");
/* finish initialization of the simulation engine */
fu_pool = res_create_pool("fu-pool", fu_config, N_ELT(fu_config));
rslink_init(MAX_RS_LINKS);
tracer_init();
fetch_init();
cv_init();
eventq_init();
readyq_init();
ruu_init();
lsq_init();
/* initialize the DLite debugger */
dlite_init(simoo_reg_obj, simoo_mem_obj, simoo_mstate_obj);
}
/* dump simulator-specific auxiliary simulator statistics */
void
sim_aux_stats(FILE *stream) /* output stream */
{
/* nada */
}
/* un-initialize the simulator */
void
sim_uninit(void)
{
if (ptrace_nelt > 0)
ptrace_close();
}
/*
* processor core definitions and declarations
*/
/* inst tag type, used to tag an operation instance in the RUU */
typedef unsigned int INST_TAG_TYPE;
/* inst sequence type, used to order instructions in the ready list, if
this rolls over the ready list order temporarily will get messed up,
but execution will continue and complete correctly */
typedef unsigned int INST_SEQ_TYPE;
/* total input dependencies possible */
#define MAX_IDEPS 3
/* total output dependencies possible */
#define MAX_ODEPS 2
/* a register update unit (RUU) station, this record is contained in the
processors RUU, which serves as a collection of ordered reservations
stations. The reservation stations capture register results and await
the time when all operands are ready, at which time the instruction is
issued to the functional units; the RUU is an order circular queue, in which
instructions are inserted in fetch (program) order, results are stored in
the RUU buffers, and later when an RUU entry is the oldest entry in the
machines, it and its instruction's value is retired to the architectural
register file in program order, NOTE: the RUU and LSQ share the same
structure, this is useful because loads and stores are split into two
operations: an effective address add and a load/store, the add is inserted
into the RUU and the load/store inserted into the LSQ, allowing the add
to wake up the load/store when effective address computation has finished */
struct RUU_station {
/* inst info */
md_inst_t IR; /* instruction bits */
enum md_opcode op; /* decoded instruction opcode */
md_addr_t PC, next_PC, pred_PC; /* inst PC, next PC, predicted PC */
int in_LSQ; /* non-zero if op is in LSQ */
int ea_comp; /* non-zero if op is an addr comp */
int recover_inst; /* start of mis-speculation? */
int stack_recover_idx; /* non-speculative TOS for RSB pred */
struct bpred_update_t dir_update; /* bpred direction update info */
int spec_mode; /* non-zero if issued in spec_mode */
md_addr_t addr; /* effective address for ld/st's */
INST_TAG_TYPE tag; /* RUU slot tag, increment to
squash operation */
INST_SEQ_TYPE seq; /* instruction sequence, used to
sort the ready list and tag inst */
unsigned int ptrace_seq; /* pipetrace sequence number */
int slip;
/* instruction status */
int queued; /* operands ready and queued */
int issued; /* operation is/was executing */
int completed; /* operation has completed execution */
/* output operand dependency list, these lists are used to
limit the number of associative searches into the RUU when
instructions complete and need to wake up dependent insts */
int onames[MAX_ODEPS]; /* output logical names (NA=unused) */
struct RS_link *odep_list[MAX_ODEPS]; /* chains to consuming operations */
/* input dependent links, the output chains rooted above use these
fields to mark input operands as ready, when all these fields have
been set non-zero, the RUU operation has all of its register
operands, it may commence execution as soon as all of its memory
operands are known to be read (see lsq_refresh() for details on
enforcing memory dependencies) */
int idep_ready[MAX_IDEPS]; /* input operand ready? */
};
/* non-zero if all register operands are ready, update with MAX_IDEPS */
#define OPERANDS_READY(RS) \
((RS)->idep_ready[0] && (RS)->idep_ready[1] && (RS)->idep_ready[2])
/* register update unit, combination of reservation stations and reorder
buffer device, organized as a circular queue */
static struct RUU_station *RUU; /* register update unit */
static int RUU_head, RUU_tail; /* RUU head and tail pointers */
static int RUU_num; /* num entries currently in RUU */
/* allocate and initialize register update unit (RUU) */
static void
ruu_init(void)
{
RUU = calloc(RUU_size, sizeof(struct RUU_station));
if (!RUU)
fatal("out of virtual memory");
RUU_num = 0;
RUU_head = RUU_tail = 0;
RUU_count = 0;
RUU_fcount = 0;
}
/* dump the contents of the RUU */
static void
ruu_dumpent(struct RUU_station *rs, /* ptr to RUU station */
int index, /* entry index */
FILE *stream, /* output stream */
int header) /* print header? */
{
if (!stream)
stream = stderr;
if (header)
fprintf(stream, "idx: %2d: opcode: %s, inst: `",
index, MD_OP_NAME(rs->op));
else
fprintf(stream, " opcode: %s, inst: `",
MD_OP_NAME(rs->op));
md_print_insn(rs->IR, rs->PC, stream);
fprintf(stream, "'\n");
myfprintf(stream, " PC: 0x%08p, NPC: 0x%08p (pred_PC: 0x%08p)\n",
rs->PC, rs->next_PC, rs->pred_PC);
fprintf(stream, " in_LSQ: %s, ea_comp: %s, recover_inst: %s\n",
rs->in_LSQ ? "t" : "f",
rs->ea_comp ? "t" : "f",
rs->recover_inst ? "t" : "f");
myfprintf(stream, " spec_mode: %s, addr: 0x%08p, tag: 0x%08x\n",
rs->spec_mode ? "t" : "f", rs->addr, rs->tag);
fprintf(stream, " seq: 0x%08x, ptrace_seq: 0x%08x\n",
rs->seq, rs->ptrace_seq);
fprintf(stream, " queued: %s, issued: %s, completed: %s\n",
rs->queued ? "t" : "f",
rs->issued ? "t" : "f",
rs->completed ? "t" : "f");
fprintf(stream, " operands ready: %s\n",
OPERANDS_READY(rs) ? "t" : "f");
}
/* dump the contents of the RUU */
static void
ruu_dump(FILE *stream) /* output stream */
{
int num, head;
struct RUU_station *rs;
if (!stream)
stream = stderr;
fprintf(stream, "** RUU state **\n");
fprintf(stream, "RUU_head: %d, RUU_tail: %d\n", RUU_head, RUU_tail);
fprintf(stream, "RUU_num: %d\n", RUU_num);
num = RUU_num;
head = RUU_head;
while (num)
{
rs = &RUU[head];
ruu_dumpent(rs, rs - RUU, stream, /* header */TRUE);
head = (head + 1) % RUU_size;
num--;
}
}
/*
* load/store queue (LSQ): holds loads and stores in program order, indicating
* status of load/store access:
*
* - issued: address computation complete, memory access in progress
* - completed: memory access has completed, stored value available
* - squashed: memory access was squashed, ignore this entry
*
* loads may execute when:
* 1) register operands are ready, and
* 2) memory operands are ready (no earlier unresolved store)
*
* loads are serviced by:
* 1) previous store at same address in LSQ (hit latency), or
* 2) data cache (hit latency + miss latency)
*
* stores may execute when:
* 1) register operands are ready
*
* stores are serviced by:
* 1) depositing store value into the load/store queue
* 2) writing store value to the store buffer (plus tag check) at commit
* 3) writing store buffer entry to data cache when cache is free
*
* NOTE: the load/store queue can bypass a store value to a load in the same
* cycle the store executes (using a bypass network), thus stores complete
* in effective zero time after their effective address is known
*/
static struct RUU_station *LSQ; /* load/store queue */
static int LSQ_head, LSQ_tail; /* LSQ head and tail pointers */
static int LSQ_num; /* num entries currently in LSQ */
/*
* input dependencies for stores in the LSQ:
* idep #0 - operand input (value that is store'd)
* idep #1 - effective address input (address of store operation)
*/
#define STORE_OP_INDEX 0
#define STORE_ADDR_INDEX 1
#define STORE_OP_READY(RS) ((RS)->idep_ready[STORE_OP_INDEX])
#define STORE_ADDR_READY(RS) ((RS)->idep_ready[STORE_ADDR_INDEX])
/* allocate and initialize the load/store queue (LSQ) */
static void
lsq_init(void)
{
LSQ = calloc(LSQ_size, sizeof(struct RUU_station));
if (!LSQ)
fatal("out of virtual memory");
LSQ_num = 0;
LSQ_head = LSQ_tail = 0;
LSQ_count = 0;
LSQ_fcount = 0;
}
/* dump the contents of the RUU */
static void
lsq_dump(FILE *stream) /* output stream */
{
int num, head;
struct RUU_station *rs;
if (!stream)
stream = stderr;
fprintf(stream, "** LSQ state **\n");
fprintf(stream, "LSQ_head: %d, LSQ_tail: %d\n", LSQ_head, LSQ_tail);
fprintf(stream, "LSQ_num: %d\n", LSQ_num);
num = LSQ_num;
head = LSQ_head;
while (num)
{
rs = &LSQ[head];
ruu_dumpent(rs, rs - LSQ, stream, /* header */TRUE);
head = (head + 1) % LSQ_size;
num--;
}
}
/*
* RS_LINK defs and decls
*/
/* a reservation station link: this structure links elements of a RUU
reservation station list; used for ready instruction queue, event queue, and
output dependency lists; each RS_LINK node contains a pointer to the RUU
entry it references along with an instance tag, the RS_LINK is only valid if
the instruction instance tag matches the instruction RUU entry instance tag;
this strategy allows entries in the RUU can be squashed and reused without
updating the lists that point to it, which significantly improves the
performance of (all to frequent) squash events */
struct RS_link {
struct RS_link *next; /* next entry in list */
struct RUU_station *rs; /* referenced RUU resv station */
INST_TAG_TYPE tag; /* inst instance sequence number */
union {
tick_t when; /* time stamp of entry (for eventq) */
INST_SEQ_TYPE seq; /* inst sequence */
int opnum; /* input/output operand number */
} x;
};
/* RS link free list, grab RS_LINKs from here, when needed */
static struct RS_link *rslink_free_list;
/* NULL value for an RS link */
#define RSLINK_NULL_DATA { NULL, NULL, 0 }
static struct RS_link RSLINK_NULL = RSLINK_NULL_DATA;
/* create and initialize an RS link */
#define RSLINK_INIT(RSL, RS) \
((RSL).next = NULL, (RSL).rs = (RS), (RSL).tag = (RS)->tag)
/* non-zero if RS link is NULL */
#define RSLINK_IS_NULL(LINK) ((LINK)->rs == NULL)
/* non-zero if RS link is to a valid (non-squashed) entry */
#define RSLINK_VALID(LINK) ((LINK)->tag == (LINK)->rs->tag)
/* extra RUU reservation station pointer */
#define RSLINK_RS(LINK) ((LINK)->rs)
/* get a new RS link record */
#define RSLINK_NEW(DST, RS) \
{ struct RS_link *n_link; \
if (!rslink_free_list) \
panic("out of rs links"); \
n_link = rslink_free_list; \
rslink_free_list = rslink_free_list->next; \
n_link->next = NULL; \
n_link->rs = (RS); n_link->tag = n_link->rs->tag; \
(DST) = n_link; \
}
/* free an RS link record */
#define RSLINK_FREE(LINK) \
{ struct RS_link *f_link = (LINK); \
f_link->rs = NULL; f_link->tag = 0; \
f_link->next = rslink_free_list; \
rslink_free_list = f_link; \
}
/* FIXME: could this be faster!!! */
/* free an RS link list */
#define RSLINK_FREE_LIST(LINK) \
{ struct RS_link *fl_link, *fl_link_next; \
for (fl_link=(LINK); fl_link; fl_link=fl_link_next) \
{ \
fl_link_next = fl_link->next; \
RSLINK_FREE(fl_link); \
} \
}
/* initialize the free RS_LINK pool */
static void
rslink_init(int nlinks) /* total number of RS_LINK available */
{
int i;
struct RS_link *link;
rslink_free_list = NULL;
for (i=0; i<nlinks; i++)
{
link = calloc(1, sizeof(struct RS_link));
if (!link)
fatal("out of virtual memory");
link->next = rslink_free_list;
rslink_free_list = link;
}
}
/* service all functional unit release events, this function is called
once per cycle, and it used to step the BUSY timers attached to each
functional unit in the function unit resource pool, as long as a functional
unit's BUSY count is > 0, it cannot be issued an operation */
static void
ruu_release_fu(void)
{
int i;
/* walk all resource units, decrement busy counts by one */
for (i=0; i<fu_pool->num_resources; i++)
{
/* resource is released when BUSY hits zero */
if (fu_pool->resources[i].busy > 0)
fu_pool->resources[i].busy--;
}
}
/*
* the execution unit event queue implementation follows, the event queue
* indicates which instruction will complete next, the writeback handler
* drains this queue
*/
/* pending event queue, sorted from soonest to latest event (in time), NOTE:
RS_LINK nodes are used for the event queue list so that it need not be
updated during squash events */
static struct RS_link *event_queue;
/* initialize the event queue structures */
static void
eventq_init(void)
{
event_queue = NULL;
}
/* dump the contents of the event queue */
static void
eventq_dump(FILE *stream) /* output stream */
{
struct RS_link *ev;
if (!stream)
stream = stderr;
fprintf(stream, "** event queue state **\n");
for (ev = event_queue; ev != NULL; ev = ev->next)
{
/* is event still valid? */
if (RSLINK_VALID(ev))
{
struct RUU_station *rs = RSLINK_RS(ev);
fprintf(stream, "idx: %2d: @ %.0f\n",
(int)(rs - (rs->in_LSQ ? LSQ : RUU)), (double)ev->x.when);
ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
stream, /* !header */FALSE);
}
}
}
/* insert an event for RS into the event queue, event queue is sorted from
earliest to latest event, event and associated side-effects will be
apparent at the start of cycle WHEN */
static void
eventq_queue_event(struct RUU_station *rs, tick_t when)
{
struct RS_link *prev, *ev, *new_ev;
if (rs->completed)
panic("event completed");
if (when <= sim_cycle)
panic("event occurred in the past");
/* get a free event record */
RSLINK_NEW(new_ev, rs);
new_ev->x.when = when;
/* locate insertion point */
for (prev=NULL, ev=event_queue;
ev && ev->x.when < when;
prev=ev, ev=ev->next);
if (prev)
{
/* insert middle or end */
new_ev->next = prev->next;
prev->next = new_ev;
}
else
{
/* insert at beginning */
new_ev->next = event_queue;
event_queue = new_ev;
}
}
/* return the next event that has already occurred, returns NULL when no
remaining events or all remaining events are in the future */
static struct RUU_station *
eventq_next_event(void)
{
struct RS_link *ev;
if (event_queue && event_queue->x.when <= sim_cycle)
{
/* unlink and return first event on priority list */
ev = event_queue;
event_queue = event_queue->next;
/* event still valid? */
if (RSLINK_VALID(ev))
{
struct RUU_station *rs = RSLINK_RS(ev);
/* reclaim event record */
RSLINK_FREE(ev);
/* event is valid, return resv station */
return rs;
}
else
{
/* reclaim event record */
RSLINK_FREE(ev);
/* receiving inst was squashed, return next event */
return eventq_next_event();
}
}
else
{
/* no event or no event is ready */
return NULL;
}
}
/*
* the ready instruction queue implementation follows, the ready instruction
* queue indicates which instruction have all of there *register* dependencies
* satisfied, instruction will issue when 1) all memory dependencies for
* the instruction have been satisfied (see lsq_refresh() for details on how
* this is accomplished) and 2) resources are available; ready queue is fully
* constructed each cycle before any operation is issued from it -- this
* ensures that instruction issue priorities are properly observed; NOTE:
* RS_LINK nodes are used for the event queue list so that it need not be
* updated during squash events
*/
/* the ready instruction queue */
static struct RS_link *ready_queue;
/* initialize the event queue structures */
static void
readyq_init(void)
{
ready_queue = NULL;
}
/* dump the contents of the ready queue */
static void
readyq_dump(FILE *stream) /* output stream */
{
struct RS_link *link;
if (!stream)
stream = stderr;
fprintf(stream, "** ready queue state **\n");
for (link = ready_queue; link != NULL; link = link->next)
{
/* is entry still valid? */
if (RSLINK_VALID(link))
{
struct RUU_station *rs = RSLINK_RS(link);
ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
stream, /* header */TRUE);
}
}
}
/* insert ready node into the ready list using ready instruction scheduling
policy; currently the following scheduling policy is enforced:
memory and long latency operands, and branch instructions first
then
all other instructions, oldest instructions first
this policy works well because branches pass through the machine quicker
which works to reduce branch misprediction latencies, and very long latency
instructions (such loads and multiplies) get priority since they are very
likely on the program's critical path */
static void
readyq_enqueue(struct RUU_station *rs) /* RS to enqueue */
{
struct RS_link *prev, *node, *new_node;
/* node is now queued */
if (rs->queued)
panic("node is already queued");
rs->queued = TRUE;
/* get a free ready list node */
RSLINK_NEW(new_node, rs);
new_node->x.seq = rs->seq;
/* locate insertion point */
if (rs->in_LSQ || MD_OP_FLAGS(rs->op) & (F_LONGLAT|F_CTRL))
{
/* insert loads/stores and long latency ops at the head of the queue */
prev = NULL;
node = ready_queue;
}
else
{
/* otherwise insert in program order (earliest seq first) */
for (prev=NULL, node=ready_queue;
node && node->x.seq < rs->seq;
prev=node, node=node->next);
}
if (prev)
{
/* insert middle or end */
new_node->next = prev->next;
prev->next = new_node;
}
else
{
/* insert at beginning */
new_node->next = ready_queue;
ready_queue = new_node;
}
}
/*
* the create vector maps a logical register to a creator in the RUU (and
* specific output operand) or the architected register file (if RS_link
* is NULL)
*/
/* an entry in the create vector */
struct CV_link {
struct RUU_station *rs; /* creator's reservation station */
int odep_num; /* specific output operand */
};
/* a NULL create vector entry */
static struct CV_link CVLINK_NULL = { NULL, 0 };
/* get a new create vector link */
#define CVLINK_INIT(CV, RS,ONUM) ((CV).rs = (RS), (CV).odep_num = (ONUM))
/* size of the create vector (one entry per architected register) */
#define CV_BMAP_SZ (BITMAP_SIZE(MD_TOTAL_REGS))
/* the create vector, NOTE: speculative copy on write storage provided
for fast recovery during wrong path execute (see tracer_recover() for
details on this process */
static BITMAP_TYPE(MD_TOTAL_REGS, use_spec_cv);
static struct CV_link create_vector[MD_TOTAL_REGS];
static struct CV_link spec_create_vector[MD_TOTAL_REGS];
/* these arrays shadow the create vector an indicate when a register was
last created */
static tick_t create_vector_rt[MD_TOTAL_REGS];
static tick_t spec_create_vector_rt[MD_TOTAL_REGS];
/* read a create vector entry */
#define CREATE_VECTOR(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
? spec_create_vector[N] \
: create_vector[N])
/* read a create vector timestamp entry */
#define CREATE_VECTOR_RT(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
? spec_create_vector_rt[N] \
: create_vector_rt[N])
/* set a create vector entry */
#define SET_CREATE_VECTOR(N, L) (spec_mode \
? (BITMAP_SET(use_spec_cv, CV_BMAP_SZ, (N)),\
spec_create_vector[N] = (L)) \
: (create_vector[N] = (L)))
/* initialize the create vector */
static void
cv_init(void)
{
int i;
/* initially all registers are valid in the architected register file,
i.e., the create vector entry is CVLINK_NULL */
for (i=0; i < MD_TOTAL_REGS; i++)
{
create_vector[i] = CVLINK_NULL;
create_vector_rt[i] = 0;
spec_create_vector[i] = CVLINK_NULL;
spec_create_vector_rt[i] = 0;
}
/* all create vector entries are non-speculative */
BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
}
/* dump the contents of the create vector */
static void
cv_dump(FILE *stream) /* output stream */
{
int i;
struct CV_link ent;
if (!stream)
stream = stderr;
fprintf(stream, "** create vector state **\n");
for (i=0; i < MD_TOTAL_REGS; i++)
{
ent = CREATE_VECTOR(i);
if (!ent.rs)
fprintf(stream, "[cv%02d]: from architected reg file\n", i);
else
fprintf(stream, "[cv%02d]: from %s, idx: %d\n",
i, (ent.rs->in_LSQ ? "LSQ" : "RUU"),
(int)(ent.rs - (ent.rs->in_LSQ ? LSQ : RUU)));
}
}
/*
* RUU_COMMIT() - instruction retirement pipeline stage
*/
/* this function commits the results of the oldest completed entries from the
RUU and LSQ to the architected reg file, stores in the LSQ will commit
their store data to the data cache at this point as well */
static void
ruu_commit(void)
{
int i, lat, events, committed = 0;
static counter_t sim_ret_insn = 0;
/* all values must be retired to the architected reg file in program order */
while (RUU_num > 0 && committed < ruu_commit_width)
{
struct RUU_station *rs = &(RUU[RUU_head]);
if (!rs->completed)
{
/* at least RUU entry must be complete */
break;
}
/* default commit events */
events = 0;
/* load/stores must retire load/store queue entry as well */
if (RUU[RUU_head].ea_comp)
{
/* load/store, retire head of LSQ as well */
if (LSQ_num <= 0 || !LSQ[LSQ_head].in_LSQ)
panic("RUU out of sync with LSQ");
/* load/store operation must be complete */
if (!LSQ[LSQ_head].completed)
{
/* load/store operation is not yet complete */
break;
}
if ((MD_OP_FLAGS(LSQ[LSQ_head].op) & (F_MEM|F_STORE))
== (F_MEM|F_STORE))
{
struct res_template *fu;
/* stores must retire their store value to the cache at commit,
try to get a store port (functional unit allocation) */
fu = res_get(fu_pool, MD_OP_FUCLASS(LSQ[LSQ_head].op));
if (fu)
{
/* reserve the functional unit */
if (fu->master->busy)
panic("functional unit already in use");
/* schedule functional unit release event */
fu->master->busy = fu->issuelat;
/* go to the data cache */
if (cache_dl1)
{
/* commit store value to D-cache */
lat =
cache_access(cache_dl1, Write, (LSQ[LSQ_head].addr&~3),
NULL, 4, sim_cycle, NULL, NULL);
if (lat > cache_dl1_lat)
events |= PEV_CACHEMISS;
}
/* all loads and stores must to access D-TLB */
if (dtlb)
{
/* access the D-TLB */
lat =
cache_access(dtlb, Read, (LSQ[LSQ_head].addr & ~3),
NULL, 4, sim_cycle, NULL, NULL);
if (lat > 1)
events |= PEV_TLBMISS;
}
}
else
{
/* no store ports left, cannot continue to commit insts */
break;
}
}
/* invalidate load/store operation instance */
LSQ[LSQ_head].tag++;
sim_slip += (sim_cycle - LSQ[LSQ_head].slip);
/* indicate to pipeline trace that this instruction retired */
ptrace_newstage(LSQ[LSQ_head].ptrace_seq, PST_COMMIT, events);
ptrace_endinst(LSQ[LSQ_head].ptrace_seq);
/* commit head of LSQ as well */
LSQ_head = (LSQ_head + 1) % LSQ_size;
LSQ_num--;
}
if (pred
&& bpred_spec_update == spec_CT
&& (MD_OP_FLAGS(rs->op) & F_CTRL))
{
bpred_update(pred,
/* branch address */rs->PC,
/* actual target address */rs->next_PC,
/* taken? */rs->next_PC != (rs->PC +
sizeof(md_inst_t)),
/* pred taken? */rs->pred_PC != (rs->PC +
sizeof(md_inst_t)),
/* correct pred? */rs->pred_PC == rs->next_PC,
/* opcode */rs->op,
/* dir predictor update pointer */&rs->dir_update);
}
/* invalidate RUU operation instance */
RUU[RUU_head].tag++;
sim_slip += (sim_cycle - RUU[RUU_head].slip);
/* print retirement trace if in verbose mode */
if (verbose)
{
sim_ret_insn++;
myfprintf(stderr, "%10n @ 0x%08p: ", sim_ret_insn, RUU[RUU_head].PC);
md_print_insn(RUU[RUU_head].IR, RUU[RUU_head].PC, stderr);
if (MD_OP_FLAGS(RUU[RUU_head].op) & F_MEM)
myfprintf(stderr, " mem: 0x%08p", RUU[RUU_head].addr);
fprintf(stderr, "\n");
/* fflush(stderr); */
}
/* indicate to pipeline trace that this instruction retired */
ptrace_newstage(RUU[RUU_head].ptrace_seq, PST_COMMIT, events);
ptrace_endinst(RUU[RUU_head].ptrace_seq);
/* commit head entry of RUU */
RUU_head = (RUU_head + 1) % RUU_size;
RUU_num--;
/* one more instruction committed to architected state */
committed++;
for (i=0; i<MAX_ODEPS; i++)
{
if (rs->odep_list[i])
panic ("retired instruction has odeps\n");
}
}
}
/*
* RUU_RECOVER() - squash mispredicted microarchitecture state
*/
/* recover processor microarchitecture state back to point of the
mis-predicted branch at RUU[BRANCH_INDEX] */
static void
ruu_recover(int branch_index) /* index of mis-pred branch */
{
int i, RUU_index = RUU_tail, LSQ_index = LSQ_tail;
int RUU_prev_tail = RUU_tail, LSQ_prev_tail = LSQ_tail;
/* recover from the tail of the RUU towards the head until the branch index
is reached, this direction ensures that the LSQ can be synchronized with
the RUU */
/* go to first element to squash */
RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
/* traverse to older insts until the mispredicted branch is encountered */
while (RUU_index != branch_index)
{
/* the RUU should not drain since the mispredicted branch will remain */
if (!RUU_num)
panic("empty RUU");
/* should meet up with the tail first */
if (RUU_index == RUU_head)
panic("RUU head and tail broken");
/* is this operation an effective addr calc for a load or store? */
if (RUU[RUU_index].ea_comp)
{
/* should be at least one load or store in the LSQ */
if (!LSQ_num)
panic("RUU and LSQ out of sync");
/* recover any resources consumed by the load or store operation */
for (i=0; i<MAX_ODEPS; i++)
{
RSLINK_FREE_LIST(LSQ[LSQ_index].odep_list[i]);
/* blow away the consuming op list */
LSQ[LSQ_index].odep_list[i] = NULL;
}
/* squash this LSQ entry */
LSQ[LSQ_index].tag++;
/* indicate in pipetrace that this instruction was squashed */
ptrace_endinst(LSQ[LSQ_index].ptrace_seq);
/* go to next earlier LSQ slot */
LSQ_prev_tail = LSQ_index;
LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
LSQ_num--;
}
/* recover any resources used by this RUU operation */
for (i=0; i<MAX_ODEPS; i++)
{
RSLINK_FREE_LIST(RUU[RUU_index].odep_list[i]);
/* blow away the consuming op list */
RUU[RUU_index].odep_list[i] = NULL;
}
/* squash this RUU entry */
RUU[RUU_index].tag++;
/* indicate in pipetrace that this instruction was squashed */
ptrace_endinst(RUU[RUU_index].ptrace_seq);
/* go to next earlier slot in the RUU */
RUU_prev_tail = RUU_index;
RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
RUU_num--;
}
/* reset head/tail pointers to point to the mis-predicted branch */
RUU_tail = RUU_prev_tail;
LSQ_tail = LSQ_prev_tail;
/* revert create vector back to last precise create vector state, NOTE:
this is accomplished by resetting all the copied-on-write bits in the
USE_SPEC_CV bit vector */
BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
/* FIXME: could reset functional units at squash time */
}
/*
* RUU_WRITEBACK() - instruction result writeback pipeline stage
*/
/* forward declarations */
static void tracer_recover(void);
/* writeback completed operation results from the functional units to RUU,
at this point, the output dependency chains of completing instructions
are also walked to determine if any dependent instruction now has all
of its register operands, if so the (nearly) ready instruction is inserted
into the ready instruction queue */
static void
ruu_writeback(void)
{
int i;
struct RUU_station *rs;
/* service all completed events */
while ((rs = eventq_next_event()))
{
/* RS has completed execution and (possibly) produced a result */
if (!OPERANDS_READY(rs) || rs->queued || !rs->issued || rs->completed)
panic("inst completed and !ready, !issued, or completed");
/* operation has completed */
rs->completed = TRUE;
/* does this operation reveal a mis-predicted branch? */
if (rs->recover_inst)
{
if (rs->in_LSQ)
panic("mis-predicted load or store?!?!?");
/* recover processor state and reinit fetch to correct path */
ruu_recover(rs - RUU);
tracer_recover();
bpred_recover(pred, rs->PC, rs->stack_recover_idx);
/* stall fetch until I-fetch and I-decode recover */
ruu_fetch_issue_delay = ruu_branch_penalty;
/* continue writeback of the branch/control instruction */
}
/* if we speculatively update branch-predictor, do it here */
if (pred
&& bpred_spec_update == spec_WB
&& !rs->in_LSQ
&& (MD_OP_FLAGS(rs->op) & F_CTRL))
{
bpred_update(pred,
/* branch address */rs->PC,
/* actual target address */rs->next_PC,
/* taken? */rs->next_PC != (rs->PC +
sizeof(md_inst_t)),
/* pred taken? */rs->pred_PC != (rs->PC +
sizeof(md_inst_t)),
/* correct pred? */rs->pred_PC == rs->next_PC,
/* opcode */rs->op,
/* dir predictor update pointer */&rs->dir_update);
}
/* entered writeback stage, indicate in pipe trace */
ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK,
rs->recover_inst ? PEV_MPDETECT : 0);
/* broadcast results to consuming operations, this is more efficiently
accomplished by walking the output dependency chains of the
completed instruction */
for (i=0; i<MAX_ODEPS; i++)
{
if (rs->onames[i] != NA)
{
struct CV_link link;
struct RS_link *olink, *olink_next;
if (rs->spec_mode)
{
/* update the speculative create vector, future operations
get value from later creator or architected reg file */
link = spec_create_vector[rs->onames[i]];
if (/* !NULL */link.rs
&& /* refs RS */(link.rs == rs && link.odep_num == i))
{
/* the result can now be read from a physical register,
indicate this as so */
spec_create_vector[rs->onames[i]] = CVLINK_NULL;
spec_create_vector_rt[rs->onames[i]] = sim_cycle;
}
/* else, creator invalidated or there is another creator */
}
else
{
/* update the non-speculative create vector, future
operations get value from later creator or architected
reg file */
link = create_vector[rs->onames[i]];
if (/* !NULL */link.rs
&& /* refs RS */(link.rs == rs && link.odep_num == i))
{
/* the result can now be read from a physical register,
indicate this as so */
create_vector[rs->onames[i]] = CVLINK_NULL;
create_vector_rt[rs->onames[i]] = sim_cycle;
}
/* else, creator invalidated or there is another creator */
}
/* walk output list, queue up ready operations */
for (olink=rs->odep_list[i]; olink; olink=olink_next)
{
if (RSLINK_VALID(olink))
{
if (olink->rs->idep_ready[olink->x.opnum])
panic("output dependence already satisfied");
/* input is now ready */
olink->rs->idep_ready[olink->x.opnum] = TRUE;
/* are all the register operands of target ready? */
if (OPERANDS_READY(olink->rs))
{
/* yes! enqueue instruction as ready, NOTE: stores
complete at dispatch, so no need to enqueue
them */
if (!olink->rs->in_LSQ
|| ((MD_OP_FLAGS(olink->rs->op)&(F_MEM|F_STORE))
== (F_MEM|F_STORE)))
readyq_enqueue(olink->rs);
/* else, ld op, issued when no mem conflict */
}
}
/* grab link to next element prior to free */
olink_next = olink->next;
/* free dependence link element */
RSLINK_FREE(olink);
}
/* blow away the consuming op list */
rs->odep_list[i] = NULL;
} /* if not NA output */
} /* for all outputs */
} /* for all writeback events */
}
/*
* LSQ_REFRESH() - memory access dependence checker/scheduler
*/
/* this function locates ready instructions whose memory dependencies have
been satisfied, this is accomplished by walking the LSQ for loads, looking
for blocking memory dependency condition (e.g., earlier store with an
unknown address) */
#define MAX_STD_UNKNOWNS 64
static void
lsq_refresh(void)
{
int i, j, index, n_std_unknowns;
md_addr_t std_unknowns[MAX_STD_UNKNOWNS];
/* scan entire queue for ready loads: scan from oldest instruction
(head) until we reach the tail or an unresolved store, after which no
other instruction will become ready */
for (i=0, index=LSQ_head, n_std_unknowns=0;
i < LSQ_num;
i++, index=(index + 1) % LSQ_size)
{
/* terminate search for ready loads after first unresolved store,
as no later load could be resolved in its presence */
if (/* store? */
(MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE))
{
if (!STORE_ADDR_READY(&LSQ[index]))
{
/* FIXME: a later STD + STD known could hide the STA unknown */
/* sta unknown, blocks all later loads, stop search */
break;
}
else if (!OPERANDS_READY(&LSQ[index]))
{
/* sta known, but std unknown, may block a later store, record
this address for later referral, we use an array here because
for most simulations the number of entries to search will be
very small */
if (n_std_unknowns == MAX_STD_UNKNOWNS)
fatal("STD unknown array overflow, increase MAX_STD_UNKNOWNS");
std_unknowns[n_std_unknowns++] = LSQ[index].addr;
}
else /* STORE_ADDR_READY() && OPERANDS_READY() */
{
/* a later STD known hides an earlier STD unknown */
for (j=0; j<n_std_unknowns; j++)
{
if (std_unknowns[j] == /* STA/STD known */LSQ[index].addr)
std_unknowns[j] = /* bogus addr */0;
}
}
}
if (/* load? */
((MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_LOAD)) == (F_MEM|F_LOAD))
&& /* queued? */!LSQ[index].queued
&& /* waiting? */!LSQ[index].issued
&& /* completed? */!LSQ[index].completed
&& /* regs ready? */OPERANDS_READY(&LSQ[index]))
{
/* no STA unknown conflict (because we got to this check), check for
a STD unknown conflict */
for (j=0; j<n_std_unknowns; j++)
{
/* found a relevant STD unknown? */
if (std_unknowns[j] == LSQ[index].addr)
break;
}
if (j == n_std_unknowns)
{
/* no STA or STD unknown conflicts, put load on ready queue */
readyq_enqueue(&LSQ[index]);
}
}
}
}
/*
* RUU_ISSUE() - issue instructions to functional units
*/
/* attempt to issue all operations in the ready queue; insts in the ready
instruction queue have all register dependencies satisfied, this function
must then 1) ensure the instructions memory dependencies have been satisfied
(see lsq_refresh() for details on this process) and 2) a function unit
is available in this cycle to commence execution of the operation; if all
goes well, the function unit is allocated, a writeback event is scheduled,
and the instruction begins execution */
static void
ruu_issue(void)
{
int i, load_lat, tlb_lat, n_issued;
struct RS_link *node, *next_node;
struct res_template *fu;
/* FIXME: could be a little more efficient when scanning the ready queue */
/* copy and then blow away the ready list, NOTE: the ready list is
always totally reclaimed each cycle, and instructions that are not
issue are explicitly reinserted into the ready instruction queue,
this management strategy ensures that the ready instruction queue
is always properly sorted */
node = ready_queue;
ready_queue = NULL;
/* visit all ready instructions (i.e., insts whose register input
dependencies have been satisfied, stop issue when no more instructions
are available or issue bandwidth is exhausted */
for (n_issued=0;
node && n_issued < ruu_issue_width;
node = next_node)
{
next_node = node->next;
/* still valid? */
if (RSLINK_VALID(node))
{
struct RUU_station *rs = RSLINK_RS(node);
/* issue operation, both reg and mem deps have been satisfied */
if (!OPERANDS_READY(rs) || !rs->queued
|| rs->issued || rs->completed)
panic("issued inst !ready, issued, or completed");
/* node is now un-queued */
rs->queued = FALSE;
if (rs->in_LSQ
&& ((MD_OP_FLAGS(rs->op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE)))
{
/* stores complete in effectively zero time, result is
written into the load/store queue, the actual store into
the memory system occurs when the instruction is retired
(see ruu_commit()) */
rs->issued = TRUE;
rs->completed = TRUE;
if (rs->onames[0] || rs->onames[1])
panic("store creates result");
if (rs->recover_inst)
panic("mis-predicted store");
/* entered execute stage, indicate in pipe trace */
ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK, 0);
/* one more inst issued */
n_issued++;
}
else
{
/* issue the instruction to a functional unit */
if (MD_OP_FUCLASS(rs->op) != NA)
{
fu = res_get(fu_pool, MD_OP_FUCLASS(rs->op));
if (fu)
{
/* got one! issue inst to functional unit */
rs->issued = TRUE;
/* reserve the functional unit */
if (fu->master->busy)
panic("functional unit already in use");
/* schedule functional unit release event */
fu->master->busy = fu->issuelat;
/* schedule a result writeback event */
if (rs->in_LSQ
&& ((MD_OP_FLAGS(rs->op) & (F_MEM|F_LOAD))
== (F_MEM|F_LOAD)))
{
int events = 0;
/* for loads, determine cache access latency:
first scan LSQ to see if a store forward is
possible, if not, access the data cache */
load_lat = 0;
i = (rs - LSQ);
if (i != LSQ_head)
{
for (;;)
{
/* go to next earlier LSQ entry */
i = (i + (LSQ_size-1)) % LSQ_size;
/* FIXME: not dealing with partials! */
if ((MD_OP_FLAGS(LSQ[i].op) & F_STORE)
&& (LSQ[i].addr == rs->addr))
{
/* hit in the LSQ */
load_lat = 1;
break;
}
/* scan finished? */
if (i == LSQ_head)
break;
}
}
/* was the value store forwared from the LSQ? */
if (!load_lat)
{
int valid_addr = MD_VALID_ADDR(rs->addr);
if (!spec_mode && !valid_addr)
sim_invalid_addrs++;
/* no! go to the data cache if addr is valid */
if (cache_dl1 && valid_addr)
{
/* access the cache if non-faulting */
load_lat =
cache_access(cache_dl1, Read,
(rs->addr & ~3), NULL, 4,
sim_cycle, NULL, NULL);
if (load_lat > cache_dl1_lat)
events |= PEV_CACHEMISS;
}
else
{
/* no caches defined, just use op latency */
load_lat = fu->oplat;
}
}
/* all loads and stores must to access D-TLB */
if (dtlb && MD_VALID_ADDR(rs->addr))
{
/* access the D-DLB, NOTE: this code will
initiate speculative TLB misses */
tlb_lat =
cache_access(dtlb, Read, (rs->addr & ~3),
NULL, 4, sim_cycle, NULL, NULL);
if (tlb_lat > 1)
events |= PEV_TLBMISS;
/* D-cache/D-TLB accesses occur in parallel */
load_lat = MAX(tlb_lat, load_lat);
}
/* use computed cache access latency */
eventq_queue_event(rs, sim_cycle + load_lat);
/* entered execute stage, indicate in pipe trace */
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
((rs->ea_comp ? PEV_AGEN : 0)
| events));
}
else /* !load && !store */
{
/* use deterministic functional unit latency */
eventq_queue_event(rs, sim_cycle + fu->oplat);
/* entered execute stage, indicate in pipe trace */
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
rs->ea_comp ? PEV_AGEN : 0);
}
/* one more inst issued */
n_issued++;
}
else /* no functional unit */
{
/* insufficient functional unit resources, put operation
back onto the ready list, we'll try to issue it
again next cycle */
readyq_enqueue(rs);
}
}
else /* does not require a functional unit! */
{
/* FIXME: need better solution for these */
/* the instruction does not need a functional unit */
rs->issued = TRUE;
/* schedule a result event */
eventq_queue_event(rs, sim_cycle + 1);
/* entered execute stage, indicate in pipe trace */
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
rs->ea_comp ? PEV_AGEN : 0);
/* one more inst issued */
n_issued++;
}
} /* !store */
}
/* else, RUU entry was squashed */
/* reclaim ready list entry, NOTE: this is done whether or not the
instruction issued, since the instruction was once again reinserted
into the ready queue if it did not issue, this ensures that the ready
queue is always properly sorted */
RSLINK_FREE(node);
}
/* put any instruction not issued back into the ready queue, go through
normal channels to ensure instruction stay ordered correctly */
for (; node; node = next_node)
{
next_node = node->next;
/* still valid? */
if (RSLINK_VALID(node))
{
struct RUU_station *rs = RSLINK_RS(node);
/* node is now un-queued */
rs->queued = FALSE;
/* not issued, put operation back onto the ready list, we'll try to
issue it again next cycle */
readyq_enqueue(rs);
}
/* else, RUU entry was squashed */
/* reclaim ready list entry, NOTE: this is done whether or not the
instruction issued, since the instruction was once again reinserted
into the ready queue if it did not issue, this ensures that the ready
queue is always properly sorted */
RSLINK_FREE(node);
}
}
/*
* routines for generating on-the-fly instruction traces with support
* for control and data misspeculation modeling
*/
/* integer register file */
#define R_BMAP_SZ (BITMAP_SIZE(MD_NUM_IREGS))
static BITMAP_TYPE(MD_NUM_IREGS, use_spec_R);
static md_gpr_t spec_regs_R;
/* floating point register file */
#define F_BMAP_SZ (BITMAP_SIZE(MD_NUM_FREGS))
static BITMAP_TYPE(MD_NUM_FREGS, use_spec_F);
static md_fpr_t spec_regs_F;
/* miscellaneous registers */
#define C_BMAP_SZ (BITMAP_SIZE(MD_NUM_CREGS))
static BITMAP_TYPE(MD_NUM_FREGS, use_spec_C);
static md_ctrl_t spec_regs_C;
/* dump speculative register state */
static void
rspec_dump(FILE *stream) /* output stream */
{
int i;
if (!stream)
stream = stderr;