4638 lines
141 KiB
C
4638 lines
141 KiB
C
/* sim-outorder.c - sample out-of-order issue perf simulator implementation */
|
|
|
|
/* SimpleScalar(TM) Tool Suite
|
|
* Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
|
|
* All Rights Reserved.
|
|
*
|
|
* THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR,
|
|
* YOU ARE AGREEING TO THESE TERMS AND CONDITIONS.
|
|
*
|
|
* No portion of this work may be used by any commercial entity, or for any
|
|
* commercial purpose, without the prior, written permission of SimpleScalar,
|
|
* LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted
|
|
* as described below.
|
|
*
|
|
* 1. SimpleScalar is provided AS IS, with no warranty of any kind, express
|
|
* or implied. The user of the program accepts full responsibility for the
|
|
* application of the program and the use of any results.
|
|
*
|
|
* 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be
|
|
* downloaded, compiled, executed, copied, and modified solely for nonprofit,
|
|
* educational, noncommercial research, and noncommercial scholarship
|
|
* purposes provided that this notice in its entirety accompanies all copies.
|
|
* Copies of the modified software can be delivered to persons who use it
|
|
* solely for nonprofit, educational, noncommercial research, and
|
|
* noncommercial scholarship purposes provided that this notice in its
|
|
* entirety accompanies all copies.
|
|
*
|
|
* 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY
|
|
* PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com).
|
|
*
|
|
* 4. No nonprofit user may place any restrictions on the use of this software,
|
|
* including as modified by the user, by any other authorized user.
|
|
*
|
|
* 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar
|
|
* in compiled or executable form as set forth in Section 2, provided that
|
|
* either: (A) it is accompanied by the corresponding machine-readable source
|
|
* code, or (B) it is accompanied by a written offer, with no time limit, to
|
|
* give anyone a machine-readable copy of the corresponding source code in
|
|
* return for reimbursement of the cost of distribution. This written offer
|
|
* must permit verbatim duplication by anyone, or (C) it is distributed by
|
|
* someone who received only the executable form, and is accompanied by a
|
|
* copy of the written offer of source code.
|
|
*
|
|
* 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is
|
|
* currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail:
|
|
* 2395 Timbercrest Court, Ann Arbor, MI 48105.
|
|
*
|
|
* Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
|
|
*/
|
|
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <math.h>
|
|
#include <assert.h>
|
|
#include <signal.h>
|
|
|
|
#include "host.h"
|
|
#include "misc.h"
|
|
#include "machine.h"
|
|
#include "regs.h"
|
|
#include "memory.h"
|
|
#include "cache.h"
|
|
#include "loader.h"
|
|
#include "syscall.h"
|
|
#include "bpred.h"
|
|
#include "resource.h"
|
|
#include "bitmap.h"
|
|
#include "options.h"
|
|
#include "eval.h"
|
|
#include "stats.h"
|
|
#include "ptrace.h"
|
|
#include "dlite.h"
|
|
#include "sim.h"
|
|
|
|
/*
|
|
* This file implements a very detailed out-of-order issue superscalar
|
|
* processor with a two-level memory system and speculative execution support.
|
|
* This simulator is a performance simulator, tracking the latency of all
|
|
* pipeline operations.
|
|
*/
|
|
|
|
/* simulated registers */
|
|
static struct regs_t regs;
|
|
|
|
/* simulated memory */
|
|
static struct mem_t *mem = NULL;
|
|
|
|
|
|
/*
|
|
* simulator options
|
|
*/
|
|
|
|
/* maximum number of inst's to execute */
|
|
static unsigned int max_insts;
|
|
|
|
/* number of insts skipped before timing starts */
|
|
static int fastfwd_count;
|
|
|
|
/* pipeline trace range and output filename */
|
|
static int ptrace_nelt = 0;
|
|
static char *ptrace_opts[2];
|
|
|
|
/* instruction fetch queue size (in insts) */
|
|
static int ruu_ifq_size;
|
|
|
|
/* extra branch mis-prediction latency */
|
|
static int ruu_branch_penalty;
|
|
|
|
/* speed of front-end of machine relative to execution core */
|
|
static int fetch_speed;
|
|
|
|
/* branch predictor type {nottaken|taken|perfect|bimod|2lev} */
|
|
static char *pred_type;
|
|
|
|
/* bimodal predictor config (<table_size>) */
|
|
static int bimod_nelt = 1;
|
|
static int bimod_config[1] =
|
|
{ /* bimod tbl size */2048 };
|
|
|
|
/* threebit predictor config (<table_size>) */
|
|
static int threebit_nelt = 1;
|
|
static int threebit_config[1] =
|
|
{ /* threebit tbl size */2048 };
|
|
|
|
/* 2-level predictor config (<l1size> <l2size> <hist_size> <xor>) */
|
|
static int twolev_nelt = 4;
|
|
static int twolev_config[4] =
|
|
{ /* l1size */1, /* l2size */1024, /* hist */8, /* xor */FALSE};
|
|
|
|
/* combining predictor config (<meta_table_size> */
|
|
static int comb_nelt = 1;
|
|
static int comb_config[1] =
|
|
{ /* meta_table_size */1024 };
|
|
|
|
/* return address stack (RAS) size */
|
|
static int ras_size = 8;
|
|
|
|
/* BTB predictor config (<num_sets> <associativity>) */
|
|
static int btb_nelt = 2;
|
|
static int btb_config[2] =
|
|
{ /* nsets */512, /* assoc */4 };
|
|
|
|
/* instruction decode B/W (insts/cycle) */
|
|
static int ruu_decode_width;
|
|
|
|
/* instruction issue B/W (insts/cycle) */
|
|
static int ruu_issue_width;
|
|
|
|
/* run pipeline with in-order issue */
|
|
static int ruu_inorder_issue;
|
|
|
|
/* issue instructions down wrong execution paths */
|
|
static int ruu_include_spec = TRUE;
|
|
|
|
/* instruction commit B/W (insts/cycle) */
|
|
static int ruu_commit_width;
|
|
|
|
/* register update unit (RUU) size */
|
|
static int RUU_size = 8;
|
|
|
|
/* load/store queue (LSQ) size */
|
|
static int LSQ_size = 4;
|
|
|
|
/* l1 data cache config, i.e., {<config>|none} */
|
|
static char *cache_dl1_opt;
|
|
|
|
/* l1 data cache hit latency (in cycles) */
|
|
static int cache_dl1_lat;
|
|
|
|
/* l2 data cache config, i.e., {<config>|none} */
|
|
static char *cache_dl2_opt;
|
|
|
|
/* l2 data cache hit latency (in cycles) */
|
|
static int cache_dl2_lat;
|
|
|
|
/* l1 instruction cache config, i.e., {<config>|dl1|dl2|none} */
|
|
static char *cache_il1_opt;
|
|
|
|
/* l1 instruction cache hit latency (in cycles) */
|
|
static int cache_il1_lat;
|
|
|
|
/* l2 instruction cache config, i.e., {<config>|dl1|dl2|none} */
|
|
static char *cache_il2_opt;
|
|
|
|
/* l2 instruction cache hit latency (in cycles) */
|
|
static int cache_il2_lat;
|
|
|
|
/* flush caches on system calls */
|
|
static int flush_on_syscalls;
|
|
|
|
/* convert 64-bit inst addresses to 32-bit inst equivalents */
|
|
static int compress_icache_addrs;
|
|
|
|
/* memory access latency (<first_chunk> <inter_chunk>) */
|
|
static int mem_nelt = 2;
|
|
static int mem_lat[2] =
|
|
{ /* lat to first chunk */18, /* lat between remaining chunks */2 };
|
|
|
|
/* memory access bus width (in bytes) */
|
|
static int mem_bus_width;
|
|
|
|
/* instruction TLB config, i.e., {<config>|none} */
|
|
static char *itlb_opt;
|
|
|
|
/* data TLB config, i.e., {<config>|none} */
|
|
static char *dtlb_opt;
|
|
|
|
/* inst/data TLB miss latency (in cycles) */
|
|
static int tlb_miss_lat;
|
|
|
|
/* total number of integer ALU's available */
|
|
static int res_ialu;
|
|
|
|
/* total number of integer multiplier/dividers available */
|
|
static int res_imult;
|
|
|
|
/* total number of memory system ports available (to CPU) */
|
|
static int res_memport;
|
|
|
|
/* total number of floating point ALU's available */
|
|
static int res_fpalu;
|
|
|
|
/* total number of floating point multiplier/dividers available */
|
|
static int res_fpmult;
|
|
|
|
/* text-based stat profiles */
|
|
#define MAX_PCSTAT_VARS 8
|
|
static int pcstat_nelt = 0;
|
|
static char *pcstat_vars[MAX_PCSTAT_VARS];
|
|
|
|
/* convert 64-bit inst text addresses to 32-bit inst equivalents */
|
|
#ifdef TARGET_PISA
|
|
#define IACOMPRESS(A) \
|
|
(compress_icache_addrs ? ((((A) - ld_text_base) >> 1) + ld_text_base) : (A))
|
|
#define ISCOMPRESS(SZ) \
|
|
(compress_icache_addrs ? ((SZ) >> 1) : (SZ))
|
|
#else /* !TARGET_PISA */
|
|
#define IACOMPRESS(A) (A)
|
|
#define ISCOMPRESS(SZ) (SZ)
|
|
#endif /* TARGET_PISA */
|
|
|
|
/* operate in backward-compatible bugs mode (for testing only) */
|
|
static int bugcompat_mode;
|
|
|
|
/*
|
|
* functional unit resource configuration
|
|
*/
|
|
|
|
/* resource pool indices, NOTE: update these if you change FU_CONFIG */
|
|
#define FU_IALU_INDEX 0
|
|
#define FU_IMULT_INDEX 1
|
|
#define FU_MEMPORT_INDEX 2
|
|
#define FU_FPALU_INDEX 3
|
|
#define FU_FPMULT_INDEX 4
|
|
|
|
/* resource pool definition, NOTE: update FU_*_INDEX defs if you change this */
|
|
struct res_desc fu_config[] = {
|
|
{
|
|
"integer-ALU",
|
|
4,
|
|
0,
|
|
{
|
|
{ IntALU, 1, 1 }
|
|
}
|
|
},
|
|
{
|
|
"integer-MULT/DIV",
|
|
1,
|
|
0,
|
|
{
|
|
{ IntMULT, 3, 1 },
|
|
{ IntDIV, 20, 19 }
|
|
}
|
|
},
|
|
{
|
|
"memory-port",
|
|
2,
|
|
0,
|
|
{
|
|
{ RdPort, 1, 1 },
|
|
{ WrPort, 1, 1 }
|
|
}
|
|
},
|
|
{
|
|
"FP-adder",
|
|
4,
|
|
0,
|
|
{
|
|
{ FloatADD, 2, 1 },
|
|
{ FloatCMP, 2, 1 },
|
|
{ FloatCVT, 2, 1 }
|
|
}
|
|
},
|
|
{
|
|
"FP-MULT/DIV",
|
|
1,
|
|
0,
|
|
{
|
|
{ FloatMULT, 4, 1 },
|
|
{ FloatDIV, 12, 12 },
|
|
{ FloatSQRT, 24, 24 }
|
|
}
|
|
},
|
|
};
|
|
|
|
|
|
/*
|
|
* simulator stats
|
|
*/
|
|
/* SLIP variable */
|
|
static counter_t sim_slip = 0;
|
|
|
|
/* total number of instructions executed */
|
|
static counter_t sim_total_insn = 0;
|
|
|
|
/* total number of memory references committed */
|
|
static counter_t sim_num_refs = 0;
|
|
|
|
/* total number of memory references executed */
|
|
static counter_t sim_total_refs = 0;
|
|
|
|
/* total number of loads committed */
|
|
static counter_t sim_num_loads = 0;
|
|
|
|
/* total number of loads executed */
|
|
static counter_t sim_total_loads = 0;
|
|
|
|
/* total number of branches committed */
|
|
static counter_t sim_num_branches = 0;
|
|
|
|
/* total number of branches executed */
|
|
static counter_t sim_total_branches = 0;
|
|
|
|
/* cycle counter */
|
|
static tick_t sim_cycle = 0;
|
|
|
|
/* occupancy counters */
|
|
static counter_t IFQ_count; /* cumulative IFQ occupancy */
|
|
static counter_t IFQ_fcount; /* cumulative IFQ full count */
|
|
static counter_t RUU_count; /* cumulative RUU occupancy */
|
|
static counter_t RUU_fcount; /* cumulative RUU full count */
|
|
static counter_t LSQ_count; /* cumulative LSQ occupancy */
|
|
static counter_t LSQ_fcount; /* cumulative LSQ full count */
|
|
|
|
/* total non-speculative bogus addresses seen (debug var) */
|
|
static counter_t sim_invalid_addrs;
|
|
|
|
/*
|
|
* simulator state variables
|
|
*/
|
|
|
|
/* instruction sequence counter, used to assign unique id's to insts */
|
|
static unsigned int inst_seq = 0;
|
|
|
|
/* pipetrace instruction sequence counter */
|
|
static unsigned int ptrace_seq = 0;
|
|
|
|
/* speculation mode, non-zero when mis-speculating, i.e., executing
|
|
instructions down the wrong path, thus state recovery will eventually have
|
|
to occur that resets processor register and memory state back to the last
|
|
precise state */
|
|
static int spec_mode = FALSE;
|
|
|
|
/* cycles until fetch issue resumes */
|
|
static unsigned ruu_fetch_issue_delay = 0;
|
|
|
|
/* perfect prediction enabled */
|
|
static int pred_perfect = FALSE;
|
|
|
|
/* speculative bpred-update enabled */
|
|
static char *bpred_spec_opt;
|
|
static enum { spec_ID, spec_WB, spec_CT } bpred_spec_update;
|
|
|
|
/* level 1 instruction cache, entry level instruction cache */
|
|
static struct cache_t *cache_il1;
|
|
|
|
/* level 1 instruction cache */
|
|
static struct cache_t *cache_il2;
|
|
|
|
/* level 1 data cache, entry level data cache */
|
|
static struct cache_t *cache_dl1;
|
|
|
|
/* level 2 data cache */
|
|
static struct cache_t *cache_dl2;
|
|
|
|
/* instruction TLB */
|
|
static struct cache_t *itlb;
|
|
|
|
/* data TLB */
|
|
static struct cache_t *dtlb;
|
|
|
|
/* branch predictor */
|
|
static struct bpred_t *pred;
|
|
|
|
/* functional unit resource pool */
|
|
static struct res_pool *fu_pool = NULL;
|
|
|
|
/* text-based stat profiles */
|
|
static struct stat_stat_t *pcstat_stats[MAX_PCSTAT_VARS];
|
|
static counter_t pcstat_lastvals[MAX_PCSTAT_VARS];
|
|
static struct stat_stat_t *pcstat_sdists[MAX_PCSTAT_VARS];
|
|
|
|
/* wedge all stat values into a counter_t */
|
|
#define STATVAL(STAT) \
|
|
((STAT)->sc == sc_int \
|
|
? (counter_t)*((STAT)->variant.for_int.var) \
|
|
: ((STAT)->sc == sc_uint \
|
|
? (counter_t)*((STAT)->variant.for_uint.var) \
|
|
: ((STAT)->sc == sc_counter \
|
|
? *((STAT)->variant.for_counter.var) \
|
|
: (panic("bad stat class"), 0))))
|
|
|
|
|
|
/* memory access latency, assumed to not cross a page boundary */
|
|
static unsigned int /* total latency of access */
|
|
mem_access_latency(int blk_sz) /* block size accessed */
|
|
{
|
|
int chunks = (blk_sz + (mem_bus_width - 1)) / mem_bus_width;
|
|
|
|
assert(chunks > 0);
|
|
|
|
return (/* first chunk latency */mem_lat[0] +
|
|
(/* remainder chunk latency */mem_lat[1] * (chunks - 1)));
|
|
}
|
|
|
|
|
|
/*
|
|
* cache miss handlers
|
|
*/
|
|
|
|
/* l1 data cache l1 block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
dl1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
unsigned int lat;
|
|
|
|
if (cache_dl2)
|
|
{
|
|
/* access next level of data cache hierarchy */
|
|
lat = cache_access(cache_dl2, cmd, baddr, NULL, bsize,
|
|
/* now */now, /* pudata */NULL, /* repl addr */NULL);
|
|
if (cmd == Read)
|
|
return lat;
|
|
else
|
|
{
|
|
/* FIXME: unlimited write buffers */
|
|
return 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* access main memory */
|
|
if (cmd == Read)
|
|
return mem_access_latency(bsize);
|
|
else
|
|
{
|
|
/* FIXME: unlimited write buffers */
|
|
return 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* l2 data cache block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
dl2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
/* this is a miss to the lowest level, so access main memory */
|
|
if (cmd == Read)
|
|
return mem_access_latency(bsize);
|
|
else
|
|
{
|
|
/* FIXME: unlimited write buffers */
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/* l1 inst cache l1 block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
il1_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
unsigned int lat;
|
|
|
|
if (cache_il2)
|
|
{
|
|
/* access next level of inst cache hierarchy */
|
|
lat = cache_access(cache_il2, cmd, baddr, NULL, bsize,
|
|
/* now */now, /* pudata */NULL, /* repl addr */NULL);
|
|
if (cmd == Read)
|
|
return lat;
|
|
else
|
|
panic("writes to instruction memory not supported");
|
|
}
|
|
else
|
|
{
|
|
/* access main memory */
|
|
if (cmd == Read)
|
|
return mem_access_latency(bsize);
|
|
else
|
|
panic("writes to instruction memory not supported");
|
|
}
|
|
}
|
|
|
|
/* l2 inst cache block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
il2_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
/* this is a miss to the lowest level, so access main memory */
|
|
if (cmd == Read)
|
|
return mem_access_latency(bsize);
|
|
else
|
|
panic("writes to instruction memory not supported");
|
|
}
|
|
|
|
|
|
/*
|
|
* TLB miss handlers
|
|
*/
|
|
|
|
/* inst cache block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
itlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
|
|
|
|
/* no real memory access, however, should have user data space attached */
|
|
assert(phy_page_ptr);
|
|
|
|
/* fake translation, for now... */
|
|
*phy_page_ptr = 0;
|
|
|
|
/* return tlb miss latency */
|
|
return tlb_miss_lat;
|
|
}
|
|
|
|
/* data cache block miss handler function */
|
|
static unsigned int /* latency of block access */
|
|
dtlb_access_fn(enum mem_cmd cmd, /* access cmd, Read or Write */
|
|
md_addr_t baddr, /* block address to access */
|
|
int bsize, /* size of block to access */
|
|
struct cache_blk_t *blk, /* ptr to block in upper level */
|
|
tick_t now) /* time of access */
|
|
{
|
|
md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
|
|
|
|
/* no real memory access, however, should have user data space attached */
|
|
assert(phy_page_ptr);
|
|
|
|
/* fake translation, for now... */
|
|
*phy_page_ptr = 0;
|
|
|
|
/* return tlb miss latency */
|
|
return tlb_miss_lat;
|
|
}
|
|
|
|
|
|
/* register simulator-specific options */
|
|
void
|
|
sim_reg_options(struct opt_odb_t *odb)
|
|
{
|
|
opt_reg_header(odb,
|
|
"sim-outorder: This simulator implements a very detailed out-of-order issue\n"
|
|
"superscalar processor with a two-level memory system and speculative\n"
|
|
"execution support. This simulator is a performance simulator, tracking the\n"
|
|
"latency of all pipeline operations.\n"
|
|
);
|
|
|
|
/* instruction limit */
|
|
|
|
opt_reg_uint(odb, "-max:inst", "maximum number of inst's to execute",
|
|
&max_insts, /* default */0,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* trace options */
|
|
|
|
opt_reg_int(odb, "-fastfwd", "number of insts skipped before timing starts",
|
|
&fastfwd_count, /* default */0,
|
|
/* print */TRUE, /* format */NULL);
|
|
opt_reg_string_list(odb, "-ptrace",
|
|
"generate pipetrace, i.e., <fname|stdout|stderr> <range>",
|
|
ptrace_opts, /* arr_sz */2, &ptrace_nelt, /* default */NULL,
|
|
/* !print */FALSE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_note(odb,
|
|
" Pipetrace range arguments are formatted as follows:\n"
|
|
"\n"
|
|
" {{@|#}<start>}:{{@|#|+}<end>}\n"
|
|
"\n"
|
|
" Both ends of the range are optional, if neither are specified, the entire\n"
|
|
" execution is traced. Ranges that start with a `@' designate an address\n"
|
|
" range to be traced, those that start with an `#' designate a cycle count\n"
|
|
" range. All other range values represent an instruction count range. The\n"
|
|
" second argument, if specified with a `+', indicates a value relative\n"
|
|
" to the first argument, e.g., 1000:+100 == 1000:1100. Program symbols may\n"
|
|
" be used in all contexts.\n"
|
|
"\n"
|
|
" Examples: -ptrace FOO.trc #0:#1000\n"
|
|
" -ptrace BAR.trc @2000:\n"
|
|
" -ptrace BLAH.trc :1500\n"
|
|
" -ptrace UXXE.trc :\n"
|
|
" -ptrace FOOBAR.trc @main:+278\n"
|
|
);
|
|
|
|
/* ifetch options */
|
|
|
|
opt_reg_int(odb, "-fetch:ifqsize", "instruction fetch queue size (in insts)",
|
|
&ruu_ifq_size, /* default */4,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-fetch:mplat", "extra branch mis-prediction latency",
|
|
&ruu_branch_penalty, /* default */3,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-fetch:speed",
|
|
"speed of front-end of machine relative to execution core",
|
|
&fetch_speed, /* default */1,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* branch predictor options */
|
|
|
|
opt_reg_note(odb,
|
|
" Branch predictor configuration examples for 2-level predictor:\n"
|
|
" Configurations: N, M, W, X\n"
|
|
" N # entries in first level (# of shift register(s))\n"
|
|
" W width of shift register(s)\n"
|
|
" M # entries in 2nd level (# of counters, or other FSM)\n"
|
|
" X (yes-1/no-0) xor history and address for 2nd level index\n"
|
|
" Sample predictors:\n"
|
|
" GAg : 1, W, 2^W, 0\n"
|
|
" GAp : 1, W, M (M > 2^W), 0\n"
|
|
" PAg : N, W, 2^W, 0\n"
|
|
" PAp : N, W, M (M == 2^(N+W)), 0\n"
|
|
" gshare : 1, W, 2^W, 1\n"
|
|
" Predictor `comb' combines a bimodal and a 2-level predictor.\n"
|
|
);
|
|
|
|
opt_reg_string(odb, "-bpred",
|
|
"branch predictor type {nottaken|taken|perfect|bimod|threebit|2lev|comb}",
|
|
&pred_type, /* default */"bimod",
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int_list(odb, "-bpred:bimod",
|
|
"bimodal predictor config (<table size>)",
|
|
bimod_config, bimod_nelt, &bimod_nelt,
|
|
/* default */bimod_config,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_int_list(odb, "-bpred:threebit",
|
|
"3-bit predictor config (<table size>)",
|
|
threebit_config, threebit_nelt, &threebit_nelt,
|
|
/* default */threebit_config,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_int_list(odb, "-bpred:2lev",
|
|
"2-level predictor config "
|
|
"(<l1size> <l2size> <hist_size> <xor>)",
|
|
twolev_config, twolev_nelt, &twolev_nelt,
|
|
/* default */twolev_config,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_int_list(odb, "-bpred:comb",
|
|
"combining predictor config (<meta_table_size>)",
|
|
comb_config, comb_nelt, &comb_nelt,
|
|
/* default */comb_config,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_int(odb, "-bpred:ras",
|
|
"return address stack size (0 for no return stack)",
|
|
&ras_size, /* default */ras_size,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int_list(odb, "-bpred:btb",
|
|
"BTB config (<num_sets> <associativity>)",
|
|
btb_config, btb_nelt, &btb_nelt,
|
|
/* default */btb_config,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_string(odb, "-bpred:spec_update",
|
|
"speculative predictors update in {ID|WB} (default non-spec)",
|
|
&bpred_spec_opt, /* default */NULL,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* decode options */
|
|
|
|
opt_reg_int(odb, "-decode:width",
|
|
"instruction decode B/W (insts/cycle)",
|
|
&ruu_decode_width, /* default */4,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* issue options */
|
|
|
|
opt_reg_int(odb, "-issue:width",
|
|
"instruction issue B/W (insts/cycle)",
|
|
&ruu_issue_width, /* default */4,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_flag(odb, "-issue:inorder", "run pipeline with in-order issue",
|
|
&ruu_inorder_issue, /* default */FALSE,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_flag(odb, "-issue:wrongpath",
|
|
"issue instructions down wrong execution paths",
|
|
&ruu_include_spec, /* default */TRUE,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* commit options */
|
|
|
|
opt_reg_int(odb, "-commit:width",
|
|
"instruction commit B/W (insts/cycle)",
|
|
&ruu_commit_width, /* default */4,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* register scheduler options */
|
|
|
|
opt_reg_int(odb, "-ruu:size",
|
|
"register update unit (RUU) size",
|
|
&RUU_size, /* default */16,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* memory scheduler options */
|
|
|
|
opt_reg_int(odb, "-lsq:size",
|
|
"load/store queue (LSQ) size",
|
|
&LSQ_size, /* default */8,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* cache options */
|
|
|
|
opt_reg_string(odb, "-cache:dl1",
|
|
"l1 data cache config, i.e., {<config>|none}",
|
|
&cache_dl1_opt, "dl1:128:32:4:l",
|
|
/* print */TRUE, NULL);
|
|
|
|
opt_reg_note(odb,
|
|
" The cache config parameter <config> has the following format:\n"
|
|
"\n"
|
|
" <name>:<nsets>:<bsize>:<assoc>:<repl>\n"
|
|
"\n"
|
|
" <name> - name of the cache being defined\n"
|
|
" <nsets> - number of sets in the cache\n"
|
|
" <bsize> - block size of the cache\n"
|
|
" <assoc> - associativity of the cache\n"
|
|
" <repl> - block replacement strategy, 'l'-LRU, 'f'-FIFO, 'r'-random\n"
|
|
"\n"
|
|
" Examples: -cache:dl1 dl1:4096:32:1:l\n"
|
|
" -dtlb dtlb:128:4096:32:r\n"
|
|
);
|
|
|
|
opt_reg_int(odb, "-cache:dl1lat",
|
|
"l1 data cache hit latency (in cycles)",
|
|
&cache_dl1_lat, /* default */1,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_string(odb, "-cache:dl2",
|
|
"l2 data cache config, i.e., {<config>|none}",
|
|
&cache_dl2_opt, "ul2:1024:64:4:l",
|
|
/* print */TRUE, NULL);
|
|
|
|
opt_reg_int(odb, "-cache:dl2lat",
|
|
"l2 data cache hit latency (in cycles)",
|
|
&cache_dl2_lat, /* default */6,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_string(odb, "-cache:il1",
|
|
"l1 inst cache config, i.e., {<config>|dl1|dl2|none}",
|
|
&cache_il1_opt, "il1:512:32:1:l",
|
|
/* print */TRUE, NULL);
|
|
|
|
opt_reg_note(odb,
|
|
" Cache levels can be unified by pointing a level of the instruction cache\n"
|
|
" hierarchy at the data cache hiearchy using the \"dl1\" and \"dl2\" cache\n"
|
|
" configuration arguments. Most sensible combinations are supported, e.g.,\n"
|
|
"\n"
|
|
" A unified l2 cache (il2 is pointed at dl2):\n"
|
|
" -cache:il1 il1:128:64:1:l -cache:il2 dl2\n"
|
|
" -cache:dl1 dl1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
|
|
"\n"
|
|
" Or, a fully unified cache hierarchy (il1 pointed at dl1):\n"
|
|
" -cache:il1 dl1\n"
|
|
" -cache:dl1 ul1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
|
|
);
|
|
|
|
opt_reg_int(odb, "-cache:il1lat",
|
|
"l1 instruction cache hit latency (in cycles)",
|
|
&cache_il1_lat, /* default */1,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_string(odb, "-cache:il2",
|
|
"l2 instruction cache config, i.e., {<config>|dl2|none}",
|
|
&cache_il2_opt, "dl2",
|
|
/* print */TRUE, NULL);
|
|
|
|
opt_reg_int(odb, "-cache:il2lat",
|
|
"l2 instruction cache hit latency (in cycles)",
|
|
&cache_il2_lat, /* default */6,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_flag(odb, "-cache:flush", "flush caches on system calls",
|
|
&flush_on_syscalls, /* default */FALSE, /* print */TRUE, NULL);
|
|
|
|
opt_reg_flag(odb, "-cache:icompress",
|
|
"convert 64-bit inst addresses to 32-bit inst equivalents",
|
|
&compress_icache_addrs, /* default */FALSE,
|
|
/* print */TRUE, NULL);
|
|
|
|
/* mem options */
|
|
opt_reg_int_list(odb, "-mem:lat",
|
|
"memory access latency (<first_chunk> <inter_chunk>)",
|
|
mem_lat, mem_nelt, &mem_nelt, mem_lat,
|
|
/* print */TRUE, /* format */NULL, /* !accrue */FALSE);
|
|
|
|
opt_reg_int(odb, "-mem:width", "memory access bus width (in bytes)",
|
|
&mem_bus_width, /* default */8,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* TLB options */
|
|
|
|
opt_reg_string(odb, "-tlb:itlb",
|
|
"instruction TLB config, i.e., {<config>|none}",
|
|
&itlb_opt, "itlb:16:4096:4:l", /* print */TRUE, NULL);
|
|
|
|
opt_reg_string(odb, "-tlb:dtlb",
|
|
"data TLB config, i.e., {<config>|none}",
|
|
&dtlb_opt, "dtlb:32:4096:4:l", /* print */TRUE, NULL);
|
|
|
|
opt_reg_int(odb, "-tlb:lat",
|
|
"inst/data TLB miss latency (in cycles)",
|
|
&tlb_miss_lat, /* default */30,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
/* resource configuration */
|
|
|
|
opt_reg_int(odb, "-res:ialu",
|
|
"total number of integer ALU's available",
|
|
&res_ialu, /* default */fu_config[FU_IALU_INDEX].quantity,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-res:imult",
|
|
"total number of integer multiplier/dividers available",
|
|
&res_imult, /* default */fu_config[FU_IMULT_INDEX].quantity,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-res:memport",
|
|
"total number of memory system ports available (to CPU)",
|
|
&res_memport, /* default */fu_config[FU_MEMPORT_INDEX].quantity,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-res:fpalu",
|
|
"total number of floating point ALU's available",
|
|
&res_fpalu, /* default */fu_config[FU_FPALU_INDEX].quantity,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_int(odb, "-res:fpmult",
|
|
"total number of floating point multiplier/dividers available",
|
|
&res_fpmult, /* default */fu_config[FU_FPMULT_INDEX].quantity,
|
|
/* print */TRUE, /* format */NULL);
|
|
|
|
opt_reg_string_list(odb, "-pcstat",
|
|
"profile stat(s) against text addr's (mult uses ok)",
|
|
pcstat_vars, MAX_PCSTAT_VARS, &pcstat_nelt, NULL,
|
|
/* !print */FALSE, /* format */NULL, /* accrue */TRUE);
|
|
|
|
opt_reg_flag(odb, "-bugcompat",
|
|
"operate in backward-compatible bugs mode (for testing only)",
|
|
&bugcompat_mode, /* default */FALSE, /* print */TRUE, NULL);
|
|
}
|
|
|
|
/* check simulator-specific option values */
|
|
void
|
|
sim_check_options(struct opt_odb_t *odb, /* options database */
|
|
int argc, char **argv) /* command line arguments */
|
|
{
|
|
char name[128], c;
|
|
int nsets, bsize, assoc;
|
|
|
|
if (fastfwd_count < 0 || fastfwd_count >= 2147483647)
|
|
fatal("bad fast forward count: %d", fastfwd_count);
|
|
|
|
if (ruu_ifq_size < 1 || (ruu_ifq_size & (ruu_ifq_size - 1)) != 0)
|
|
fatal("inst fetch queue size must be positive > 0 and a power of two");
|
|
|
|
if (ruu_branch_penalty < 1)
|
|
fatal("mis-prediction penalty must be at least 1 cycle");
|
|
|
|
if (fetch_speed < 1)
|
|
fatal("front-end speed must be positive and non-zero");
|
|
|
|
if (!mystricmp(pred_type, "perfect"))
|
|
{
|
|
/* perfect predictor */
|
|
pred = NULL;
|
|
pred_perfect = TRUE;
|
|
}
|
|
else if (!mystricmp(pred_type, "taken"))
|
|
{
|
|
/* static predictor, not taken */
|
|
pred = bpred_create(BPredTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
}
|
|
else if (!mystricmp(pred_type, "nottaken"))
|
|
{
|
|
/* static predictor, taken */
|
|
pred = bpred_create(BPredNotTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
|
|
}
|
|
else if (!mystricmp(pred_type, "bimod"))
|
|
{
|
|
/* bimodal predictor, bpred_create() checks BTB_SIZE */
|
|
if (bimod_nelt != 1)
|
|
fatal("bad bimod predictor config (<table_size>)");
|
|
if (btb_nelt != 2)
|
|
fatal("bad btb config (<num_sets> <associativity>)");
|
|
|
|
/* bimodal predictor, bpred_create() checks BTB_SIZE */
|
|
pred = bpred_create(BPred2bit,
|
|
/* bimod table size */bimod_config[0],
|
|
/* 2lev l1 size */0,
|
|
/* 2lev l2 size */0,
|
|
/* meta table size */0,
|
|
/* history reg size */0,
|
|
/* history xor address */0,
|
|
/* btb sets */btb_config[0],
|
|
/* btb assoc */btb_config[1],
|
|
/* ret-addr stack size */ras_size);
|
|
}
|
|
else if (!mystricmp(pred_type, "threebit"))
|
|
{
|
|
/* 3-bit predictor, bpred_create() checks BTB_SIZE */
|
|
if (threebit_nelt != 1)
|
|
fatal("bad 3-bit predictor config (<table_size>)");
|
|
if (btb_nelt != 2)
|
|
fatal("bad btb config (<num_sets> <associativity>)");
|
|
|
|
/* 3-bit predictor, bpred_create() checks BTB_SIZE */
|
|
pred = bpred_create(BPred3bit,
|
|
/* threebit table size */threebit_config[0],
|
|
/* 2lev l1 size */0,
|
|
/* 2lev l2 size */0,
|
|
/* meta table size */0,
|
|
/* history reg size */0,
|
|
/* history xor address */0,
|
|
/* btb sets */btb_config[0],
|
|
/* btb assoc */btb_config[1],
|
|
/* ret-addr stack size */ras_size);
|
|
}
|
|
else if (!mystricmp(pred_type, "2lev"))
|
|
{
|
|
/* 2-level adaptive predictor, bpred_create() checks args */
|
|
if (twolev_nelt != 4)
|
|
fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
|
|
if (btb_nelt != 2)
|
|
fatal("bad btb config (<num_sets> <associativity>)");
|
|
|
|
pred = bpred_create(BPred2Level,
|
|
/* bimod table size */0,
|
|
/* 2lev l1 size */twolev_config[0],
|
|
/* 2lev l2 size */twolev_config[1],
|
|
/* meta table size */0,
|
|
/* history reg size */twolev_config[2],
|
|
/* history xor address */twolev_config[3],
|
|
/* btb sets */btb_config[0],
|
|
/* btb assoc */btb_config[1],
|
|
/* ret-addr stack size */ras_size);
|
|
}
|
|
else if (!mystricmp(pred_type, "comb"))
|
|
{
|
|
/* combining predictor, bpred_create() checks args */
|
|
if (twolev_nelt != 4)
|
|
fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
|
|
if (bimod_nelt != 1)
|
|
fatal("bad bimod predictor config (<table_size>)");
|
|
if (comb_nelt != 1)
|
|
fatal("bad combining predictor config (<meta_table_size>)");
|
|
if (btb_nelt != 2)
|
|
fatal("bad btb config (<num_sets> <associativity>)");
|
|
|
|
pred = bpred_create(BPredComb,
|
|
/* bimod table size */bimod_config[0],
|
|
/* l1 size */twolev_config[0],
|
|
/* l2 size */twolev_config[1],
|
|
/* meta table size */comb_config[0],
|
|
/* history reg size */twolev_config[2],
|
|
/* history xor address */twolev_config[3],
|
|
/* btb sets */btb_config[0],
|
|
/* btb assoc */btb_config[1],
|
|
/* ret-addr stack size */ras_size);
|
|
}
|
|
else
|
|
fatal("cannot parse predictor type `%s'", pred_type);
|
|
|
|
if (!bpred_spec_opt)
|
|
bpred_spec_update = spec_CT;
|
|
else if (!mystricmp(bpred_spec_opt, "ID"))
|
|
bpred_spec_update = spec_ID;
|
|
else if (!mystricmp(bpred_spec_opt, "WB"))
|
|
bpred_spec_update = spec_WB;
|
|
else
|
|
fatal("bad speculative update stage specifier, use {ID|WB}");
|
|
|
|
if (ruu_decode_width < 1 || (ruu_decode_width & (ruu_decode_width-1)) != 0)
|
|
fatal("issue width must be positive non-zero and a power of two");
|
|
|
|
if (ruu_issue_width < 1 || (ruu_issue_width & (ruu_issue_width-1)) != 0)
|
|
fatal("issue width must be positive non-zero and a power of two");
|
|
|
|
if (ruu_commit_width < 1)
|
|
fatal("commit width must be positive non-zero");
|
|
|
|
if (RUU_size < 2 || (RUU_size & (RUU_size-1)) != 0)
|
|
fatal("RUU size must be a positive number > 1 and a power of two");
|
|
|
|
if (LSQ_size < 2 || (LSQ_size & (LSQ_size-1)) != 0)
|
|
fatal("LSQ size must be a positive number > 1 and a power of two");
|
|
|
|
/* use a level 1 D-cache? */
|
|
if (!mystricmp(cache_dl1_opt, "none"))
|
|
{
|
|
cache_dl1 = NULL;
|
|
|
|
/* the level 2 D-cache cannot be defined */
|
|
if (strcmp(cache_dl2_opt, "none"))
|
|
fatal("the l1 data cache must defined if the l2 cache is defined");
|
|
cache_dl2 = NULL;
|
|
}
|
|
else /* dl1 is defined */
|
|
{
|
|
if (sscanf(cache_dl1_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad l1 D-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
|
|
cache_dl1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */0, assoc, cache_char2policy(c),
|
|
dl1_access_fn, /* hit lat */cache_dl1_lat);
|
|
|
|
/* is the level 2 D-cache defined? */
|
|
if (!mystricmp(cache_dl2_opt, "none"))
|
|
cache_dl2 = NULL;
|
|
else
|
|
{
|
|
if (sscanf(cache_dl2_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad l2 D-cache parms: "
|
|
"<name>:<nsets>:<bsize>:<assoc>:<repl>");
|
|
cache_dl2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */0, assoc, cache_char2policy(c),
|
|
dl2_access_fn, /* hit lat */cache_dl2_lat);
|
|
}
|
|
}
|
|
|
|
/* use a level 1 I-cache? */
|
|
if (!mystricmp(cache_il1_opt, "none"))
|
|
{
|
|
cache_il1 = NULL;
|
|
|
|
/* the level 2 I-cache cannot be defined */
|
|
if (strcmp(cache_il2_opt, "none"))
|
|
fatal("the l1 inst cache must defined if the l2 cache is defined");
|
|
cache_il2 = NULL;
|
|
}
|
|
else if (!mystricmp(cache_il1_opt, "dl1"))
|
|
{
|
|
if (!cache_dl1)
|
|
fatal("I-cache l1 cannot access D-cache l1 as it's undefined");
|
|
cache_il1 = cache_dl1;
|
|
|
|
/* the level 2 I-cache cannot be defined */
|
|
if (strcmp(cache_il2_opt, "none"))
|
|
fatal("the l1 inst cache must defined if the l2 cache is defined");
|
|
cache_il2 = NULL;
|
|
}
|
|
else if (!mystricmp(cache_il1_opt, "dl2"))
|
|
{
|
|
if (!cache_dl2)
|
|
fatal("I-cache l1 cannot access D-cache l2 as it's undefined");
|
|
cache_il1 = cache_dl2;
|
|
|
|
/* the level 2 I-cache cannot be defined */
|
|
if (strcmp(cache_il2_opt, "none"))
|
|
fatal("the l1 inst cache must defined if the l2 cache is defined");
|
|
cache_il2 = NULL;
|
|
}
|
|
else /* il1 is defined */
|
|
{
|
|
if (sscanf(cache_il1_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad l1 I-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
|
|
cache_il1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */0, assoc, cache_char2policy(c),
|
|
il1_access_fn, /* hit lat */cache_il1_lat);
|
|
|
|
/* is the level 2 D-cache defined? */
|
|
if (!mystricmp(cache_il2_opt, "none"))
|
|
cache_il2 = NULL;
|
|
else if (!mystricmp(cache_il2_opt, "dl2"))
|
|
{
|
|
if (!cache_dl2)
|
|
fatal("I-cache l2 cannot access D-cache l2 as it's undefined");
|
|
cache_il2 = cache_dl2;
|
|
}
|
|
else
|
|
{
|
|
if (sscanf(cache_il2_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad l2 I-cache parms: "
|
|
"<name>:<nsets>:<bsize>:<assoc>:<repl>");
|
|
cache_il2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */0, assoc, cache_char2policy(c),
|
|
il2_access_fn, /* hit lat */cache_il2_lat);
|
|
}
|
|
}
|
|
|
|
/* use an I-TLB? */
|
|
if (!mystricmp(itlb_opt, "none"))
|
|
itlb = NULL;
|
|
else
|
|
{
|
|
if (sscanf(itlb_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
|
|
itlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */sizeof(md_addr_t), assoc,
|
|
cache_char2policy(c), itlb_access_fn,
|
|
/* hit latency */1);
|
|
}
|
|
|
|
/* use a D-TLB? */
|
|
if (!mystricmp(dtlb_opt, "none"))
|
|
dtlb = NULL;
|
|
else
|
|
{
|
|
if (sscanf(dtlb_opt, "%[^:]:%d:%d:%d:%c",
|
|
name, &nsets, &bsize, &assoc, &c) != 5)
|
|
fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
|
|
dtlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
|
|
/* usize */sizeof(md_addr_t), assoc,
|
|
cache_char2policy(c), dtlb_access_fn,
|
|
/* hit latency */1);
|
|
}
|
|
|
|
if (cache_dl1_lat < 1)
|
|
fatal("l1 data cache latency must be greater than zero");
|
|
|
|
if (cache_dl2_lat < 1)
|
|
fatal("l2 data cache latency must be greater than zero");
|
|
|
|
if (cache_il1_lat < 1)
|
|
fatal("l1 instruction cache latency must be greater than zero");
|
|
|
|
if (cache_il2_lat < 1)
|
|
fatal("l2 instruction cache latency must be greater than zero");
|
|
|
|
if (mem_nelt != 2)
|
|
fatal("bad memory access latency (<first_chunk> <inter_chunk>)");
|
|
|
|
if (mem_lat[0] < 1 || mem_lat[1] < 1)
|
|
fatal("all memory access latencies must be greater than zero");
|
|
|
|
if (mem_bus_width < 1 || (mem_bus_width & (mem_bus_width-1)) != 0)
|
|
fatal("memory bus width must be positive non-zero and a power of two");
|
|
|
|
if (tlb_miss_lat < 1)
|
|
fatal("TLB miss latency must be greater than zero");
|
|
|
|
if (res_ialu < 1)
|
|
fatal("number of integer ALU's must be greater than zero");
|
|
if (res_ialu > MAX_INSTS_PER_CLASS)
|
|
fatal("number of integer ALU's must be <= MAX_INSTS_PER_CLASS");
|
|
fu_config[FU_IALU_INDEX].quantity = res_ialu;
|
|
|
|
if (res_imult < 1)
|
|
fatal("number of integer multiplier/dividers must be greater than zero");
|
|
if (res_imult > MAX_INSTS_PER_CLASS)
|
|
fatal("number of integer mult/div's must be <= MAX_INSTS_PER_CLASS");
|
|
fu_config[FU_IMULT_INDEX].quantity = res_imult;
|
|
|
|
if (res_memport < 1)
|
|
fatal("number of memory system ports must be greater than zero");
|
|
if (res_memport > MAX_INSTS_PER_CLASS)
|
|
fatal("number of memory system ports must be <= MAX_INSTS_PER_CLASS");
|
|
fu_config[FU_MEMPORT_INDEX].quantity = res_memport;
|
|
|
|
if (res_fpalu < 1)
|
|
fatal("number of floating point ALU's must be greater than zero");
|
|
if (res_fpalu > MAX_INSTS_PER_CLASS)
|
|
fatal("number of floating point ALU's must be <= MAX_INSTS_PER_CLASS");
|
|
fu_config[FU_FPALU_INDEX].quantity = res_fpalu;
|
|
|
|
if (res_fpmult < 1)
|
|
fatal("number of floating point multiplier/dividers must be > zero");
|
|
if (res_fpmult > MAX_INSTS_PER_CLASS)
|
|
fatal("number of FP mult/div's must be <= MAX_INSTS_PER_CLASS");
|
|
fu_config[FU_FPMULT_INDEX].quantity = res_fpmult;
|
|
}
|
|
|
|
/* print simulator-specific configuration information */
|
|
void
|
|
sim_aux_config(FILE *stream) /* output stream */
|
|
{
|
|
/* nada */
|
|
}
|
|
|
|
/* register simulator-specific statistics */
|
|
void
|
|
sim_reg_stats(struct stat_sdb_t *sdb) /* stats database */
|
|
{
|
|
int i;
|
|
stat_reg_counter(sdb, "sim_num_insn",
|
|
"total number of instructions committed",
|
|
&sim_num_insn, sim_num_insn, NULL);
|
|
stat_reg_counter(sdb, "sim_num_refs",
|
|
"total number of loads and stores committed",
|
|
&sim_num_refs, 0, NULL);
|
|
stat_reg_counter(sdb, "sim_num_loads",
|
|
"total number of loads committed",
|
|
&sim_num_loads, 0, NULL);
|
|
stat_reg_formula(sdb, "sim_num_stores",
|
|
"total number of stores committed",
|
|
"sim_num_refs - sim_num_loads", NULL);
|
|
stat_reg_counter(sdb, "sim_num_branches",
|
|
"total number of branches committed",
|
|
&sim_num_branches, /* initial value */0, /* format */NULL);
|
|
stat_reg_int(sdb, "sim_elapsed_time",
|
|
"total simulation time in seconds",
|
|
&sim_elapsed_time, 0, NULL);
|
|
stat_reg_formula(sdb, "sim_inst_rate",
|
|
"simulation speed (in insts/sec)",
|
|
"sim_num_insn / sim_elapsed_time", NULL);
|
|
|
|
stat_reg_counter(sdb, "sim_total_insn",
|
|
"total number of instructions executed",
|
|
&sim_total_insn, 0, NULL);
|
|
stat_reg_counter(sdb, "sim_total_refs",
|
|
"total number of loads and stores executed",
|
|
&sim_total_refs, 0, NULL);
|
|
stat_reg_counter(sdb, "sim_total_loads",
|
|
"total number of loads executed",
|
|
&sim_total_loads, 0, NULL);
|
|
stat_reg_formula(sdb, "sim_total_stores",
|
|
"total number of stores executed",
|
|
"sim_total_refs - sim_total_loads", NULL);
|
|
stat_reg_counter(sdb, "sim_total_branches",
|
|
"total number of branches executed",
|
|
&sim_total_branches, /* initial value */0, /* format */NULL);
|
|
|
|
/* register performance stats */
|
|
stat_reg_counter(sdb, "sim_cycle",
|
|
"total simulation time in cycles",
|
|
&sim_cycle, /* initial value */0, /* format */NULL);
|
|
stat_reg_formula(sdb, "sim_IPC",
|
|
"instructions per cycle",
|
|
"sim_num_insn / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "sim_CPI",
|
|
"cycles per instruction",
|
|
"sim_cycle / sim_num_insn", /* format */NULL);
|
|
stat_reg_formula(sdb, "sim_exec_BW",
|
|
"total instructions (mis-spec + committed) per cycle",
|
|
"sim_total_insn / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "sim_IPB",
|
|
"instruction per branch",
|
|
"sim_num_insn / sim_num_branches", /* format */NULL);
|
|
|
|
/* occupancy stats */
|
|
stat_reg_counter(sdb, "IFQ_count", "cumulative IFQ occupancy",
|
|
&IFQ_count, /* initial value */0, /* format */NULL);
|
|
stat_reg_counter(sdb, "IFQ_fcount", "cumulative IFQ full count",
|
|
&IFQ_fcount, /* initial value */0, /* format */NULL);
|
|
stat_reg_formula(sdb, "ifq_occupancy", "avg IFQ occupancy (insn's)",
|
|
"IFQ_count / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "ifq_rate", "avg IFQ dispatch rate (insn/cycle)",
|
|
"sim_total_insn / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "ifq_latency", "avg IFQ occupant latency (cycle's)",
|
|
"ifq_occupancy / ifq_rate", /* format */NULL);
|
|
stat_reg_formula(sdb, "ifq_full", "fraction of time (cycle's) IFQ was full",
|
|
"IFQ_fcount / sim_cycle", /* format */NULL);
|
|
|
|
stat_reg_counter(sdb, "RUU_count", "cumulative RUU occupancy",
|
|
&RUU_count, /* initial value */0, /* format */NULL);
|
|
stat_reg_counter(sdb, "RUU_fcount", "cumulative RUU full count",
|
|
&RUU_fcount, /* initial value */0, /* format */NULL);
|
|
stat_reg_formula(sdb, "ruu_occupancy", "avg RUU occupancy (insn's)",
|
|
"RUU_count / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "ruu_rate", "avg RUU dispatch rate (insn/cycle)",
|
|
"sim_total_insn / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "ruu_latency", "avg RUU occupant latency (cycle's)",
|
|
"ruu_occupancy / ruu_rate", /* format */NULL);
|
|
stat_reg_formula(sdb, "ruu_full", "fraction of time (cycle's) RUU was full",
|
|
"RUU_fcount / sim_cycle", /* format */NULL);
|
|
|
|
stat_reg_counter(sdb, "LSQ_count", "cumulative LSQ occupancy",
|
|
&LSQ_count, /* initial value */0, /* format */NULL);
|
|
stat_reg_counter(sdb, "LSQ_fcount", "cumulative LSQ full count",
|
|
&LSQ_fcount, /* initial value */0, /* format */NULL);
|
|
stat_reg_formula(sdb, "lsq_occupancy", "avg LSQ occupancy (insn's)",
|
|
"LSQ_count / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "lsq_rate", "avg LSQ dispatch rate (insn/cycle)",
|
|
"sim_total_insn / sim_cycle", /* format */NULL);
|
|
stat_reg_formula(sdb, "lsq_latency", "avg LSQ occupant latency (cycle's)",
|
|
"lsq_occupancy / lsq_rate", /* format */NULL);
|
|
stat_reg_formula(sdb, "lsq_full", "fraction of time (cycle's) LSQ was full",
|
|
"LSQ_fcount / sim_cycle", /* format */NULL);
|
|
|
|
stat_reg_counter(sdb, "sim_slip",
|
|
"total number of slip cycles",
|
|
&sim_slip, 0, NULL);
|
|
/* register baseline stats */
|
|
stat_reg_formula(sdb, "avg_sim_slip",
|
|
"the average slip between issue and retirement",
|
|
"sim_slip / sim_num_insn", NULL);
|
|
|
|
/* register predictor stats */
|
|
if (pred)
|
|
bpred_reg_stats(pred, sdb);
|
|
|
|
/* register cache stats */
|
|
if (cache_il1
|
|
&& (cache_il1 != cache_dl1 && cache_il1 != cache_dl2))
|
|
cache_reg_stats(cache_il1, sdb);
|
|
if (cache_il2
|
|
&& (cache_il2 != cache_dl1 && cache_il2 != cache_dl2))
|
|
cache_reg_stats(cache_il2, sdb);
|
|
if (cache_dl1)
|
|
cache_reg_stats(cache_dl1, sdb);
|
|
if (cache_dl2)
|
|
cache_reg_stats(cache_dl2, sdb);
|
|
if (itlb)
|
|
cache_reg_stats(itlb, sdb);
|
|
if (dtlb)
|
|
cache_reg_stats(dtlb, sdb);
|
|
|
|
/* debug variable(s) */
|
|
stat_reg_counter(sdb, "sim_invalid_addrs",
|
|
"total non-speculative bogus addresses seen (debug var)",
|
|
&sim_invalid_addrs, /* initial value */0, /* format */NULL);
|
|
|
|
for (i=0; i<pcstat_nelt; i++)
|
|
{
|
|
char buf[512], buf1[512];
|
|
struct stat_stat_t *stat;
|
|
|
|
/* track the named statistical variable by text address */
|
|
|
|
/* find it... */
|
|
stat = stat_find_stat(sdb, pcstat_vars[i]);
|
|
if (!stat)
|
|
fatal("cannot locate any statistic named `%s'", pcstat_vars[i]);
|
|
|
|
/* stat must be an integral type */
|
|
if (stat->sc != sc_int && stat->sc != sc_uint && stat->sc != sc_counter)
|
|
fatal("`-pcstat' statistical variable `%s' is not an integral type",
|
|
stat->name);
|
|
|
|
/* register this stat */
|
|
pcstat_stats[i] = stat;
|
|
pcstat_lastvals[i] = STATVAL(stat);
|
|
|
|
/* declare the sparce text distribution */
|
|
sprintf(buf, "%s_by_pc", stat->name);
|
|
sprintf(buf1, "%s (by text address)", stat->desc);
|
|
pcstat_sdists[i] = stat_reg_sdist(sdb, buf, buf1,
|
|
/* initial value */0,
|
|
/* print format */(PF_COUNT|PF_PDF),
|
|
/* format */"0x%lx %lu %.2f",
|
|
/* print fn */NULL);
|
|
}
|
|
ld_reg_stats(sdb);
|
|
mem_reg_stats(mem, sdb);
|
|
}
|
|
|
|
/* forward declarations */
|
|
static void ruu_init(void);
|
|
static void lsq_init(void);
|
|
static void rslink_init(int nlinks);
|
|
static void eventq_init(void);
|
|
static void readyq_init(void);
|
|
static void cv_init(void);
|
|
static void tracer_init(void);
|
|
static void fetch_init(void);
|
|
|
|
/* initialize the simulator */
|
|
void
|
|
sim_init(void)
|
|
{
|
|
sim_num_refs = 0;
|
|
|
|
/* allocate and initialize register file */
|
|
regs_init(®s);
|
|
|
|
/* allocate and initialize memory space */
|
|
mem = mem_create("mem");
|
|
mem_init(mem);
|
|
}
|
|
|
|
/* default register state accessor, used by DLite */
|
|
static char * /* err str, NULL for no err */
|
|
simoo_reg_obj(struct regs_t *regs, /* registers to access */
|
|
int is_write, /* access type */
|
|
enum md_reg_type rt, /* reg bank to probe */
|
|
int reg, /* register number */
|
|
struct eval_value_t *val); /* input, output */
|
|
|
|
/* default memory state accessor, used by DLite */
|
|
static char * /* err str, NULL for no err */
|
|
simoo_mem_obj(struct mem_t *mem, /* memory space to access */
|
|
int is_write, /* access type */
|
|
md_addr_t addr, /* address to access */
|
|
char *p, /* input/output buffer */
|
|
int nbytes); /* size of access */
|
|
|
|
/* default machine state accessor, used by DLite */
|
|
static char * /* err str, NULL for no err */
|
|
simoo_mstate_obj(FILE *stream, /* output stream */
|
|
char *cmd, /* optional command string */
|
|
struct regs_t *regs, /* registers to access */
|
|
struct mem_t *mem); /* memory space to access */
|
|
|
|
/* total RS links allocated at program start */
|
|
#define MAX_RS_LINKS 4096
|
|
|
|
/* load program into simulated state */
|
|
void
|
|
sim_load_prog(char *fname, /* program to load */
|
|
int argc, char **argv, /* program arguments */
|
|
char **envp) /* program environment */
|
|
{
|
|
/* load program text and data, set up environment, memory, and regs */
|
|
ld_load_prog(fname, argc, argv, envp, ®s, mem, TRUE);
|
|
|
|
/* initialize here, so symbols can be loaded */
|
|
if (ptrace_nelt == 2)
|
|
{
|
|
/* generate a pipeline trace */
|
|
ptrace_open(/* fname */ptrace_opts[0], /* range */ptrace_opts[1]);
|
|
}
|
|
else if (ptrace_nelt == 0)
|
|
{
|
|
/* no pipetracing */;
|
|
}
|
|
else
|
|
fatal("bad pipetrace args, use: <fname|stdout|stderr> <range>");
|
|
|
|
/* finish initialization of the simulation engine */
|
|
fu_pool = res_create_pool("fu-pool", fu_config, N_ELT(fu_config));
|
|
rslink_init(MAX_RS_LINKS);
|
|
tracer_init();
|
|
fetch_init();
|
|
cv_init();
|
|
eventq_init();
|
|
readyq_init();
|
|
ruu_init();
|
|
lsq_init();
|
|
|
|
/* initialize the DLite debugger */
|
|
dlite_init(simoo_reg_obj, simoo_mem_obj, simoo_mstate_obj);
|
|
}
|
|
|
|
/* dump simulator-specific auxiliary simulator statistics */
|
|
void
|
|
sim_aux_stats(FILE *stream) /* output stream */
|
|
{
|
|
/* nada */
|
|
}
|
|
|
|
/* un-initialize the simulator */
|
|
void
|
|
sim_uninit(void)
|
|
{
|
|
if (ptrace_nelt > 0)
|
|
ptrace_close();
|
|
}
|
|
|
|
|
|
/*
|
|
* processor core definitions and declarations
|
|
*/
|
|
|
|
/* inst tag type, used to tag an operation instance in the RUU */
|
|
typedef unsigned int INST_TAG_TYPE;
|
|
|
|
/* inst sequence type, used to order instructions in the ready list, if
|
|
this rolls over the ready list order temporarily will get messed up,
|
|
but execution will continue and complete correctly */
|
|
typedef unsigned int INST_SEQ_TYPE;
|
|
|
|
|
|
/* total input dependencies possible */
|
|
#define MAX_IDEPS 3
|
|
|
|
/* total output dependencies possible */
|
|
#define MAX_ODEPS 2
|
|
|
|
/* a register update unit (RUU) station, this record is contained in the
|
|
processors RUU, which serves as a collection of ordered reservations
|
|
stations. The reservation stations capture register results and await
|
|
the time when all operands are ready, at which time the instruction is
|
|
issued to the functional units; the RUU is an order circular queue, in which
|
|
instructions are inserted in fetch (program) order, results are stored in
|
|
the RUU buffers, and later when an RUU entry is the oldest entry in the
|
|
machines, it and its instruction's value is retired to the architectural
|
|
register file in program order, NOTE: the RUU and LSQ share the same
|
|
structure, this is useful because loads and stores are split into two
|
|
operations: an effective address add and a load/store, the add is inserted
|
|
into the RUU and the load/store inserted into the LSQ, allowing the add
|
|
to wake up the load/store when effective address computation has finished */
|
|
struct RUU_station {
|
|
/* inst info */
|
|
md_inst_t IR; /* instruction bits */
|
|
enum md_opcode op; /* decoded instruction opcode */
|
|
md_addr_t PC, next_PC, pred_PC; /* inst PC, next PC, predicted PC */
|
|
int in_LSQ; /* non-zero if op is in LSQ */
|
|
int ea_comp; /* non-zero if op is an addr comp */
|
|
int recover_inst; /* start of mis-speculation? */
|
|
int stack_recover_idx; /* non-speculative TOS for RSB pred */
|
|
struct bpred_update_t dir_update; /* bpred direction update info */
|
|
int spec_mode; /* non-zero if issued in spec_mode */
|
|
md_addr_t addr; /* effective address for ld/st's */
|
|
INST_TAG_TYPE tag; /* RUU slot tag, increment to
|
|
squash operation */
|
|
INST_SEQ_TYPE seq; /* instruction sequence, used to
|
|
sort the ready list and tag inst */
|
|
unsigned int ptrace_seq; /* pipetrace sequence number */
|
|
int slip;
|
|
/* instruction status */
|
|
int queued; /* operands ready and queued */
|
|
int issued; /* operation is/was executing */
|
|
int completed; /* operation has completed execution */
|
|
/* output operand dependency list, these lists are used to
|
|
limit the number of associative searches into the RUU when
|
|
instructions complete and need to wake up dependent insts */
|
|
int onames[MAX_ODEPS]; /* output logical names (NA=unused) */
|
|
struct RS_link *odep_list[MAX_ODEPS]; /* chains to consuming operations */
|
|
|
|
/* input dependent links, the output chains rooted above use these
|
|
fields to mark input operands as ready, when all these fields have
|
|
been set non-zero, the RUU operation has all of its register
|
|
operands, it may commence execution as soon as all of its memory
|
|
operands are known to be read (see lsq_refresh() for details on
|
|
enforcing memory dependencies) */
|
|
int idep_ready[MAX_IDEPS]; /* input operand ready? */
|
|
};
|
|
|
|
/* non-zero if all register operands are ready, update with MAX_IDEPS */
|
|
#define OPERANDS_READY(RS) \
|
|
((RS)->idep_ready[0] && (RS)->idep_ready[1] && (RS)->idep_ready[2])
|
|
|
|
/* register update unit, combination of reservation stations and reorder
|
|
buffer device, organized as a circular queue */
|
|
static struct RUU_station *RUU; /* register update unit */
|
|
static int RUU_head, RUU_tail; /* RUU head and tail pointers */
|
|
static int RUU_num; /* num entries currently in RUU */
|
|
|
|
/* allocate and initialize register update unit (RUU) */
|
|
static void
|
|
ruu_init(void)
|
|
{
|
|
RUU = calloc(RUU_size, sizeof(struct RUU_station));
|
|
if (!RUU)
|
|
fatal("out of virtual memory");
|
|
|
|
RUU_num = 0;
|
|
RUU_head = RUU_tail = 0;
|
|
RUU_count = 0;
|
|
RUU_fcount = 0;
|
|
}
|
|
|
|
/* dump the contents of the RUU */
|
|
static void
|
|
ruu_dumpent(struct RUU_station *rs, /* ptr to RUU station */
|
|
int index, /* entry index */
|
|
FILE *stream, /* output stream */
|
|
int header) /* print header? */
|
|
{
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
if (header)
|
|
fprintf(stream, "idx: %2d: opcode: %s, inst: `",
|
|
index, MD_OP_NAME(rs->op));
|
|
else
|
|
fprintf(stream, " opcode: %s, inst: `",
|
|
MD_OP_NAME(rs->op));
|
|
md_print_insn(rs->IR, rs->PC, stream);
|
|
fprintf(stream, "'\n");
|
|
myfprintf(stream, " PC: 0x%08p, NPC: 0x%08p (pred_PC: 0x%08p)\n",
|
|
rs->PC, rs->next_PC, rs->pred_PC);
|
|
fprintf(stream, " in_LSQ: %s, ea_comp: %s, recover_inst: %s\n",
|
|
rs->in_LSQ ? "t" : "f",
|
|
rs->ea_comp ? "t" : "f",
|
|
rs->recover_inst ? "t" : "f");
|
|
myfprintf(stream, " spec_mode: %s, addr: 0x%08p, tag: 0x%08x\n",
|
|
rs->spec_mode ? "t" : "f", rs->addr, rs->tag);
|
|
fprintf(stream, " seq: 0x%08x, ptrace_seq: 0x%08x\n",
|
|
rs->seq, rs->ptrace_seq);
|
|
fprintf(stream, " queued: %s, issued: %s, completed: %s\n",
|
|
rs->queued ? "t" : "f",
|
|
rs->issued ? "t" : "f",
|
|
rs->completed ? "t" : "f");
|
|
fprintf(stream, " operands ready: %s\n",
|
|
OPERANDS_READY(rs) ? "t" : "f");
|
|
}
|
|
|
|
/* dump the contents of the RUU */
|
|
static void
|
|
ruu_dump(FILE *stream) /* output stream */
|
|
{
|
|
int num, head;
|
|
struct RUU_station *rs;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
fprintf(stream, "** RUU state **\n");
|
|
fprintf(stream, "RUU_head: %d, RUU_tail: %d\n", RUU_head, RUU_tail);
|
|
fprintf(stream, "RUU_num: %d\n", RUU_num);
|
|
|
|
num = RUU_num;
|
|
head = RUU_head;
|
|
while (num)
|
|
{
|
|
rs = &RUU[head];
|
|
ruu_dumpent(rs, rs - RUU, stream, /* header */TRUE);
|
|
head = (head + 1) % RUU_size;
|
|
num--;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* load/store queue (LSQ): holds loads and stores in program order, indicating
|
|
* status of load/store access:
|
|
*
|
|
* - issued: address computation complete, memory access in progress
|
|
* - completed: memory access has completed, stored value available
|
|
* - squashed: memory access was squashed, ignore this entry
|
|
*
|
|
* loads may execute when:
|
|
* 1) register operands are ready, and
|
|
* 2) memory operands are ready (no earlier unresolved store)
|
|
*
|
|
* loads are serviced by:
|
|
* 1) previous store at same address in LSQ (hit latency), or
|
|
* 2) data cache (hit latency + miss latency)
|
|
*
|
|
* stores may execute when:
|
|
* 1) register operands are ready
|
|
*
|
|
* stores are serviced by:
|
|
* 1) depositing store value into the load/store queue
|
|
* 2) writing store value to the store buffer (plus tag check) at commit
|
|
* 3) writing store buffer entry to data cache when cache is free
|
|
*
|
|
* NOTE: the load/store queue can bypass a store value to a load in the same
|
|
* cycle the store executes (using a bypass network), thus stores complete
|
|
* in effective zero time after their effective address is known
|
|
*/
|
|
static struct RUU_station *LSQ; /* load/store queue */
|
|
static int LSQ_head, LSQ_tail; /* LSQ head and tail pointers */
|
|
static int LSQ_num; /* num entries currently in LSQ */
|
|
|
|
/*
|
|
* input dependencies for stores in the LSQ:
|
|
* idep #0 - operand input (value that is store'd)
|
|
* idep #1 - effective address input (address of store operation)
|
|
*/
|
|
#define STORE_OP_INDEX 0
|
|
#define STORE_ADDR_INDEX 1
|
|
|
|
#define STORE_OP_READY(RS) ((RS)->idep_ready[STORE_OP_INDEX])
|
|
#define STORE_ADDR_READY(RS) ((RS)->idep_ready[STORE_ADDR_INDEX])
|
|
|
|
/* allocate and initialize the load/store queue (LSQ) */
|
|
static void
|
|
lsq_init(void)
|
|
{
|
|
LSQ = calloc(LSQ_size, sizeof(struct RUU_station));
|
|
if (!LSQ)
|
|
fatal("out of virtual memory");
|
|
|
|
LSQ_num = 0;
|
|
LSQ_head = LSQ_tail = 0;
|
|
LSQ_count = 0;
|
|
LSQ_fcount = 0;
|
|
}
|
|
|
|
/* dump the contents of the RUU */
|
|
static void
|
|
lsq_dump(FILE *stream) /* output stream */
|
|
{
|
|
int num, head;
|
|
struct RUU_station *rs;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
fprintf(stream, "** LSQ state **\n");
|
|
fprintf(stream, "LSQ_head: %d, LSQ_tail: %d\n", LSQ_head, LSQ_tail);
|
|
fprintf(stream, "LSQ_num: %d\n", LSQ_num);
|
|
|
|
num = LSQ_num;
|
|
head = LSQ_head;
|
|
while (num)
|
|
{
|
|
rs = &LSQ[head];
|
|
ruu_dumpent(rs, rs - LSQ, stream, /* header */TRUE);
|
|
head = (head + 1) % LSQ_size;
|
|
num--;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RS_LINK defs and decls
|
|
*/
|
|
|
|
/* a reservation station link: this structure links elements of a RUU
|
|
reservation station list; used for ready instruction queue, event queue, and
|
|
output dependency lists; each RS_LINK node contains a pointer to the RUU
|
|
entry it references along with an instance tag, the RS_LINK is only valid if
|
|
the instruction instance tag matches the instruction RUU entry instance tag;
|
|
this strategy allows entries in the RUU can be squashed and reused without
|
|
updating the lists that point to it, which significantly improves the
|
|
performance of (all to frequent) squash events */
|
|
struct RS_link {
|
|
struct RS_link *next; /* next entry in list */
|
|
struct RUU_station *rs; /* referenced RUU resv station */
|
|
INST_TAG_TYPE tag; /* inst instance sequence number */
|
|
union {
|
|
tick_t when; /* time stamp of entry (for eventq) */
|
|
INST_SEQ_TYPE seq; /* inst sequence */
|
|
int opnum; /* input/output operand number */
|
|
} x;
|
|
};
|
|
|
|
/* RS link free list, grab RS_LINKs from here, when needed */
|
|
static struct RS_link *rslink_free_list;
|
|
|
|
/* NULL value for an RS link */
|
|
#define RSLINK_NULL_DATA { NULL, NULL, 0 }
|
|
static struct RS_link RSLINK_NULL = RSLINK_NULL_DATA;
|
|
|
|
/* create and initialize an RS link */
|
|
#define RSLINK_INIT(RSL, RS) \
|
|
((RSL).next = NULL, (RSL).rs = (RS), (RSL).tag = (RS)->tag)
|
|
|
|
/* non-zero if RS link is NULL */
|
|
#define RSLINK_IS_NULL(LINK) ((LINK)->rs == NULL)
|
|
|
|
/* non-zero if RS link is to a valid (non-squashed) entry */
|
|
#define RSLINK_VALID(LINK) ((LINK)->tag == (LINK)->rs->tag)
|
|
|
|
/* extra RUU reservation station pointer */
|
|
#define RSLINK_RS(LINK) ((LINK)->rs)
|
|
|
|
/* get a new RS link record */
|
|
#define RSLINK_NEW(DST, RS) \
|
|
{ struct RS_link *n_link; \
|
|
if (!rslink_free_list) \
|
|
panic("out of rs links"); \
|
|
n_link = rslink_free_list; \
|
|
rslink_free_list = rslink_free_list->next; \
|
|
n_link->next = NULL; \
|
|
n_link->rs = (RS); n_link->tag = n_link->rs->tag; \
|
|
(DST) = n_link; \
|
|
}
|
|
|
|
/* free an RS link record */
|
|
#define RSLINK_FREE(LINK) \
|
|
{ struct RS_link *f_link = (LINK); \
|
|
f_link->rs = NULL; f_link->tag = 0; \
|
|
f_link->next = rslink_free_list; \
|
|
rslink_free_list = f_link; \
|
|
}
|
|
|
|
/* FIXME: could this be faster!!! */
|
|
/* free an RS link list */
|
|
#define RSLINK_FREE_LIST(LINK) \
|
|
{ struct RS_link *fl_link, *fl_link_next; \
|
|
for (fl_link=(LINK); fl_link; fl_link=fl_link_next) \
|
|
{ \
|
|
fl_link_next = fl_link->next; \
|
|
RSLINK_FREE(fl_link); \
|
|
} \
|
|
}
|
|
|
|
/* initialize the free RS_LINK pool */
|
|
static void
|
|
rslink_init(int nlinks) /* total number of RS_LINK available */
|
|
{
|
|
int i;
|
|
struct RS_link *link;
|
|
|
|
rslink_free_list = NULL;
|
|
for (i=0; i<nlinks; i++)
|
|
{
|
|
link = calloc(1, sizeof(struct RS_link));
|
|
if (!link)
|
|
fatal("out of virtual memory");
|
|
link->next = rslink_free_list;
|
|
rslink_free_list = link;
|
|
}
|
|
}
|
|
|
|
/* service all functional unit release events, this function is called
|
|
once per cycle, and it used to step the BUSY timers attached to each
|
|
functional unit in the function unit resource pool, as long as a functional
|
|
unit's BUSY count is > 0, it cannot be issued an operation */
|
|
static void
|
|
ruu_release_fu(void)
|
|
{
|
|
int i;
|
|
|
|
/* walk all resource units, decrement busy counts by one */
|
|
for (i=0; i<fu_pool->num_resources; i++)
|
|
{
|
|
/* resource is released when BUSY hits zero */
|
|
if (fu_pool->resources[i].busy > 0)
|
|
fu_pool->resources[i].busy--;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* the execution unit event queue implementation follows, the event queue
|
|
* indicates which instruction will complete next, the writeback handler
|
|
* drains this queue
|
|
*/
|
|
|
|
/* pending event queue, sorted from soonest to latest event (in time), NOTE:
|
|
RS_LINK nodes are used for the event queue list so that it need not be
|
|
updated during squash events */
|
|
static struct RS_link *event_queue;
|
|
|
|
/* initialize the event queue structures */
|
|
static void
|
|
eventq_init(void)
|
|
{
|
|
event_queue = NULL;
|
|
}
|
|
|
|
/* dump the contents of the event queue */
|
|
static void
|
|
eventq_dump(FILE *stream) /* output stream */
|
|
{
|
|
struct RS_link *ev;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
fprintf(stream, "** event queue state **\n");
|
|
|
|
for (ev = event_queue; ev != NULL; ev = ev->next)
|
|
{
|
|
/* is event still valid? */
|
|
if (RSLINK_VALID(ev))
|
|
{
|
|
struct RUU_station *rs = RSLINK_RS(ev);
|
|
|
|
fprintf(stream, "idx: %2d: @ %.0f\n",
|
|
(int)(rs - (rs->in_LSQ ? LSQ : RUU)), (double)ev->x.when);
|
|
ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
|
|
stream, /* !header */FALSE);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* insert an event for RS into the event queue, event queue is sorted from
|
|
earliest to latest event, event and associated side-effects will be
|
|
apparent at the start of cycle WHEN */
|
|
static void
|
|
eventq_queue_event(struct RUU_station *rs, tick_t when)
|
|
{
|
|
struct RS_link *prev, *ev, *new_ev;
|
|
|
|
if (rs->completed)
|
|
panic("event completed");
|
|
|
|
if (when <= sim_cycle)
|
|
panic("event occurred in the past");
|
|
|
|
/* get a free event record */
|
|
RSLINK_NEW(new_ev, rs);
|
|
new_ev->x.when = when;
|
|
|
|
/* locate insertion point */
|
|
for (prev=NULL, ev=event_queue;
|
|
ev && ev->x.when < when;
|
|
prev=ev, ev=ev->next);
|
|
|
|
if (prev)
|
|
{
|
|
/* insert middle or end */
|
|
new_ev->next = prev->next;
|
|
prev->next = new_ev;
|
|
}
|
|
else
|
|
{
|
|
/* insert at beginning */
|
|
new_ev->next = event_queue;
|
|
event_queue = new_ev;
|
|
}
|
|
}
|
|
|
|
/* return the next event that has already occurred, returns NULL when no
|
|
remaining events or all remaining events are in the future */
|
|
static struct RUU_station *
|
|
eventq_next_event(void)
|
|
{
|
|
struct RS_link *ev;
|
|
|
|
if (event_queue && event_queue->x.when <= sim_cycle)
|
|
{
|
|
/* unlink and return first event on priority list */
|
|
ev = event_queue;
|
|
event_queue = event_queue->next;
|
|
|
|
/* event still valid? */
|
|
if (RSLINK_VALID(ev))
|
|
{
|
|
struct RUU_station *rs = RSLINK_RS(ev);
|
|
|
|
/* reclaim event record */
|
|
RSLINK_FREE(ev);
|
|
|
|
/* event is valid, return resv station */
|
|
return rs;
|
|
}
|
|
else
|
|
{
|
|
/* reclaim event record */
|
|
RSLINK_FREE(ev);
|
|
|
|
/* receiving inst was squashed, return next event */
|
|
return eventq_next_event();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* no event or no event is ready */
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* the ready instruction queue implementation follows, the ready instruction
|
|
* queue indicates which instruction have all of there *register* dependencies
|
|
* satisfied, instruction will issue when 1) all memory dependencies for
|
|
* the instruction have been satisfied (see lsq_refresh() for details on how
|
|
* this is accomplished) and 2) resources are available; ready queue is fully
|
|
* constructed each cycle before any operation is issued from it -- this
|
|
* ensures that instruction issue priorities are properly observed; NOTE:
|
|
* RS_LINK nodes are used for the event queue list so that it need not be
|
|
* updated during squash events
|
|
*/
|
|
|
|
/* the ready instruction queue */
|
|
static struct RS_link *ready_queue;
|
|
|
|
/* initialize the event queue structures */
|
|
static void
|
|
readyq_init(void)
|
|
{
|
|
ready_queue = NULL;
|
|
}
|
|
|
|
/* dump the contents of the ready queue */
|
|
static void
|
|
readyq_dump(FILE *stream) /* output stream */
|
|
{
|
|
struct RS_link *link;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
fprintf(stream, "** ready queue state **\n");
|
|
|
|
for (link = ready_queue; link != NULL; link = link->next)
|
|
{
|
|
/* is entry still valid? */
|
|
if (RSLINK_VALID(link))
|
|
{
|
|
struct RUU_station *rs = RSLINK_RS(link);
|
|
|
|
ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
|
|
stream, /* header */TRUE);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* insert ready node into the ready list using ready instruction scheduling
|
|
policy; currently the following scheduling policy is enforced:
|
|
|
|
memory and long latency operands, and branch instructions first
|
|
|
|
then
|
|
|
|
all other instructions, oldest instructions first
|
|
|
|
this policy works well because branches pass through the machine quicker
|
|
which works to reduce branch misprediction latencies, and very long latency
|
|
instructions (such loads and multiplies) get priority since they are very
|
|
likely on the program's critical path */
|
|
static void
|
|
readyq_enqueue(struct RUU_station *rs) /* RS to enqueue */
|
|
{
|
|
struct RS_link *prev, *node, *new_node;
|
|
|
|
/* node is now queued */
|
|
if (rs->queued)
|
|
panic("node is already queued");
|
|
rs->queued = TRUE;
|
|
|
|
/* get a free ready list node */
|
|
RSLINK_NEW(new_node, rs);
|
|
new_node->x.seq = rs->seq;
|
|
|
|
/* locate insertion point */
|
|
if (rs->in_LSQ || MD_OP_FLAGS(rs->op) & (F_LONGLAT|F_CTRL))
|
|
{
|
|
/* insert loads/stores and long latency ops at the head of the queue */
|
|
prev = NULL;
|
|
node = ready_queue;
|
|
}
|
|
else
|
|
{
|
|
/* otherwise insert in program order (earliest seq first) */
|
|
for (prev=NULL, node=ready_queue;
|
|
node && node->x.seq < rs->seq;
|
|
prev=node, node=node->next);
|
|
}
|
|
|
|
if (prev)
|
|
{
|
|
/* insert middle or end */
|
|
new_node->next = prev->next;
|
|
prev->next = new_node;
|
|
}
|
|
else
|
|
{
|
|
/* insert at beginning */
|
|
new_node->next = ready_queue;
|
|
ready_queue = new_node;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* the create vector maps a logical register to a creator in the RUU (and
|
|
* specific output operand) or the architected register file (if RS_link
|
|
* is NULL)
|
|
*/
|
|
|
|
/* an entry in the create vector */
|
|
struct CV_link {
|
|
struct RUU_station *rs; /* creator's reservation station */
|
|
int odep_num; /* specific output operand */
|
|
};
|
|
|
|
/* a NULL create vector entry */
|
|
static struct CV_link CVLINK_NULL = { NULL, 0 };
|
|
|
|
/* get a new create vector link */
|
|
#define CVLINK_INIT(CV, RS,ONUM) ((CV).rs = (RS), (CV).odep_num = (ONUM))
|
|
|
|
/* size of the create vector (one entry per architected register) */
|
|
#define CV_BMAP_SZ (BITMAP_SIZE(MD_TOTAL_REGS))
|
|
|
|
/* the create vector, NOTE: speculative copy on write storage provided
|
|
for fast recovery during wrong path execute (see tracer_recover() for
|
|
details on this process */
|
|
static BITMAP_TYPE(MD_TOTAL_REGS, use_spec_cv);
|
|
static struct CV_link create_vector[MD_TOTAL_REGS];
|
|
static struct CV_link spec_create_vector[MD_TOTAL_REGS];
|
|
|
|
/* these arrays shadow the create vector an indicate when a register was
|
|
last created */
|
|
static tick_t create_vector_rt[MD_TOTAL_REGS];
|
|
static tick_t spec_create_vector_rt[MD_TOTAL_REGS];
|
|
|
|
/* read a create vector entry */
|
|
#define CREATE_VECTOR(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
|
|
? spec_create_vector[N] \
|
|
: create_vector[N])
|
|
|
|
/* read a create vector timestamp entry */
|
|
#define CREATE_VECTOR_RT(N) (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
|
|
? spec_create_vector_rt[N] \
|
|
: create_vector_rt[N])
|
|
|
|
/* set a create vector entry */
|
|
#define SET_CREATE_VECTOR(N, L) (spec_mode \
|
|
? (BITMAP_SET(use_spec_cv, CV_BMAP_SZ, (N)),\
|
|
spec_create_vector[N] = (L)) \
|
|
: (create_vector[N] = (L)))
|
|
|
|
/* initialize the create vector */
|
|
static void
|
|
cv_init(void)
|
|
{
|
|
int i;
|
|
|
|
/* initially all registers are valid in the architected register file,
|
|
i.e., the create vector entry is CVLINK_NULL */
|
|
for (i=0; i < MD_TOTAL_REGS; i++)
|
|
{
|
|
create_vector[i] = CVLINK_NULL;
|
|
create_vector_rt[i] = 0;
|
|
spec_create_vector[i] = CVLINK_NULL;
|
|
spec_create_vector_rt[i] = 0;
|
|
}
|
|
|
|
/* all create vector entries are non-speculative */
|
|
BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
|
|
}
|
|
|
|
/* dump the contents of the create vector */
|
|
static void
|
|
cv_dump(FILE *stream) /* output stream */
|
|
{
|
|
int i;
|
|
struct CV_link ent;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|
|
|
|
fprintf(stream, "** create vector state **\n");
|
|
|
|
for (i=0; i < MD_TOTAL_REGS; i++)
|
|
{
|
|
ent = CREATE_VECTOR(i);
|
|
if (!ent.rs)
|
|
fprintf(stream, "[cv%02d]: from architected reg file\n", i);
|
|
else
|
|
fprintf(stream, "[cv%02d]: from %s, idx: %d\n",
|
|
i, (ent.rs->in_LSQ ? "LSQ" : "RUU"),
|
|
(int)(ent.rs - (ent.rs->in_LSQ ? LSQ : RUU)));
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RUU_COMMIT() - instruction retirement pipeline stage
|
|
*/
|
|
|
|
/* this function commits the results of the oldest completed entries from the
|
|
RUU and LSQ to the architected reg file, stores in the LSQ will commit
|
|
their store data to the data cache at this point as well */
|
|
static void
|
|
ruu_commit(void)
|
|
{
|
|
int i, lat, events, committed = 0;
|
|
static counter_t sim_ret_insn = 0;
|
|
|
|
/* all values must be retired to the architected reg file in program order */
|
|
while (RUU_num > 0 && committed < ruu_commit_width)
|
|
{
|
|
struct RUU_station *rs = &(RUU[RUU_head]);
|
|
|
|
if (!rs->completed)
|
|
{
|
|
/* at least RUU entry must be complete */
|
|
break;
|
|
}
|
|
|
|
/* default commit events */
|
|
events = 0;
|
|
|
|
/* load/stores must retire load/store queue entry as well */
|
|
if (RUU[RUU_head].ea_comp)
|
|
{
|
|
/* load/store, retire head of LSQ as well */
|
|
if (LSQ_num <= 0 || !LSQ[LSQ_head].in_LSQ)
|
|
panic("RUU out of sync with LSQ");
|
|
|
|
/* load/store operation must be complete */
|
|
if (!LSQ[LSQ_head].completed)
|
|
{
|
|
/* load/store operation is not yet complete */
|
|
break;
|
|
}
|
|
|
|
if ((MD_OP_FLAGS(LSQ[LSQ_head].op) & (F_MEM|F_STORE))
|
|
== (F_MEM|F_STORE))
|
|
{
|
|
struct res_template *fu;
|
|
|
|
|
|
/* stores must retire their store value to the cache at commit,
|
|
try to get a store port (functional unit allocation) */
|
|
fu = res_get(fu_pool, MD_OP_FUCLASS(LSQ[LSQ_head].op));
|
|
if (fu)
|
|
{
|
|
/* reserve the functional unit */
|
|
if (fu->master->busy)
|
|
panic("functional unit already in use");
|
|
|
|
/* schedule functional unit release event */
|
|
fu->master->busy = fu->issuelat;
|
|
|
|
/* go to the data cache */
|
|
if (cache_dl1)
|
|
{
|
|
/* commit store value to D-cache */
|
|
lat =
|
|
cache_access(cache_dl1, Write, (LSQ[LSQ_head].addr&~3),
|
|
NULL, 4, sim_cycle, NULL, NULL);
|
|
if (lat > cache_dl1_lat)
|
|
events |= PEV_CACHEMISS;
|
|
}
|
|
|
|
/* all loads and stores must to access D-TLB */
|
|
if (dtlb)
|
|
{
|
|
/* access the D-TLB */
|
|
lat =
|
|
cache_access(dtlb, Read, (LSQ[LSQ_head].addr & ~3),
|
|
NULL, 4, sim_cycle, NULL, NULL);
|
|
if (lat > 1)
|
|
events |= PEV_TLBMISS;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* no store ports left, cannot continue to commit insts */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* invalidate load/store operation instance */
|
|
LSQ[LSQ_head].tag++;
|
|
sim_slip += (sim_cycle - LSQ[LSQ_head].slip);
|
|
|
|
/* indicate to pipeline trace that this instruction retired */
|
|
ptrace_newstage(LSQ[LSQ_head].ptrace_seq, PST_COMMIT, events);
|
|
ptrace_endinst(LSQ[LSQ_head].ptrace_seq);
|
|
|
|
/* commit head of LSQ as well */
|
|
LSQ_head = (LSQ_head + 1) % LSQ_size;
|
|
LSQ_num--;
|
|
}
|
|
|
|
if (pred
|
|
&& bpred_spec_update == spec_CT
|
|
&& (MD_OP_FLAGS(rs->op) & F_CTRL))
|
|
{
|
|
bpred_update(pred,
|
|
/* branch address */rs->PC,
|
|
/* actual target address */rs->next_PC,
|
|
/* taken? */rs->next_PC != (rs->PC +
|
|
sizeof(md_inst_t)),
|
|
/* pred taken? */rs->pred_PC != (rs->PC +
|
|
sizeof(md_inst_t)),
|
|
/* correct pred? */rs->pred_PC == rs->next_PC,
|
|
/* opcode */rs->op,
|
|
/* dir predictor update pointer */&rs->dir_update);
|
|
}
|
|
|
|
/* invalidate RUU operation instance */
|
|
RUU[RUU_head].tag++;
|
|
sim_slip += (sim_cycle - RUU[RUU_head].slip);
|
|
/* print retirement trace if in verbose mode */
|
|
if (verbose)
|
|
{
|
|
sim_ret_insn++;
|
|
myfprintf(stderr, "%10n @ 0x%08p: ", sim_ret_insn, RUU[RUU_head].PC);
|
|
md_print_insn(RUU[RUU_head].IR, RUU[RUU_head].PC, stderr);
|
|
if (MD_OP_FLAGS(RUU[RUU_head].op) & F_MEM)
|
|
myfprintf(stderr, " mem: 0x%08p", RUU[RUU_head].addr);
|
|
fprintf(stderr, "\n");
|
|
/* fflush(stderr); */
|
|
}
|
|
|
|
/* indicate to pipeline trace that this instruction retired */
|
|
ptrace_newstage(RUU[RUU_head].ptrace_seq, PST_COMMIT, events);
|
|
ptrace_endinst(RUU[RUU_head].ptrace_seq);
|
|
|
|
/* commit head entry of RUU */
|
|
RUU_head = (RUU_head + 1) % RUU_size;
|
|
RUU_num--;
|
|
|
|
/* one more instruction committed to architected state */
|
|
committed++;
|
|
|
|
for (i=0; i<MAX_ODEPS; i++)
|
|
{
|
|
if (rs->odep_list[i])
|
|
panic ("retired instruction has odeps\n");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RUU_RECOVER() - squash mispredicted microarchitecture state
|
|
*/
|
|
|
|
/* recover processor microarchitecture state back to point of the
|
|
mis-predicted branch at RUU[BRANCH_INDEX] */
|
|
static void
|
|
ruu_recover(int branch_index) /* index of mis-pred branch */
|
|
{
|
|
int i, RUU_index = RUU_tail, LSQ_index = LSQ_tail;
|
|
int RUU_prev_tail = RUU_tail, LSQ_prev_tail = LSQ_tail;
|
|
|
|
/* recover from the tail of the RUU towards the head until the branch index
|
|
is reached, this direction ensures that the LSQ can be synchronized with
|
|
the RUU */
|
|
|
|
/* go to first element to squash */
|
|
RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
|
|
LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
|
|
|
|
/* traverse to older insts until the mispredicted branch is encountered */
|
|
while (RUU_index != branch_index)
|
|
{
|
|
/* the RUU should not drain since the mispredicted branch will remain */
|
|
if (!RUU_num)
|
|
panic("empty RUU");
|
|
|
|
/* should meet up with the tail first */
|
|
if (RUU_index == RUU_head)
|
|
panic("RUU head and tail broken");
|
|
|
|
/* is this operation an effective addr calc for a load or store? */
|
|
if (RUU[RUU_index].ea_comp)
|
|
{
|
|
/* should be at least one load or store in the LSQ */
|
|
if (!LSQ_num)
|
|
panic("RUU and LSQ out of sync");
|
|
|
|
/* recover any resources consumed by the load or store operation */
|
|
for (i=0; i<MAX_ODEPS; i++)
|
|
{
|
|
RSLINK_FREE_LIST(LSQ[LSQ_index].odep_list[i]);
|
|
/* blow away the consuming op list */
|
|
LSQ[LSQ_index].odep_list[i] = NULL;
|
|
}
|
|
|
|
/* squash this LSQ entry */
|
|
LSQ[LSQ_index].tag++;
|
|
|
|
/* indicate in pipetrace that this instruction was squashed */
|
|
ptrace_endinst(LSQ[LSQ_index].ptrace_seq);
|
|
|
|
/* go to next earlier LSQ slot */
|
|
LSQ_prev_tail = LSQ_index;
|
|
LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
|
|
LSQ_num--;
|
|
}
|
|
|
|
/* recover any resources used by this RUU operation */
|
|
for (i=0; i<MAX_ODEPS; i++)
|
|
{
|
|
RSLINK_FREE_LIST(RUU[RUU_index].odep_list[i]);
|
|
/* blow away the consuming op list */
|
|
RUU[RUU_index].odep_list[i] = NULL;
|
|
}
|
|
|
|
/* squash this RUU entry */
|
|
RUU[RUU_index].tag++;
|
|
|
|
/* indicate in pipetrace that this instruction was squashed */
|
|
ptrace_endinst(RUU[RUU_index].ptrace_seq);
|
|
|
|
/* go to next earlier slot in the RUU */
|
|
RUU_prev_tail = RUU_index;
|
|
RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
|
|
RUU_num--;
|
|
}
|
|
|
|
/* reset head/tail pointers to point to the mis-predicted branch */
|
|
RUU_tail = RUU_prev_tail;
|
|
LSQ_tail = LSQ_prev_tail;
|
|
|
|
/* revert create vector back to last precise create vector state, NOTE:
|
|
this is accomplished by resetting all the copied-on-write bits in the
|
|
USE_SPEC_CV bit vector */
|
|
BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
|
|
|
|
/* FIXME: could reset functional units at squash time */
|
|
}
|
|
|
|
|
|
/*
|
|
* RUU_WRITEBACK() - instruction result writeback pipeline stage
|
|
*/
|
|
|
|
/* forward declarations */
|
|
static void tracer_recover(void);
|
|
|
|
/* writeback completed operation results from the functional units to RUU,
|
|
at this point, the output dependency chains of completing instructions
|
|
are also walked to determine if any dependent instruction now has all
|
|
of its register operands, if so the (nearly) ready instruction is inserted
|
|
into the ready instruction queue */
|
|
static void
|
|
ruu_writeback(void)
|
|
{
|
|
int i;
|
|
struct RUU_station *rs;
|
|
|
|
/* service all completed events */
|
|
while ((rs = eventq_next_event()))
|
|
{
|
|
/* RS has completed execution and (possibly) produced a result */
|
|
if (!OPERANDS_READY(rs) || rs->queued || !rs->issued || rs->completed)
|
|
panic("inst completed and !ready, !issued, or completed");
|
|
|
|
/* operation has completed */
|
|
rs->completed = TRUE;
|
|
|
|
/* does this operation reveal a mis-predicted branch? */
|
|
if (rs->recover_inst)
|
|
{
|
|
if (rs->in_LSQ)
|
|
panic("mis-predicted load or store?!?!?");
|
|
|
|
/* recover processor state and reinit fetch to correct path */
|
|
ruu_recover(rs - RUU);
|
|
tracer_recover();
|
|
bpred_recover(pred, rs->PC, rs->stack_recover_idx);
|
|
|
|
/* stall fetch until I-fetch and I-decode recover */
|
|
ruu_fetch_issue_delay = ruu_branch_penalty;
|
|
|
|
/* continue writeback of the branch/control instruction */
|
|
}
|
|
|
|
/* if we speculatively update branch-predictor, do it here */
|
|
if (pred
|
|
&& bpred_spec_update == spec_WB
|
|
&& !rs->in_LSQ
|
|
&& (MD_OP_FLAGS(rs->op) & F_CTRL))
|
|
{
|
|
bpred_update(pred,
|
|
/* branch address */rs->PC,
|
|
/* actual target address */rs->next_PC,
|
|
/* taken? */rs->next_PC != (rs->PC +
|
|
sizeof(md_inst_t)),
|
|
/* pred taken? */rs->pred_PC != (rs->PC +
|
|
sizeof(md_inst_t)),
|
|
/* correct pred? */rs->pred_PC == rs->next_PC,
|
|
/* opcode */rs->op,
|
|
/* dir predictor update pointer */&rs->dir_update);
|
|
}
|
|
|
|
/* entered writeback stage, indicate in pipe trace */
|
|
ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK,
|
|
rs->recover_inst ? PEV_MPDETECT : 0);
|
|
|
|
/* broadcast results to consuming operations, this is more efficiently
|
|
accomplished by walking the output dependency chains of the
|
|
completed instruction */
|
|
for (i=0; i<MAX_ODEPS; i++)
|
|
{
|
|
if (rs->onames[i] != NA)
|
|
{
|
|
struct CV_link link;
|
|
struct RS_link *olink, *olink_next;
|
|
|
|
if (rs->spec_mode)
|
|
{
|
|
/* update the speculative create vector, future operations
|
|
get value from later creator or architected reg file */
|
|
link = spec_create_vector[rs->onames[i]];
|
|
if (/* !NULL */link.rs
|
|
&& /* refs RS */(link.rs == rs && link.odep_num == i))
|
|
{
|
|
/* the result can now be read from a physical register,
|
|
indicate this as so */
|
|
spec_create_vector[rs->onames[i]] = CVLINK_NULL;
|
|
spec_create_vector_rt[rs->onames[i]] = sim_cycle;
|
|
}
|
|
/* else, creator invalidated or there is another creator */
|
|
}
|
|
else
|
|
{
|
|
/* update the non-speculative create vector, future
|
|
operations get value from later creator or architected
|
|
reg file */
|
|
link = create_vector[rs->onames[i]];
|
|
if (/* !NULL */link.rs
|
|
&& /* refs RS */(link.rs == rs && link.odep_num == i))
|
|
{
|
|
/* the result can now be read from a physical register,
|
|
indicate this as so */
|
|
create_vector[rs->onames[i]] = CVLINK_NULL;
|
|
create_vector_rt[rs->onames[i]] = sim_cycle;
|
|
}
|
|
/* else, creator invalidated or there is another creator */
|
|
}
|
|
|
|
/* walk output list, queue up ready operations */
|
|
for (olink=rs->odep_list[i]; olink; olink=olink_next)
|
|
{
|
|
if (RSLINK_VALID(olink))
|
|
{
|
|
if (olink->rs->idep_ready[olink->x.opnum])
|
|
panic("output dependence already satisfied");
|
|
|
|
/* input is now ready */
|
|
olink->rs->idep_ready[olink->x.opnum] = TRUE;
|
|
|
|
/* are all the register operands of target ready? */
|
|
if (OPERANDS_READY(olink->rs))
|
|
{
|
|
/* yes! enqueue instruction as ready, NOTE: stores
|
|
complete at dispatch, so no need to enqueue
|
|
them */
|
|
if (!olink->rs->in_LSQ
|
|
|| ((MD_OP_FLAGS(olink->rs->op)&(F_MEM|F_STORE))
|
|
== (F_MEM|F_STORE)))
|
|
readyq_enqueue(olink->rs);
|
|
/* else, ld op, issued when no mem conflict */
|
|
}
|
|
}
|
|
|
|
/* grab link to next element prior to free */
|
|
olink_next = olink->next;
|
|
|
|
/* free dependence link element */
|
|
RSLINK_FREE(olink);
|
|
}
|
|
/* blow away the consuming op list */
|
|
rs->odep_list[i] = NULL;
|
|
|
|
} /* if not NA output */
|
|
|
|
} /* for all outputs */
|
|
|
|
} /* for all writeback events */
|
|
|
|
}
|
|
|
|
|
|
/*
|
|
* LSQ_REFRESH() - memory access dependence checker/scheduler
|
|
*/
|
|
|
|
/* this function locates ready instructions whose memory dependencies have
|
|
been satisfied, this is accomplished by walking the LSQ for loads, looking
|
|
for blocking memory dependency condition (e.g., earlier store with an
|
|
unknown address) */
|
|
#define MAX_STD_UNKNOWNS 64
|
|
static void
|
|
lsq_refresh(void)
|
|
{
|
|
int i, j, index, n_std_unknowns;
|
|
md_addr_t std_unknowns[MAX_STD_UNKNOWNS];
|
|
|
|
/* scan entire queue for ready loads: scan from oldest instruction
|
|
(head) until we reach the tail or an unresolved store, after which no
|
|
other instruction will become ready */
|
|
for (i=0, index=LSQ_head, n_std_unknowns=0;
|
|
i < LSQ_num;
|
|
i++, index=(index + 1) % LSQ_size)
|
|
{
|
|
/* terminate search for ready loads after first unresolved store,
|
|
as no later load could be resolved in its presence */
|
|
if (/* store? */
|
|
(MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE))
|
|
{
|
|
if (!STORE_ADDR_READY(&LSQ[index]))
|
|
{
|
|
/* FIXME: a later STD + STD known could hide the STA unknown */
|
|
/* sta unknown, blocks all later loads, stop search */
|
|
break;
|
|
}
|
|
else if (!OPERANDS_READY(&LSQ[index]))
|
|
{
|
|
/* sta known, but std unknown, may block a later store, record
|
|
this address for later referral, we use an array here because
|
|
for most simulations the number of entries to search will be
|
|
very small */
|
|
if (n_std_unknowns == MAX_STD_UNKNOWNS)
|
|
fatal("STD unknown array overflow, increase MAX_STD_UNKNOWNS");
|
|
std_unknowns[n_std_unknowns++] = LSQ[index].addr;
|
|
}
|
|
else /* STORE_ADDR_READY() && OPERANDS_READY() */
|
|
{
|
|
/* a later STD known hides an earlier STD unknown */
|
|
for (j=0; j<n_std_unknowns; j++)
|
|
{
|
|
if (std_unknowns[j] == /* STA/STD known */LSQ[index].addr)
|
|
std_unknowns[j] = /* bogus addr */0;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (/* load? */
|
|
((MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_LOAD)) == (F_MEM|F_LOAD))
|
|
&& /* queued? */!LSQ[index].queued
|
|
&& /* waiting? */!LSQ[index].issued
|
|
&& /* completed? */!LSQ[index].completed
|
|
&& /* regs ready? */OPERANDS_READY(&LSQ[index]))
|
|
{
|
|
/* no STA unknown conflict (because we got to this check), check for
|
|
a STD unknown conflict */
|
|
for (j=0; j<n_std_unknowns; j++)
|
|
{
|
|
/* found a relevant STD unknown? */
|
|
if (std_unknowns[j] == LSQ[index].addr)
|
|
break;
|
|
}
|
|
if (j == n_std_unknowns)
|
|
{
|
|
/* no STA or STD unknown conflicts, put load on ready queue */
|
|
readyq_enqueue(&LSQ[index]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* RUU_ISSUE() - issue instructions to functional units
|
|
*/
|
|
|
|
/* attempt to issue all operations in the ready queue; insts in the ready
|
|
instruction queue have all register dependencies satisfied, this function
|
|
must then 1) ensure the instructions memory dependencies have been satisfied
|
|
(see lsq_refresh() for details on this process) and 2) a function unit
|
|
is available in this cycle to commence execution of the operation; if all
|
|
goes well, the function unit is allocated, a writeback event is scheduled,
|
|
and the instruction begins execution */
|
|
static void
|
|
ruu_issue(void)
|
|
{
|
|
int i, load_lat, tlb_lat, n_issued;
|
|
struct RS_link *node, *next_node;
|
|
struct res_template *fu;
|
|
|
|
/* FIXME: could be a little more efficient when scanning the ready queue */
|
|
|
|
/* copy and then blow away the ready list, NOTE: the ready list is
|
|
always totally reclaimed each cycle, and instructions that are not
|
|
issue are explicitly reinserted into the ready instruction queue,
|
|
this management strategy ensures that the ready instruction queue
|
|
is always properly sorted */
|
|
node = ready_queue;
|
|
ready_queue = NULL;
|
|
|
|
/* visit all ready instructions (i.e., insts whose register input
|
|
dependencies have been satisfied, stop issue when no more instructions
|
|
are available or issue bandwidth is exhausted */
|
|
for (n_issued=0;
|
|
node && n_issued < ruu_issue_width;
|
|
node = next_node)
|
|
{
|
|
next_node = node->next;
|
|
|
|
/* still valid? */
|
|
if (RSLINK_VALID(node))
|
|
{
|
|
struct RUU_station *rs = RSLINK_RS(node);
|
|
|
|
/* issue operation, both reg and mem deps have been satisfied */
|
|
if (!OPERANDS_READY(rs) || !rs->queued
|
|
|| rs->issued || rs->completed)
|
|
panic("issued inst !ready, issued, or completed");
|
|
|
|
/* node is now un-queued */
|
|
rs->queued = FALSE;
|
|
|
|
if (rs->in_LSQ
|
|
&& ((MD_OP_FLAGS(rs->op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE)))
|
|
{
|
|
/* stores complete in effectively zero time, result is
|
|
written into the load/store queue, the actual store into
|
|
the memory system occurs when the instruction is retired
|
|
(see ruu_commit()) */
|
|
rs->issued = TRUE;
|
|
rs->completed = TRUE;
|
|
if (rs->onames[0] || rs->onames[1])
|
|
panic("store creates result");
|
|
|
|
if (rs->recover_inst)
|
|
panic("mis-predicted store");
|
|
|
|
/* entered execute stage, indicate in pipe trace */
|
|
ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK, 0);
|
|
|
|
/* one more inst issued */
|
|
n_issued++;
|
|
}
|
|
else
|
|
{
|
|
/* issue the instruction to a functional unit */
|
|
if (MD_OP_FUCLASS(rs->op) != NA)
|
|
{
|
|
fu = res_get(fu_pool, MD_OP_FUCLASS(rs->op));
|
|
if (fu)
|
|
{
|
|
/* got one! issue inst to functional unit */
|
|
rs->issued = TRUE;
|
|
/* reserve the functional unit */
|
|
if (fu->master->busy)
|
|
panic("functional unit already in use");
|
|
|
|
/* schedule functional unit release event */
|
|
fu->master->busy = fu->issuelat;
|
|
|
|
/* schedule a result writeback event */
|
|
if (rs->in_LSQ
|
|
&& ((MD_OP_FLAGS(rs->op) & (F_MEM|F_LOAD))
|
|
== (F_MEM|F_LOAD)))
|
|
{
|
|
int events = 0;
|
|
|
|
/* for loads, determine cache access latency:
|
|
first scan LSQ to see if a store forward is
|
|
possible, if not, access the data cache */
|
|
load_lat = 0;
|
|
i = (rs - LSQ);
|
|
if (i != LSQ_head)
|
|
{
|
|
for (;;)
|
|
{
|
|
/* go to next earlier LSQ entry */
|
|
i = (i + (LSQ_size-1)) % LSQ_size;
|
|
|
|
/* FIXME: not dealing with partials! */
|
|
if ((MD_OP_FLAGS(LSQ[i].op) & F_STORE)
|
|
&& (LSQ[i].addr == rs->addr))
|
|
{
|
|
/* hit in the LSQ */
|
|
load_lat = 1;
|
|
break;
|
|
}
|
|
|
|
/* scan finished? */
|
|
if (i == LSQ_head)
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* was the value store forwared from the LSQ? */
|
|
if (!load_lat)
|
|
{
|
|
int valid_addr = MD_VALID_ADDR(rs->addr);
|
|
|
|
if (!spec_mode && !valid_addr)
|
|
sim_invalid_addrs++;
|
|
|
|
/* no! go to the data cache if addr is valid */
|
|
if (cache_dl1 && valid_addr)
|
|
{
|
|
/* access the cache if non-faulting */
|
|
load_lat =
|
|
cache_access(cache_dl1, Read,
|
|
(rs->addr & ~3), NULL, 4,
|
|
sim_cycle, NULL, NULL);
|
|
if (load_lat > cache_dl1_lat)
|
|
events |= PEV_CACHEMISS;
|
|
}
|
|
else
|
|
{
|
|
/* no caches defined, just use op latency */
|
|
load_lat = fu->oplat;
|
|
}
|
|
}
|
|
|
|
/* all loads and stores must to access D-TLB */
|
|
if (dtlb && MD_VALID_ADDR(rs->addr))
|
|
{
|
|
/* access the D-DLB, NOTE: this code will
|
|
initiate speculative TLB misses */
|
|
tlb_lat =
|
|
cache_access(dtlb, Read, (rs->addr & ~3),
|
|
NULL, 4, sim_cycle, NULL, NULL);
|
|
if (tlb_lat > 1)
|
|
events |= PEV_TLBMISS;
|
|
|
|
/* D-cache/D-TLB accesses occur in parallel */
|
|
load_lat = MAX(tlb_lat, load_lat);
|
|
}
|
|
|
|
/* use computed cache access latency */
|
|
eventq_queue_event(rs, sim_cycle + load_lat);
|
|
|
|
/* entered execute stage, indicate in pipe trace */
|
|
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
|
|
((rs->ea_comp ? PEV_AGEN : 0)
|
|
| events));
|
|
}
|
|
else /* !load && !store */
|
|
{
|
|
/* use deterministic functional unit latency */
|
|
eventq_queue_event(rs, sim_cycle + fu->oplat);
|
|
|
|
/* entered execute stage, indicate in pipe trace */
|
|
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
|
|
rs->ea_comp ? PEV_AGEN : 0);
|
|
}
|
|
|
|
/* one more inst issued */
|
|
n_issued++;
|
|
}
|
|
else /* no functional unit */
|
|
{
|
|
/* insufficient functional unit resources, put operation
|
|
back onto the ready list, we'll try to issue it
|
|
again next cycle */
|
|
readyq_enqueue(rs);
|
|
}
|
|
}
|
|
else /* does not require a functional unit! */
|
|
{
|
|
/* FIXME: need better solution for these */
|
|
/* the instruction does not need a functional unit */
|
|
rs->issued = TRUE;
|
|
|
|
/* schedule a result event */
|
|
eventq_queue_event(rs, sim_cycle + 1);
|
|
|
|
/* entered execute stage, indicate in pipe trace */
|
|
ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
|
|
rs->ea_comp ? PEV_AGEN : 0);
|
|
|
|
/* one more inst issued */
|
|
n_issued++;
|
|
}
|
|
} /* !store */
|
|
|
|
}
|
|
/* else, RUU entry was squashed */
|
|
|
|
/* reclaim ready list entry, NOTE: this is done whether or not the
|
|
instruction issued, since the instruction was once again reinserted
|
|
into the ready queue if it did not issue, this ensures that the ready
|
|
queue is always properly sorted */
|
|
RSLINK_FREE(node);
|
|
}
|
|
|
|
/* put any instruction not issued back into the ready queue, go through
|
|
normal channels to ensure instruction stay ordered correctly */
|
|
for (; node; node = next_node)
|
|
{
|
|
next_node = node->next;
|
|
|
|
/* still valid? */
|
|
if (RSLINK_VALID(node))
|
|
{
|
|
struct RUU_station *rs = RSLINK_RS(node);
|
|
|
|
/* node is now un-queued */
|
|
rs->queued = FALSE;
|
|
|
|
/* not issued, put operation back onto the ready list, we'll try to
|
|
issue it again next cycle */
|
|
readyq_enqueue(rs);
|
|
}
|
|
/* else, RUU entry was squashed */
|
|
|
|
/* reclaim ready list entry, NOTE: this is done whether or not the
|
|
instruction issued, since the instruction was once again reinserted
|
|
into the ready queue if it did not issue, this ensures that the ready
|
|
queue is always properly sorted */
|
|
RSLINK_FREE(node);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* routines for generating on-the-fly instruction traces with support
|
|
* for control and data misspeculation modeling
|
|
*/
|
|
|
|
/* integer register file */
|
|
#define R_BMAP_SZ (BITMAP_SIZE(MD_NUM_IREGS))
|
|
static BITMAP_TYPE(MD_NUM_IREGS, use_spec_R);
|
|
static md_gpr_t spec_regs_R;
|
|
|
|
/* floating point register file */
|
|
#define F_BMAP_SZ (BITMAP_SIZE(MD_NUM_FREGS))
|
|
static BITMAP_TYPE(MD_NUM_FREGS, use_spec_F);
|
|
static md_fpr_t spec_regs_F;
|
|
|
|
/* miscellaneous registers */
|
|
#define C_BMAP_SZ (BITMAP_SIZE(MD_NUM_CREGS))
|
|
static BITMAP_TYPE(MD_NUM_FREGS, use_spec_C);
|
|
static md_ctrl_t spec_regs_C;
|
|
|
|
/* dump speculative register state */
|
|
static void
|
|
rspec_dump(FILE *stream) /* output stream */
|
|
{
|
|
int i;
|
|
|
|
if (!stream)
|
|
stream = stderr;
|