From 39ebd845aa6259f1ddd3fc16e61205838a4b1341 Mon Sep 17 00:00:00 2001
From: Danila Fedorin <danila.fedorin@gmail.com>
Date: Sat, 5 Dec 2020 19:34:25 -0800
Subject: [PATCH] Add changed source files (one seems buggy)

---
 bpred.c        | 1039 +++++++++++
 bpred.h        |  289 +++
 sim-outorder.c | 4637 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 5965 insertions(+)
 create mode 100644 bpred.c
 create mode 100644 bpred.h
 create mode 100644 sim-outorder.c

diff --git a/bpred.c b/bpred.c
new file mode 100644
index 0000000..d41f827
--- /dev/null
+++ b/bpred.c
@@ -0,0 +1,1039 @@
+/* bpred.c - branch predictor routines */
+
+/* SimpleScalar(TM) Tool Suite
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ * All Rights Reserved. 
+ * 
+ * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR,
+ * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS.
+ * 
+ * No portion of this work may be used by any commercial entity, or for any
+ * commercial purpose, without the prior, written permission of SimpleScalar,
+ * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted
+ * as described below.
+ * 
+ * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express
+ * or implied. The user of the program accepts full responsibility for the
+ * application of the program and the use of any results.
+ * 
+ * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be
+ * downloaded, compiled, executed, copied, and modified solely for nonprofit,
+ * educational, noncommercial research, and noncommercial scholarship
+ * purposes provided that this notice in its entirety accompanies all copies.
+ * Copies of the modified software can be delivered to persons who use it
+ * solely for nonprofit, educational, noncommercial research, and
+ * noncommercial scholarship purposes provided that this notice in its
+ * entirety accompanies all copies.
+ * 
+ * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY
+ * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com).
+ * 
+ * 4. No nonprofit user may place any restrictions on the use of this software,
+ * including as modified by the user, by any other authorized user.
+ * 
+ * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar
+ * in compiled or executable form as set forth in Section 2, provided that
+ * either: (A) it is accompanied by the corresponding machine-readable source
+ * code, or (B) it is accompanied by a written offer, with no time limit, to
+ * give anyone a machine-readable copy of the corresponding source code in
+ * return for reimbursement of the cost of distribution. This written offer
+ * must permit verbatim duplication by anyone, or (C) it is distributed by
+ * someone who received only the executable form, and is accompanied by a
+ * copy of the written offer of source code.
+ * 
+ * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is
+ * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail:
+ * 2395 Timbercrest Court, Ann Arbor, MI 48105.
+ * 
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+
+#include "host.h"
+#include "misc.h"
+#include "machine.h"
+#include "bpred.h"
+
+/* turn this on to enable the SimpleScalar 2.0 RAS bug */
+/* #define RAS_BUG_COMPATIBLE */
+
+/* create a branch predictor */
+struct bpred_t *			/* branch predictory instance */
+bpred_create(enum bpred_class class,	/* type of predictor to create */
+	     unsigned int bimod_size,	/* bimod table size */
+	     unsigned int l1size,	/* 2lev l1 table size */
+	     unsigned int l2size,	/* 2lev l2 table size */
+	     unsigned int meta_size,	/* meta table size */
+	     unsigned int shift_width,	/* history register width */
+	     unsigned int xor,  	/* history xor address flag */
+	     unsigned int btb_sets,	/* number of sets in BTB */ 
+	     unsigned int btb_assoc,	/* BTB associativity */
+	     unsigned int retstack_size) /* num entries in ret-addr stack */
+{
+  struct bpred_t *pred;
+
+  if (!(pred = calloc(1, sizeof(struct bpred_t))))
+    fatal("out of virtual memory");
+
+  pred->class = class;
+
+  switch (class) {
+  case BPredComb:
+    /* bimodal component */
+    pred->dirpred.bimod = 
+      bpred_dir_create(BPred2bit, bimod_size, 0, 0, 0);
+
+    /* 2-level component */
+    pred->dirpred.twolev = 
+      bpred_dir_create(BPred2Level, l1size, l2size, shift_width, xor);
+
+    /* metapredictor component */
+    pred->dirpred.meta = 
+      bpred_dir_create(BPred2bit, meta_size, 0, 0, 0);
+
+    break;
+
+  case BPred2Level:
+    pred->dirpred.twolev = 
+      bpred_dir_create(class, l1size, l2size, shift_width, xor);
+
+    break;
+
+  case BPred2bit:
+    pred->dirpred.bimod = 
+      bpred_dir_create(class, bimod_size, 0, 0, 0);
+
+    break;
+
+  case BPred3bit:
+    pred->dirpred.bimod =
+      /* Instead of adding another argument to this function,
+       * just re-use the bimodal size. */
+      bpred_dir_create(class, bimod_size, 0, 0, 0);
+
+    break;
+
+  case BPredTaken:
+  case BPredNotTaken:
+    /* no other state */
+    break;
+
+  default:
+    panic("bogus predictor class");
+  }
+
+  /* allocate ret-addr stack */
+  switch (class) {
+  case BPredComb:
+  case BPred2Level:
+  case BPred2bit:
+  case BPred3bit:
+    {
+      int i;
+
+      /* allocate BTB */
+      if (!btb_sets || (btb_sets & (btb_sets-1)) != 0)
+	fatal("number of BTB sets must be non-zero and a power of two");
+      if (!btb_assoc || (btb_assoc & (btb_assoc-1)) != 0)
+	fatal("BTB associativity must be non-zero and a power of two");
+
+      if (!(pred->btb.btb_data = calloc(btb_sets * btb_assoc,
+					sizeof(struct bpred_btb_ent_t))))
+	fatal("cannot allocate BTB");
+
+      pred->btb.sets = btb_sets;
+      pred->btb.assoc = btb_assoc;
+
+      if (pred->btb.assoc > 1)
+	for (i=0; i < (pred->btb.assoc*pred->btb.sets); i++)
+	  {
+	    if (i % pred->btb.assoc != pred->btb.assoc - 1)
+	      pred->btb.btb_data[i].next = &pred->btb.btb_data[i+1];
+	    else
+	      pred->btb.btb_data[i].next = NULL;
+	    
+	    if (i % pred->btb.assoc != pred->btb.assoc - 1)
+	      pred->btb.btb_data[i+1].prev = &pred->btb.btb_data[i];
+	  }
+
+      /* allocate retstack */
+      if ((retstack_size & (retstack_size-1)) != 0)
+	fatal("Return-address-stack size must be zero or a power of two");
+      
+      pred->retstack.size = retstack_size;
+      if (retstack_size)
+	if (!(pred->retstack.stack = calloc(retstack_size, 
+					    sizeof(struct bpred_btb_ent_t))))
+	  fatal("cannot allocate return-address-stack");
+      pred->retstack.tos = retstack_size - 1;
+      
+      break;
+    }
+
+  case BPredTaken:
+  case BPredNotTaken:
+    /* no other state */
+    break;
+
+  default:
+    panic("bogus predictor class");
+  }
+
+  return pred;
+}
+
+/* create a branch direction predictor */
+struct bpred_dir_t *		/* branch direction predictor instance */
+bpred_dir_create (
+  enum bpred_class class,	/* type of predictor to create */
+  unsigned int l1size,	 	/* level-1 table size */
+  unsigned int l2size,	 	/* level-2 table size (if relevant) */
+  unsigned int shift_width,	/* history register width */
+  unsigned int xor)	    	/* history xor address flag */
+{
+  struct bpred_dir_t *pred_dir;
+  unsigned int cnt;
+  int flipflop;
+
+  if (!(pred_dir = calloc(1, sizeof(struct bpred_dir_t))))
+    fatal("out of virtual memory");
+
+  pred_dir->class = class;
+
+  cnt = -1;
+  switch (class) {
+  case BPred2Level:
+    {
+      if (!l1size || (l1size & (l1size-1)) != 0)
+	fatal("level-1 size, `%d', must be non-zero and a power of two", 
+	      l1size);
+      pred_dir->config.two.l1size = l1size;
+      
+      if (!l2size || (l2size & (l2size-1)) != 0)
+	fatal("level-2 size, `%d', must be non-zero and a power of two", 
+	      l2size);
+      pred_dir->config.two.l2size = l2size;
+      
+      if (!shift_width || shift_width > 30)
+	fatal("shift register width, `%d', must be non-zero and positive",
+	      shift_width);
+      pred_dir->config.two.shift_width = shift_width;
+      
+      pred_dir->config.two.xor = xor;
+      pred_dir->config.two.shiftregs = calloc(l1size, sizeof(int));
+      if (!pred_dir->config.two.shiftregs)
+	fatal("cannot allocate shift register table");
+      
+      pred_dir->config.two.l2table = calloc(l2size, sizeof(unsigned char));
+      if (!pred_dir->config.two.l2table)
+	fatal("cannot allocate second level table");
+
+      /* initialize counters to weakly this-or-that */
+      flipflop = 1;
+      for (cnt = 0; cnt < l2size; cnt++)
+	{
+	  pred_dir->config.two.l2table[cnt] = flipflop;
+	  flipflop = 3 - flipflop;
+	}
+
+      break;
+    }
+
+  case BPred2bit:
+    if (!l1size || (l1size & (l1size-1)) != 0)
+      fatal("2bit table size, `%d', must be non-zero and a power of two", 
+	    l1size);
+    pred_dir->config.bimod.size = l1size;
+    if (!(pred_dir->config.bimod.table =
+	  calloc(l1size, sizeof(unsigned char))))
+      fatal("cannot allocate 2bit storage");
+    /* initialize counters to weakly this-or-that */
+    flipflop = 1;
+    for (cnt = 0; cnt < l1size; cnt++)
+      {
+	pred_dir->config.bimod.table[cnt] = flipflop;
+	flipflop = 3 - flipflop;
+      }
+
+    break;
+
+  case BPred3bit:
+    if (!l1size || (l1size & (l1size-1)) != 0)
+      fatal("2bit table size, `%d', must be non-zero and a power of two", 
+	    l1size);
+    pred_dir->config.threebit.size = l1size;
+    if (!(pred_dir->config.threebit.table =
+	  calloc(l1size, sizeof(unsigned char))))
+      fatal("cannot allocate 2bit storage");
+    /* initialize counters to weakly this-or-that */
+    flipflop = 3;
+    for (cnt = 0; cnt < l1size; cnt++)
+      {
+	pred_dir->config.threebit.table[cnt] = flipflop;
+	flipflop = 7 - flipflop;
+      }
+
+    break;
+
+  case BPredTaken:
+  case BPredNotTaken:
+    /* no other state */
+    break;
+
+  default:
+    panic("bogus branch direction predictor class");
+  }
+
+  return pred_dir;
+}
+
+/* print branch direction predictor configuration */
+void
+bpred_dir_config(
+  struct bpred_dir_t *pred_dir,	/* branch direction predictor instance */
+  char name[],			/* predictor name */
+  FILE *stream)			/* output stream */
+{
+  switch (pred_dir->class) {
+  case BPred2Level:
+    fprintf(stream,
+      "pred_dir: %s: 2-lvl: %d l1-sz, %d bits/ent, %s xor, %d l2-sz, direct-mapped\n",
+      name, pred_dir->config.two.l1size, pred_dir->config.two.shift_width,
+      pred_dir->config.two.xor ? "" : "no", pred_dir->config.two.l2size);
+    break;
+
+  case BPred2bit:
+    fprintf(stream, "pred_dir: %s: 2-bit: %d entries, direct-mapped\n",
+      name, pred_dir->config.bimod.size);
+    break;
+
+  case BPred3bit:
+    fprintf(stream, "pred_dir: %s: 3-bit: %d entries, direct-mapped\n",
+      name, pred_dir->config.threebit.size);
+    break;
+
+  case BPredTaken:
+    fprintf(stream, "pred_dir: %s: predict taken\n", name);
+    break;
+
+  case BPredNotTaken:
+    fprintf(stream, "pred_dir: %s: predict not taken\n", name);
+    break;
+
+  default:
+    panic("bogus branch direction predictor class");
+  }
+}
+
+/* print branch predictor configuration */
+void
+bpred_config(struct bpred_t *pred,	/* branch predictor instance */
+	     FILE *stream)		/* output stream */
+{
+  switch (pred->class) {
+  case BPredComb:
+    bpred_dir_config (pred->dirpred.bimod, "bimod", stream);
+    bpred_dir_config (pred->dirpred.twolev, "2lev", stream);
+    bpred_dir_config (pred->dirpred.meta, "meta", stream);
+    fprintf(stream, "btb: %d sets x %d associativity", 
+	    pred->btb.sets, pred->btb.assoc);
+    fprintf(stream, "ret_stack: %d entries", pred->retstack.size);
+    break;
+
+  case BPred2Level:
+    bpred_dir_config (pred->dirpred.twolev, "2lev", stream);
+    fprintf(stream, "btb: %d sets x %d associativity", 
+	    pred->btb.sets, pred->btb.assoc);
+    fprintf(stream, "ret_stack: %d entries", pred->retstack.size);
+    break;
+
+  case BPred2bit:
+    bpred_dir_config (pred->dirpred.bimod, "bimod", stream);
+    fprintf(stream, "btb: %d sets x %d associativity", 
+	    pred->btb.sets, pred->btb.assoc);
+    fprintf(stream, "ret_stack: %d entries", pred->retstack.size);
+    break;
+
+  case BPred3bit:
+    bpred_dir_config (pred->dirpred.bimod, "threebit", stream);
+    fprintf(stream, "btb: %d sets x %d associativity", 
+	    pred->btb.sets, pred->btb.assoc);
+    fprintf(stream, "ret_stack: %d entries", pred->retstack.size);
+    break;
+
+  case BPredTaken:
+    bpred_dir_config (pred->dirpred.bimod, "taken", stream);
+    break;
+  case BPredNotTaken:
+    bpred_dir_config (pred->dirpred.bimod, "nottaken", stream);
+    break;
+
+  default:
+    panic("bogus branch predictor class");
+  }
+}
+
+/* print predictor stats */
+void
+bpred_stats(struct bpred_t *pred,	/* branch predictor instance */
+	    FILE *stream)		/* output stream */
+{
+  fprintf(stream, "pred: addr-prediction rate = %f\n",
+	  (double)pred->addr_hits/(double)(pred->addr_hits+pred->misses));
+  fprintf(stream, "pred: dir-prediction rate = %f\n",
+	  (double)pred->dir_hits/(double)(pred->dir_hits+pred->misses));
+}
+
+/* register branch predictor stats */
+void
+bpred_reg_stats(struct bpred_t *pred,	/* branch predictor instance */
+		struct stat_sdb_t *sdb)	/* stats database */
+{
+  char buf[512], buf1[512], *name;
+
+  /* get a name for this predictor */
+  switch (pred->class)
+    {
+    case BPredComb:
+      name = "bpred_comb";
+      break;
+    case BPred2Level:
+      name = "bpred_2lev";
+      break;
+    case BPred2bit:
+      name = "bpred_bimod";
+      break;
+    case BPred3bit:
+      name = "bpred_threebit";
+      break;
+    case BPredTaken:
+      name = "bpred_taken";
+      break;
+    case BPredNotTaken:
+      name = "bpred_nottaken";
+      break;
+    default:
+      panic("bogus branch predictor class");
+    }
+
+  sprintf(buf, "%s.lookups", name);
+  stat_reg_counter(sdb, buf, "total number of bpred lookups",
+		   &pred->lookups, 0, NULL);
+  sprintf(buf, "%s.updates", name);
+  sprintf(buf1, "%s.dir_hits + %s.misses", name, name);
+  stat_reg_formula(sdb, buf, "total number of updates", buf1, "%12.0f");
+  sprintf(buf, "%s.addr_hits", name);
+  stat_reg_counter(sdb, buf, "total number of address-predicted hits", 
+		   &pred->addr_hits, 0, NULL);
+  sprintf(buf, "%s.dir_hits", name);
+  stat_reg_counter(sdb, buf, 
+		   "total number of direction-predicted hits "
+		   "(includes addr-hits)", 
+		   &pred->dir_hits, 0, NULL);
+  if (pred->class == BPredComb)
+    {
+      sprintf(buf, "%s.used_bimod", name);
+      stat_reg_counter(sdb, buf, 
+		       "total number of bimodal predictions used", 
+		       &pred->used_bimod, 0, NULL);
+      sprintf(buf, "%s.used_2lev", name);
+      stat_reg_counter(sdb, buf, 
+		       "total number of 2-level predictions used", 
+		       &pred->used_2lev, 0, NULL);
+    }
+  sprintf(buf, "%s.misses", name);
+  stat_reg_counter(sdb, buf, "total number of misses", &pred->misses, 0, NULL);
+  sprintf(buf, "%s.jr_hits", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of address-predicted hits for JR's",
+		   &pred->jr_hits, 0, NULL);
+  sprintf(buf, "%s.jr_seen", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of JR's seen",
+		   &pred->jr_seen, 0, NULL);
+  sprintf(buf, "%s.jr_non_ras_hits.PP", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of address-predicted hits for non-RAS JR's",
+		   &pred->jr_non_ras_hits, 0, NULL);
+  sprintf(buf, "%s.jr_non_ras_seen.PP", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of non-RAS JR's seen",
+		   &pred->jr_non_ras_seen, 0, NULL);
+  sprintf(buf, "%s.bpred_addr_rate", name);
+  sprintf(buf1, "%s.addr_hits / %s.updates", name, name);
+  stat_reg_formula(sdb, buf,
+		   "branch address-prediction rate (i.e., addr-hits/updates)",
+		   buf1, "%9.4f");
+  sprintf(buf, "%s.bpred_dir_rate", name);
+  sprintf(buf1, "%s.dir_hits / %s.updates", name, name);
+  stat_reg_formula(sdb, buf,
+		  "branch direction-prediction rate (i.e., all-hits/updates)",
+		  buf1, "%9.4f");
+  sprintf(buf, "%s.bpred_jr_rate", name);
+  sprintf(buf1, "%s.jr_hits / %s.jr_seen", name, name);
+  stat_reg_formula(sdb, buf,
+		  "JR address-prediction rate (i.e., JR addr-hits/JRs seen)",
+		  buf1, "%9.4f");
+  sprintf(buf, "%s.bpred_jr_non_ras_rate.PP", name);
+  sprintf(buf1, "%s.jr_non_ras_hits.PP / %s.jr_non_ras_seen.PP", name, name);
+  stat_reg_formula(sdb, buf,
+		   "non-RAS JR addr-pred rate (ie, non-RAS JR hits/JRs seen)",
+		   buf1, "%9.4f");
+  sprintf(buf, "%s.retstack_pushes", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of address pushed onto ret-addr stack",
+		   &pred->retstack_pushes, 0, NULL);
+  sprintf(buf, "%s.retstack_pops", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of address popped off of ret-addr stack",
+		   &pred->retstack_pops, 0, NULL);
+  sprintf(buf, "%s.used_ras.PP", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of RAS predictions used",
+		   &pred->used_ras, 0, NULL);
+  sprintf(buf, "%s.ras_hits.PP", name);
+  stat_reg_counter(sdb, buf,
+		   "total number of RAS hits",
+		   &pred->ras_hits, 0, NULL);
+  sprintf(buf, "%s.ras_rate.PP", name);
+  sprintf(buf1, "%s.ras_hits.PP / %s.used_ras.PP", name, name);
+  stat_reg_formula(sdb, buf,
+		   "RAS prediction rate (i.e., RAS hits/used RAS)",
+		   buf1, "%9.4f");
+}
+
+void
+bpred_after_priming(struct bpred_t *bpred)
+{
+  if (bpred == NULL)
+    return;
+
+  bpred->lookups = 0;
+  bpred->addr_hits = 0;
+  bpred->dir_hits = 0;
+  bpred->used_ras = 0;
+  bpred->used_bimod = 0;
+  bpred->used_2lev = 0;
+  bpred->jr_hits = 0;
+  bpred->jr_seen = 0;
+  bpred->misses = 0;
+  bpred->retstack_pops = 0;
+  bpred->retstack_pushes = 0;
+  bpred->ras_hits = 0;
+}
+
+#define BIMOD_HASH(PRED, ADDR)						\
+  ((((ADDR) >> 19) ^ ((ADDR) >> MD_BR_SHIFT)) & ((PRED)->config.bimod.size-1))
+    /* was: ((baddr >> 16) ^ baddr) & (pred->dirpred.bimod.size-1) */
+
+#define THREEBIT_HASH(PRED, ADDR)						\
+  ((((ADDR) >> 19) ^ ((ADDR) >> MD_BR_SHIFT)) & ((PRED)->config.threebit.size-1))
+
+/* predicts a branch direction */
+char *						/* pointer to counter */
+bpred_dir_lookup(struct bpred_dir_t *pred_dir,	/* branch dir predictor inst */
+		 md_addr_t baddr)		/* branch address */
+{
+  unsigned char *p = NULL;
+
+  /* Except for jumps, get a pointer to direction-prediction bits */
+  switch (pred_dir->class) {
+    case BPred2Level:
+      {
+	int l1index, l2index;
+
+        /* traverse 2-level tables */
+        l1index = (baddr >> MD_BR_SHIFT) & (pred_dir->config.two.l1size - 1);
+        l2index = pred_dir->config.two.shiftregs[l1index];
+        if (pred_dir->config.two.xor)
+	  {
+#if 1
+	    /* this L2 index computation is more "compatible" to McFarling's
+	       verison of it, i.e., if the PC xor address component is only
+	       part of the index, take the lower order address bits for the
+	       other part of the index, rather than the higher order ones */
+	    l2index = (((l2index ^ (baddr >> MD_BR_SHIFT))
+			& ((1 << pred_dir->config.two.shift_width) - 1))
+		       | ((baddr >> MD_BR_SHIFT)
+			  << pred_dir->config.two.shift_width));
+#else
+	    l2index = l2index ^ (baddr >> MD_BR_SHIFT);
+#endif
+	  }
+	else
+	  {
+	    l2index =
+	      l2index
+		| ((baddr >> MD_BR_SHIFT) << pred_dir->config.two.shift_width);
+	  }
+        l2index = l2index & (pred_dir->config.two.l2size - 1);
+
+        /* get a pointer to prediction state information */
+        p = &pred_dir->config.two.l2table[l2index];
+      }
+      break;
+    case BPred2bit:
+      p = &pred_dir->config.bimod.table[BIMOD_HASH(pred_dir, baddr)];
+      break;
+    case BPred3bit:
+      p = &pred_dir->config.threebit.table[THREEBIT_HASH(pred_dir, baddr)];
+      break;
+    case BPredTaken:
+    case BPredNotTaken:
+      break;
+    default:
+      panic("bogus branch direction predictor class");
+    }
+
+  return (char *)p;
+}
+
+/* probe a predictor for a next fetch address, the predictor is probed
+   with branch address BADDR, the branch target is BTARGET (used for
+   static predictors), and OP is the instruction opcode (used to simulate
+   predecode bits; a pointer to the predictor state entry (or null for jumps)
+   is returned in *DIR_UPDATE_PTR (used for updating predictor state),
+   and the non-speculative top-of-stack is returned in stack_recover_idx 
+   (used for recovering ret-addr stack after mis-predict).  */
+md_addr_t				/* predicted branch target addr */
+bpred_lookup(struct bpred_t *pred,	/* branch predictor instance */
+	     md_addr_t baddr,		/* branch address */
+	     md_addr_t btarget,		/* branch target if taken */
+	     enum md_opcode op,		/* opcode of instruction */
+	     int is_call,		/* non-zero if inst is fn call */
+	     int is_return,		/* non-zero if inst is fn return */
+	     struct bpred_update_t *dir_update_ptr, /* pred state pointer */
+	     int *stack_recover_idx)	/* Non-speculative top-of-stack;
+					 * used on mispredict recovery */
+{
+  struct bpred_btb_ent_t *pbtb = NULL;
+  int index, i;
+
+  if (!dir_update_ptr)
+    panic("no bpred update record");
+
+  /* if this is not a branch, return not-taken */
+  if (!(MD_OP_FLAGS(op) & F_CTRL))
+    return 0;
+
+  pred->lookups++;
+
+  dir_update_ptr->dir.ras = FALSE;
+  dir_update_ptr->pdir1 = NULL;
+  dir_update_ptr->pdir2 = NULL;
+  dir_update_ptr->pmeta = NULL;
+  /* Except for jumps, get a pointer to direction-prediction bits */
+  switch (pred->class) {
+    case BPredComb:
+      if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND))
+	{
+	  char *bimod, *twolev, *meta;
+	  bimod = bpred_dir_lookup (pred->dirpred.bimod, baddr);
+	  twolev = bpred_dir_lookup (pred->dirpred.twolev, baddr);
+	  meta = bpred_dir_lookup (pred->dirpred.meta, baddr);
+	  dir_update_ptr->pmeta = meta;
+	  dir_update_ptr->dir.meta  = (*meta >= 2);
+	  dir_update_ptr->dir.bimod = (*bimod >= 2);
+	  dir_update_ptr->dir.twolev  = (*twolev >= 2);
+	  if (*meta >= 2)
+	    {
+	      dir_update_ptr->pdir1 = twolev;
+	      dir_update_ptr->pdir2 = bimod;
+	    }
+	  else
+	    {
+	      dir_update_ptr->pdir1 = bimod;
+	      dir_update_ptr->pdir2 = twolev;
+	    }
+	}
+      break;
+    case BPred2Level:
+      if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND))
+	{
+	  dir_update_ptr->pdir1 =
+	    bpred_dir_lookup (pred->dirpred.twolev, baddr);
+	}
+      break;
+    case BPred2bit:
+    case BPred3bit:
+      if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND))
+	{
+	  dir_update_ptr->pdir1 =
+	    bpred_dir_lookup (pred->dirpred.bimod, baddr);
+	}
+      break;
+    case BPredTaken:
+      return btarget;
+    case BPredNotTaken:
+      if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND))
+	{
+	  return baddr + sizeof(md_inst_t);
+	}
+      else
+	{
+	  return btarget;
+	}
+    default:
+      panic("bogus predictor class");
+  }
+
+  /*
+   * We have a stateful predictor, and have gotten a pointer into the
+   * direction predictor (except for jumps, for which the ptr is null)
+   */
+
+  /* record pre-pop TOS; if this branch is executed speculatively
+   * and is squashed, we'll restore the TOS and hope the data
+   * wasn't corrupted in the meantime. */
+  if (pred->retstack.size)
+    *stack_recover_idx = pred->retstack.tos;
+  else
+    *stack_recover_idx = 0;
+
+  /* if this is a return, pop return-address stack */
+  if (is_return && pred->retstack.size)
+    {
+      md_addr_t target = pred->retstack.stack[pred->retstack.tos].target;
+      pred->retstack.tos = (pred->retstack.tos + pred->retstack.size - 1)
+	                   % pred->retstack.size;
+      pred->retstack_pops++;
+      dir_update_ptr->dir.ras = TRUE; /* using RAS here */
+      return target;
+    }
+
+#ifndef RAS_BUG_COMPATIBLE
+  /* if function call, push return-address onto return-address stack */
+  if (is_call && pred->retstack.size)
+    {
+      pred->retstack.tos = (pred->retstack.tos + 1)% pred->retstack.size;
+      pred->retstack.stack[pred->retstack.tos].target = 
+	baddr + sizeof(md_inst_t);
+      pred->retstack_pushes++;
+    }
+#endif /* !RAS_BUG_COMPATIBLE */
+  
+  /* not a return. Get a pointer into the BTB */
+  index = (baddr >> MD_BR_SHIFT) & (pred->btb.sets - 1);
+
+  if (pred->btb.assoc > 1)
+    {
+      index *= pred->btb.assoc;
+
+      /* Now we know the set; look for a PC match */
+      for (i = index; i < (index+pred->btb.assoc) ; i++)
+	if (pred->btb.btb_data[i].addr == baddr)
+	  {
+	    /* match */
+	    pbtb = &pred->btb.btb_data[i];
+	    break;
+	  }
+    }	
+  else
+    {
+      pbtb = &pred->btb.btb_data[index];
+      if (pbtb->addr != baddr)
+	pbtb = NULL;
+    }
+
+  /*
+   * We now also have a pointer into the BTB for a hit, or NULL otherwise
+   */
+
+  /* if this is a jump, ignore predicted direction; we know it's taken. */
+  if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) == (F_CTRL|F_UNCOND))
+    {
+      return (pbtb ? pbtb->target : 1);
+    }
+
+  char pred_cutoff = (pred->class == BPred3bit) ? 4 : 2;
+  /* otherwise we have a conditional branch */
+  if (pbtb == NULL)
+    {
+      /* BTB miss -- just return a predicted direction */
+      return ((*(dir_update_ptr->pdir1) >= pred_cutoff)
+	      ? /* taken */ 1
+	      : /* not taken */ 0);
+    }
+  else
+    {
+      /* BTB hit, so return target if it's a predicted-taken branch */
+      return ((*(dir_update_ptr->pdir1) >= pred_cutoff)
+	      ? /* taken */ pbtb->target
+	      : /* not taken */ 0);
+    }
+}
+
+/* Speculative execution can corrupt the ret-addr stack.  So for each
+ * lookup we return the top-of-stack (TOS) at that point; a mispredicted
+ * branch, as part of its recovery, restores the TOS using this value --
+ * hopefully this uncorrupts the stack. */
+void
+bpred_recover(struct bpred_t *pred,	/* branch predictor instance */
+	      md_addr_t baddr,		/* branch address */
+	      int stack_recover_idx)	/* Non-speculative top-of-stack;
+					 * used on mispredict recovery */
+{
+  if (pred == NULL)
+    return;
+
+  pred->retstack.tos = stack_recover_idx;
+}
+
+/* update the branch predictor, only useful for stateful predictors; updates
+   entry for instruction type OP at address BADDR.  BTB only gets updated
+   for branches which are taken.  Inst was determined to jump to
+   address BTARGET and was taken if TAKEN is non-zero.  Predictor 
+   statistics are updated with result of prediction, indicated by CORRECT and 
+   PRED_TAKEN, predictor state to be updated is indicated by *DIR_UPDATE_PTR 
+   (may be NULL for jumps, which shouldn't modify state bits).  Note if
+   bpred_update is done speculatively, branch-prediction may get polluted. */
+void
+bpred_update(struct bpred_t *pred,	/* branch predictor instance */
+	     md_addr_t baddr,		/* branch address */
+	     md_addr_t btarget,		/* resolved branch target */
+	     int taken,			/* non-zero if branch was taken */
+	     int pred_taken,		/* non-zero if branch was pred taken */
+	     int correct,		/* was earlier addr prediction ok? */
+	     enum md_opcode op,		/* opcode of instruction */
+	     struct bpred_update_t *dir_update_ptr)/* pred state pointer */
+{
+  struct bpred_btb_ent_t *pbtb = NULL;
+  struct bpred_btb_ent_t *lruhead = NULL, *lruitem = NULL;
+  int index, i;
+
+  /* don't change bpred state for non-branch instructions or if this
+   * is a stateless predictor*/
+  if (!(MD_OP_FLAGS(op) & F_CTRL))
+    return;
+
+  /* Have a branch here */
+
+  if (correct)
+    pred->addr_hits++;
+
+  if (!!pred_taken == !!taken)
+    pred->dir_hits++;
+  else
+    pred->misses++;
+
+  if (dir_update_ptr->dir.ras)
+    {
+      pred->used_ras++;
+      if (correct)
+	pred->ras_hits++;
+    }
+  else if ((MD_OP_FLAGS(op) & (F_CTRL|F_COND)) == (F_CTRL|F_COND))
+    {
+      if (dir_update_ptr->dir.meta)
+	pred->used_2lev++;
+      else
+	pred->used_bimod++;
+    }
+
+  /* keep stats about JR's; also, but don't change any bpred state for JR's
+   * which are returns unless there's no retstack */
+  if (MD_IS_INDIR(op))
+    {
+      pred->jr_seen++;
+      if (correct)
+	pred->jr_hits++;
+      
+      if (!dir_update_ptr->dir.ras)
+	{
+	  pred->jr_non_ras_seen++;
+	  if (correct)
+	    pred->jr_non_ras_hits++;
+	}
+      else
+	{
+	  /* return that used the ret-addr stack; no further work to do */
+	  return;
+	}
+    }
+
+  /* Can exit now if this is a stateless predictor */
+  if (pred->class == BPredNotTaken || pred->class == BPredTaken)
+    return;
+
+  /* 
+   * Now we know the branch didn't use the ret-addr stack, and that this
+   * is a stateful predictor 
+   */
+
+#ifdef RAS_BUG_COMPATIBLE
+  /* if function call, push return-address onto return-address stack */
+  if (MD_IS_CALL(op) && pred->retstack.size)
+    {
+      pred->retstack.tos = (pred->retstack.tos + 1)% pred->retstack.size;
+      pred->retstack.stack[pred->retstack.tos].target = 
+	baddr + sizeof(md_inst_t);
+      pred->retstack_pushes++;
+    }
+#endif /* RAS_BUG_COMPATIBLE */
+
+  /* update L1 table if appropriate */
+  /* L1 table is updated unconditionally for combining predictor too */
+  if ((MD_OP_FLAGS(op) & (F_CTRL|F_UNCOND)) != (F_CTRL|F_UNCOND) &&
+      (pred->class == BPred2Level || pred->class == BPredComb))
+    {
+      int l1index, shift_reg;
+      
+      /* also update appropriate L1 history register */
+      l1index =
+	(baddr >> MD_BR_SHIFT) & (pred->dirpred.twolev->config.two.l1size - 1);
+      shift_reg =
+	(pred->dirpred.twolev->config.two.shiftregs[l1index] << 1) | (!!taken);
+      pred->dirpred.twolev->config.two.shiftregs[l1index] =
+	shift_reg & ((1 << pred->dirpred.twolev->config.two.shift_width) - 1);
+    }
+
+  /* find BTB entry if it's a taken branch (don't allocate for non-taken) */
+  if (taken)
+    {
+      index = (baddr >> MD_BR_SHIFT) & (pred->btb.sets - 1);
+      
+      if (pred->btb.assoc > 1)
+	{
+	  index *= pred->btb.assoc;
+	  
+	  /* Now we know the set; look for a PC match; also identify
+	   * MRU and LRU items */
+	  for (i = index; i < (index+pred->btb.assoc) ; i++)
+	    {
+	      if (pred->btb.btb_data[i].addr == baddr)
+		{
+		  /* match */
+		  assert(!pbtb);
+		  pbtb = &pred->btb.btb_data[i];
+		}
+	      
+	      dassert(pred->btb.btb_data[i].prev 
+		      != pred->btb.btb_data[i].next);
+	      if (pred->btb.btb_data[i].prev == NULL)
+		{
+		  /* this is the head of the lru list, ie current MRU item */
+		  dassert(lruhead == NULL);
+		  lruhead = &pred->btb.btb_data[i];
+		}
+	      if (pred->btb.btb_data[i].next == NULL)
+		{
+		  /* this is the tail of the lru list, ie the LRU item */
+		  dassert(lruitem == NULL);
+		  lruitem = &pred->btb.btb_data[i];
+		}
+	    }
+	  dassert(lruhead && lruitem);
+	  
+	  if (!pbtb)
+	    /* missed in BTB; choose the LRU item in this set as the victim */
+	    pbtb = lruitem;	
+	  /* else hit, and pbtb points to matching BTB entry */
+	  
+	  /* Update LRU state: selected item, whether selected because it
+	   * matched or because it was LRU and selected as a victim, becomes 
+	   * MRU */
+	  if (pbtb != lruhead)
+	    {
+	      /* this splices out the matched entry... */
+	      if (pbtb->prev)
+		pbtb->prev->next = pbtb->next;
+	      if (pbtb->next)
+		pbtb->next->prev = pbtb->prev;
+	      /* ...and this puts the matched entry at the head of the list */
+	      pbtb->next = lruhead;
+	      pbtb->prev = NULL;
+	      lruhead->prev = pbtb;
+	      dassert(pbtb->prev || pbtb->next);
+	      dassert(pbtb->prev != pbtb->next);
+	    }
+	  /* else pbtb is already MRU item; do nothing */
+	}
+      else
+	pbtb = &pred->btb.btb_data[index];
+    }
+      
+  /* 
+   * Now 'p' is a possibly null pointer into the direction prediction table, 
+   * and 'pbtb' is a possibly null pointer into the BTB (either to a 
+   * matched-on entry or a victim which was LRU in its set)
+   */
+
+  /* update state (but not for jumps) */
+  if (dir_update_ptr->pdir1)
+    {
+      char pred_cutoff = (pred->dirpred.bimod->class == BPred3bit) ? 7 : 3;
+      if (taken)
+	{
+	  if (*dir_update_ptr->pdir1 < pred_cutoff)
+	    ++*dir_update_ptr->pdir1;
+	}
+      else
+	{ /* not taken */
+	  if (*dir_update_ptr->pdir1 > 0)
+	    --*dir_update_ptr->pdir1;
+	}
+    }
+
+  /* combining predictor also updates second predictor and meta predictor */
+  /* second direction predictor */
+  if (dir_update_ptr->pdir2)
+    {
+      char pred_cutoff = (pred->dirpred.twolev->class == BPred3bit) ? 7 : 3;
+      if (taken)
+	{
+	  if (*dir_update_ptr->pdir2 < pred_cutoff)
+	    ++*dir_update_ptr->pdir2;
+	}
+      else
+	{ /* not taken */
+	  if (*dir_update_ptr->pdir2 > 0)
+	    --*dir_update_ptr->pdir2;
+	}
+    }
+
+  /* meta predictor */
+  if (dir_update_ptr->pmeta)
+    {
+      if (dir_update_ptr->dir.bimod != dir_update_ptr->dir.twolev)
+	{
+	  /* we only update meta predictor if directions were different */
+	  if (dir_update_ptr->dir.twolev == (unsigned int)taken)
+	    {
+	      /* 2-level predictor was correct */
+	      if (*dir_update_ptr->pmeta < 3)
+		++*dir_update_ptr->pmeta;
+	    }
+	  else
+	    {
+	      /* bimodal predictor was correct */
+	      if (*dir_update_ptr->pmeta > 0)
+		--*dir_update_ptr->pmeta;
+	    }
+	}
+    }
+
+  /* update BTB (but only for taken branches) */
+  if (pbtb)
+    {
+      /* update current information */
+      dassert(taken);
+
+      if (pbtb->addr == baddr)
+	{
+	  if (!correct)
+	    pbtb->target = btarget;
+	}
+      else
+	{
+	  /* enter a new branch in the table */
+	  pbtb->addr = baddr;
+	  pbtb->op = op;
+	  pbtb->target = btarget;
+	}
+    }
+}
diff --git a/bpred.h b/bpred.h
new file mode 100644
index 0000000..2b6f6c9
--- /dev/null
+++ b/bpred.h
@@ -0,0 +1,289 @@
+/* bpred.h - branch predictor interfaces */
+
+/* SimpleScalar(TM) Tool Suite
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ * All Rights Reserved. 
+ * 
+ * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR,
+ * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS.
+ * 
+ * No portion of this work may be used by any commercial entity, or for any
+ * commercial purpose, without the prior, written permission of SimpleScalar,
+ * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted
+ * as described below.
+ * 
+ * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express
+ * or implied. The user of the program accepts full responsibility for the
+ * application of the program and the use of any results.
+ * 
+ * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be
+ * downloaded, compiled, executed, copied, and modified solely for nonprofit,
+ * educational, noncommercial research, and noncommercial scholarship
+ * purposes provided that this notice in its entirety accompanies all copies.
+ * Copies of the modified software can be delivered to persons who use it
+ * solely for nonprofit, educational, noncommercial research, and
+ * noncommercial scholarship purposes provided that this notice in its
+ * entirety accompanies all copies.
+ * 
+ * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY
+ * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com).
+ * 
+ * 4. No nonprofit user may place any restrictions on the use of this software,
+ * including as modified by the user, by any other authorized user.
+ * 
+ * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar
+ * in compiled or executable form as set forth in Section 2, provided that
+ * either: (A) it is accompanied by the corresponding machine-readable source
+ * code, or (B) it is accompanied by a written offer, with no time limit, to
+ * give anyone a machine-readable copy of the corresponding source code in
+ * return for reimbursement of the cost of distribution. This written offer
+ * must permit verbatim duplication by anyone, or (C) it is distributed by
+ * someone who received only the executable form, and is accompanied by a
+ * copy of the written offer of source code.
+ * 
+ * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is
+ * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail:
+ * 2395 Timbercrest Court, Ann Arbor, MI 48105.
+ * 
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ */
+
+
+#ifndef BPRED_H
+#define BPRED_H
+
+#define dassert(a) assert(a)
+
+#include <stdio.h>
+
+#include "host.h"
+#include "misc.h"
+#include "machine.h"
+#include "stats.h"
+
+/*
+ * This module implements a number of branch predictor mechanisms.  The
+ * following predictors are supported:
+ *
+ *	BPred2Level:  two level adaptive branch predictor
+ *
+ *		It can simulate many prediction mechanisms that have up to
+ *		two levels of tables. Parameters are:
+ *		     N   # entries in first level (# of shift register(s))
+ *		     W   width of shift register(s)
+ *		     M   # entries in 2nd level (# of counters, or other FSM)
+ *		One BTB entry per level-2 counter.
+ *
+ *		Configurations:   N, W, M
+ *
+ *		    counter based: 1, 0, M
+ *
+ *		    GAg          : 1, W, 2^W
+ *		    GAp          : 1, W, M (M > 2^W)
+ *		    PAg          : N, W, 2^W
+ *		    PAp          : N, W, M (M == 2^(N+W))
+ *
+ *	BPred2bit:  a simple direct mapped bimodal predictor
+ *
+ *		This predictor has a table of two bit saturating counters.
+ *		Where counter states 0 & 1 are predict not taken and
+ *		counter states 2 & 3 are predict taken, the per-branch counters
+ *		are incremented on taken branches and decremented on
+ *		no taken branches.  One BTB entry per counter.
+ *
+ *	BPredTaken:  static predict branch taken
+ *
+ *	BPredNotTaken:  static predict branch not taken
+ *
+ */
+
+/* branch predictor types */
+enum bpred_class {
+  BPredComb,                    /* combined predictor (McFarling) */
+  BPred2Level,			/* 2-level correlating pred w/2-bit counters */
+  BPred2bit,			/* 2-bit saturating cntr pred (dir mapped) */
+  BPred3bit,			/* 2-bit saturating cntr pred (dir mapped) */
+  BPredTaken,			/* static predict taken */
+  BPredNotTaken,		/* static predict not taken */
+  BPred_NUM
+};
+
+/* an entry in a BTB */
+struct bpred_btb_ent_t {
+  md_addr_t addr;		/* address of branch being tracked */
+  enum md_opcode op;		/* opcode of branch corresp. to addr */
+  md_addr_t target;		/* last destination of branch when taken */
+  struct bpred_btb_ent_t *prev, *next; /* lru chaining pointers */
+};
+
+/* direction predictor def */
+struct bpred_dir_t {
+  enum bpred_class class;	/* type of predictor */
+  union {
+    struct {
+      unsigned int size;	/* number of entries in direct-mapped table */
+      unsigned char *table;	/* prediction state table */
+    } bimod;
+    struct {
+      unsigned int size;
+      unsigned char *table;
+    } threebit;
+    struct {
+      int l1size;		/* level-1 size, number of history regs */
+      int l2size;		/* level-2 size, number of pred states */
+      int shift_width;		/* amount of history in level-1 shift regs */
+      int xor;			/* history xor address flag */
+      int *shiftregs;		/* level-1 history table */
+      unsigned char *l2table;	/* level-2 prediction state table */
+    } two;
+  } config;
+};
+
+/* branch predictor def */
+struct bpred_t {
+  enum bpred_class class;	/* type of predictor */
+  struct {
+    struct bpred_dir_t *bimod;	  /* first direction predictor */
+    struct bpred_dir_t *twolev;	  /* second direction predictor */
+    struct bpred_dir_t *meta;	  /* meta predictor */
+  } dirpred;
+
+  struct {
+    int sets;			/* num BTB sets */
+    int assoc;			/* BTB associativity */
+    struct bpred_btb_ent_t *btb_data; /* BTB addr-prediction table */
+  } btb;
+
+  struct {
+    int size;			/* return-address stack size */
+    int tos;			/* top-of-stack */
+    struct bpred_btb_ent_t *stack; /* return-address stack */
+  } retstack;
+
+  /* stats */
+  counter_t addr_hits;		/* num correct addr-predictions */
+  counter_t dir_hits;		/* num correct dir-predictions (incl addr) */
+  counter_t used_ras;		/* num RAS predictions used */
+  counter_t used_bimod;		/* num bimodal predictions used (BPredComb) */
+  counter_t used_2lev;		/* num 2-level predictions used (BPredComb) */
+  counter_t jr_hits;		/* num correct addr-predictions for JR's */
+  counter_t jr_seen;		/* num JR's seen */
+  counter_t jr_non_ras_hits;	/* num correct addr-preds for non-RAS JR's */
+  counter_t jr_non_ras_seen;	/* num non-RAS JR's seen */
+  counter_t misses;		/* num incorrect predictions */
+
+  counter_t lookups;		/* num lookups */
+  counter_t retstack_pops;	/* number of times a value was popped */
+  counter_t retstack_pushes;	/* number of times a value was pushed */
+  counter_t ras_hits;		/* num correct return-address predictions */
+};
+
+/* branch predictor update information */
+struct bpred_update_t {
+  char *pdir1;		/* direction-1 predictor counter */
+  char *pdir2;		/* direction-2 predictor counter */
+  char *pmeta;		/* meta predictor counter */
+  struct {		/* predicted directions */
+    unsigned int ras    : 1;	/* RAS used */
+    unsigned int bimod  : 1;    /* bimodal predictor */
+    unsigned int twolev : 1;    /* 2-level predictor */
+    unsigned int meta   : 1;    /* meta predictor (0..bimod / 1..2lev) */
+  } dir;
+};
+
+/* create a branch predictor */
+struct bpred_t *			/* branch predictory instance */
+bpred_create(enum bpred_class class,	/* type of predictor to create */
+	     unsigned int bimod_size,	/* bimod table size */
+	     unsigned int l1size,	/* level-1 table size */
+	     unsigned int l2size,	/* level-2 table size */
+	     unsigned int meta_size,	/* meta predictor table size */
+	     unsigned int shift_width,	/* history register width */
+	     unsigned int xor,		/* history xor address flag */
+	     unsigned int btb_sets,	/* number of sets in BTB */ 
+	     unsigned int btb_assoc,	/* BTB associativity */
+	     unsigned int retstack_size);/* num entries in ret-addr stack */
+
+/* create a branch direction predictor */
+struct bpred_dir_t *		/* branch direction predictor instance */
+bpred_dir_create (
+  enum bpred_class class,	/* type of predictor to create */
+  unsigned int l1size,		/* level-1 table size */
+  unsigned int l2size,		/* level-2 table size (if relevant) */
+  unsigned int shift_width,	/* history register width */
+  unsigned int xor);	   	/* history xor address flag */
+
+/* print branch predictor configuration */
+void
+bpred_config(struct bpred_t *pred,	/* branch predictor instance */
+	     FILE *stream);		/* output stream */
+
+/* print predictor stats */
+void
+bpred_stats(struct bpred_t *pred,	/* branch predictor instance */
+	    FILE *stream);		/* output stream */
+
+/* register branch predictor stats */
+void
+bpred_reg_stats(struct bpred_t *pred,	/* branch predictor instance */
+		struct stat_sdb_t *sdb);/* stats database */
+
+/* reset stats after priming, if appropriate */
+void bpred_after_priming(struct bpred_t *bpred);
+
+/* probe a predictor for a next fetch address, the predictor is probed
+   with branch address BADDR, the branch target is BTARGET (used for
+   static predictors), and OP is the instruction opcode (used to simulate
+   predecode bits; a pointer to the predictor state entry (or null for jumps)
+   is returned in *DIR_UPDATE_PTR (used for updating predictor state),
+   and the non-speculative top-of-stack is returned in stack_recover_idx 
+   (used for recovering ret-addr stack after mis-predict).  */
+md_addr_t				/* predicted branch target addr */
+bpred_lookup(struct bpred_t *pred,	/* branch predictor instance */
+	     md_addr_t baddr,		/* branch address */
+	     md_addr_t btarget,		/* branch target if taken */
+	     enum md_opcode op,		/* opcode of instruction */
+	     int is_call,		/* non-zero if inst is fn call */
+	     int is_return,		/* non-zero if inst is fn return */
+	     struct bpred_update_t *dir_update_ptr, /* pred state pointer */
+	     int *stack_recover_idx);	/* Non-speculative top-of-stack;
+					 * used on mispredict recovery */
+
+/* Speculative execution can corrupt the ret-addr stack.  So for each
+ * lookup we return the top-of-stack (TOS) at that point; a mispredicted
+ * branch, as part of its recovery, restores the TOS using this value --
+ * hopefully this uncorrupts the stack. */
+void
+bpred_recover(struct bpred_t *pred,	/* branch predictor instance */
+	      md_addr_t baddr,		/* branch address */
+	      int stack_recover_idx);	/* Non-speculative top-of-stack;
+					 * used on mispredict recovery */
+
+/* update the branch predictor, only useful for stateful predictors; updates
+   entry for instruction type OP at address BADDR.  BTB only gets updated
+   for branches which are taken.  Inst was determined to jump to
+   address BTARGET and was taken if TAKEN is non-zero.  Predictor 
+   statistics are updated with result of prediction, indicated by CORRECT and 
+   PRED_TAKEN, predictor state to be updated is indicated by *DIR_UPDATE_PTR 
+   (may be NULL for jumps, which shouldn't modify state bits).  Note if
+   bpred_update is done speculatively, branch-prediction may get polluted. */
+void
+bpred_update(struct bpred_t *pred,	/* branch predictor instance */
+	     md_addr_t baddr,		/* branch address */
+	     md_addr_t btarget,		/* resolved branch target */
+	     int taken,			/* non-zero if branch was taken */
+	     int pred_taken,		/* non-zero if branch was pred taken */
+	     int correct,		/* was earlier prediction correct? */
+	     enum md_opcode op,		/* opcode of instruction */
+	     struct bpred_update_t *dir_update_ptr); /* pred state pointer */
+
+
+#ifdef foo0
+/* OBSOLETE */
+/* dump branch predictor state (for debug) */
+void
+bpred_dump(struct bpred_t *pred,	/* branch predictor instance */
+	   FILE *stream);		/* output stream */
+#endif
+
+#endif /* BPRED_H */
diff --git a/sim-outorder.c b/sim-outorder.c
new file mode 100644
index 0000000..0245b58
--- /dev/null
+++ b/sim-outorder.c
@@ -0,0 +1,4637 @@
+/* sim-outorder.c - sample out-of-order issue perf simulator implementation */
+
+/* SimpleScalar(TM) Tool Suite
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ * All Rights Reserved. 
+ * 
+ * THIS IS A LEGAL DOCUMENT, BY USING SIMPLESCALAR,
+ * YOU ARE AGREEING TO THESE TERMS AND CONDITIONS.
+ * 
+ * No portion of this work may be used by any commercial entity, or for any
+ * commercial purpose, without the prior, written permission of SimpleScalar,
+ * LLC (info@simplescalar.com). Nonprofit and noncommercial use is permitted
+ * as described below.
+ * 
+ * 1. SimpleScalar is provided AS IS, with no warranty of any kind, express
+ * or implied. The user of the program accepts full responsibility for the
+ * application of the program and the use of any results.
+ * 
+ * 2. Nonprofit and noncommercial use is encouraged. SimpleScalar may be
+ * downloaded, compiled, executed, copied, and modified solely for nonprofit,
+ * educational, noncommercial research, and noncommercial scholarship
+ * purposes provided that this notice in its entirety accompanies all copies.
+ * Copies of the modified software can be delivered to persons who use it
+ * solely for nonprofit, educational, noncommercial research, and
+ * noncommercial scholarship purposes provided that this notice in its
+ * entirety accompanies all copies.
+ * 
+ * 3. ALL COMMERCIAL USE, AND ALL USE BY FOR PROFIT ENTITIES, IS EXPRESSLY
+ * PROHIBITED WITHOUT A LICENSE FROM SIMPLESCALAR, LLC (info@simplescalar.com).
+ * 
+ * 4. No nonprofit user may place any restrictions on the use of this software,
+ * including as modified by the user, by any other authorized user.
+ * 
+ * 5. Noncommercial and nonprofit users may distribute copies of SimpleScalar
+ * in compiled or executable form as set forth in Section 2, provided that
+ * either: (A) it is accompanied by the corresponding machine-readable source
+ * code, or (B) it is accompanied by a written offer, with no time limit, to
+ * give anyone a machine-readable copy of the corresponding source code in
+ * return for reimbursement of the cost of distribution. This written offer
+ * must permit verbatim duplication by anyone, or (C) it is distributed by
+ * someone who received only the executable form, and is accompanied by a
+ * copy of the written offer of source code.
+ * 
+ * 6. SimpleScalar was developed by Todd M. Austin, Ph.D. The tool suite is
+ * currently maintained by SimpleScalar LLC (info@simplescalar.com). US Mail:
+ * 2395 Timbercrest Court, Ann Arbor, MI 48105.
+ * 
+ * Copyright (C) 1994-2003 by Todd M. Austin, Ph.D. and SimpleScalar, LLC.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <assert.h>
+#include <signal.h>
+
+#include "host.h"
+#include "misc.h"
+#include "machine.h"
+#include "regs.h"
+#include "memory.h"
+#include "cache.h"
+#include "loader.h"
+#include "syscall.h"
+#include "bpred.h"
+#include "resource.h"
+#include "bitmap.h"
+#include "options.h"
+#include "eval.h"
+#include "stats.h"
+#include "ptrace.h"
+#include "dlite.h"
+#include "sim.h"
+
+/*
+ * This file implements a very detailed out-of-order issue superscalar
+ * processor with a two-level memory system and speculative execution support.
+ * This simulator is a performance simulator, tracking the latency of all
+ * pipeline operations.
+ */
+
+/* simulated registers */
+static struct regs_t regs;
+
+/* simulated memory */
+static struct mem_t *mem = NULL;
+
+
+/*
+ * simulator options
+ */
+
+/* maximum number of inst's to execute */
+static unsigned int max_insts;
+
+/* number of insts skipped before timing starts */
+static int fastfwd_count;
+
+/* pipeline trace range and output filename */
+static int ptrace_nelt = 0;
+static char *ptrace_opts[2];
+
+/* instruction fetch queue size (in insts) */
+static int ruu_ifq_size;
+
+/* extra branch mis-prediction latency */
+static int ruu_branch_penalty;
+
+/* speed of front-end of machine relative to execution core */
+static int fetch_speed;
+
+/* branch predictor type {nottaken|taken|perfect|bimod|2lev} */
+static char *pred_type;
+
+/* bimodal predictor config (<table_size>) */
+static int bimod_nelt = 1;
+static int bimod_config[1] =
+  { /* bimod tbl size */2048 };
+
+/* threebit predictor config (<table_size>) */
+static int threebit_nelt = 1;
+static int threebit_config[1] =
+  { /* threebit tbl size */2048 };
+
+/* 2-level predictor config (<l1size> <l2size> <hist_size> <xor>) */
+static int twolev_nelt = 4;
+static int twolev_config[4] =
+  { /* l1size */1, /* l2size */1024, /* hist */8, /* xor */FALSE};
+
+/* combining predictor config (<meta_table_size> */
+static int comb_nelt = 1;
+static int comb_config[1] =
+  { /* meta_table_size */1024 };
+
+/* return address stack (RAS) size */
+static int ras_size = 8;
+
+/* BTB predictor config (<num_sets> <associativity>) */
+static int btb_nelt = 2;
+static int btb_config[2] =
+  { /* nsets */512, /* assoc */4 };
+
+/* instruction decode B/W (insts/cycle) */
+static int ruu_decode_width;
+
+/* instruction issue B/W (insts/cycle) */
+static int ruu_issue_width;
+
+/* run pipeline with in-order issue */
+static int ruu_inorder_issue;
+
+/* issue instructions down wrong execution paths */
+static int ruu_include_spec = TRUE;
+
+/* instruction commit B/W (insts/cycle) */
+static int ruu_commit_width;
+
+/* register update unit (RUU) size */
+static int RUU_size = 8;
+
+/* load/store queue (LSQ) size */
+static int LSQ_size = 4;
+
+/* l1 data cache config, i.e., {<config>|none} */
+static char *cache_dl1_opt;
+
+/* l1 data cache hit latency (in cycles) */
+static int cache_dl1_lat;
+
+/* l2 data cache config, i.e., {<config>|none} */
+static char *cache_dl2_opt;
+
+/* l2 data cache hit latency (in cycles) */
+static int cache_dl2_lat;
+
+/* l1 instruction cache config, i.e., {<config>|dl1|dl2|none} */
+static char *cache_il1_opt;
+
+/* l1 instruction cache hit latency (in cycles) */
+static int cache_il1_lat;
+
+/* l2 instruction cache config, i.e., {<config>|dl1|dl2|none} */
+static char *cache_il2_opt;
+
+/* l2 instruction cache hit latency (in cycles) */
+static int cache_il2_lat;
+
+/* flush caches on system calls */
+static int flush_on_syscalls;
+
+/* convert 64-bit inst addresses to 32-bit inst equivalents */
+static int compress_icache_addrs;
+
+/* memory access latency (<first_chunk> <inter_chunk>) */
+static int mem_nelt = 2;
+static int mem_lat[2] =
+  { /* lat to first chunk */18, /* lat between remaining chunks */2 };
+
+/* memory access bus width (in bytes) */
+static int mem_bus_width;
+
+/* instruction TLB config, i.e., {<config>|none} */
+static char *itlb_opt;
+
+/* data TLB config, i.e., {<config>|none} */
+static char *dtlb_opt;
+
+/* inst/data TLB miss latency (in cycles) */
+static int tlb_miss_lat;
+
+/* total number of integer ALU's available */
+static int res_ialu;
+
+/* total number of integer multiplier/dividers available */
+static int res_imult;
+
+/* total number of memory system ports available (to CPU) */
+static int res_memport;
+
+/* total number of floating point ALU's available */
+static int res_fpalu;
+
+/* total number of floating point multiplier/dividers available */
+static int res_fpmult;
+
+/* text-based stat profiles */
+#define MAX_PCSTAT_VARS 8
+static int pcstat_nelt = 0;
+static char *pcstat_vars[MAX_PCSTAT_VARS];
+
+/* convert 64-bit inst text addresses to 32-bit inst equivalents */
+#ifdef TARGET_PISA
+#define IACOMPRESS(A)							\
+  (compress_icache_addrs ? ((((A) - ld_text_base) >> 1) + ld_text_base) : (A))
+#define ISCOMPRESS(SZ)							\
+  (compress_icache_addrs ? ((SZ) >> 1) : (SZ))
+#else /* !TARGET_PISA */
+#define IACOMPRESS(A)		(A)
+#define ISCOMPRESS(SZ)		(SZ)
+#endif /* TARGET_PISA */
+
+/* operate in backward-compatible bugs mode (for testing only) */
+static int bugcompat_mode;
+
+/*
+ * functional unit resource configuration
+ */
+
+/* resource pool indices, NOTE: update these if you change FU_CONFIG */
+#define FU_IALU_INDEX			0
+#define FU_IMULT_INDEX			1
+#define FU_MEMPORT_INDEX		2
+#define FU_FPALU_INDEX			3
+#define FU_FPMULT_INDEX			4
+
+/* resource pool definition, NOTE: update FU_*_INDEX defs if you change this */
+struct res_desc fu_config[] = {
+  {
+    "integer-ALU",
+    4,
+    0,
+    {
+      { IntALU, 1, 1 }
+    }
+  },
+  {
+    "integer-MULT/DIV",
+    1,
+    0,
+    {
+      { IntMULT, 3, 1 },
+      { IntDIV, 20, 19 }
+    }
+  },
+  {
+    "memory-port",
+    2,
+    0,
+    {
+      { RdPort, 1, 1 },
+      { WrPort, 1, 1 }
+    }
+  },
+  {
+    "FP-adder",
+    4,
+    0,
+    {
+      { FloatADD, 2, 1 },
+      { FloatCMP, 2, 1 },
+      { FloatCVT, 2, 1 }
+    }
+  },
+  {
+    "FP-MULT/DIV",
+    1,
+    0,
+    {
+      { FloatMULT, 4, 1 },
+      { FloatDIV, 12, 12 },
+      { FloatSQRT, 24, 24 }
+    }
+  },
+};
+
+
+/*
+ * simulator stats
+ */
+/* SLIP variable */
+static counter_t sim_slip = 0;
+
+/* total number of instructions executed */
+static counter_t sim_total_insn = 0;
+
+/* total number of memory references committed */
+static counter_t sim_num_refs = 0;
+
+/* total number of memory references executed */
+static counter_t sim_total_refs = 0;
+
+/* total number of loads committed */
+static counter_t sim_num_loads = 0;
+
+/* total number of loads executed */
+static counter_t sim_total_loads = 0;
+
+/* total number of branches committed */
+static counter_t sim_num_branches = 0;
+
+/* total number of branches executed */
+static counter_t sim_total_branches = 0;
+
+/* cycle counter */
+static tick_t sim_cycle = 0;
+
+/* occupancy counters */
+static counter_t IFQ_count;		/* cumulative IFQ occupancy */
+static counter_t IFQ_fcount;		/* cumulative IFQ full count */
+static counter_t RUU_count;		/* cumulative RUU occupancy */
+static counter_t RUU_fcount;		/* cumulative RUU full count */
+static counter_t LSQ_count;		/* cumulative LSQ occupancy */
+static counter_t LSQ_fcount;		/* cumulative LSQ full count */
+
+/* total non-speculative bogus addresses seen (debug var) */
+static counter_t sim_invalid_addrs;
+
+/*
+ * simulator state variables
+ */
+
+/* instruction sequence counter, used to assign unique id's to insts */
+static unsigned int inst_seq = 0;
+
+/* pipetrace instruction sequence counter */
+static unsigned int ptrace_seq = 0;
+
+/* speculation mode, non-zero when mis-speculating, i.e., executing
+   instructions down the wrong path, thus state recovery will eventually have
+   to occur that resets processor register and memory state back to the last
+   precise state */
+static int spec_mode = FALSE;
+
+/* cycles until fetch issue resumes */
+static unsigned ruu_fetch_issue_delay = 0;
+
+/* perfect prediction enabled */
+static int pred_perfect = FALSE;
+
+/* speculative bpred-update enabled */
+static char *bpred_spec_opt;
+static enum { spec_ID, spec_WB, spec_CT } bpred_spec_update;
+
+/* level 1 instruction cache, entry level instruction cache */
+static struct cache_t *cache_il1;
+
+/* level 1 instruction cache */
+static struct cache_t *cache_il2;
+
+/* level 1 data cache, entry level data cache */
+static struct cache_t *cache_dl1;
+
+/* level 2 data cache */
+static struct cache_t *cache_dl2;
+
+/* instruction TLB */
+static struct cache_t *itlb;
+
+/* data TLB */
+static struct cache_t *dtlb;
+
+/* branch predictor */
+static struct bpred_t *pred;
+
+/* functional unit resource pool */
+static struct res_pool *fu_pool = NULL;
+
+/* text-based stat profiles */
+static struct stat_stat_t *pcstat_stats[MAX_PCSTAT_VARS];
+static counter_t pcstat_lastvals[MAX_PCSTAT_VARS];
+static struct stat_stat_t *pcstat_sdists[MAX_PCSTAT_VARS];
+
+/* wedge all stat values into a counter_t */
+#define STATVAL(STAT)							\
+  ((STAT)->sc == sc_int							\
+   ? (counter_t)*((STAT)->variant.for_int.var)			\
+   : ((STAT)->sc == sc_uint						\
+      ? (counter_t)*((STAT)->variant.for_uint.var)		\
+      : ((STAT)->sc == sc_counter					\
+	 ? *((STAT)->variant.for_counter.var)				\
+	 : (panic("bad stat class"), 0))))
+
+
+/* memory access latency, assumed to not cross a page boundary */
+static unsigned int			/* total latency of access */
+mem_access_latency(int blk_sz)		/* block size accessed */
+{
+  int chunks = (blk_sz + (mem_bus_width - 1)) / mem_bus_width;
+
+  assert(chunks > 0);
+
+  return (/* first chunk latency */mem_lat[0] +
+	  (/* remainder chunk latency */mem_lat[1] * (chunks - 1)));
+}
+
+
+/*
+ * cache miss handlers
+ */
+
+/* l1 data cache l1 block miss handler function */
+static unsigned int			/* latency of block access */
+dl1_access_fn(enum mem_cmd cmd,		/* access cmd, Read or Write */
+	      md_addr_t baddr,		/* block address to access */
+	      int bsize,		/* size of block to access */
+	      struct cache_blk_t *blk,	/* ptr to block in upper level */
+	      tick_t now)		/* time of access */
+{
+  unsigned int lat;
+
+  if (cache_dl2)
+    {
+      /* access next level of data cache hierarchy */
+      lat = cache_access(cache_dl2, cmd, baddr, NULL, bsize,
+			 /* now */now, /* pudata */NULL, /* repl addr */NULL);
+      if (cmd == Read)
+	return lat;
+      else
+	{
+	  /* FIXME: unlimited write buffers */
+	  return 0;
+	}
+    }
+  else
+    {
+      /* access main memory */
+      if (cmd == Read)
+	return mem_access_latency(bsize);
+      else
+	{
+	  /* FIXME: unlimited write buffers */
+	  return 0;
+	}
+    }
+}
+
+/* l2 data cache block miss handler function */
+static unsigned int			/* latency of block access */
+dl2_access_fn(enum mem_cmd cmd,		/* access cmd, Read or Write */
+	      md_addr_t baddr,		/* block address to access */
+	      int bsize,		/* size of block to access */
+	      struct cache_blk_t *blk,	/* ptr to block in upper level */
+	      tick_t now)		/* time of access */
+{
+  /* this is a miss to the lowest level, so access main memory */
+  if (cmd == Read)
+    return mem_access_latency(bsize);
+  else
+    {
+      /* FIXME: unlimited write buffers */
+      return 0;
+    }
+}
+
+/* l1 inst cache l1 block miss handler function */
+static unsigned int			/* latency of block access */
+il1_access_fn(enum mem_cmd cmd,		/* access cmd, Read or Write */
+	      md_addr_t baddr,		/* block address to access */
+	      int bsize,		/* size of block to access */
+	      struct cache_blk_t *blk,	/* ptr to block in upper level */
+	      tick_t now)		/* time of access */
+{
+  unsigned int lat;
+
+if (cache_il2)
+    {
+      /* access next level of inst cache hierarchy */
+      lat = cache_access(cache_il2, cmd, baddr, NULL, bsize,
+			 /* now */now, /* pudata */NULL, /* repl addr */NULL);
+      if (cmd == Read)
+	return lat;
+      else
+	panic("writes to instruction memory not supported");
+    }
+  else
+    {
+      /* access main memory */
+      if (cmd == Read)
+	return mem_access_latency(bsize);
+      else
+	panic("writes to instruction memory not supported");
+    }
+}
+
+/* l2 inst cache block miss handler function */
+static unsigned int			/* latency of block access */
+il2_access_fn(enum mem_cmd cmd,		/* access cmd, Read or Write */
+	      md_addr_t baddr,		/* block address to access */
+	      int bsize,		/* size of block to access */
+	      struct cache_blk_t *blk,	/* ptr to block in upper level */
+	      tick_t now)		/* time of access */
+{
+  /* this is a miss to the lowest level, so access main memory */
+  if (cmd == Read)
+    return mem_access_latency(bsize);
+  else
+    panic("writes to instruction memory not supported");
+}
+
+
+/*
+ * TLB miss handlers
+ */
+
+/* inst cache block miss handler function */
+static unsigned int			/* latency of block access */
+itlb_access_fn(enum mem_cmd cmd,	/* access cmd, Read or Write */
+	       md_addr_t baddr,		/* block address to access */
+	       int bsize,		/* size of block to access */
+	       struct cache_blk_t *blk,	/* ptr to block in upper level */
+	       tick_t now)		/* time of access */
+{
+  md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
+
+  /* no real memory access, however, should have user data space attached */
+  assert(phy_page_ptr);
+
+  /* fake translation, for now... */
+  *phy_page_ptr = 0;
+
+  /* return tlb miss latency */
+  return tlb_miss_lat;
+}
+
+/* data cache block miss handler function */
+static unsigned int			/* latency of block access */
+dtlb_access_fn(enum mem_cmd cmd,	/* access cmd, Read or Write */
+	       md_addr_t baddr,	/* block address to access */
+	       int bsize,		/* size of block to access */
+	       struct cache_blk_t *blk,	/* ptr to block in upper level */
+	       tick_t now)		/* time of access */
+{
+  md_addr_t *phy_page_ptr = (md_addr_t *)blk->user_data;
+
+  /* no real memory access, however, should have user data space attached */
+  assert(phy_page_ptr);
+
+  /* fake translation, for now... */
+  *phy_page_ptr = 0;
+
+  /* return tlb miss latency */
+  return tlb_miss_lat;
+}
+
+
+/* register simulator-specific options */
+void
+sim_reg_options(struct opt_odb_t *odb)
+{
+  opt_reg_header(odb, 
+"sim-outorder: This simulator implements a very detailed out-of-order issue\n"
+"superscalar processor with a two-level memory system and speculative\n"
+"execution support.  This simulator is a performance simulator, tracking the\n"
+"latency of all pipeline operations.\n"
+		 );
+
+  /* instruction limit */
+
+  opt_reg_uint(odb, "-max:inst", "maximum number of inst's to execute",
+	       &max_insts, /* default */0,
+	       /* print */TRUE, /* format */NULL);
+
+  /* trace options */
+
+  opt_reg_int(odb, "-fastfwd", "number of insts skipped before timing starts",
+	      &fastfwd_count, /* default */0,
+	      /* print */TRUE, /* format */NULL);
+  opt_reg_string_list(odb, "-ptrace",
+	      "generate pipetrace, i.e., <fname|stdout|stderr> <range>",
+	      ptrace_opts, /* arr_sz */2, &ptrace_nelt, /* default */NULL,
+	      /* !print */FALSE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_note(odb,
+"  Pipetrace range arguments are formatted as follows:\n"
+"\n"
+"    {{@|#}<start>}:{{@|#|+}<end>}\n"
+"\n"
+"  Both ends of the range are optional, if neither are specified, the entire\n"
+"  execution is traced.  Ranges that start with a `@' designate an address\n"
+"  range to be traced, those that start with an `#' designate a cycle count\n"
+"  range.  All other range values represent an instruction count range.  The\n"
+"  second argument, if specified with a `+', indicates a value relative\n"
+"  to the first argument, e.g., 1000:+100 == 1000:1100.  Program symbols may\n"
+"  be used in all contexts.\n"
+"\n"
+"    Examples:   -ptrace FOO.trc #0:#1000\n"
+"                -ptrace BAR.trc @2000:\n"
+"                -ptrace BLAH.trc :1500\n"
+"                -ptrace UXXE.trc :\n"
+"                -ptrace FOOBAR.trc @main:+278\n"
+	       );
+
+  /* ifetch options */
+
+  opt_reg_int(odb, "-fetch:ifqsize", "instruction fetch queue size (in insts)",
+	      &ruu_ifq_size, /* default */4,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-fetch:mplat", "extra branch mis-prediction latency",
+	      &ruu_branch_penalty, /* default */3,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-fetch:speed",
+	      "speed of front-end of machine relative to execution core",
+	      &fetch_speed, /* default */1,
+	      /* print */TRUE, /* format */NULL);
+
+  /* branch predictor options */
+
+  opt_reg_note(odb,
+"  Branch predictor configuration examples for 2-level predictor:\n"
+"    Configurations:   N, M, W, X\n"
+"      N   # entries in first level (# of shift register(s))\n"
+"      W   width of shift register(s)\n"
+"      M   # entries in 2nd level (# of counters, or other FSM)\n"
+"      X   (yes-1/no-0) xor history and address for 2nd level index\n"
+"    Sample predictors:\n"
+"      GAg     : 1, W, 2^W, 0\n"
+"      GAp     : 1, W, M (M > 2^W), 0\n"
+"      PAg     : N, W, 2^W, 0\n"
+"      PAp     : N, W, M (M == 2^(N+W)), 0\n"
+"      gshare  : 1, W, 2^W, 1\n"
+"  Predictor `comb' combines a bimodal and a 2-level predictor.\n"
+               );
+
+  opt_reg_string(odb, "-bpred",
+		 "branch predictor type {nottaken|taken|perfect|bimod|threebit|2lev|comb}",
+                 &pred_type, /* default */"bimod",
+                 /* print */TRUE, /* format */NULL);
+
+  opt_reg_int_list(odb, "-bpred:bimod",
+		   "bimodal predictor config (<table size>)",
+		   bimod_config, bimod_nelt, &bimod_nelt,
+		   /* default */bimod_config,
+		   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_int_list(odb, "-bpred:threebit",
+		   "3-bit predictor config (<table size>)",
+		   threebit_config, threebit_nelt, &threebit_nelt,
+		   /* default */threebit_config,
+		   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_int_list(odb, "-bpred:2lev",
+                   "2-level predictor config "
+		   "(<l1size> <l2size> <hist_size> <xor>)",
+                   twolev_config, twolev_nelt, &twolev_nelt,
+		   /* default */twolev_config,
+                   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_int_list(odb, "-bpred:comb",
+		   "combining predictor config (<meta_table_size>)",
+		   comb_config, comb_nelt, &comb_nelt,
+		   /* default */comb_config,
+		   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_int(odb, "-bpred:ras",
+              "return address stack size (0 for no return stack)",
+              &ras_size, /* default */ras_size,
+              /* print */TRUE, /* format */NULL);
+
+  opt_reg_int_list(odb, "-bpred:btb",
+		   "BTB config (<num_sets> <associativity>)",
+		   btb_config, btb_nelt, &btb_nelt,
+		   /* default */btb_config,
+		   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_string(odb, "-bpred:spec_update",
+		 "speculative predictors update in {ID|WB} (default non-spec)",
+		 &bpred_spec_opt, /* default */NULL,
+		 /* print */TRUE, /* format */NULL);
+
+  /* decode options */
+
+  opt_reg_int(odb, "-decode:width",
+	      "instruction decode B/W (insts/cycle)",
+	      &ruu_decode_width, /* default */4,
+	      /* print */TRUE, /* format */NULL);
+
+  /* issue options */
+
+  opt_reg_int(odb, "-issue:width",
+	      "instruction issue B/W (insts/cycle)",
+	      &ruu_issue_width, /* default */4,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_flag(odb, "-issue:inorder", "run pipeline with in-order issue",
+	       &ruu_inorder_issue, /* default */FALSE,
+	       /* print */TRUE, /* format */NULL);
+
+  opt_reg_flag(odb, "-issue:wrongpath",
+	       "issue instructions down wrong execution paths",
+	       &ruu_include_spec, /* default */TRUE,
+	       /* print */TRUE, /* format */NULL);
+
+  /* commit options */
+
+  opt_reg_int(odb, "-commit:width",
+	      "instruction commit B/W (insts/cycle)",
+	      &ruu_commit_width, /* default */4,
+	      /* print */TRUE, /* format */NULL);
+
+  /* register scheduler options */
+
+  opt_reg_int(odb, "-ruu:size",
+	      "register update unit (RUU) size",
+	      &RUU_size, /* default */16,
+	      /* print */TRUE, /* format */NULL);
+
+  /* memory scheduler options  */
+
+  opt_reg_int(odb, "-lsq:size",
+	      "load/store queue (LSQ) size",
+	      &LSQ_size, /* default */8,
+	      /* print */TRUE, /* format */NULL);
+
+  /* cache options */
+
+  opt_reg_string(odb, "-cache:dl1",
+		 "l1 data cache config, i.e., {<config>|none}",
+		 &cache_dl1_opt, "dl1:128:32:4:l",
+		 /* print */TRUE, NULL);
+
+  opt_reg_note(odb,
+"  The cache config parameter <config> has the following format:\n"
+"\n"
+"    <name>:<nsets>:<bsize>:<assoc>:<repl>\n"
+"\n"
+"    <name>   - name of the cache being defined\n"
+"    <nsets>  - number of sets in the cache\n"
+"    <bsize>  - block size of the cache\n"
+"    <assoc>  - associativity of the cache\n"
+"    <repl>   - block replacement strategy, 'l'-LRU, 'f'-FIFO, 'r'-random\n"
+"\n"
+"    Examples:   -cache:dl1 dl1:4096:32:1:l\n"
+"                -dtlb dtlb:128:4096:32:r\n"
+	       );
+
+  opt_reg_int(odb, "-cache:dl1lat",
+	      "l1 data cache hit latency (in cycles)",
+	      &cache_dl1_lat, /* default */1,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_string(odb, "-cache:dl2",
+		 "l2 data cache config, i.e., {<config>|none}",
+		 &cache_dl2_opt, "ul2:1024:64:4:l",
+		 /* print */TRUE, NULL);
+
+  opt_reg_int(odb, "-cache:dl2lat",
+	      "l2 data cache hit latency (in cycles)",
+	      &cache_dl2_lat, /* default */6,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_string(odb, "-cache:il1",
+		 "l1 inst cache config, i.e., {<config>|dl1|dl2|none}",
+		 &cache_il1_opt, "il1:512:32:1:l",
+		 /* print */TRUE, NULL);
+
+  opt_reg_note(odb,
+"  Cache levels can be unified by pointing a level of the instruction cache\n"
+"  hierarchy at the data cache hiearchy using the \"dl1\" and \"dl2\" cache\n"
+"  configuration arguments.  Most sensible combinations are supported, e.g.,\n"
+"\n"
+"    A unified l2 cache (il2 is pointed at dl2):\n"
+"      -cache:il1 il1:128:64:1:l -cache:il2 dl2\n"
+"      -cache:dl1 dl1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
+"\n"
+"    Or, a fully unified cache hierarchy (il1 pointed at dl1):\n"
+"      -cache:il1 dl1\n"
+"      -cache:dl1 ul1:256:32:1:l -cache:dl2 ul2:1024:64:2:l\n"
+	       );
+
+  opt_reg_int(odb, "-cache:il1lat",
+	      "l1 instruction cache hit latency (in cycles)",
+	      &cache_il1_lat, /* default */1,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_string(odb, "-cache:il2",
+		 "l2 instruction cache config, i.e., {<config>|dl2|none}",
+		 &cache_il2_opt, "dl2",
+		 /* print */TRUE, NULL);
+
+  opt_reg_int(odb, "-cache:il2lat",
+	      "l2 instruction cache hit latency (in cycles)",
+	      &cache_il2_lat, /* default */6,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_flag(odb, "-cache:flush", "flush caches on system calls",
+	       &flush_on_syscalls, /* default */FALSE, /* print */TRUE, NULL);
+
+  opt_reg_flag(odb, "-cache:icompress",
+	       "convert 64-bit inst addresses to 32-bit inst equivalents",
+	       &compress_icache_addrs, /* default */FALSE,
+	       /* print */TRUE, NULL);
+
+  /* mem options */
+  opt_reg_int_list(odb, "-mem:lat",
+		   "memory access latency (<first_chunk> <inter_chunk>)",
+		   mem_lat, mem_nelt, &mem_nelt, mem_lat,
+		   /* print */TRUE, /* format */NULL, /* !accrue */FALSE);
+
+  opt_reg_int(odb, "-mem:width", "memory access bus width (in bytes)",
+	      &mem_bus_width, /* default */8,
+	      /* print */TRUE, /* format */NULL);
+
+  /* TLB options */
+
+  opt_reg_string(odb, "-tlb:itlb",
+		 "instruction TLB config, i.e., {<config>|none}",
+		 &itlb_opt, "itlb:16:4096:4:l", /* print */TRUE, NULL);
+
+  opt_reg_string(odb, "-tlb:dtlb",
+		 "data TLB config, i.e., {<config>|none}",
+		 &dtlb_opt, "dtlb:32:4096:4:l", /* print */TRUE, NULL);
+
+  opt_reg_int(odb, "-tlb:lat",
+	      "inst/data TLB miss latency (in cycles)",
+	      &tlb_miss_lat, /* default */30,
+	      /* print */TRUE, /* format */NULL);
+
+  /* resource configuration */
+
+  opt_reg_int(odb, "-res:ialu",
+	      "total number of integer ALU's available",
+	      &res_ialu, /* default */fu_config[FU_IALU_INDEX].quantity,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-res:imult",
+	      "total number of integer multiplier/dividers available",
+	      &res_imult, /* default */fu_config[FU_IMULT_INDEX].quantity,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-res:memport",
+	      "total number of memory system ports available (to CPU)",
+	      &res_memport, /* default */fu_config[FU_MEMPORT_INDEX].quantity,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-res:fpalu",
+	      "total number of floating point ALU's available",
+	      &res_fpalu, /* default */fu_config[FU_FPALU_INDEX].quantity,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_int(odb, "-res:fpmult",
+	      "total number of floating point multiplier/dividers available",
+	      &res_fpmult, /* default */fu_config[FU_FPMULT_INDEX].quantity,
+	      /* print */TRUE, /* format */NULL);
+
+  opt_reg_string_list(odb, "-pcstat",
+		      "profile stat(s) against text addr's (mult uses ok)",
+		      pcstat_vars, MAX_PCSTAT_VARS, &pcstat_nelt, NULL,
+		      /* !print */FALSE, /* format */NULL, /* accrue */TRUE);
+
+  opt_reg_flag(odb, "-bugcompat",
+	       "operate in backward-compatible bugs mode (for testing only)",
+	       &bugcompat_mode, /* default */FALSE, /* print */TRUE, NULL);
+}
+
+/* check simulator-specific option values */
+void
+sim_check_options(struct opt_odb_t *odb,        /* options database */
+		  int argc, char **argv)        /* command line arguments */
+{
+  char name[128], c;
+  int nsets, bsize, assoc;
+
+  if (fastfwd_count < 0 || fastfwd_count >= 2147483647)
+    fatal("bad fast forward count: %d", fastfwd_count);
+
+  if (ruu_ifq_size < 1 || (ruu_ifq_size & (ruu_ifq_size - 1)) != 0)
+    fatal("inst fetch queue size must be positive > 0 and a power of two");
+
+  if (ruu_branch_penalty < 1)
+    fatal("mis-prediction penalty must be at least 1 cycle");
+
+  if (fetch_speed < 1)
+    fatal("front-end speed must be positive and non-zero");
+
+  if (!mystricmp(pred_type, "perfect"))
+    {
+      /* perfect predictor */
+      pred = NULL;
+      pred_perfect = TRUE;
+    }
+  else if (!mystricmp(pred_type, "taken"))
+    {
+      /* static predictor, not taken */
+      pred = bpred_create(BPredTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    }
+  else if (!mystricmp(pred_type, "nottaken"))
+    {
+      /* static predictor, taken */
+      pred = bpred_create(BPredNotTaken, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    }
+  else if (!mystricmp(pred_type, "bimod"))
+    {
+      /* bimodal predictor, bpred_create() checks BTB_SIZE */
+      if (bimod_nelt != 1)
+	fatal("bad bimod predictor config (<table_size>)");
+      if (btb_nelt != 2)
+	fatal("bad btb config (<num_sets> <associativity>)");
+
+      /* bimodal predictor, bpred_create() checks BTB_SIZE */
+      pred = bpred_create(BPred2bit,
+			  /* bimod table size */bimod_config[0],
+			  /* 2lev l1 size */0,
+			  /* 2lev l2 size */0,
+			  /* meta table size */0,
+			  /* history reg size */0,
+			  /* history xor address */0,
+			  /* btb sets */btb_config[0],
+			  /* btb assoc */btb_config[1],
+			  /* ret-addr stack size */ras_size);
+    }
+  else if (!mystricmp(pred_type, "threebit"))
+    {
+      /* 3-bit predictor, bpred_create() checks BTB_SIZE */
+      if (threebit_nelt != 1)
+	fatal("bad 3-bit predictor config (<table_size>)");
+      if (btb_nelt != 2)
+	fatal("bad btb config (<num_sets> <associativity>)");
+
+      /* 3-bit predictor, bpred_create() checks BTB_SIZE */
+      pred = bpred_create(BPred3bit,
+			  /* threebit table size */threebit_config[0],
+			  /* 2lev l1 size */0,
+			  /* 2lev l2 size */0,
+			  /* meta table size */0,
+			  /* history reg size */0,
+			  /* history xor address */0,
+			  /* btb sets */btb_config[0],
+			  /* btb assoc */btb_config[1],
+			  /* ret-addr stack size */ras_size);
+    }
+  else if (!mystricmp(pred_type, "2lev"))
+    {
+      /* 2-level adaptive predictor, bpred_create() checks args */
+      if (twolev_nelt != 4)
+	fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
+      if (btb_nelt != 2)
+	fatal("bad btb config (<num_sets> <associativity>)");
+
+      pred = bpred_create(BPred2Level,
+			  /* bimod table size */0,
+			  /* 2lev l1 size */twolev_config[0],
+			  /* 2lev l2 size */twolev_config[1],
+			  /* meta table size */0,
+			  /* history reg size */twolev_config[2],
+			  /* history xor address */twolev_config[3],
+			  /* btb sets */btb_config[0],
+			  /* btb assoc */btb_config[1],
+			  /* ret-addr stack size */ras_size);
+    }
+  else if (!mystricmp(pred_type, "comb"))
+    {
+      /* combining predictor, bpred_create() checks args */
+      if (twolev_nelt != 4)
+	fatal("bad 2-level pred config (<l1size> <l2size> <hist_size> <xor>)");
+      if (bimod_nelt != 1)
+	fatal("bad bimod predictor config (<table_size>)");
+      if (comb_nelt != 1)
+	fatal("bad combining predictor config (<meta_table_size>)");
+      if (btb_nelt != 2)
+	fatal("bad btb config (<num_sets> <associativity>)");
+
+      pred = bpred_create(BPredComb,
+			  /* bimod table size */bimod_config[0],
+			  /* l1 size */twolev_config[0],
+			  /* l2 size */twolev_config[1],
+			  /* meta table size */comb_config[0],
+			  /* history reg size */twolev_config[2],
+			  /* history xor address */twolev_config[3],
+			  /* btb sets */btb_config[0],
+			  /* btb assoc */btb_config[1],
+			  /* ret-addr stack size */ras_size);
+    }
+  else
+    fatal("cannot parse predictor type `%s'", pred_type);
+
+  if (!bpred_spec_opt)
+    bpred_spec_update = spec_CT;
+  else if (!mystricmp(bpred_spec_opt, "ID"))
+    bpred_spec_update = spec_ID;
+  else if (!mystricmp(bpred_spec_opt, "WB"))
+    bpred_spec_update = spec_WB;
+  else
+    fatal("bad speculative update stage specifier, use {ID|WB}");
+
+  if (ruu_decode_width < 1 || (ruu_decode_width & (ruu_decode_width-1)) != 0)
+    fatal("issue width must be positive non-zero and a power of two");
+
+  if (ruu_issue_width < 1 || (ruu_issue_width & (ruu_issue_width-1)) != 0)
+    fatal("issue width must be positive non-zero and a power of two");
+
+  if (ruu_commit_width < 1)
+    fatal("commit width must be positive non-zero");
+
+  if (RUU_size < 2 || (RUU_size & (RUU_size-1)) != 0)
+    fatal("RUU size must be a positive number > 1 and a power of two");
+
+  if (LSQ_size < 2 || (LSQ_size & (LSQ_size-1)) != 0)
+    fatal("LSQ size must be a positive number > 1 and a power of two");
+
+  /* use a level 1 D-cache? */
+  if (!mystricmp(cache_dl1_opt, "none"))
+    {
+      cache_dl1 = NULL;
+
+      /* the level 2 D-cache cannot be defined */
+      if (strcmp(cache_dl2_opt, "none"))
+	fatal("the l1 data cache must defined if the l2 cache is defined");
+      cache_dl2 = NULL;
+    }
+  else /* dl1 is defined */
+    {
+      if (sscanf(cache_dl1_opt, "%[^:]:%d:%d:%d:%c",
+		 name, &nsets, &bsize, &assoc, &c) != 5)
+	fatal("bad l1 D-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
+      cache_dl1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
+			       /* usize */0, assoc, cache_char2policy(c),
+			       dl1_access_fn, /* hit lat */cache_dl1_lat);
+
+      /* is the level 2 D-cache defined? */
+      if (!mystricmp(cache_dl2_opt, "none"))
+	cache_dl2 = NULL;
+      else
+	{
+	  if (sscanf(cache_dl2_opt, "%[^:]:%d:%d:%d:%c",
+		     name, &nsets, &bsize, &assoc, &c) != 5)
+	    fatal("bad l2 D-cache parms: "
+		  "<name>:<nsets>:<bsize>:<assoc>:<repl>");
+	  cache_dl2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
+				   /* usize */0, assoc, cache_char2policy(c),
+				   dl2_access_fn, /* hit lat */cache_dl2_lat);
+	}
+    }
+
+  /* use a level 1 I-cache? */
+  if (!mystricmp(cache_il1_opt, "none"))
+    {
+      cache_il1 = NULL;
+
+      /* the level 2 I-cache cannot be defined */
+      if (strcmp(cache_il2_opt, "none"))
+	fatal("the l1 inst cache must defined if the l2 cache is defined");
+      cache_il2 = NULL;
+    }
+  else if (!mystricmp(cache_il1_opt, "dl1"))
+    {
+      if (!cache_dl1)
+	fatal("I-cache l1 cannot access D-cache l1 as it's undefined");
+      cache_il1 = cache_dl1;
+
+      /* the level 2 I-cache cannot be defined */
+      if (strcmp(cache_il2_opt, "none"))
+	fatal("the l1 inst cache must defined if the l2 cache is defined");
+      cache_il2 = NULL;
+    }
+  else if (!mystricmp(cache_il1_opt, "dl2"))
+    {
+      if (!cache_dl2)
+	fatal("I-cache l1 cannot access D-cache l2 as it's undefined");
+      cache_il1 = cache_dl2;
+
+      /* the level 2 I-cache cannot be defined */
+      if (strcmp(cache_il2_opt, "none"))
+	fatal("the l1 inst cache must defined if the l2 cache is defined");
+      cache_il2 = NULL;
+    }
+  else /* il1 is defined */
+    {
+      if (sscanf(cache_il1_opt, "%[^:]:%d:%d:%d:%c",
+		 name, &nsets, &bsize, &assoc, &c) != 5)
+	fatal("bad l1 I-cache parms: <name>:<nsets>:<bsize>:<assoc>:<repl>");
+      cache_il1 = cache_create(name, nsets, bsize, /* balloc */FALSE,
+			       /* usize */0, assoc, cache_char2policy(c),
+			       il1_access_fn, /* hit lat */cache_il1_lat);
+
+      /* is the level 2 D-cache defined? */
+      if (!mystricmp(cache_il2_opt, "none"))
+	cache_il2 = NULL;
+      else if (!mystricmp(cache_il2_opt, "dl2"))
+	{
+	  if (!cache_dl2)
+	    fatal("I-cache l2 cannot access D-cache l2 as it's undefined");
+	  cache_il2 = cache_dl2;
+	}
+      else
+	{
+	  if (sscanf(cache_il2_opt, "%[^:]:%d:%d:%d:%c",
+		     name, &nsets, &bsize, &assoc, &c) != 5)
+	    fatal("bad l2 I-cache parms: "
+		  "<name>:<nsets>:<bsize>:<assoc>:<repl>");
+	  cache_il2 = cache_create(name, nsets, bsize, /* balloc */FALSE,
+				   /* usize */0, assoc, cache_char2policy(c),
+				   il2_access_fn, /* hit lat */cache_il2_lat);
+	}
+    }
+
+  /* use an I-TLB? */
+  if (!mystricmp(itlb_opt, "none"))
+    itlb = NULL;
+  else
+    {
+      if (sscanf(itlb_opt, "%[^:]:%d:%d:%d:%c",
+		 name, &nsets, &bsize, &assoc, &c) != 5)
+	fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
+      itlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
+			  /* usize */sizeof(md_addr_t), assoc,
+			  cache_char2policy(c), itlb_access_fn,
+			  /* hit latency */1);
+    }
+
+  /* use a D-TLB? */
+  if (!mystricmp(dtlb_opt, "none"))
+    dtlb = NULL;
+  else
+    {
+      if (sscanf(dtlb_opt, "%[^:]:%d:%d:%d:%c",
+		 name, &nsets, &bsize, &assoc, &c) != 5)
+	fatal("bad TLB parms: <name>:<nsets>:<page_size>:<assoc>:<repl>");
+      dtlb = cache_create(name, nsets, bsize, /* balloc */FALSE,
+			  /* usize */sizeof(md_addr_t), assoc,
+			  cache_char2policy(c), dtlb_access_fn,
+			  /* hit latency */1);
+    }
+
+  if (cache_dl1_lat < 1)
+    fatal("l1 data cache latency must be greater than zero");
+
+  if (cache_dl2_lat < 1)
+    fatal("l2 data cache latency must be greater than zero");
+
+  if (cache_il1_lat < 1)
+    fatal("l1 instruction cache latency must be greater than zero");
+
+  if (cache_il2_lat < 1)
+    fatal("l2 instruction cache latency must be greater than zero");
+
+  if (mem_nelt != 2)
+    fatal("bad memory access latency (<first_chunk> <inter_chunk>)");
+
+  if (mem_lat[0] < 1 || mem_lat[1] < 1)
+    fatal("all memory access latencies must be greater than zero");
+
+  if (mem_bus_width < 1 || (mem_bus_width & (mem_bus_width-1)) != 0)
+    fatal("memory bus width must be positive non-zero and a power of two");
+
+  if (tlb_miss_lat < 1)
+    fatal("TLB miss latency must be greater than zero");
+
+  if (res_ialu < 1)
+    fatal("number of integer ALU's must be greater than zero");
+  if (res_ialu > MAX_INSTS_PER_CLASS)
+    fatal("number of integer ALU's must be <= MAX_INSTS_PER_CLASS");
+  fu_config[FU_IALU_INDEX].quantity = res_ialu;
+  
+  if (res_imult < 1)
+    fatal("number of integer multiplier/dividers must be greater than zero");
+  if (res_imult > MAX_INSTS_PER_CLASS)
+    fatal("number of integer mult/div's must be <= MAX_INSTS_PER_CLASS");
+  fu_config[FU_IMULT_INDEX].quantity = res_imult;
+  
+  if (res_memport < 1)
+    fatal("number of memory system ports must be greater than zero");
+  if (res_memport > MAX_INSTS_PER_CLASS)
+    fatal("number of memory system ports must be <= MAX_INSTS_PER_CLASS");
+  fu_config[FU_MEMPORT_INDEX].quantity = res_memport;
+  
+  if (res_fpalu < 1)
+    fatal("number of floating point ALU's must be greater than zero");
+  if (res_fpalu > MAX_INSTS_PER_CLASS)
+    fatal("number of floating point ALU's must be <= MAX_INSTS_PER_CLASS");
+  fu_config[FU_FPALU_INDEX].quantity = res_fpalu;
+  
+  if (res_fpmult < 1)
+    fatal("number of floating point multiplier/dividers must be > zero");
+  if (res_fpmult > MAX_INSTS_PER_CLASS)
+    fatal("number of FP mult/div's must be <= MAX_INSTS_PER_CLASS");
+  fu_config[FU_FPMULT_INDEX].quantity = res_fpmult;
+}
+
+/* print simulator-specific configuration information */
+void
+sim_aux_config(FILE *stream)            /* output stream */
+{
+  /* nada */
+}
+
+/* register simulator-specific statistics */
+void
+sim_reg_stats(struct stat_sdb_t *sdb)   /* stats database */
+{
+  int i;
+  stat_reg_counter(sdb, "sim_num_insn",
+		   "total number of instructions committed",
+		   &sim_num_insn, sim_num_insn, NULL);
+  stat_reg_counter(sdb, "sim_num_refs",
+		   "total number of loads and stores committed",
+		   &sim_num_refs, 0, NULL);
+  stat_reg_counter(sdb, "sim_num_loads",
+		   "total number of loads committed",
+		   &sim_num_loads, 0, NULL);
+  stat_reg_formula(sdb, "sim_num_stores",
+		   "total number of stores committed",
+		   "sim_num_refs - sim_num_loads", NULL);
+  stat_reg_counter(sdb, "sim_num_branches",
+		   "total number of branches committed",
+		   &sim_num_branches, /* initial value */0, /* format */NULL);
+  stat_reg_int(sdb, "sim_elapsed_time",
+	       "total simulation time in seconds",
+	       &sim_elapsed_time, 0, NULL);
+  stat_reg_formula(sdb, "sim_inst_rate",
+		   "simulation speed (in insts/sec)",
+		   "sim_num_insn / sim_elapsed_time", NULL);
+
+  stat_reg_counter(sdb, "sim_total_insn",
+		   "total number of instructions executed",
+		   &sim_total_insn, 0, NULL);
+  stat_reg_counter(sdb, "sim_total_refs",
+		   "total number of loads and stores executed",
+		   &sim_total_refs, 0, NULL);
+  stat_reg_counter(sdb, "sim_total_loads",
+		   "total number of loads executed",
+		   &sim_total_loads, 0, NULL);
+  stat_reg_formula(sdb, "sim_total_stores",
+		   "total number of stores executed",
+		   "sim_total_refs - sim_total_loads", NULL);
+  stat_reg_counter(sdb, "sim_total_branches",
+		   "total number of branches executed",
+		   &sim_total_branches, /* initial value */0, /* format */NULL);
+
+  /* register performance stats */
+  stat_reg_counter(sdb, "sim_cycle",
+		   "total simulation time in cycles",
+		   &sim_cycle, /* initial value */0, /* format */NULL);
+  stat_reg_formula(sdb, "sim_IPC",
+		   "instructions per cycle",
+		   "sim_num_insn / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "sim_CPI",
+		   "cycles per instruction",
+		   "sim_cycle / sim_num_insn", /* format */NULL);
+  stat_reg_formula(sdb, "sim_exec_BW",
+		   "total instructions (mis-spec + committed) per cycle",
+		   "sim_total_insn / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "sim_IPB",
+		   "instruction per branch",
+		   "sim_num_insn / sim_num_branches", /* format */NULL);
+
+  /* occupancy stats */
+  stat_reg_counter(sdb, "IFQ_count", "cumulative IFQ occupancy",
+                   &IFQ_count, /* initial value */0, /* format */NULL);
+  stat_reg_counter(sdb, "IFQ_fcount", "cumulative IFQ full count",
+                   &IFQ_fcount, /* initial value */0, /* format */NULL);
+  stat_reg_formula(sdb, "ifq_occupancy", "avg IFQ occupancy (insn's)",
+                   "IFQ_count / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "ifq_rate", "avg IFQ dispatch rate (insn/cycle)",
+                   "sim_total_insn / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "ifq_latency", "avg IFQ occupant latency (cycle's)",
+                   "ifq_occupancy / ifq_rate", /* format */NULL);
+  stat_reg_formula(sdb, "ifq_full", "fraction of time (cycle's) IFQ was full",
+                   "IFQ_fcount / sim_cycle", /* format */NULL);
+
+  stat_reg_counter(sdb, "RUU_count", "cumulative RUU occupancy",
+                   &RUU_count, /* initial value */0, /* format */NULL);
+  stat_reg_counter(sdb, "RUU_fcount", "cumulative RUU full count",
+                   &RUU_fcount, /* initial value */0, /* format */NULL);
+  stat_reg_formula(sdb, "ruu_occupancy", "avg RUU occupancy (insn's)",
+                   "RUU_count / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "ruu_rate", "avg RUU dispatch rate (insn/cycle)",
+                   "sim_total_insn / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "ruu_latency", "avg RUU occupant latency (cycle's)",
+                   "ruu_occupancy / ruu_rate", /* format */NULL);
+  stat_reg_formula(sdb, "ruu_full", "fraction of time (cycle's) RUU was full",
+                   "RUU_fcount / sim_cycle", /* format */NULL);
+
+  stat_reg_counter(sdb, "LSQ_count", "cumulative LSQ occupancy",
+                   &LSQ_count, /* initial value */0, /* format */NULL);
+  stat_reg_counter(sdb, "LSQ_fcount", "cumulative LSQ full count",
+                   &LSQ_fcount, /* initial value */0, /* format */NULL);
+  stat_reg_formula(sdb, "lsq_occupancy", "avg LSQ occupancy (insn's)",
+                   "LSQ_count / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "lsq_rate", "avg LSQ dispatch rate (insn/cycle)",
+                   "sim_total_insn / sim_cycle", /* format */NULL);
+  stat_reg_formula(sdb, "lsq_latency", "avg LSQ occupant latency (cycle's)",
+                   "lsq_occupancy / lsq_rate", /* format */NULL);
+  stat_reg_formula(sdb, "lsq_full", "fraction of time (cycle's) LSQ was full",
+                   "LSQ_fcount / sim_cycle", /* format */NULL);
+
+  stat_reg_counter(sdb, "sim_slip",
+                   "total number of slip cycles",
+                   &sim_slip, 0, NULL);
+  /* register baseline stats */
+  stat_reg_formula(sdb, "avg_sim_slip",
+                   "the average slip between issue and retirement",
+                   "sim_slip / sim_num_insn", NULL);
+
+  /* register predictor stats */
+  if (pred)
+    bpred_reg_stats(pred, sdb);
+
+  /* register cache stats */
+  if (cache_il1
+      && (cache_il1 != cache_dl1 && cache_il1 != cache_dl2))
+    cache_reg_stats(cache_il1, sdb);
+  if (cache_il2
+      && (cache_il2 != cache_dl1 && cache_il2 != cache_dl2))
+    cache_reg_stats(cache_il2, sdb);
+  if (cache_dl1)
+    cache_reg_stats(cache_dl1, sdb);
+  if (cache_dl2)
+    cache_reg_stats(cache_dl2, sdb);
+  if (itlb)
+    cache_reg_stats(itlb, sdb);
+  if (dtlb)
+    cache_reg_stats(dtlb, sdb);
+
+  /* debug variable(s) */
+  stat_reg_counter(sdb, "sim_invalid_addrs",
+		   "total non-speculative bogus addresses seen (debug var)",
+                   &sim_invalid_addrs, /* initial value */0, /* format */NULL);
+
+  for (i=0; i<pcstat_nelt; i++)
+    {
+      char buf[512], buf1[512];
+      struct stat_stat_t *stat;
+
+      /* track the named statistical variable by text address */
+
+      /* find it... */
+      stat = stat_find_stat(sdb, pcstat_vars[i]);
+      if (!stat)
+	fatal("cannot locate any statistic named `%s'", pcstat_vars[i]);
+
+      /* stat must be an integral type */
+      if (stat->sc != sc_int && stat->sc != sc_uint && stat->sc != sc_counter)
+	fatal("`-pcstat' statistical variable `%s' is not an integral type",
+	      stat->name);
+
+      /* register this stat */
+      pcstat_stats[i] = stat;
+      pcstat_lastvals[i] = STATVAL(stat);
+
+      /* declare the sparce text distribution */
+      sprintf(buf, "%s_by_pc", stat->name);
+      sprintf(buf1, "%s (by text address)", stat->desc);
+      pcstat_sdists[i] = stat_reg_sdist(sdb, buf, buf1,
+					/* initial value */0,
+					/* print format */(PF_COUNT|PF_PDF),
+					/* format */"0x%lx %lu %.2f",
+					/* print fn */NULL);
+    }
+  ld_reg_stats(sdb);
+  mem_reg_stats(mem, sdb);
+}
+
+/* forward declarations */
+static void ruu_init(void);
+static void lsq_init(void);
+static void rslink_init(int nlinks);
+static void eventq_init(void);
+static void readyq_init(void);
+static void cv_init(void);
+static void tracer_init(void);
+static void fetch_init(void);
+
+/* initialize the simulator */
+void
+sim_init(void)
+{
+  sim_num_refs = 0;
+
+  /* allocate and initialize register file */
+  regs_init(&regs);
+
+  /* allocate and initialize memory space */
+  mem = mem_create("mem");
+  mem_init(mem);
+}
+
+/* default register state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_reg_obj(struct regs_t *regs,		/* registers to access */
+	      int is_write,			/* access type */
+	      enum md_reg_type rt,		/* reg bank to probe */
+	      int reg,				/* register number */
+	      struct eval_value_t *val);	/* input, output */
+
+/* default memory state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_mem_obj(struct mem_t *mem,		/* memory space to access */
+	      int is_write,			/* access type */
+	      md_addr_t addr,			/* address to access */
+	      char *p,				/* input/output buffer */
+	      int nbytes);			/* size of access */
+
+/* default machine state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_mstate_obj(FILE *stream,			/* output stream */
+		 char *cmd,			/* optional command string */
+		 struct regs_t *regs,		/* registers to access */
+		 struct mem_t *mem);		/* memory space to access */
+
+/* total RS links allocated at program start */
+#define MAX_RS_LINKS                    4096
+
+/* load program into simulated state */
+void
+sim_load_prog(char *fname,		/* program to load */
+	      int argc, char **argv,	/* program arguments */
+	      char **envp)		/* program environment */
+{
+  /* load program text and data, set up environment, memory, and regs */
+  ld_load_prog(fname, argc, argv, envp, &regs, mem, TRUE);
+
+  /* initialize here, so symbols can be loaded */
+  if (ptrace_nelt == 2)
+    {
+      /* generate a pipeline trace */
+      ptrace_open(/* fname */ptrace_opts[0], /* range */ptrace_opts[1]);
+    }
+  else if (ptrace_nelt == 0)
+    {
+      /* no pipetracing */;
+    }
+  else
+    fatal("bad pipetrace args, use: <fname|stdout|stderr> <range>");
+
+  /* finish initialization of the simulation engine */
+  fu_pool = res_create_pool("fu-pool", fu_config, N_ELT(fu_config));
+  rslink_init(MAX_RS_LINKS);
+  tracer_init();
+  fetch_init();
+  cv_init();
+  eventq_init();
+  readyq_init();
+  ruu_init();
+  lsq_init();
+
+  /* initialize the DLite debugger */
+  dlite_init(simoo_reg_obj, simoo_mem_obj, simoo_mstate_obj);
+}
+
+/* dump simulator-specific auxiliary simulator statistics */
+void
+sim_aux_stats(FILE *stream)             /* output stream */
+{
+  /* nada */
+}
+
+/* un-initialize the simulator */
+void
+sim_uninit(void)
+{
+  if (ptrace_nelt > 0)
+    ptrace_close();
+}
+
+
+/*
+ * processor core definitions and declarations
+ */
+
+/* inst tag type, used to tag an operation instance in the RUU */
+typedef unsigned int INST_TAG_TYPE;
+
+/* inst sequence type, used to order instructions in the ready list, if
+   this rolls over the ready list order temporarily will get messed up,
+   but execution will continue and complete correctly */
+typedef unsigned int INST_SEQ_TYPE;
+
+
+/* total input dependencies possible */
+#define MAX_IDEPS               3
+
+/* total output dependencies possible */
+#define MAX_ODEPS               2
+
+/* a register update unit (RUU) station, this record is contained in the
+   processors RUU, which serves as a collection of ordered reservations
+   stations.  The reservation stations capture register results and await
+   the time when all operands are ready, at which time the instruction is
+   issued to the functional units; the RUU is an order circular queue, in which
+   instructions are inserted in fetch (program) order, results are stored in
+   the RUU buffers, and later when an RUU entry is the oldest entry in the
+   machines, it and its instruction's value is retired to the architectural
+   register file in program order, NOTE: the RUU and LSQ share the same
+   structure, this is useful because loads and stores are split into two
+   operations: an effective address add and a load/store, the add is inserted
+   into the RUU and the load/store inserted into the LSQ, allowing the add
+   to wake up the load/store when effective address computation has finished */
+struct RUU_station {
+  /* inst info */
+  md_inst_t IR;			/* instruction bits */
+  enum md_opcode op;			/* decoded instruction opcode */
+  md_addr_t PC, next_PC, pred_PC;	/* inst PC, next PC, predicted PC */
+  int in_LSQ;				/* non-zero if op is in LSQ */
+  int ea_comp;				/* non-zero if op is an addr comp */
+  int recover_inst;			/* start of mis-speculation? */
+  int stack_recover_idx;		/* non-speculative TOS for RSB pred */
+  struct bpred_update_t dir_update;	/* bpred direction update info */
+  int spec_mode;			/* non-zero if issued in spec_mode */
+  md_addr_t addr;			/* effective address for ld/st's */
+  INST_TAG_TYPE tag;			/* RUU slot tag, increment to
+					   squash operation */
+  INST_SEQ_TYPE seq;			/* instruction sequence, used to
+					   sort the ready list and tag inst */
+  unsigned int ptrace_seq;		/* pipetrace sequence number */
+  int slip;
+  /* instruction status */
+  int queued;				/* operands ready and queued */
+  int issued;				/* operation is/was executing */
+  int completed;			/* operation has completed execution */
+  /* output operand dependency list, these lists are used to
+     limit the number of associative searches into the RUU when
+     instructions complete and need to wake up dependent insts */
+  int onames[MAX_ODEPS];		/* output logical names (NA=unused) */
+  struct RS_link *odep_list[MAX_ODEPS];	/* chains to consuming operations */
+
+  /* input dependent links, the output chains rooted above use these
+     fields to mark input operands as ready, when all these fields have
+     been set non-zero, the RUU operation has all of its register
+     operands, it may commence execution as soon as all of its memory
+     operands are known to be read (see lsq_refresh() for details on
+     enforcing memory dependencies) */
+  int idep_ready[MAX_IDEPS];		/* input operand ready? */
+};
+
+/* non-zero if all register operands are ready, update with MAX_IDEPS */
+#define OPERANDS_READY(RS)                                              \
+  ((RS)->idep_ready[0] && (RS)->idep_ready[1] && (RS)->idep_ready[2])
+
+/* register update unit, combination of reservation stations and reorder
+   buffer device, organized as a circular queue */
+static struct RUU_station *RUU;		/* register update unit */
+static int RUU_head, RUU_tail;		/* RUU head and tail pointers */
+static int RUU_num;			/* num entries currently in RUU */
+
+/* allocate and initialize register update unit (RUU) */
+static void
+ruu_init(void)
+{
+  RUU = calloc(RUU_size, sizeof(struct RUU_station));
+  if (!RUU)
+    fatal("out of virtual memory");
+
+  RUU_num = 0;
+  RUU_head = RUU_tail = 0;
+  RUU_count = 0;
+  RUU_fcount = 0;
+}
+
+/* dump the contents of the RUU */
+static void
+ruu_dumpent(struct RUU_station *rs,		/* ptr to RUU station */
+	    int index,				/* entry index */
+	    FILE *stream,			/* output stream */
+	    int header)				/* print header? */
+{
+  if (!stream)
+    stream = stderr;
+
+  if (header)
+    fprintf(stream, "idx: %2d: opcode: %s, inst: `",
+	    index, MD_OP_NAME(rs->op));
+  else
+    fprintf(stream, "       opcode: %s, inst: `",
+	    MD_OP_NAME(rs->op));
+  md_print_insn(rs->IR, rs->PC, stream);
+  fprintf(stream, "'\n");
+  myfprintf(stream, "         PC: 0x%08p, NPC: 0x%08p (pred_PC: 0x%08p)\n",
+	    rs->PC, rs->next_PC, rs->pred_PC);
+  fprintf(stream, "         in_LSQ: %s, ea_comp: %s, recover_inst: %s\n",
+	  rs->in_LSQ ? "t" : "f",
+	  rs->ea_comp ? "t" : "f",
+	  rs->recover_inst ? "t" : "f");
+  myfprintf(stream, "         spec_mode: %s, addr: 0x%08p, tag: 0x%08x\n",
+	    rs->spec_mode ? "t" : "f", rs->addr, rs->tag);
+  fprintf(stream, "         seq: 0x%08x, ptrace_seq: 0x%08x\n",
+	  rs->seq, rs->ptrace_seq);
+  fprintf(stream, "         queued: %s, issued: %s, completed: %s\n",
+	  rs->queued ? "t" : "f",
+	  rs->issued ? "t" : "f",
+	  rs->completed ? "t" : "f");
+  fprintf(stream, "         operands ready: %s\n",
+	  OPERANDS_READY(rs) ? "t" : "f");
+}
+
+/* dump the contents of the RUU */
+static void
+ruu_dump(FILE *stream)				/* output stream */
+{
+  int num, head;
+  struct RUU_station *rs;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** RUU state **\n");
+  fprintf(stream, "RUU_head: %d, RUU_tail: %d\n", RUU_head, RUU_tail);
+  fprintf(stream, "RUU_num: %d\n", RUU_num);
+
+  num = RUU_num;
+  head = RUU_head;
+  while (num)
+    {
+      rs = &RUU[head];
+      ruu_dumpent(rs, rs - RUU, stream, /* header */TRUE);
+      head = (head + 1) % RUU_size;
+      num--;
+    }
+}
+
+/*
+ * load/store queue (LSQ): holds loads and stores in program order, indicating
+ * status of load/store access:
+ *
+ *   - issued: address computation complete, memory access in progress
+ *   - completed: memory access has completed, stored value available
+ *   - squashed: memory access was squashed, ignore this entry
+ *
+ * loads may execute when:
+ *   1) register operands are ready, and
+ *   2) memory operands are ready (no earlier unresolved store)
+ *
+ * loads are serviced by:
+ *   1) previous store at same address in LSQ (hit latency), or
+ *   2) data cache (hit latency + miss latency)
+ *
+ * stores may execute when:
+ *   1) register operands are ready
+ *
+ * stores are serviced by:
+ *   1) depositing store value into the load/store queue
+ *   2) writing store value to the store buffer (plus tag check) at commit
+ *   3) writing store buffer entry to data cache when cache is free
+ *
+ * NOTE: the load/store queue can bypass a store value to a load in the same
+ *   cycle the store executes (using a bypass network), thus stores complete
+ *   in effective zero time after their effective address is known
+ */
+static struct RUU_station *LSQ;         /* load/store queue */
+static int LSQ_head, LSQ_tail;          /* LSQ head and tail pointers */
+static int LSQ_num;                     /* num entries currently in LSQ */
+
+/*
+ * input dependencies for stores in the LSQ:
+ *   idep #0 - operand input (value that is store'd)
+ *   idep #1 - effective address input (address of store operation)
+ */
+#define STORE_OP_INDEX                  0
+#define STORE_ADDR_INDEX                1
+
+#define STORE_OP_READY(RS)              ((RS)->idep_ready[STORE_OP_INDEX])
+#define STORE_ADDR_READY(RS)            ((RS)->idep_ready[STORE_ADDR_INDEX])
+
+/* allocate and initialize the load/store queue (LSQ) */
+static void
+lsq_init(void)
+{
+  LSQ = calloc(LSQ_size, sizeof(struct RUU_station));
+  if (!LSQ)
+    fatal("out of virtual memory");
+
+  LSQ_num = 0;
+  LSQ_head = LSQ_tail = 0;
+  LSQ_count = 0;
+  LSQ_fcount = 0;
+}
+
+/* dump the contents of the RUU */
+static void
+lsq_dump(FILE *stream)				/* output stream */
+{
+  int num, head;
+  struct RUU_station *rs;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** LSQ state **\n");
+  fprintf(stream, "LSQ_head: %d, LSQ_tail: %d\n", LSQ_head, LSQ_tail);
+  fprintf(stream, "LSQ_num: %d\n", LSQ_num);
+
+  num = LSQ_num;
+  head = LSQ_head;
+  while (num)
+    {
+      rs = &LSQ[head];
+      ruu_dumpent(rs, rs - LSQ, stream, /* header */TRUE);
+      head = (head + 1) % LSQ_size;
+      num--;
+    }
+}
+
+
+/*
+ * RS_LINK defs and decls
+ */
+
+/* a reservation station link: this structure links elements of a RUU
+   reservation station list; used for ready instruction queue, event queue, and
+   output dependency lists; each RS_LINK node contains a pointer to the RUU
+   entry it references along with an instance tag, the RS_LINK is only valid if
+   the instruction instance tag matches the instruction RUU entry instance tag;
+   this strategy allows entries in the RUU can be squashed and reused without
+   updating the lists that point to it, which significantly improves the
+   performance of (all to frequent) squash events */
+struct RS_link {
+  struct RS_link *next;			/* next entry in list */
+  struct RUU_station *rs;		/* referenced RUU resv station */
+  INST_TAG_TYPE tag;			/* inst instance sequence number */
+  union {
+    tick_t when;			/* time stamp of entry (for eventq) */
+    INST_SEQ_TYPE seq;			/* inst sequence */
+    int opnum;				/* input/output operand number */
+  } x;
+};
+
+/* RS link free list, grab RS_LINKs from here, when needed */
+static struct RS_link *rslink_free_list;
+
+/* NULL value for an RS link */
+#define RSLINK_NULL_DATA		{ NULL, NULL, 0 }
+static struct RS_link RSLINK_NULL = RSLINK_NULL_DATA;
+
+/* create and initialize an RS link */
+#define RSLINK_INIT(RSL, RS)						\
+  ((RSL).next = NULL, (RSL).rs = (RS), (RSL).tag = (RS)->tag)
+
+/* non-zero if RS link is NULL */
+#define RSLINK_IS_NULL(LINK)            ((LINK)->rs == NULL)
+
+/* non-zero if RS link is to a valid (non-squashed) entry */
+#define RSLINK_VALID(LINK)              ((LINK)->tag == (LINK)->rs->tag)
+
+/* extra RUU reservation station pointer */
+#define RSLINK_RS(LINK)                 ((LINK)->rs)
+
+/* get a new RS link record */
+#define RSLINK_NEW(DST, RS)						\
+  { struct RS_link *n_link;						\
+    if (!rslink_free_list)						\
+      panic("out of rs links");						\
+    n_link = rslink_free_list;						\
+    rslink_free_list = rslink_free_list->next;				\
+    n_link->next = NULL;						\
+    n_link->rs = (RS); n_link->tag = n_link->rs->tag;			\
+    (DST) = n_link;							\
+  }
+
+/* free an RS link record */
+#define RSLINK_FREE(LINK)						\
+  {  struct RS_link *f_link = (LINK);					\
+     f_link->rs = NULL; f_link->tag = 0;				\
+     f_link->next = rslink_free_list;					\
+     rslink_free_list = f_link;						\
+  }
+
+/* FIXME: could this be faster!!! */
+/* free an RS link list */
+#define RSLINK_FREE_LIST(LINK)						\
+  {  struct RS_link *fl_link, *fl_link_next;				\
+     for (fl_link=(LINK); fl_link; fl_link=fl_link_next)		\
+       {								\
+	 fl_link_next = fl_link->next;					\
+	 RSLINK_FREE(fl_link);						\
+       }								\
+  }
+
+/* initialize the free RS_LINK pool */
+static void
+rslink_init(int nlinks)			/* total number of RS_LINK available */
+{
+  int i;
+  struct RS_link *link;
+
+  rslink_free_list = NULL;
+  for (i=0; i<nlinks; i++)
+    {
+      link = calloc(1, sizeof(struct RS_link));
+      if (!link)
+	fatal("out of virtual memory");
+      link->next = rslink_free_list;
+      rslink_free_list = link;
+    }
+}
+
+/* service all functional unit release events, this function is called
+   once per cycle, and it used to step the BUSY timers attached to each
+   functional unit in the function unit resource pool, as long as a functional
+   unit's BUSY count is > 0, it cannot be issued an operation */
+static void
+ruu_release_fu(void)
+{
+  int i;
+
+  /* walk all resource units, decrement busy counts by one */
+  for (i=0; i<fu_pool->num_resources; i++)
+    {
+      /* resource is released when BUSY hits zero */
+      if (fu_pool->resources[i].busy > 0)
+	fu_pool->resources[i].busy--;
+    }
+}
+
+
+/*
+ * the execution unit event queue implementation follows, the event queue
+ * indicates which instruction will complete next, the writeback handler
+ * drains this queue
+ */
+
+/* pending event queue, sorted from soonest to latest event (in time), NOTE:
+   RS_LINK nodes are used for the event queue list so that it need not be
+   updated during squash events */
+static struct RS_link *event_queue;
+
+/* initialize the event queue structures */
+static void
+eventq_init(void)
+{
+  event_queue = NULL;
+}
+
+/* dump the contents of the event queue */
+static void
+eventq_dump(FILE *stream)			/* output stream */
+{
+  struct RS_link *ev;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** event queue state **\n");
+
+  for (ev = event_queue; ev != NULL; ev = ev->next)
+    {
+      /* is event still valid? */
+      if (RSLINK_VALID(ev))
+	{
+	  struct RUU_station *rs = RSLINK_RS(ev);
+
+	  fprintf(stream, "idx: %2d: @ %.0f\n",
+		  (int)(rs - (rs->in_LSQ ? LSQ : RUU)), (double)ev->x.when);
+	  ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
+		      stream, /* !header */FALSE);
+	}
+    }
+}
+
+/* insert an event for RS into the event queue, event queue is sorted from
+   earliest to latest event, event and associated side-effects will be
+   apparent at the start of cycle WHEN */
+static void
+eventq_queue_event(struct RUU_station *rs, tick_t when)
+{
+  struct RS_link *prev, *ev, *new_ev;
+
+  if (rs->completed)
+    panic("event completed");
+
+  if (when <= sim_cycle)
+    panic("event occurred in the past");
+
+  /* get a free event record */
+  RSLINK_NEW(new_ev, rs);
+  new_ev->x.when = when;
+
+  /* locate insertion point */
+  for (prev=NULL, ev=event_queue;
+       ev && ev->x.when < when;
+       prev=ev, ev=ev->next);
+
+  if (prev)
+    {
+      /* insert middle or end */
+      new_ev->next = prev->next;
+      prev->next = new_ev;
+    }
+  else
+    {
+      /* insert at beginning */
+      new_ev->next = event_queue;
+      event_queue = new_ev;
+    }
+}
+
+/* return the next event that has already occurred, returns NULL when no
+   remaining events or all remaining events are in the future */
+static struct RUU_station *
+eventq_next_event(void)
+{
+  struct RS_link *ev;
+
+  if (event_queue && event_queue->x.when <= sim_cycle)
+    {
+      /* unlink and return first event on priority list */
+      ev = event_queue;
+      event_queue = event_queue->next;
+
+      /* event still valid? */
+      if (RSLINK_VALID(ev))
+	{
+	  struct RUU_station *rs = RSLINK_RS(ev);
+
+	  /* reclaim event record */
+	  RSLINK_FREE(ev);
+
+	  /* event is valid, return resv station */
+	  return rs;
+	}
+      else
+	{
+	  /* reclaim event record */
+	  RSLINK_FREE(ev);
+
+	  /* receiving inst was squashed, return next event */
+	  return eventq_next_event();
+	}
+    }
+  else
+    {
+      /* no event or no event is ready */
+      return NULL;
+    }
+}
+
+
+/*
+ * the ready instruction queue implementation follows, the ready instruction
+ * queue indicates which instruction have all of there *register* dependencies
+ * satisfied, instruction will issue when 1) all memory dependencies for
+ * the instruction have been satisfied (see lsq_refresh() for details on how
+ * this is accomplished) and 2) resources are available; ready queue is fully
+ * constructed each cycle before any operation is issued from it -- this
+ * ensures that instruction issue priorities are properly observed; NOTE:
+ * RS_LINK nodes are used for the event queue list so that it need not be
+ * updated during squash events
+ */
+
+/* the ready instruction queue */
+static struct RS_link *ready_queue;
+
+/* initialize the event queue structures */
+static void
+readyq_init(void)
+{
+  ready_queue = NULL;
+}
+
+/* dump the contents of the ready queue */
+static void
+readyq_dump(FILE *stream)			/* output stream */
+{
+  struct RS_link *link;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** ready queue state **\n");
+
+  for (link = ready_queue; link != NULL; link = link->next)
+    {
+      /* is entry still valid? */
+      if (RSLINK_VALID(link))
+	{
+	  struct RUU_station *rs = RSLINK_RS(link);
+
+	  ruu_dumpent(rs, rs - (rs->in_LSQ ? LSQ : RUU),
+		      stream, /* header */TRUE);
+	}
+    }
+}
+
+/* insert ready node into the ready list using ready instruction scheduling
+   policy; currently the following scheduling policy is enforced:
+
+     memory and long latency operands, and branch instructions first
+
+   then
+
+     all other instructions, oldest instructions first
+
+  this policy works well because branches pass through the machine quicker
+  which works to reduce branch misprediction latencies, and very long latency
+  instructions (such loads and multiplies) get priority since they are very
+  likely on the program's critical path */
+static void
+readyq_enqueue(struct RUU_station *rs)		/* RS to enqueue */
+{
+  struct RS_link *prev, *node, *new_node;
+
+  /* node is now queued */
+  if (rs->queued)
+    panic("node is already queued");
+  rs->queued = TRUE;
+
+  /* get a free ready list node */
+  RSLINK_NEW(new_node, rs);
+  new_node->x.seq = rs->seq;
+
+  /* locate insertion point */
+  if (rs->in_LSQ || MD_OP_FLAGS(rs->op) & (F_LONGLAT|F_CTRL))
+    {
+      /* insert loads/stores and long latency ops at the head of the queue */
+      prev = NULL;
+      node = ready_queue;
+    }
+  else
+    {
+      /* otherwise insert in program order (earliest seq first) */
+      for (prev=NULL, node=ready_queue;
+	   node && node->x.seq < rs->seq;
+	   prev=node, node=node->next);
+    }
+
+  if (prev)
+    {
+      /* insert middle or end */
+      new_node->next = prev->next;
+      prev->next = new_node;
+    }
+  else
+    {
+      /* insert at beginning */
+      new_node->next = ready_queue;
+      ready_queue = new_node;
+    }
+}
+
+
+/*
+ * the create vector maps a logical register to a creator in the RUU (and
+ * specific output operand) or the architected register file (if RS_link
+ * is NULL)
+ */
+
+/* an entry in the create vector */
+struct CV_link {
+  struct RUU_station *rs;               /* creator's reservation station */
+  int odep_num;                         /* specific output operand */
+};
+
+/* a NULL create vector entry */
+static struct CV_link CVLINK_NULL = { NULL, 0 };
+
+/* get a new create vector link */
+#define CVLINK_INIT(CV, RS,ONUM)	((CV).rs = (RS), (CV).odep_num = (ONUM))
+
+/* size of the create vector (one entry per architected register) */
+#define CV_BMAP_SZ              (BITMAP_SIZE(MD_TOTAL_REGS))
+
+/* the create vector, NOTE: speculative copy on write storage provided
+   for fast recovery during wrong path execute (see tracer_recover() for
+   details on this process */
+static BITMAP_TYPE(MD_TOTAL_REGS, use_spec_cv);
+static struct CV_link create_vector[MD_TOTAL_REGS];
+static struct CV_link spec_create_vector[MD_TOTAL_REGS];
+
+/* these arrays shadow the create vector an indicate when a register was
+   last created */
+static tick_t create_vector_rt[MD_TOTAL_REGS];
+static tick_t spec_create_vector_rt[MD_TOTAL_REGS];
+
+/* read a create vector entry */
+#define CREATE_VECTOR(N)        (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
+				 ? spec_create_vector[N]                \
+				 : create_vector[N])
+
+/* read a create vector timestamp entry */
+#define CREATE_VECTOR_RT(N)     (BITMAP_SET_P(use_spec_cv, CV_BMAP_SZ, (N))\
+				 ? spec_create_vector_rt[N]             \
+				 : create_vector_rt[N])
+
+/* set a create vector entry */
+#define SET_CREATE_VECTOR(N, L) (spec_mode                              \
+				 ? (BITMAP_SET(use_spec_cv, CV_BMAP_SZ, (N)),\
+				    spec_create_vector[N] = (L))        \
+				 : (create_vector[N] = (L)))
+
+/* initialize the create vector */
+static void
+cv_init(void)
+{
+  int i;
+
+  /* initially all registers are valid in the architected register file,
+     i.e., the create vector entry is CVLINK_NULL */
+  for (i=0; i < MD_TOTAL_REGS; i++)
+    {
+      create_vector[i] = CVLINK_NULL;
+      create_vector_rt[i] = 0;
+      spec_create_vector[i] = CVLINK_NULL;
+      spec_create_vector_rt[i] = 0;
+    }
+
+  /* all create vector entries are non-speculative */
+  BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
+}
+
+/* dump the contents of the create vector */
+static void
+cv_dump(FILE *stream)				/* output stream */
+{
+  int i;
+  struct CV_link ent;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** create vector state **\n");
+
+  for (i=0; i < MD_TOTAL_REGS; i++)
+    {
+      ent = CREATE_VECTOR(i);
+      if (!ent.rs)
+	fprintf(stream, "[cv%02d]: from architected reg file\n", i);
+      else
+	fprintf(stream, "[cv%02d]: from %s, idx: %d\n",
+		i, (ent.rs->in_LSQ ? "LSQ" : "RUU"),
+		(int)(ent.rs - (ent.rs->in_LSQ ? LSQ : RUU)));
+    }
+}
+
+
+/*
+ *  RUU_COMMIT() - instruction retirement pipeline stage
+ */
+
+/* this function commits the results of the oldest completed entries from the
+   RUU and LSQ to the architected reg file, stores in the LSQ will commit
+   their store data to the data cache at this point as well */
+static void
+ruu_commit(void)
+{
+  int i, lat, events, committed = 0;
+  static counter_t sim_ret_insn = 0;
+
+  /* all values must be retired to the architected reg file in program order */
+  while (RUU_num > 0 && committed < ruu_commit_width)
+    {
+      struct RUU_station *rs = &(RUU[RUU_head]);
+
+      if (!rs->completed)
+	{
+	  /* at least RUU entry must be complete */
+	  break;
+	}
+
+      /* default commit events */
+      events = 0;
+
+      /* load/stores must retire load/store queue entry as well */
+      if (RUU[RUU_head].ea_comp)
+	{
+	  /* load/store, retire head of LSQ as well */
+	  if (LSQ_num <= 0 || !LSQ[LSQ_head].in_LSQ)
+	    panic("RUU out of sync with LSQ");
+
+	  /* load/store operation must be complete */
+	  if (!LSQ[LSQ_head].completed)
+	    {
+	      /* load/store operation is not yet complete */
+	      break;
+	    }
+
+	  if ((MD_OP_FLAGS(LSQ[LSQ_head].op) & (F_MEM|F_STORE))
+	      == (F_MEM|F_STORE))
+	    {
+	      struct res_template *fu;
+
+
+	      /* stores must retire their store value to the cache at commit,
+		 try to get a store port (functional unit allocation) */
+	      fu = res_get(fu_pool, MD_OP_FUCLASS(LSQ[LSQ_head].op));
+	      if (fu)
+		{
+		  /* reserve the functional unit */
+		  if (fu->master->busy)
+		    panic("functional unit already in use");
+
+		  /* schedule functional unit release event */
+		  fu->master->busy = fu->issuelat;
+
+		  /* go to the data cache */
+		  if (cache_dl1)
+		    {
+		      /* commit store value to D-cache */
+		      lat =
+			cache_access(cache_dl1, Write, (LSQ[LSQ_head].addr&~3),
+				     NULL, 4, sim_cycle, NULL, NULL);
+		      if (lat > cache_dl1_lat)
+			events |= PEV_CACHEMISS;
+		    }
+
+		  /* all loads and stores must to access D-TLB */
+		  if (dtlb)
+		    {
+		      /* access the D-TLB */
+		      lat =
+			cache_access(dtlb, Read, (LSQ[LSQ_head].addr & ~3),
+				     NULL, 4, sim_cycle, NULL, NULL);
+		      if (lat > 1)
+			events |= PEV_TLBMISS;
+		    }
+		}
+	      else
+		{
+		  /* no store ports left, cannot continue to commit insts */
+		  break;
+		}
+	    }
+
+	  /* invalidate load/store operation instance */
+	  LSQ[LSQ_head].tag++;
+          sim_slip += (sim_cycle - LSQ[LSQ_head].slip);
+   
+	  /* indicate to pipeline trace that this instruction retired */
+	  ptrace_newstage(LSQ[LSQ_head].ptrace_seq, PST_COMMIT, events);
+	  ptrace_endinst(LSQ[LSQ_head].ptrace_seq);
+
+	  /* commit head of LSQ as well */
+	  LSQ_head = (LSQ_head + 1) % LSQ_size;
+	  LSQ_num--;
+	}
+
+      if (pred
+	  && bpred_spec_update == spec_CT
+	  && (MD_OP_FLAGS(rs->op) & F_CTRL))
+	{
+	  bpred_update(pred,
+		       /* branch address */rs->PC,
+		       /* actual target address */rs->next_PC,
+                       /* taken? */rs->next_PC != (rs->PC +
+                                                   sizeof(md_inst_t)),
+                       /* pred taken? */rs->pred_PC != (rs->PC +
+                                                        sizeof(md_inst_t)),
+                       /* correct pred? */rs->pred_PC == rs->next_PC,
+                       /* opcode */rs->op,
+                       /* dir predictor update pointer */&rs->dir_update);
+	}
+
+      /* invalidate RUU operation instance */
+      RUU[RUU_head].tag++;
+      sim_slip += (sim_cycle - RUU[RUU_head].slip);
+      /* print retirement trace if in verbose mode */
+      if (verbose)
+	{
+	  sim_ret_insn++;
+	  myfprintf(stderr, "%10n @ 0x%08p: ", sim_ret_insn, RUU[RUU_head].PC);
+ 	  md_print_insn(RUU[RUU_head].IR, RUU[RUU_head].PC, stderr);
+	  if (MD_OP_FLAGS(RUU[RUU_head].op) & F_MEM)
+	    myfprintf(stderr, "  mem: 0x%08p", RUU[RUU_head].addr);
+	  fprintf(stderr, "\n");
+	  /* fflush(stderr); */
+	}
+
+      /* indicate to pipeline trace that this instruction retired */
+      ptrace_newstage(RUU[RUU_head].ptrace_seq, PST_COMMIT, events);
+      ptrace_endinst(RUU[RUU_head].ptrace_seq);
+
+      /* commit head entry of RUU */
+      RUU_head = (RUU_head + 1) % RUU_size;
+      RUU_num--;
+
+      /* one more instruction committed to architected state */
+      committed++;
+
+      for (i=0; i<MAX_ODEPS; i++)
+	{
+	  if (rs->odep_list[i])
+	    panic ("retired instruction has odeps\n");
+        }
+    }
+}
+
+
+/*
+ *  RUU_RECOVER() - squash mispredicted microarchitecture state
+ */
+
+/* recover processor microarchitecture state back to point of the
+   mis-predicted branch at RUU[BRANCH_INDEX] */
+static void
+ruu_recover(int branch_index)			/* index of mis-pred branch */
+{
+  int i, RUU_index = RUU_tail, LSQ_index = LSQ_tail;
+  int RUU_prev_tail = RUU_tail, LSQ_prev_tail = LSQ_tail;
+
+  /* recover from the tail of the RUU towards the head until the branch index
+     is reached, this direction ensures that the LSQ can be synchronized with
+     the RUU */
+
+  /* go to first element to squash */
+  RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
+  LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
+
+  /* traverse to older insts until the mispredicted branch is encountered */
+  while (RUU_index != branch_index)
+    {
+      /* the RUU should not drain since the mispredicted branch will remain */
+      if (!RUU_num)
+	panic("empty RUU");
+
+      /* should meet up with the tail first */
+      if (RUU_index == RUU_head)
+	panic("RUU head and tail broken");
+
+      /* is this operation an effective addr calc for a load or store? */
+      if (RUU[RUU_index].ea_comp)
+	{
+	  /* should be at least one load or store in the LSQ */
+	  if (!LSQ_num)
+	    panic("RUU and LSQ out of sync");
+
+	  /* recover any resources consumed by the load or store operation */
+	  for (i=0; i<MAX_ODEPS; i++)
+	    {
+	      RSLINK_FREE_LIST(LSQ[LSQ_index].odep_list[i]);
+	      /* blow away the consuming op list */
+	      LSQ[LSQ_index].odep_list[i] = NULL;
+	    }
+      
+	  /* squash this LSQ entry */
+	  LSQ[LSQ_index].tag++;
+
+	  /* indicate in pipetrace that this instruction was squashed */
+	  ptrace_endinst(LSQ[LSQ_index].ptrace_seq);
+
+	  /* go to next earlier LSQ slot */
+	  LSQ_prev_tail = LSQ_index;
+	  LSQ_index = (LSQ_index + (LSQ_size-1)) % LSQ_size;
+	  LSQ_num--;
+	}
+
+      /* recover any resources used by this RUU operation */
+      for (i=0; i<MAX_ODEPS; i++)
+	{
+	  RSLINK_FREE_LIST(RUU[RUU_index].odep_list[i]);
+	  /* blow away the consuming op list */
+	  RUU[RUU_index].odep_list[i] = NULL;
+	}
+      
+      /* squash this RUU entry */
+      RUU[RUU_index].tag++;
+
+      /* indicate in pipetrace that this instruction was squashed */
+      ptrace_endinst(RUU[RUU_index].ptrace_seq);
+
+      /* go to next earlier slot in the RUU */
+      RUU_prev_tail = RUU_index;
+      RUU_index = (RUU_index + (RUU_size-1)) % RUU_size;
+      RUU_num--;
+    }
+
+  /* reset head/tail pointers to point to the mis-predicted branch */
+  RUU_tail = RUU_prev_tail;
+  LSQ_tail = LSQ_prev_tail;
+
+  /* revert create vector back to last precise create vector state, NOTE:
+     this is accomplished by resetting all the copied-on-write bits in the
+     USE_SPEC_CV bit vector */
+  BITMAP_CLEAR_MAP(use_spec_cv, CV_BMAP_SZ);
+
+  /* FIXME: could reset functional units at squash time */
+}
+
+
+/*
+ *  RUU_WRITEBACK() - instruction result writeback pipeline stage
+ */
+
+/* forward declarations */
+static void tracer_recover(void);
+
+/* writeback completed operation results from the functional units to RUU,
+   at this point, the output dependency chains of completing instructions
+   are also walked to determine if any dependent instruction now has all
+   of its register operands, if so the (nearly) ready instruction is inserted
+   into the ready instruction queue */
+static void
+ruu_writeback(void)
+{
+  int i;
+  struct RUU_station *rs;
+
+  /* service all completed events */
+  while ((rs = eventq_next_event()))
+    {
+      /* RS has completed execution and (possibly) produced a result */
+      if (!OPERANDS_READY(rs) || rs->queued || !rs->issued || rs->completed)
+	panic("inst completed and !ready, !issued, or completed");
+
+      /* operation has completed */
+      rs->completed = TRUE;
+
+      /* does this operation reveal a mis-predicted branch? */
+      if (rs->recover_inst)
+	{
+	  if (rs->in_LSQ)
+	    panic("mis-predicted load or store?!?!?");
+
+	  /* recover processor state and reinit fetch to correct path */
+	  ruu_recover(rs - RUU);
+	  tracer_recover();
+	  bpred_recover(pred, rs->PC, rs->stack_recover_idx);
+
+	  /* stall fetch until I-fetch and I-decode recover */
+	  ruu_fetch_issue_delay = ruu_branch_penalty;
+
+	  /* continue writeback of the branch/control instruction */
+	}
+
+      /* if we speculatively update branch-predictor, do it here */
+      if (pred
+	  && bpred_spec_update == spec_WB
+	  && !rs->in_LSQ
+	  && (MD_OP_FLAGS(rs->op) & F_CTRL))
+	{
+	  bpred_update(pred,
+		       /* branch address */rs->PC,
+		       /* actual target address */rs->next_PC,
+		       /* taken? */rs->next_PC != (rs->PC +
+						   sizeof(md_inst_t)),
+		       /* pred taken? */rs->pred_PC != (rs->PC +
+							sizeof(md_inst_t)),
+		       /* correct pred? */rs->pred_PC == rs->next_PC,
+		       /* opcode */rs->op,
+		       /* dir predictor update pointer */&rs->dir_update);
+	}
+
+      /* entered writeback stage, indicate in pipe trace */
+      ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK,
+		      rs->recover_inst ? PEV_MPDETECT : 0);
+
+      /* broadcast results to consuming operations, this is more efficiently
+         accomplished by walking the output dependency chains of the
+	 completed instruction */
+      for (i=0; i<MAX_ODEPS; i++)
+	{
+	  if (rs->onames[i] != NA)
+	    {
+	      struct CV_link link;
+	      struct RS_link *olink, *olink_next;
+
+	      if (rs->spec_mode)
+		{
+		  /* update the speculative create vector, future operations
+		     get value from later creator or architected reg file */
+		  link = spec_create_vector[rs->onames[i]];
+		  if (/* !NULL */link.rs
+		      && /* refs RS */(link.rs == rs && link.odep_num == i))
+		    {
+		      /* the result can now be read from a physical register,
+			 indicate this as so */
+		      spec_create_vector[rs->onames[i]] = CVLINK_NULL;
+		      spec_create_vector_rt[rs->onames[i]] = sim_cycle;
+		    }
+		  /* else, creator invalidated or there is another creator */
+		}
+	      else
+		{
+		  /* update the non-speculative create vector, future
+		     operations get value from later creator or architected
+		     reg file */
+		  link = create_vector[rs->onames[i]];
+		  if (/* !NULL */link.rs
+		      && /* refs RS */(link.rs == rs && link.odep_num == i))
+		    {
+		      /* the result can now be read from a physical register,
+			 indicate this as so */
+		      create_vector[rs->onames[i]] = CVLINK_NULL;
+		      create_vector_rt[rs->onames[i]] = sim_cycle;
+		    }
+		  /* else, creator invalidated or there is another creator */
+		}
+
+	      /* walk output list, queue up ready operations */
+	      for (olink=rs->odep_list[i]; olink; olink=olink_next)
+		{
+		  if (RSLINK_VALID(olink))
+		    {
+		      if (olink->rs->idep_ready[olink->x.opnum])
+			panic("output dependence already satisfied");
+
+		      /* input is now ready */
+		      olink->rs->idep_ready[olink->x.opnum] = TRUE;
+
+		      /* are all the register operands of target ready? */
+		      if (OPERANDS_READY(olink->rs))
+			{
+			  /* yes! enqueue instruction as ready, NOTE: stores
+			     complete at dispatch, so no need to enqueue
+			     them */
+			  if (!olink->rs->in_LSQ
+			      || ((MD_OP_FLAGS(olink->rs->op)&(F_MEM|F_STORE))
+				  == (F_MEM|F_STORE)))
+			    readyq_enqueue(olink->rs);
+			  /* else, ld op, issued when no mem conflict */
+			}
+		    }
+
+		  /* grab link to next element prior to free */
+		  olink_next = olink->next;
+
+		  /* free dependence link element */
+		  RSLINK_FREE(olink);
+		}
+	      /* blow away the consuming op list */
+	      rs->odep_list[i] = NULL;
+
+	    } /* if not NA output */
+
+	} /* for all outputs */
+
+   } /* for all writeback events */
+
+}
+
+
+/*
+ *  LSQ_REFRESH() - memory access dependence checker/scheduler
+ */
+
+/* this function locates ready instructions whose memory dependencies have
+   been satisfied, this is accomplished by walking the LSQ for loads, looking
+   for blocking memory dependency condition (e.g., earlier store with an
+   unknown address) */
+#define MAX_STD_UNKNOWNS		64
+static void
+lsq_refresh(void)
+{
+  int i, j, index, n_std_unknowns;
+  md_addr_t std_unknowns[MAX_STD_UNKNOWNS];
+
+  /* scan entire queue for ready loads: scan from oldest instruction
+     (head) until we reach the tail or an unresolved store, after which no
+     other instruction will become ready */
+  for (i=0, index=LSQ_head, n_std_unknowns=0;
+       i < LSQ_num;
+       i++, index=(index + 1) % LSQ_size)
+    {
+      /* terminate search for ready loads after first unresolved store,
+	 as no later load could be resolved in its presence */
+      if (/* store? */
+	  (MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE))
+	{
+	  if (!STORE_ADDR_READY(&LSQ[index]))
+	    {
+	      /* FIXME: a later STD + STD known could hide the STA unknown */
+	      /* sta unknown, blocks all later loads, stop search */
+	      break;
+	    }
+	  else if (!OPERANDS_READY(&LSQ[index]))
+	    {
+	      /* sta known, but std unknown, may block a later store, record
+		 this address for later referral, we use an array here because
+		 for most simulations the number of entries to search will be
+		 very small */
+	      if (n_std_unknowns == MAX_STD_UNKNOWNS)
+		fatal("STD unknown array overflow, increase MAX_STD_UNKNOWNS");
+	      std_unknowns[n_std_unknowns++] = LSQ[index].addr;
+	    }
+	  else /* STORE_ADDR_READY() && OPERANDS_READY() */
+	    {
+	      /* a later STD known hides an earlier STD unknown */
+	      for (j=0; j<n_std_unknowns; j++)
+		{
+		  if (std_unknowns[j] == /* STA/STD known */LSQ[index].addr)
+		    std_unknowns[j] = /* bogus addr */0;
+		}
+	    }
+	}
+
+      if (/* load? */
+	  ((MD_OP_FLAGS(LSQ[index].op) & (F_MEM|F_LOAD)) == (F_MEM|F_LOAD))
+	  && /* queued? */!LSQ[index].queued
+	  && /* waiting? */!LSQ[index].issued
+	  && /* completed? */!LSQ[index].completed
+	  && /* regs ready? */OPERANDS_READY(&LSQ[index]))
+	{
+	  /* no STA unknown conflict (because we got to this check), check for
+	     a STD unknown conflict */
+	  for (j=0; j<n_std_unknowns; j++)
+	    {
+	      /* found a relevant STD unknown? */
+	      if (std_unknowns[j] == LSQ[index].addr)
+		break;
+	    }
+	  if (j == n_std_unknowns)
+	    {
+	      /* no STA or STD unknown conflicts, put load on ready queue */
+	      readyq_enqueue(&LSQ[index]);
+	    }
+	}
+    }
+}
+
+
+/*
+ *  RUU_ISSUE() - issue instructions to functional units
+ */
+
+/* attempt to issue all operations in the ready queue; insts in the ready
+   instruction queue have all register dependencies satisfied, this function
+   must then 1) ensure the instructions memory dependencies have been satisfied
+   (see lsq_refresh() for details on this process) and 2) a function unit
+   is available in this cycle to commence execution of the operation; if all
+   goes well, the function unit is allocated, a writeback event is scheduled,
+   and the instruction begins execution */
+static void
+ruu_issue(void)
+{
+  int i, load_lat, tlb_lat, n_issued;
+  struct RS_link *node, *next_node;
+  struct res_template *fu;
+
+  /* FIXME: could be a little more efficient when scanning the ready queue */
+
+  /* copy and then blow away the ready list, NOTE: the ready list is
+     always totally reclaimed each cycle, and instructions that are not
+     issue are explicitly reinserted into the ready instruction queue,
+     this management strategy ensures that the ready instruction queue
+     is always properly sorted */
+  node = ready_queue;
+  ready_queue = NULL;
+
+  /* visit all ready instructions (i.e., insts whose register input
+     dependencies have been satisfied, stop issue when no more instructions
+     are available or issue bandwidth is exhausted */
+  for (n_issued=0;
+       node && n_issued < ruu_issue_width;
+       node = next_node)
+    {
+      next_node = node->next;
+
+      /* still valid? */
+      if (RSLINK_VALID(node))
+	{
+	  struct RUU_station *rs = RSLINK_RS(node);
+
+	  /* issue operation, both reg and mem deps have been satisfied */
+	  if (!OPERANDS_READY(rs) || !rs->queued
+	      || rs->issued || rs->completed)
+	    panic("issued inst !ready, issued, or completed");
+
+	  /* node is now un-queued */
+	  rs->queued = FALSE;
+
+	  if (rs->in_LSQ
+	      && ((MD_OP_FLAGS(rs->op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE)))
+	    {
+	      /* stores complete in effectively zero time, result is
+		 written into the load/store queue, the actual store into
+		 the memory system occurs when the instruction is retired
+		 (see ruu_commit()) */
+	      rs->issued = TRUE;
+	      rs->completed = TRUE;
+	      if (rs->onames[0] || rs->onames[1])
+		panic("store creates result");
+
+	      if (rs->recover_inst)
+		panic("mis-predicted store");
+
+	      /* entered execute stage, indicate in pipe trace */
+	      ptrace_newstage(rs->ptrace_seq, PST_WRITEBACK, 0);
+
+	      /* one more inst issued */
+	      n_issued++;
+	    }
+	  else
+	    {
+	      /* issue the instruction to a functional unit */
+	      if (MD_OP_FUCLASS(rs->op) != NA)
+		{
+		  fu = res_get(fu_pool, MD_OP_FUCLASS(rs->op));
+		  if (fu)
+		    {
+		      /* got one! issue inst to functional unit */
+		      rs->issued = TRUE;
+		      /* reserve the functional unit */
+		      if (fu->master->busy)
+			panic("functional unit already in use");
+
+		      /* schedule functional unit release event */
+		      fu->master->busy = fu->issuelat;
+
+		      /* schedule a result writeback event */
+		      if (rs->in_LSQ
+			  && ((MD_OP_FLAGS(rs->op) & (F_MEM|F_LOAD))
+			      == (F_MEM|F_LOAD)))
+			{
+			  int events = 0;
+
+			  /* for loads, determine cache access latency:
+			     first scan LSQ to see if a store forward is
+			     possible, if not, access the data cache */
+			  load_lat = 0;
+			  i = (rs - LSQ);
+			  if (i != LSQ_head)
+			    {
+			      for (;;)
+				{
+				  /* go to next earlier LSQ entry */
+				  i = (i + (LSQ_size-1)) % LSQ_size;
+
+				  /* FIXME: not dealing with partials! */
+				  if ((MD_OP_FLAGS(LSQ[i].op) & F_STORE)
+				      && (LSQ[i].addr == rs->addr))
+				    {
+				      /* hit in the LSQ */
+				      load_lat = 1;
+				      break;
+				    }
+
+				  /* scan finished? */
+				  if (i == LSQ_head)
+				    break;
+				}
+			    }
+
+			  /* was the value store forwared from the LSQ? */
+			  if (!load_lat)
+			    {
+			      int valid_addr = MD_VALID_ADDR(rs->addr);
+
+			      if (!spec_mode && !valid_addr)
+				sim_invalid_addrs++;
+
+			      /* no! go to the data cache if addr is valid */
+			      if (cache_dl1 && valid_addr)
+				{
+				  /* access the cache if non-faulting */
+				  load_lat =
+				    cache_access(cache_dl1, Read,
+						 (rs->addr & ~3), NULL, 4,
+						 sim_cycle, NULL, NULL);
+				  if (load_lat > cache_dl1_lat)
+				    events |= PEV_CACHEMISS;
+				}
+			      else
+				{
+				  /* no caches defined, just use op latency */
+				  load_lat = fu->oplat;
+				}
+			    }
+
+			  /* all loads and stores must to access D-TLB */
+			  if (dtlb && MD_VALID_ADDR(rs->addr))
+			    {
+			      /* access the D-DLB, NOTE: this code will
+				 initiate speculative TLB misses */
+			      tlb_lat =
+				cache_access(dtlb, Read, (rs->addr & ~3),
+					     NULL, 4, sim_cycle, NULL, NULL);
+			      if (tlb_lat > 1)
+				events |= PEV_TLBMISS;
+
+			      /* D-cache/D-TLB accesses occur in parallel */
+			      load_lat = MAX(tlb_lat, load_lat);
+			    }
+
+			  /* use computed cache access latency */
+			  eventq_queue_event(rs, sim_cycle + load_lat);
+
+			  /* entered execute stage, indicate in pipe trace */
+			  ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
+					  ((rs->ea_comp ? PEV_AGEN : 0)
+					   | events));
+			}
+		      else /* !load && !store */
+			{
+			  /* use deterministic functional unit latency */
+			  eventq_queue_event(rs, sim_cycle + fu->oplat);
+
+			  /* entered execute stage, indicate in pipe trace */
+			  ptrace_newstage(rs->ptrace_seq, PST_EXECUTE, 
+					  rs->ea_comp ? PEV_AGEN : 0);
+			}
+
+		      /* one more inst issued */
+		      n_issued++;
+		    }
+		  else /* no functional unit */
+		    {
+		      /* insufficient functional unit resources, put operation
+			 back onto the ready list, we'll try to issue it
+			 again next cycle */
+		      readyq_enqueue(rs);
+		    }
+		}
+	      else /* does not require a functional unit! */
+		{
+		  /* FIXME: need better solution for these */
+		  /* the instruction does not need a functional unit */
+		  rs->issued = TRUE;
+
+		  /* schedule a result event */
+		  eventq_queue_event(rs, sim_cycle + 1);
+
+		  /* entered execute stage, indicate in pipe trace */
+		  ptrace_newstage(rs->ptrace_seq, PST_EXECUTE,
+				  rs->ea_comp ? PEV_AGEN : 0);
+
+		  /* one more inst issued */
+		  n_issued++;
+		}
+	    } /* !store */
+
+	}
+      /* else, RUU entry was squashed */
+
+      /* reclaim ready list entry, NOTE: this is done whether or not the
+         instruction issued, since the instruction was once again reinserted
+         into the ready queue if it did not issue, this ensures that the ready
+         queue is always properly sorted */
+      RSLINK_FREE(node);
+    }
+
+  /* put any instruction not issued back into the ready queue, go through
+     normal channels to ensure instruction stay ordered correctly */
+  for (; node; node = next_node)
+    {
+      next_node = node->next;
+
+      /* still valid? */
+      if (RSLINK_VALID(node))
+        {
+	  struct RUU_station *rs = RSLINK_RS(node);
+
+          /* node is now un-queued */
+          rs->queued = FALSE;
+
+	  /* not issued, put operation back onto the ready list, we'll try to
+	     issue it again next cycle */
+          readyq_enqueue(rs);
+        }
+      /* else, RUU entry was squashed */
+
+      /* reclaim ready list entry, NOTE: this is done whether or not the
+         instruction issued, since the instruction was once again reinserted
+         into the ready queue if it did not issue, this ensures that the ready
+         queue is always properly sorted */
+      RSLINK_FREE(node);
+    }
+}
+
+
+/*
+ * routines for generating on-the-fly instruction traces with support
+ * for control and data misspeculation modeling
+ */
+
+/* integer register file */
+#define R_BMAP_SZ       (BITMAP_SIZE(MD_NUM_IREGS))
+static BITMAP_TYPE(MD_NUM_IREGS, use_spec_R);
+static md_gpr_t spec_regs_R;
+
+/* floating point register file */
+#define F_BMAP_SZ       (BITMAP_SIZE(MD_NUM_FREGS))
+static BITMAP_TYPE(MD_NUM_FREGS, use_spec_F);
+static md_fpr_t spec_regs_F;
+
+/* miscellaneous registers */
+#define C_BMAP_SZ       (BITMAP_SIZE(MD_NUM_CREGS))
+static BITMAP_TYPE(MD_NUM_FREGS, use_spec_C);
+static md_ctrl_t spec_regs_C;
+
+/* dump speculative register state */
+static void
+rspec_dump(FILE *stream)			/* output stream */
+{
+  int i;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** speculative register contents **\n");
+
+  fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f");
+
+  /* dump speculative integer regs */
+  for (i=0; i < MD_NUM_IREGS; i++)
+    {
+      if (BITMAP_SET_P(use_spec_R, R_BMAP_SZ, i))
+	{
+	  md_print_ireg(spec_regs_R, i, stream);
+	  fprintf(stream, "\n");
+	}
+    }
+
+  /* dump speculative FP regs */
+  for (i=0; i < MD_NUM_FREGS; i++)
+    {
+      if (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, i))
+	{
+	  md_print_fpreg(spec_regs_F, i, stream);
+	  fprintf(stream, "\n");
+	}
+    }
+
+  /* dump speculative CTRL regs */
+  for (i=0; i < MD_NUM_CREGS; i++)
+    {
+      if (BITMAP_SET_P(use_spec_C, C_BMAP_SZ, i))
+	{
+	  md_print_creg(spec_regs_C, i, stream);
+	  fprintf(stream, "\n");
+	}
+    }
+}
+
+
+/* speculative memory hash table size, NOTE: this must be a power-of-two */
+#define STORE_HASH_SIZE		32
+
+/* speculative memory hash table definition, accesses go through this hash
+   table when accessing memory in speculative mode, the hash table flush the
+   table when recovering from mispredicted branches */
+struct spec_mem_ent {
+  struct spec_mem_ent *next;		/* ptr to next hash table bucket */
+  md_addr_t addr;			/* virtual address of spec state */
+  unsigned int data[2];			/* spec buffer, up to 8 bytes */
+};
+
+/* speculative memory hash table */
+static struct spec_mem_ent *store_htable[STORE_HASH_SIZE];
+
+/* speculative memory hash table bucket free list */
+static struct spec_mem_ent *bucket_free_list = NULL;
+
+
+/* program counter */
+static md_addr_t pred_PC;
+static md_addr_t recover_PC;
+
+/* fetch unit next fetch address */
+static md_addr_t fetch_regs_PC;
+static md_addr_t fetch_pred_PC;
+
+/* IFETCH -> DISPATCH instruction queue definition */
+struct fetch_rec {
+  md_inst_t IR;				/* inst register */
+  md_addr_t regs_PC, pred_PC;		/* current PC, predicted next PC */
+  struct bpred_update_t dir_update;	/* bpred direction update info */
+  int stack_recover_idx;		/* branch predictor RSB index */
+  unsigned int ptrace_seq;		/* print trace sequence id */
+};
+static struct fetch_rec *fetch_data;	/* IFETCH -> DISPATCH inst queue */
+static int fetch_num;			/* num entries in IF -> DIS queue */
+static int fetch_tail, fetch_head;	/* head and tail pointers of queue */
+
+/* recover instruction trace generator state to precise state state immediately
+   before the first mis-predicted branch; this is accomplished by resetting
+   all register value copied-on-write bitmasks are reset, and the speculative
+   memory hash table is cleared */
+static void
+tracer_recover(void)
+{
+  int i;
+  struct spec_mem_ent *ent, *ent_next;
+
+  /* better be in mis-speculative trace generation mode */
+  if (!spec_mode)
+    panic("cannot recover unless in speculative mode");
+
+  /* reset to non-speculative trace generation mode */
+  spec_mode = FALSE;
+
+  /* reset copied-on-write register bitmasks back to non-speculative state */
+  BITMAP_CLEAR_MAP(use_spec_R, R_BMAP_SZ);
+  BITMAP_CLEAR_MAP(use_spec_F, F_BMAP_SZ);
+  BITMAP_CLEAR_MAP(use_spec_C, C_BMAP_SZ);
+
+  /* reset memory state back to non-speculative state */
+  /* FIXME: could version stamps be used here?!?!? */
+  for (i=0; i<STORE_HASH_SIZE; i++)
+    {
+      /* release all hash table buckets */
+      for (ent=store_htable[i]; ent; ent=ent_next)
+	{
+	  ent_next = ent->next;
+	  ent->next = bucket_free_list;
+	  bucket_free_list = ent;
+	}
+      store_htable[i] = NULL;
+    }
+
+  /* if pipetracing, indicate squash of instructions in the inst fetch queue */
+  if (ptrace_active)
+    {
+      while (fetch_num != 0)
+	{
+	  /* squash the next instruction from the IFETCH -> DISPATCH queue */
+	  ptrace_endinst(fetch_data[fetch_head].ptrace_seq);
+
+	  /* consume instruction from IFETCH -> DISPATCH queue */
+	  fetch_head = (fetch_head+1) & (ruu_ifq_size - 1);
+	  fetch_num--;
+	}
+    }
+
+  /* reset IFETCH state */
+  fetch_num = 0;
+  fetch_tail = fetch_head = 0;
+  fetch_pred_PC = fetch_regs_PC = recover_PC;
+}
+
+/* initialize the speculative instruction state generator state */
+static void
+tracer_init(void)
+{
+  int i;
+
+  /* initially in non-speculative mode */
+  spec_mode = FALSE;
+
+  /* register state is from non-speculative state buffers */
+  BITMAP_CLEAR_MAP(use_spec_R, R_BMAP_SZ);
+  BITMAP_CLEAR_MAP(use_spec_F, F_BMAP_SZ);
+  BITMAP_CLEAR_MAP(use_spec_C, C_BMAP_SZ);
+
+  /* memory state is from non-speculative memory pages */
+  for (i=0; i<STORE_HASH_SIZE; i++)
+    store_htable[i] = NULL;
+}
+
+
+/* speculative memory hash table address hash function */
+#define HASH_ADDR(ADDR)							\
+  ((((ADDR) >> 24)^((ADDR) >> 16)^((ADDR) >> 8)^(ADDR)) & (STORE_HASH_SIZE-1))
+
+/* this functional provides a layer of mis-speculated state over the
+   non-speculative memory state, when in mis-speculation trace generation mode,
+   the simulator will call this function to access memory, instead of the
+   non-speculative memory access interfaces defined in memory.h; when storage
+   is written, an entry is allocated in the speculative memory hash table,
+   future reads and writes while in mis-speculative trace generation mode will
+   access this buffer instead of non-speculative memory state; when the trace
+   generator transitions back to non-speculative trace generation mode,
+   tracer_recover() clears this table, returns any access fault */
+static enum md_fault_type
+spec_mem_access(struct mem_t *mem,		/* memory space to access */
+		enum mem_cmd cmd,		/* Read or Write access cmd */
+		md_addr_t addr,			/* virtual address of access */
+		void *p,			/* input/output buffer */
+		int nbytes)			/* number of bytes to access */
+{
+  int i, index;
+  struct spec_mem_ent *ent, *prev;
+
+  /* FIXME: partially overlapping writes are not combined... */
+  /* FIXME: partially overlapping reads are not handled correctly... */
+
+  /* check alignments, even speculative this test should always pass */
+  if ((nbytes & (nbytes-1)) != 0 || (addr & (nbytes-1)) != 0)
+    {
+      /* no can do, return zero result */
+      for (i=0; i < nbytes; i++)
+	((char *)p)[i] = 0;
+
+      return md_fault_none;
+    }
+
+  /* check permissions */
+  if (!((addr >= ld_text_base && addr < (ld_text_base+ld_text_size)
+	 && cmd == Read)
+	|| MD_VALID_ADDR(addr)))
+    {
+      /* no can do, return zero result */
+      for (i=0; i < nbytes; i++)
+	((char *)p)[i] = 0;
+
+      return md_fault_none;
+    }
+
+  /* has this memory state been copied on mis-speculative write? */
+  index = HASH_ADDR(addr);
+  for (prev=NULL,ent=store_htable[index]; ent; prev=ent,ent=ent->next)
+    {
+      if (ent->addr == addr)
+	{
+	  /* reorder chains to speed access into hash table */
+	  if (prev != NULL)
+	    {
+	      /* not at head of list, relink the hash table entry at front */
+	      prev->next = ent->next;
+              ent->next = store_htable[index];
+              store_htable[index] = ent;
+	    }
+	  break;
+	}
+    }
+
+  /* no, if it is a write, allocate a hash table entry to hold the data */
+  if (!ent && cmd == Write)
+    {
+      /* try to get an entry from the free list, if available */
+      if (!bucket_free_list)
+	{
+	  /* otherwise, call calloc() to get the needed storage */
+	  bucket_free_list = calloc(1, sizeof(struct spec_mem_ent));
+	  if (!bucket_free_list)
+	    fatal("out of virtual memory");
+	}
+      ent = bucket_free_list;
+      bucket_free_list = bucket_free_list->next;
+
+      if (!bugcompat_mode)
+	{
+	  /* insert into hash table */
+	  ent->next = store_htable[index];
+	  store_htable[index] = ent;
+	  ent->addr = addr;
+	  ent->data[0] = 0; ent->data[1] = 0;
+	}
+    }
+
+  /* handle the read or write to speculative or non-speculative storage */
+  switch (nbytes)
+    {
+    case 1:
+      if (cmd == Read)
+	{
+	  if (ent)
+	    {
+	      /* read from mis-speculated state buffer */
+	      *((byte_t *)p) = *((byte_t *)(&ent->data[0]));
+	    }
+	  else
+	    {
+	      /* read from non-speculative memory state, don't allocate
+	         memory pages with speculative loads */
+	      *((byte_t *)p) = MEM_READ_BYTE(mem, addr);
+	    }
+	}
+      else
+	{
+	  /* always write into mis-speculated state buffer */
+	  *((byte_t *)(&ent->data[0])) = *((byte_t *)p);
+	}
+      break;
+    case 2:
+      if (cmd == Read)
+	{
+	  if (ent)
+	    {
+	      /* read from mis-speculated state buffer */
+	      *((half_t *)p) = *((half_t *)(&ent->data[0]));
+	    }
+	  else
+	    {
+	      /* read from non-speculative memory state, don't allocate
+	         memory pages with speculative loads */
+	      *((half_t *)p) = MEM_READ_HALF(mem, addr);
+	    }
+	}
+      else
+	{
+	  /* always write into mis-speculated state buffer */
+	  *((half_t *)&ent->data[0]) = *((half_t *)p);
+	}
+      break;
+    case 4:
+      if (cmd == Read)
+	{
+	  if (ent)
+	    {
+	      /* read from mis-speculated state buffer */
+	      *((word_t *)p) = *((word_t *)&ent->data[0]);
+	    }
+	  else
+	    {
+	      /* read from non-speculative memory state, don't allocate
+	         memory pages with speculative loads */
+	      *((word_t *)p) = MEM_READ_WORD(mem, addr);
+	    }
+	}
+      else
+	{
+	  /* always write into mis-speculated state buffer */
+	  *((word_t *)&ent->data[0]) = *((word_t *)p);
+	}
+      break;
+    case 8:
+      if (cmd == Read)
+	{
+	  if (ent)
+	    {
+	      /* read from mis-speculated state buffer */
+	      *((word_t *)p) = *((word_t *)&ent->data[0]);
+	      *(((word_t *)p)+1) = *((word_t *)&ent->data[1]);
+	    }
+	  else
+	    {
+	      /* read from non-speculative memory state, don't allocate
+	         memory pages with speculative loads */
+	      *((word_t *)p) = MEM_READ_WORD(mem, addr);
+	      *(((word_t *)p)+1) =
+		MEM_READ_WORD(mem, addr + sizeof(word_t));
+	    }
+	}
+      else
+	{
+	  /* always write into mis-speculated state buffer */
+	  *((word_t *)&ent->data[0]) = *((word_t *)p);
+	  *((word_t *)&ent->data[1]) = *(((word_t *)p)+1);
+	}
+      break;
+    default:
+      panic("access size not supported in mis-speculative mode");
+    }
+
+  return md_fault_none;
+}
+
+/* dump speculative memory state */
+static void
+mspec_dump(FILE *stream)			/* output stream */
+{
+  int i;
+  struct spec_mem_ent *ent;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** speculative memory contents **\n");
+
+  fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f");
+
+  for (i=0; i<STORE_HASH_SIZE; i++)
+    {
+      /* dump contents of all hash table buckets */
+      for (ent=store_htable[i]; ent; ent=ent->next)
+	{
+	  myfprintf(stream, "[0x%08p]: %12.0f/0x%08x:%08x\n",
+		    ent->addr, (double)(*((double *)ent->data)),
+		    *((unsigned int *)&ent->data[0]),
+		    *(((unsigned int *)&ent->data[0]) + 1));
+	}
+    }
+}
+
+/* default memory state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_mem_obj(struct mem_t *mem,		/* memory space to access */
+	      int is_write,			/* access type */
+	      md_addr_t addr,			/* address to access */
+	      char *p,				/* input/output buffer */
+	      int nbytes)			/* size of access */
+{
+  enum mem_cmd cmd;
+
+  if (!is_write)
+    cmd = Read;
+  else
+    cmd = Write;
+
+#if 0
+  char *errstr;
+
+  errstr = mem_valid(cmd, addr, nbytes, /* declare */FALSE);
+  if (errstr)
+    return errstr;
+#endif
+
+  /* else, no error, access memory */
+  if (spec_mode)
+    spec_mem_access(mem, cmd, addr, p, nbytes);
+  else
+    mem_access(mem, cmd, addr, p, nbytes);
+
+  /* no error */
+  return NULL;
+}
+
+
+/*
+ *  RUU_DISPATCH() - decode instructions and allocate RUU and LSQ resources
+ */
+
+/* link RS onto the output chain number of whichever operation will next
+   create the architected register value IDEP_NAME */
+static INLINE void
+ruu_link_idep(struct RUU_station *rs,		/* rs station to link */
+	      int idep_num,			/* input dependence number */
+	      int idep_name)			/* input register name */
+{
+  struct CV_link head;
+  struct RS_link *link;
+
+  /* any dependence? */
+  if (idep_name == NA)
+    {
+      /* no input dependence for this input slot, mark operand as ready */
+      rs->idep_ready[idep_num] = TRUE;
+      return;
+    }
+
+  /* locate creator of operand */
+  head = CREATE_VECTOR(idep_name);
+
+  /* any creator? */
+  if (!head.rs)
+    {
+      /* no active creator, use value available in architected reg file,
+         indicate the operand is ready for use */
+      rs->idep_ready[idep_num] = TRUE;
+      return;
+    }
+  /* else, creator operation will make this value sometime in the future */
+
+  /* indicate value will be created sometime in the future, i.e., operand
+     is not yet ready for use */
+  rs->idep_ready[idep_num] = FALSE;
+
+  /* link onto creator's output list of dependant operand */
+  RSLINK_NEW(link, rs); link->x.opnum = idep_num;
+  link->next = head.rs->odep_list[head.odep_num];
+  head.rs->odep_list[head.odep_num] = link;
+}
+
+/* make RS the creator of architected register ODEP_NAME */
+static INLINE void
+ruu_install_odep(struct RUU_station *rs,	/* creating RUU station */
+		 int odep_num,			/* output operand number */
+		 int odep_name)			/* output register name */
+{
+  struct CV_link cv;
+
+  /* any dependence? */
+  if (odep_name == NA)
+    {
+      /* no value created */
+      rs->onames[odep_num] = NA;
+      return;
+    }
+  /* else, create a RS_NULL terminated output chain in create vector */
+
+  /* record output name, used to update create vector at completion */
+  rs->onames[odep_num] = odep_name;
+
+  /* initialize output chain to empty list */
+  rs->odep_list[odep_num] = NULL;
+
+  /* indicate this operation is latest creator of ODEP_NAME */
+  CVLINK_INIT(cv, rs, odep_num);
+  SET_CREATE_VECTOR(odep_name, cv);
+}
+
+
+/*
+ * configure the instruction decode engine
+ */
+
+#define DNA			(0)
+
+#if defined(TARGET_PISA)
+
+/* general register dependence decoders */
+#define DGPR(N)			(N)
+#define DGPR_D(N)		((N) &~1)
+
+/* floating point register dependence decoders */
+#define DFPR_L(N)		(((N)+32)&~1)
+#define DFPR_F(N)		(((N)+32)&~1)
+#define DFPR_D(N)		(((N)+32)&~1)
+
+/* miscellaneous register dependence decoders */
+#define DHI			(0+32+32)
+#define DLO			(1+32+32)
+#define DFCC			(2+32+32)
+#define DTMP			(3+32+32)
+
+#elif defined(TARGET_ALPHA)
+
+/* general register dependence decoders, $r31 maps to DNA (0) */
+#define DGPR(N)			(31 - (N)) /* was: (((N) == 31) ? DNA : (N)) */
+
+/* floating point register dependence decoders */
+#define DFPR(N)			(((N) == 31) ? DNA : ((N)+32))
+
+/* miscellaneous register dependence decoders */
+#define DFPCR			(0+32+32)
+#define DUNIQ			(1+32+32)
+#define DTMP			(2+32+32)
+
+#else
+#error No ISA target defined...
+#endif
+
+
+/*
+ * configure the execution engine
+ */
+
+/* next program counter */
+#define SET_NPC(EXPR)           (regs.regs_NPC = (EXPR))
+
+/* target program counter */
+#undef  SET_TPC
+#define SET_TPC(EXPR)		(target_PC = (EXPR))
+
+/* current program counter */
+#define CPC                     (regs.regs_PC)
+#define SET_CPC(EXPR)           (regs.regs_PC = (EXPR))
+
+/* general purpose register accessors, NOTE: speculative copy on write storage
+   provided for fast recovery during wrong path execute (see tracer_recover()
+   for details on this process */
+#define GPR(N)                  (BITMAP_SET_P(use_spec_R, R_BMAP_SZ, (N))\
+				 ? spec_regs_R[N]                       \
+				 : regs.regs_R[N])
+#define SET_GPR(N,EXPR)         (spec_mode				\
+				 ? ((spec_regs_R[N] = (EXPR)),		\
+				    BITMAP_SET(use_spec_R, R_BMAP_SZ, (N)),\
+				    spec_regs_R[N])			\
+				 : (regs.regs_R[N] = (EXPR)))
+
+#if defined(TARGET_PISA)
+
+/* floating point register accessors, NOTE: speculative copy on write storage
+   provided for fast recovery during wrong path execute (see tracer_recover()
+   for details on this process */
+#define FPR_L(N)                (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\
+				 ? spec_regs_F.l[(N)]                   \
+				 : regs.regs_F.l[(N)])
+#define SET_FPR_L(N,EXPR)       (spec_mode				\
+				 ? ((spec_regs_F.l[(N)] = (EXPR)),	\
+				    BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\
+				    spec_regs_F.l[(N)])			\
+				 : (regs.regs_F.l[(N)] = (EXPR)))
+#define FPR_F(N)                (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\
+				 ? spec_regs_F.f[(N)]                   \
+				 : regs.regs_F.f[(N)])
+#define SET_FPR_F(N,EXPR)       (spec_mode				\
+				 ? ((spec_regs_F.f[(N)] = (EXPR)),	\
+				    BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\
+				    spec_regs_F.f[(N)])			\
+				 : (regs.regs_F.f[(N)] = (EXPR)))
+#define FPR_D(N)                (BITMAP_SET_P(use_spec_F, F_BMAP_SZ, ((N)&~1))\
+				 ? spec_regs_F.d[(N) >> 1]              \
+				 : regs.regs_F.d[(N) >> 1])
+#define SET_FPR_D(N,EXPR)       (spec_mode				\
+				 ? ((spec_regs_F.d[(N) >> 1] = (EXPR)),	\
+				    BITMAP_SET(use_spec_F,F_BMAP_SZ,((N)&~1)),\
+				    spec_regs_F.d[(N) >> 1])		\
+				 : (regs.regs_F.d[(N) >> 1] = (EXPR)))
+
+/* miscellanous register accessors, NOTE: speculative copy on write storage
+   provided for fast recovery during wrong path execute (see tracer_recover()
+   for details on this process */
+#define HI			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ, /*hi*/0)\
+				 ? spec_regs_C.hi			\
+				 : regs.regs_C.hi)
+#define SET_HI(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.hi = (EXPR)),		\
+				    BITMAP_SET(use_spec_C, C_BMAP_SZ,/*hi*/0),\
+				    spec_regs_C.hi)			\
+				 : (regs.regs_C.hi = (EXPR)))
+#define LO			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ, /*lo*/1)\
+				 ? spec_regs_C.lo			\
+				 : regs.regs_C.lo)
+#define SET_LO(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.lo = (EXPR)),		\
+				    BITMAP_SET(use_spec_C, C_BMAP_SZ,/*lo*/1),\
+				    spec_regs_C.lo)			\
+				 : (regs.regs_C.lo = (EXPR)))
+#define FCC			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fcc*/2)\
+				 ? spec_regs_C.fcc			\
+				 : regs.regs_C.fcc)
+#define SET_FCC(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.fcc = (EXPR)),		\
+				    BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fcc*/2),\
+				    spec_regs_C.fcc)			\
+				 : (regs.regs_C.fcc = (EXPR)))
+
+#elif defined(TARGET_ALPHA)
+
+/* floating point register accessors, NOTE: speculative copy on write storage
+   provided for fast recovery during wrong path execute (see tracer_recover()
+   for details on this process */
+#define FPR_Q(N)		(BITMAP_SET_P(use_spec_F, F_BMAP_SZ, (N))\
+				 ? spec_regs_F.q[(N)]                   \
+				 : regs.regs_F.q[(N)])
+#define SET_FPR_Q(N,EXPR)	(spec_mode				\
+				 ? ((spec_regs_F.q[(N)] = (EXPR)),	\
+				    BITMAP_SET(use_spec_F,F_BMAP_SZ, (N)),\
+				    spec_regs_F.q[(N)])			\
+				 : (regs.regs_F.q[(N)] = (EXPR)))
+#define FPR(N)			(BITMAP_SET_P(use_spec_F, F_BMAP_SZ, (N))\
+				 ? spec_regs_F.d[(N)]			\
+				 : regs.regs_F.d[(N)])
+#define SET_FPR(N,EXPR)		(spec_mode				\
+				 ? ((spec_regs_F.d[(N)] = (EXPR)),	\
+				    BITMAP_SET(use_spec_F,F_BMAP_SZ, (N)),\
+				    spec_regs_F.d[(N)])			\
+				 : (regs.regs_F.d[(N)] = (EXPR)))
+
+/* miscellanous register accessors, NOTE: speculative copy on write storage
+   provided for fast recovery during wrong path execute (see tracer_recover()
+   for details on this process */
+#define FPCR			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fpcr*/0)\
+				 ? spec_regs_C.fpcr			\
+				 : regs.regs_C.fpcr)
+#define SET_FPCR(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.fpcr = (EXPR)),	\
+				   BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fpcr*/0),\
+				    spec_regs_C.fpcr)			\
+				 : (regs.regs_C.fpcr = (EXPR)))
+#define UNIQ			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*uniq*/1)\
+				 ? spec_regs_C.uniq			\
+				 : regs.regs_C.uniq)
+#define SET_UNIQ(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.uniq = (EXPR)),	\
+				   BITMAP_SET(use_spec_C,C_BMAP_SZ,/*uniq*/1),\
+				    spec_regs_C.uniq)			\
+				 : (regs.regs_C.uniq = (EXPR)))
+#define FCC			(BITMAP_SET_P(use_spec_C, C_BMAP_SZ,/*fcc*/2)\
+				 ? spec_regs_C.fcc			\
+				 : regs.regs_C.fcc)
+#define SET_FCC(EXPR)		(spec_mode				\
+				 ? ((spec_regs_C.fcc = (EXPR)),		\
+				    BITMAP_SET(use_spec_C,C_BMAP_SZ,/*fcc*/1),\
+				    spec_regs_C.fcc)			\
+				 : (regs.regs_C.fcc = (EXPR)))
+
+#else
+#error No ISA target defined...
+#endif
+
+/* precise architected memory state accessor macros, NOTE: speculative copy on
+   write storage provided for fast recovery during wrong path execute (see
+   tracer_recover() for details on this process */
+#define __READ_SPECMEM(SRC, SRC_V, FAULT)				\
+  (addr = (SRC),							\
+   (spec_mode								\
+    ? ((FAULT) = spec_mem_access(mem, Read, addr, &SRC_V, sizeof(SRC_V)))\
+    : ((FAULT) = mem_access(mem, Read, addr, &SRC_V, sizeof(SRC_V)))),	\
+   SRC_V)
+
+#define READ_BYTE(SRC, FAULT)						\
+  __READ_SPECMEM((SRC), temp_byte, (FAULT))
+#define READ_HALF(SRC, FAULT)						\
+  MD_SWAPH(__READ_SPECMEM((SRC), temp_half, (FAULT)))
+#define READ_WORD(SRC, FAULT)						\
+  MD_SWAPW(__READ_SPECMEM((SRC), temp_word, (FAULT)))
+#ifdef HOST_HAS_QWORD
+#define READ_QWORD(SRC, FAULT)						\
+  MD_SWAPQ(__READ_SPECMEM((SRC), temp_qword, (FAULT)))
+#endif /* HOST_HAS_QWORD */
+
+
+#define __WRITE_SPECMEM(SRC, DST, DST_V, FAULT)				\
+  (DST_V = (SRC), addr = (DST),						\
+   (spec_mode								\
+    ? ((FAULT) = spec_mem_access(mem, Write, addr, &DST_V, sizeof(DST_V)))\
+    : ((FAULT) = mem_access(mem, Write, addr, &DST_V, sizeof(DST_V)))))
+
+#define WRITE_BYTE(SRC, DST, FAULT)					\
+  __WRITE_SPECMEM((SRC), (DST), temp_byte, (FAULT))
+#define WRITE_HALF(SRC, DST, FAULT)					\
+  __WRITE_SPECMEM(MD_SWAPH(SRC), (DST), temp_half, (FAULT))
+#define WRITE_WORD(SRC, DST, FAULT)					\
+  __WRITE_SPECMEM(MD_SWAPW(SRC), (DST), temp_word, (FAULT))
+#ifdef HOST_HAS_QWORD
+#define WRITE_QWORD(SRC, DST, FAULT)					\
+  __WRITE_SPECMEM(MD_SWAPQ(SRC), (DST), temp_qword, (FAULT))
+#endif /* HOST_HAS_QWORD */
+
+/* system call handler macro */
+#define SYSCALL(INST)							\
+  (/* only execute system calls in non-speculative mode */		\
+   (spec_mode ? panic("speculative syscall") : (void) 0),		\
+   sys_syscall(&regs, mem_access, mem, INST, TRUE))
+
+/* default register state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_reg_obj(struct regs_t *xregs,		/* registers to access */
+	      int is_write,			/* access type */
+	      enum md_reg_type rt,		/* reg bank to probe */
+	      int reg,				/* register number */
+	      struct eval_value_t *val)		/* input, output */
+{
+  switch (rt)
+    {
+    case rt_gpr:
+      if (reg < 0 || reg >= MD_NUM_IREGS)
+	return "register number out of range";
+
+      if (!is_write)
+	{
+	  val->type = et_uint;
+	  val->value.as_uint = GPR(reg);
+	}
+      else
+	SET_GPR(reg, eval_as_uint(*val));
+      break;
+
+    case rt_lpr:
+      if (reg < 0 || reg >= MD_NUM_FREGS)
+	return "register number out of range";
+
+      /* FIXME: this is not portable... */
+      abort();
+#if 0
+      if (!is_write)
+	{
+	  val->type = et_uint;
+	  val->value.as_uint = FPR_L(reg);
+	}
+      else
+	SET_FPR_L(reg, eval_as_uint(*val));
+#endif
+      break;
+
+    case rt_fpr:
+      /* FIXME: this is not portable... */
+      abort();
+#if 0
+      if (!is_write)
+	val->value.as_float = FPR_F(reg);
+      else
+	SET_FPR_F(reg, val->value.as_float);
+#endif
+      break;
+
+    case rt_dpr:
+      /* FIXME: this is not portable... */
+      abort();
+#if 0
+      /* 1/2 as many regs in this mode */
+      if (reg < 0 || reg >= MD_NUM_REGS/2)
+	return "register number out of range";
+
+      if (at == at_read)
+	val->as_double = FPR_D(reg * 2);
+      else
+	SET_FPR_D(reg * 2, val->as_double);
+#endif
+      break;
+
+      /* FIXME: this is not portable... */
+#if 0
+      abort();
+    case rt_hi:
+      if (at == at_read)
+	val->as_word = HI;
+      else
+	SET_HI(val->as_word);
+      break;
+    case rt_lo:
+      if (at == at_read)
+	val->as_word = LO;
+      else
+	SET_LO(val->as_word);
+      break;
+    case rt_FCC:
+      if (at == at_read)
+	val->as_condition = FCC;
+      else
+	SET_FCC(val->as_condition);
+      break;
+#endif
+
+    case rt_PC:
+      if (!is_write)
+	{
+	  val->type = et_addr;
+	  val->value.as_addr = regs.regs_PC;
+	}
+      else
+	regs.regs_PC = eval_as_addr(*val);
+      break;
+
+    case rt_NPC:
+      if (!is_write)
+	{
+	  val->type = et_addr;
+	  val->value.as_addr = regs.regs_NPC;
+	}
+      else
+	regs.regs_NPC = eval_as_addr(*val);
+      break;
+
+    default:
+      panic("bogus register bank");
+    }
+
+  /* no error */
+  return NULL;
+}
+
+/* the last operation that ruu_dispatch() attempted to dispatch, for
+   implementing in-order issue */
+static struct RS_link last_op = RSLINK_NULL_DATA;
+
+/* dispatch instructions from the IFETCH -> DISPATCH queue: instructions are
+   first decoded, then they allocated RUU (and LSQ for load/stores) resources
+   and input and output dependence chains are updated accordingly */
+static void
+ruu_dispatch(void)
+{
+  int i;
+  int n_dispatched;			/* total insts dispatched */
+  md_inst_t inst;			/* actual instruction bits */
+  enum md_opcode op;			/* decoded opcode enum */
+  int out1, out2, in1, in2, in3;	/* output/input register names */
+  md_addr_t target_PC;			/* actual next/target PC address */
+  md_addr_t addr;			/* effective address, if load/store */
+  struct RUU_station *rs;		/* RUU station being allocated */
+  struct RUU_station *lsq;		/* LSQ station for ld/st's */
+  struct bpred_update_t *dir_update_ptr;/* branch predictor dir update ptr */
+  int stack_recover_idx;		/* bpred retstack recovery index */
+  unsigned int pseq;			/* pipetrace sequence number */
+  int is_write;				/* store? */
+  int made_check;			/* used to ensure DLite entry */
+  int br_taken, br_pred_taken;		/* if br, taken?  predicted taken? */
+  int fetch_redirected = FALSE;
+  byte_t temp_byte = 0;			/* temp variable for spec mem access */
+  half_t temp_half = 0;			/* " ditto " */
+  word_t temp_word = 0;			/* " ditto " */
+#ifdef HOST_HAS_QWORD
+  qword_t temp_qword = 0;		/* " ditto " */
+#endif /* HOST_HAS_QWORD */
+  enum md_fault_type fault;
+
+  made_check = FALSE;
+  n_dispatched = 0;
+  while (/* instruction decode B/W left? */
+	 n_dispatched < (ruu_decode_width * fetch_speed)
+	 /* RUU and LSQ not full? */
+	 && RUU_num < RUU_size && LSQ_num < LSQ_size
+	 /* insts still available from fetch unit? */
+	 && fetch_num != 0
+	 /* on an acceptable trace path */
+	 && (ruu_include_spec || !spec_mode))
+    {
+      /* if issuing in-order, block until last op issues if inorder issue */
+      if (ruu_inorder_issue
+	  && (last_op.rs && RSLINK_VALID(&last_op)
+	      && !OPERANDS_READY(last_op.rs)))
+	{
+	  /* stall until last operation is ready to issue */
+	  break;
+	}
+
+      /* get the next instruction from the IFETCH -> DISPATCH queue */
+      inst = fetch_data[fetch_head].IR;
+      regs.regs_PC = fetch_data[fetch_head].regs_PC;
+      pred_PC = fetch_data[fetch_head].pred_PC;
+      dir_update_ptr = &(fetch_data[fetch_head].dir_update);
+      stack_recover_idx = fetch_data[fetch_head].stack_recover_idx;
+      pseq = fetch_data[fetch_head].ptrace_seq;
+
+      /* decode the inst */
+      MD_SET_OPCODE(op, inst);
+
+      /* compute default next PC */
+      regs.regs_NPC = regs.regs_PC + sizeof(md_inst_t);
+
+      /* drain RUU for TRAPs and system calls */
+      if (MD_OP_FLAGS(op) & F_TRAP)
+	{
+	  if (RUU_num != 0)
+	    break;
+
+	  /* else, syscall is only instruction in the machine, at this
+	     point we should not be in (mis-)speculative mode */
+	  if (spec_mode)
+	    panic("drained and speculative");
+	}
+
+      /* maintain $r0 semantics (in spec and non-spec space) */
+      regs.regs_R[MD_REG_ZERO] = 0; spec_regs_R[MD_REG_ZERO] = 0;
+#ifdef TARGET_ALPHA
+      regs.regs_F.d[MD_REG_ZERO] = 0.0; spec_regs_F.d[MD_REG_ZERO] = 0.0;
+#endif /* TARGET_ALPHA */
+
+      if (!spec_mode)
+	{
+	  /* one more non-speculative instruction executed */
+	  sim_num_insn++;
+	}
+
+      /* default effective address (none) and access */
+      addr = 0; is_write = FALSE;
+
+      /* set default fault - none */
+      fault = md_fault_none;
+
+      /* more decoding and execution */
+      switch (op)
+	{
+#define DEFINST(OP,MSK,NAME,OPFORM,RES,CLASS,O1,O2,I1,I2,I3)		\
+	case OP:							\
+	  /* compute output/input dependencies to out1-2 and in1-3 */	\
+	  out1 = O1; out2 = O2;						\
+	  in1 = I1; in2 = I2; in3 = I3;					\
+	  /* execute the instruction */					\
+	  SYMCAT(OP,_IMPL);						\
+	  break;
+#define DEFLINK(OP,MSK,NAME,MASK,SHIFT)					\
+	case OP:							\
+	  /* could speculatively decode a bogus inst, convert to NOP */	\
+	  op = MD_NOP_OP;						\
+	  /* compute output/input dependencies to out1-2 and in1-3 */	\
+	  out1 = NA; out2 = NA;						\
+	  in1 = NA; in2 = NA; in3 = NA;					\
+	  /* no EXPR */							\
+	  break;
+#define CONNECT(OP)	/* nada... */
+	  /* the following macro wraps the instruction fault declaration macro
+	     with a test to see if the trace generator is in non-speculative
+	     mode, if so the instruction fault is declared, otherwise, the
+	     error is shunted because instruction faults need to be masked on
+	     the mis-speculated instruction paths */
+#define DECLARE_FAULT(FAULT)						\
+	  {								\
+	    if (!spec_mode)						\
+	      fault = (FAULT);						\
+	    /* else, spec fault, ignore it, always terminate exec... */	\
+	    break;							\
+	  }
+#include "machine.def"
+	default:
+	  /* can speculatively decode a bogus inst, convert to a NOP */
+	  op = MD_NOP_OP;
+	  /* compute output/input dependencies to out1-2 and in1-3 */	\
+	  out1 = NA; out2 = NA;
+	  in1 = NA; in2 = NA; in3 = NA;
+	  /* no EXPR */
+	}
+      /* operation sets next PC */
+
+      /* print retirement trace if in verbose mode */
+      if (!spec_mode && verbose)
+        {
+          myfprintf(stderr, "++ %10n [xor: 0x%08x] {%d} @ 0x%08p: ",
+                    sim_num_insn, md_xor_regs(&regs),
+                    inst_seq+1, regs.regs_PC);
+          md_print_insn(inst, regs.regs_PC, stderr);
+          fprintf(stderr, "\n");
+          /* fflush(stderr); */
+        }
+
+      if (fault != md_fault_none)
+	fatal("non-speculative fault (%d) detected @ 0x%08p",
+	      fault, regs.regs_PC);
+
+      /* update memory access stats */
+      if (MD_OP_FLAGS(op) & F_MEM)
+	{
+	  sim_total_refs++;
+	  if (!spec_mode)
+	    sim_num_refs++;
+
+	  if (MD_OP_FLAGS(op) & F_STORE)
+	    is_write = TRUE;
+	  else
+	    {
+	      sim_total_loads++;
+	      if (!spec_mode)
+		sim_num_loads++;
+	    }
+	}
+
+      br_taken = (regs.regs_NPC != (regs.regs_PC + sizeof(md_inst_t)));
+      br_pred_taken = (pred_PC != (regs.regs_PC + sizeof(md_inst_t)));
+
+      if ((pred_PC != regs.regs_NPC && pred_perfect)
+	  || ((MD_OP_FLAGS(op) & (F_CTRL|F_DIRJMP)) == (F_CTRL|F_DIRJMP)
+	      && target_PC != pred_PC && br_pred_taken))
+	{
+	  /* Either 1) we're simulating perfect prediction and are in a
+             mis-predict state and need to patch up, or 2) We're not simulating
+             perfect prediction, we've predicted the branch taken, but our
+             predicted target doesn't match the computed target (i.e.,
+             mis-fetch).  Just update the PC values and do a fetch squash.
+             This is just like calling fetch_squash() except we pre-anticipate
+             the updates to the fetch values at the end of this function.  If
+             case #2, also charge a mispredict penalty for redirecting fetch */
+	  fetch_pred_PC = fetch_regs_PC = regs.regs_NPC;
+	  /* was: if (pred_perfect) */
+	  if (pred_perfect)
+	    pred_PC = regs.regs_NPC;
+
+	  fetch_head = (ruu_ifq_size-1);
+	  fetch_num = 1;
+	  fetch_tail = 0;
+
+	  if (!pred_perfect)
+	    ruu_fetch_issue_delay = ruu_branch_penalty;
+
+	  fetch_redirected = TRUE;
+	}
+
+      /* is this a NOP */
+      if (op != MD_NOP_OP)
+	{
+	  /* for load/stores:
+	       idep #0     - store operand (value that is store'ed)
+	       idep #1, #2 - eff addr computation inputs (addr of access)
+
+	     resulting RUU/LSQ operation pair:
+	       RUU (effective address computation operation):
+		 idep #0, #1 - eff addr computation inputs (addr of access)
+	       LSQ (memory access operation):
+		 idep #0     - operand input (value that is store'd)
+		 idep #1     - eff addr computation result (from RUU op)
+
+	     effective address computation is transfered via the reserved
+	     name DTMP
+	   */
+
+	  /* fill in RUU reservation station */
+	  rs = &RUU[RUU_tail];
+          rs->slip = sim_cycle - 1;
+	  rs->IR = inst;
+	  rs->op = op;
+	  rs->PC = regs.regs_PC;
+	  rs->next_PC = regs.regs_NPC; rs->pred_PC = pred_PC;
+	  rs->in_LSQ = FALSE;
+	  rs->ea_comp = FALSE;
+	  rs->recover_inst = FALSE;
+          rs->dir_update = *dir_update_ptr;
+	  rs->stack_recover_idx = stack_recover_idx;
+	  rs->spec_mode = spec_mode;
+	  rs->addr = 0;
+	  /* rs->tag is already set */
+	  rs->seq = ++inst_seq;
+	  rs->queued = rs->issued = rs->completed = FALSE;
+	  rs->ptrace_seq = pseq;
+
+	  /* split ld/st's into two operations: eff addr comp + mem access */
+	  if (MD_OP_FLAGS(op) & F_MEM)
+	    {
+	      /* convert RUU operation from ld/st to an add (eff addr comp) */
+	      rs->op = MD_AGEN_OP;
+	      rs->ea_comp = TRUE;
+
+	      /* fill in LSQ reservation station */
+	      lsq = &LSQ[LSQ_tail];
+              lsq->slip = sim_cycle - 1;
+	      lsq->IR = inst;
+	      lsq->op = op;
+	      lsq->PC = regs.regs_PC;
+	      lsq->next_PC = regs.regs_NPC; lsq->pred_PC = pred_PC;
+	      lsq->in_LSQ = TRUE;
+	      lsq->ea_comp = FALSE;
+	      lsq->recover_inst = FALSE;
+	      lsq->dir_update.pdir1 = lsq->dir_update.pdir2 = NULL;
+	      lsq->dir_update.pmeta = NULL;
+	      lsq->stack_recover_idx = 0;
+	      lsq->spec_mode = spec_mode;
+	      lsq->addr = addr;
+	      /* lsq->tag is already set */
+	      lsq->seq = ++inst_seq;
+	      lsq->queued = lsq->issued = lsq->completed = FALSE;
+	      lsq->ptrace_seq = ptrace_seq++;
+
+	      /* pipetrace this uop */
+	      ptrace_newuop(lsq->ptrace_seq, "internal ld/st", lsq->PC, 0);
+	      ptrace_newstage(lsq->ptrace_seq, PST_DISPATCH, 0);
+
+	      /* link eff addr computation onto operand's output chains */
+	      ruu_link_idep(rs, /* idep_ready[] index */0, NA);
+	      ruu_link_idep(rs, /* idep_ready[] index */1, in2);
+	      ruu_link_idep(rs, /* idep_ready[] index */2, in3);
+
+	      /* install output after inputs to prevent self reference */
+	      ruu_install_odep(rs, /* odep_list[] index */0, DTMP);
+	      ruu_install_odep(rs, /* odep_list[] index */1, NA);
+
+	      /* link memory access onto output chain of eff addr operation */
+	      ruu_link_idep(lsq,
+			    /* idep_ready[] index */STORE_OP_INDEX/* 0 */,
+			    in1);
+	      ruu_link_idep(lsq,
+			    /* idep_ready[] index */STORE_ADDR_INDEX/* 1 */,
+			    DTMP);
+	      ruu_link_idep(lsq, /* idep_ready[] index */2, NA);
+
+	      /* install output after inputs to prevent self reference */
+	      ruu_install_odep(lsq, /* odep_list[] index */0, out1);
+	      ruu_install_odep(lsq, /* odep_list[] index */1, out2);
+
+	      /* install operation in the RUU and LSQ */
+	      n_dispatched++;
+	      RUU_tail = (RUU_tail + 1) % RUU_size;
+	      RUU_num++;
+	      LSQ_tail = (LSQ_tail + 1) % LSQ_size;
+	      LSQ_num++;
+
+	      if (OPERANDS_READY(rs))
+		{
+		  /* eff addr computation ready, queue it on ready list */
+		  readyq_enqueue(rs);
+		}
+	      /* issue may continue when the load/store is issued */
+	      RSLINK_INIT(last_op, lsq);
+
+	      /* issue stores only, loads are issued by lsq_refresh() */
+	      if (((MD_OP_FLAGS(op) & (F_MEM|F_STORE)) == (F_MEM|F_STORE))
+		  && OPERANDS_READY(lsq))
+		{
+		  /* panic("store immediately ready"); */
+		  /* put operation on ready list, ruu_issue() issue it later */
+		  readyq_enqueue(lsq);
+		}
+	    }
+	  else /* !(MD_OP_FLAGS(op) & F_MEM) */
+	    {
+	      /* link onto producing operation */
+	      ruu_link_idep(rs, /* idep_ready[] index */0, in1);
+	      ruu_link_idep(rs, /* idep_ready[] index */1, in2);
+	      ruu_link_idep(rs, /* idep_ready[] index */2, in3);
+
+	      /* install output after inputs to prevent self reference */
+	      ruu_install_odep(rs, /* odep_list[] index */0, out1);
+	      ruu_install_odep(rs, /* odep_list[] index */1, out2);
+
+	      /* install operation in the RUU */
+	      n_dispatched++;
+	      RUU_tail = (RUU_tail + 1) % RUU_size;
+	      RUU_num++;
+
+	      /* issue op if all its reg operands are ready (no mem input) */
+	      if (OPERANDS_READY(rs))
+		{
+		  /* put operation on ready list, ruu_issue() issue it later */
+		  readyq_enqueue(rs);
+		  /* issue may continue */
+		  last_op = RSLINK_NULL;
+		}
+	      else
+		{
+		  /* could not issue this inst, stall issue until we can */
+		  RSLINK_INIT(last_op, rs);
+		}
+	    }
+	}
+      else
+	{
+	  /* this is a NOP, no need to update RUU/LSQ state */
+	  rs = NULL;
+	}
+
+      /* one more instruction executed, speculative or otherwise */
+      sim_total_insn++;
+      if (MD_OP_FLAGS(op) & F_CTRL)
+	sim_total_branches++;
+
+      if (!spec_mode)
+	{
+#if 0 /* moved above for EIO trace file support */
+	  /* one more non-speculative instruction executed */
+	  sim_num_insn++;
+#endif
+
+	  /* if this is a branching instruction update BTB, i.e., only
+	     non-speculative state is committed into the BTB */
+	  if (MD_OP_FLAGS(op) & F_CTRL)
+	    {
+	      sim_num_branches++;
+	      if (pred && bpred_spec_update == spec_ID)
+		{
+		  bpred_update(pred,
+			       /* branch address */regs.regs_PC,
+			       /* actual target address */regs.regs_NPC,
+			       /* taken? */regs.regs_NPC != (regs.regs_PC +
+						       sizeof(md_inst_t)),
+			       /* pred taken? */pred_PC != (regs.regs_PC +
+							sizeof(md_inst_t)),
+			       /* correct pred? */pred_PC == regs.regs_NPC,
+			       /* opcode */op,
+			       /* predictor update ptr */&rs->dir_update);
+		}
+	    }
+
+	  /* is the trace generator trasitioning into mis-speculation mode? */
+	  if (pred_PC != regs.regs_NPC && !fetch_redirected)
+	    {
+	      /* entering mis-speculation mode, indicate this and save PC */
+	      spec_mode = TRUE;
+	      rs->recover_inst = TRUE;
+	      recover_PC = regs.regs_NPC;
+	    }
+	}
+
+      /* entered decode/allocate stage, indicate in pipe trace */
+      ptrace_newstage(pseq, PST_DISPATCH,
+		      (pred_PC != regs.regs_NPC) ? PEV_MPOCCURED : 0);
+      if (op == MD_NOP_OP)
+	{
+	  /* end of the line */
+	  ptrace_endinst(pseq);
+	}
+
+      /* update any stats tracked by PC */
+      for (i=0; i<pcstat_nelt; i++)
+	{
+	  counter_t newval;
+	  int delta;
+
+	  /* check if any tracked stats changed */
+	  newval = STATVAL(pcstat_stats[i]);
+	  delta = newval - pcstat_lastvals[i];
+	  if (delta != 0)
+	    {
+	      stat_add_samples(pcstat_sdists[i], regs.regs_PC, delta);
+	      pcstat_lastvals[i] = newval;
+	    }
+	}
+
+      /* consume instruction from IFETCH -> DISPATCH queue */
+      fetch_head = (fetch_head+1) & (ruu_ifq_size - 1);
+      fetch_num--;
+
+      /* check for DLite debugger entry condition */
+      made_check = TRUE;
+      if (dlite_check_break(pred_PC,
+			    is_write ? ACCESS_WRITE : ACCESS_READ,
+			    addr, sim_num_insn, sim_cycle))
+	dlite_main(regs.regs_PC, pred_PC, sim_cycle, &regs, mem);
+    }
+
+  /* need to enter DLite at least once per cycle */
+  if (!made_check)
+    {
+      if (dlite_check_break(/* no next PC */0,
+			    is_write ? ACCESS_WRITE : ACCESS_READ,
+			    addr, sim_num_insn, sim_cycle))
+	dlite_main(regs.regs_PC, /* no next PC */0, sim_cycle, &regs, mem);
+    }
+}
+
+
+/*
+ *  RUU_FETCH() - instruction fetch pipeline stage(s)
+ */
+
+/* initialize the instruction fetch pipeline stage */
+static void
+fetch_init(void)
+{
+  /* allocate the IFETCH -> DISPATCH instruction queue */
+  fetch_data =
+    (struct fetch_rec *)calloc(ruu_ifq_size, sizeof(struct fetch_rec));
+  if (!fetch_data)
+    fatal("out of virtual memory");
+
+  fetch_num = 0;
+  fetch_tail = fetch_head = 0;
+  IFQ_count = 0;
+  IFQ_fcount = 0;
+}
+
+/* dump contents of fetch stage registers and fetch queue */
+void
+fetch_dump(FILE *stream)			/* output stream */
+{
+  int num, head;
+
+  if (!stream)
+    stream = stderr;
+
+  fprintf(stream, "** fetch stage state **\n");
+
+  fprintf(stream, "spec_mode: %s\n", spec_mode ? "t" : "f");
+  myfprintf(stream, "pred_PC: 0x%08p, recover_PC: 0x%08p\n",
+	    pred_PC, recover_PC);
+  myfprintf(stream, "fetch_regs_PC: 0x%08p, fetch_pred_PC: 0x%08p\n",
+	    fetch_regs_PC, fetch_pred_PC);
+  fprintf(stream, "\n");
+
+  fprintf(stream, "** fetch queue contents **\n");
+  fprintf(stream, "fetch_num: %d\n", fetch_num);
+  fprintf(stream, "fetch_head: %d, fetch_tail: %d\n",
+	  fetch_head, fetch_tail);
+
+  num = fetch_num;
+  head = fetch_head;
+  while (num)
+    {
+      fprintf(stream, "idx: %2d: inst: `", head);
+      md_print_insn(fetch_data[head].IR, fetch_data[head].regs_PC, stream);
+      fprintf(stream, "'\n");
+      myfprintf(stream, "         regs_PC: 0x%08p, pred_PC: 0x%08p\n",
+		fetch_data[head].regs_PC, fetch_data[head].pred_PC);
+      head = (head + 1) & (ruu_ifq_size - 1);
+      num--;
+    }
+}
+
+static int last_inst_missed = FALSE;
+static int last_inst_tmissed = FALSE;
+
+/* fetch up as many instruction as one branch prediction and one cache line
+   acess will support without overflowing the IFETCH -> DISPATCH QUEUE */
+static void
+ruu_fetch(void)
+{
+  int i, lat, tlb_lat, done = FALSE;
+  md_inst_t inst;
+  int stack_recover_idx;
+  int branch_cnt;
+
+  for (i=0, branch_cnt=0;
+       /* fetch up to as many instruction as the DISPATCH stage can decode */
+       i < (ruu_decode_width * fetch_speed)
+       /* fetch until IFETCH -> DISPATCH queue fills */
+       && fetch_num < ruu_ifq_size
+       /* and no IFETCH blocking condition encountered */
+       && !done;
+       i++)
+    {
+      /* fetch an instruction at the next predicted fetch address */
+      fetch_regs_PC = fetch_pred_PC;
+
+      /* is this a bogus text address? (can happen on mis-spec path) */
+      if (ld_text_base <= fetch_regs_PC
+	  && fetch_regs_PC < (ld_text_base+ld_text_size)
+	  && !(fetch_regs_PC & (sizeof(md_inst_t)-1)))
+	{
+	  /* read instruction from memory */
+	  MD_FETCH_INST(inst, mem, fetch_regs_PC);
+
+	  /* address is within program text, read instruction from memory */
+	  lat = cache_il1_lat;
+	  if (cache_il1)
+	    {
+	      /* access the I-cache */
+	      lat =
+		cache_access(cache_il1, Read, IACOMPRESS(fetch_regs_PC),
+			     NULL, ISCOMPRESS(sizeof(md_inst_t)), sim_cycle,
+			     NULL, NULL);
+	      if (lat > cache_il1_lat)
+		last_inst_missed = TRUE;
+	    }
+
+	  if (itlb)
+	    {
+	      /* access the I-TLB, NOTE: this code will initiate
+		 speculative TLB misses */
+	      tlb_lat =
+		cache_access(itlb, Read, IACOMPRESS(fetch_regs_PC),
+			     NULL, ISCOMPRESS(sizeof(md_inst_t)), sim_cycle,
+			     NULL, NULL);
+	      if (tlb_lat > 1)
+		last_inst_tmissed = TRUE;
+
+	      /* I-cache/I-TLB accesses occur in parallel */
+	      lat = MAX(tlb_lat, lat);
+	    }
+
+	  /* I-cache/I-TLB miss? assumes I-cache hit >= I-TLB hit */
+	  if (lat != cache_il1_lat)
+	    {
+	      /* I-cache miss, block fetch until it is resolved */
+	      ruu_fetch_issue_delay += lat - 1;
+	      break;
+	    }
+	  /* else, I-cache/I-TLB hit */
+	}
+      else
+	{
+	  /* fetch PC is bogus, send a NOP down the pipeline */
+	  inst = MD_NOP_INST;
+	}
+
+      /* have a valid inst, here */
+
+      /* possibly use the BTB target */
+      if (pred)
+	{
+	  enum md_opcode op;
+
+	  /* pre-decode instruction, used for bpred stats recording */
+	  MD_SET_OPCODE(op, inst);
+	  
+	  /* get the next predicted fetch address; only use branch predictor
+	     result for branches (assumes pre-decode bits); NOTE: returned
+	     value may be 1 if bpred can only predict a direction */
+	  if (MD_OP_FLAGS(op) & F_CTRL)
+	    fetch_pred_PC =
+	      bpred_lookup(pred,
+			   /* branch address */fetch_regs_PC,
+			   /* target address *//* FIXME: not computed */0,
+			   /* opcode */op,
+			   /* call? */MD_IS_CALL(op),
+			   /* return? */MD_IS_RETURN(op),
+			   /* updt */&(fetch_data[fetch_tail].dir_update),
+			   /* RSB index */&stack_recover_idx);
+	  else
+	    fetch_pred_PC = 0;
+
+	  /* valid address returned from branch predictor? */
+	  if (!fetch_pred_PC)
+	    {
+	      /* no predicted taken target, attempt not taken target */
+	      fetch_pred_PC = fetch_regs_PC + sizeof(md_inst_t);
+	    }
+	  else
+	    {
+	      /* go with target, NOTE: discontinuous fetch, so terminate */
+	      branch_cnt++;
+	      if (branch_cnt >= fetch_speed)
+		done = TRUE;
+	    }
+	}
+      else
+	{
+	  /* no predictor, just default to predict not taken, and
+	     continue fetching instructions linearly */
+	  fetch_pred_PC = fetch_regs_PC + sizeof(md_inst_t);
+	}
+
+      /* commit this instruction to the IFETCH -> DISPATCH queue */
+      fetch_data[fetch_tail].IR = inst;
+      fetch_data[fetch_tail].regs_PC = fetch_regs_PC;
+      fetch_data[fetch_tail].pred_PC = fetch_pred_PC;
+      fetch_data[fetch_tail].stack_recover_idx = stack_recover_idx;
+      fetch_data[fetch_tail].ptrace_seq = ptrace_seq++;
+
+      /* for pipe trace */
+      ptrace_newinst(fetch_data[fetch_tail].ptrace_seq,
+		     inst, fetch_data[fetch_tail].regs_PC,
+		     0);
+      ptrace_newstage(fetch_data[fetch_tail].ptrace_seq,
+		      PST_IFETCH,
+		      ((last_inst_missed ? PEV_CACHEMISS : 0)
+		       | (last_inst_tmissed ? PEV_TLBMISS : 0)));
+      last_inst_missed = FALSE;
+      last_inst_tmissed = FALSE;
+
+      /* adjust instruction fetch queue */
+      fetch_tail = (fetch_tail + 1) & (ruu_ifq_size - 1);
+      fetch_num++;
+    }
+}
+
+/* default machine state accessor, used by DLite */
+static char *					/* err str, NULL for no err */
+simoo_mstate_obj(FILE *stream,			/* output stream */
+		 char *cmd,			/* optional command string */
+		 struct regs_t *regs,		/* registers to access */
+		 struct mem_t *mem)		/* memory space to access */
+{
+  if (!cmd || !strcmp(cmd, "help"))
+    fprintf(stream,
+"mstate commands:\n"
+"\n"
+"    mstate help   - show all machine-specific commands (this list)\n"
+"    mstate stats  - dump all statistical variables\n"
+"    mstate res    - dump current functional unit resource states\n"
+"    mstate ruu    - dump contents of the register update unit\n"
+"    mstate lsq    - dump contents of the load/store queue\n"
+"    mstate eventq - dump contents of event queue\n"
+"    mstate readyq - dump contents of ready instruction queue\n"
+"    mstate cv     - dump contents of the register create vector\n"
+"    mstate rspec  - dump contents of speculative regs\n"
+"    mstate mspec  - dump contents of speculative memory\n"
+"    mstate fetch  - dump contents of fetch stage registers and fetch queue\n"
+"\n"
+	    );
+  else if (!strcmp(cmd, "stats"))
+    {
+      /* just dump intermediate stats */
+      sim_print_stats(stream);
+    }
+  else if (!strcmp(cmd, "res"))
+    {
+      /* dump resource state */
+      res_dump(fu_pool, stream);
+    }
+  else if (!strcmp(cmd, "ruu"))
+    {
+      /* dump RUU contents */
+      ruu_dump(stream);
+    }
+  else if (!strcmp(cmd, "lsq"))
+    {
+      /* dump LSQ contents */
+      lsq_dump(stream);
+    }
+  else if (!strcmp(cmd, "eventq"))
+    {
+      /* dump event queue contents */
+      eventq_dump(stream);
+    }
+  else if (!strcmp(cmd, "readyq"))
+    {
+      /* dump event queue contents */
+      readyq_dump(stream);
+    }
+  else if (!strcmp(cmd, "cv"))
+    {
+      /* dump event queue contents */
+      cv_dump(stream);
+    }
+  else if (!strcmp(cmd, "rspec"))
+    {
+      /* dump event queue contents */
+      rspec_dump(stream);
+    }
+  else if (!strcmp(cmd, "mspec"))
+    {
+      /* dump event queue contents */
+      mspec_dump(stream);
+    }
+  else if (!strcmp(cmd, "fetch"))
+    {
+      /* dump event queue contents */
+      fetch_dump(stream);
+    }
+  else
+    return "unknown mstate command";
+
+  /* no error */
+  return NULL;
+}
+
+
+/* start simulation, program loaded, processor precise state initialized */
+void
+sim_main(void)
+{
+  /* ignore any floating point exceptions, they may occur on mis-speculated
+     execution paths */
+  signal(SIGFPE, SIG_IGN);
+
+  /* set up program entry state */
+  regs.regs_PC = ld_prog_entry;
+  regs.regs_NPC = regs.regs_PC + sizeof(md_inst_t);
+
+  /* check for DLite debugger entry condition */
+  if (dlite_check_break(regs.regs_PC, /* no access */0, /* addr */0, 0, 0))
+    dlite_main(regs.regs_PC, regs.regs_PC + sizeof(md_inst_t),
+	       sim_cycle, &regs, mem);
+
+  /* fast forward simulator loop, performs functional simulation for
+     FASTFWD_COUNT insts, then turns on performance (timing) simulation */
+  if (fastfwd_count > 0)
+    {
+      int icount;
+      md_inst_t inst;			/* actual instruction bits */
+      enum md_opcode op;		/* decoded opcode enum */
+      md_addr_t target_PC;		/* actual next/target PC address */
+      md_addr_t addr;			/* effective address, if load/store */
+      int is_write;			/* store? */
+      byte_t temp_byte = 0;		/* temp variable for spec mem access */
+      half_t temp_half = 0;		/* " ditto " */
+      word_t temp_word = 0;		/* " ditto " */
+#ifdef HOST_HAS_QWORD
+      qword_t temp_qword = 0;		/* " ditto " */
+#endif /* HOST_HAS_QWORD */
+      enum md_fault_type fault;
+
+      fprintf(stderr, "sim: ** fast forwarding %d insts **\n", fastfwd_count);
+
+      for (icount=0; icount < fastfwd_count; icount++)
+	{
+	  /* maintain $r0 semantics */
+	  regs.regs_R[MD_REG_ZERO] = 0;
+#ifdef TARGET_ALPHA
+	  regs.regs_F.d[MD_REG_ZERO] = 0.0;
+#endif /* TARGET_ALPHA */
+
+	  /* get the next instruction to execute */
+	  MD_FETCH_INST(inst, mem, regs.regs_PC);
+
+	  /* set default reference address */
+	  addr = 0; is_write = FALSE;
+
+	  /* set default fault - none */
+	  fault = md_fault_none;
+
+	  /* decode the instruction */
+	  MD_SET_OPCODE(op, inst);
+
+	  /* execute the instruction */
+	  switch (op)
+	    {
+#define DEFINST(OP,MSK,NAME,OPFORM,RES,FLAGS,O1,O2,I1,I2,I3)		\
+	    case OP:							\
+	      SYMCAT(OP,_IMPL);						\
+	      break;
+#define DEFLINK(OP,MSK,NAME,MASK,SHIFT)					\
+	    case OP:							\
+	      panic("attempted to execute a linking opcode");
+#define CONNECT(OP)
+#undef DECLARE_FAULT
+#define DECLARE_FAULT(FAULT)						\
+	      { fault = (FAULT); break; }
+#include "machine.def"
+	    default:
+	      panic("attempted to execute a bogus opcode");
+	    }
+
+	  if (fault != md_fault_none)
+	    fatal("fault (%d) detected @ 0x%08p", fault, regs.regs_PC);
+
+	  /* update memory access stats */
+	  if (MD_OP_FLAGS(op) & F_MEM)
+	    {
+	      if (MD_OP_FLAGS(op) & F_STORE)
+		is_write = TRUE;
+	    }
+
+	  /* check for DLite debugger entry condition */
+	  if (dlite_check_break(regs.regs_NPC,
+				is_write ? ACCESS_WRITE : ACCESS_READ,
+				addr, sim_num_insn, sim_num_insn))
+	    dlite_main(regs.regs_PC, regs.regs_NPC, sim_num_insn, &regs, mem);
+
+	  /* go to the next instruction */
+	  regs.regs_PC = regs.regs_NPC;
+	  regs.regs_NPC += sizeof(md_inst_t);
+	}
+    }
+
+  fprintf(stderr, "sim: ** starting performance simulation **\n");
+
+  /* set up timing simulation entry state */
+  fetch_regs_PC = regs.regs_PC - sizeof(md_inst_t);
+  fetch_pred_PC = regs.regs_PC;
+  regs.regs_PC = regs.regs_PC - sizeof(md_inst_t);
+
+  /* main simulator loop, NOTE: the pipe stages are traverse in reverse order
+     to eliminate this/next state synchronization and relaxation problems */
+  for (;;)
+    {
+      /* RUU/LSQ sanity checks */
+      if (RUU_num < LSQ_num)
+	panic("RUU_num < LSQ_num");
+      if (((RUU_head + RUU_num) % RUU_size) != RUU_tail)
+	panic("RUU_head/RUU_tail wedged");
+      if (((LSQ_head + LSQ_num) % LSQ_size) != LSQ_tail)
+	panic("LSQ_head/LSQ_tail wedged");
+
+      /* check if pipetracing is still active */
+      ptrace_check_active(regs.regs_PC, sim_num_insn, sim_cycle);
+
+      /* indicate new cycle in pipetrace */
+      ptrace_newcycle(sim_cycle);
+
+      /* commit entries from RUU/LSQ to architected register file */
+      ruu_commit();
+
+      /* service function unit release events */
+      ruu_release_fu();
+
+      /* ==> may have ready queue entries carried over from previous cycles */
+
+      /* service result completions, also readies dependent operations */
+      /* ==> inserts operations into ready queue --> register deps resolved */
+      ruu_writeback();
+
+      if (!bugcompat_mode)
+	{
+	  /* try to locate memory operations that are ready to execute */
+	  /* ==> inserts operations into ready queue --> mem deps resolved */
+	  lsq_refresh();
+
+	  /* issue operations ready to execute from a previous cycle */
+	  /* <== drains ready queue <-- ready operations commence execution */
+	  ruu_issue();
+	}
+
+      /* decode and dispatch new operations */
+      /* ==> insert ops w/ no deps or all regs ready --> reg deps resolved */
+      ruu_dispatch();
+
+      if (bugcompat_mode)
+	{
+	  /* try to locate memory operations that are ready to execute */
+	  /* ==> inserts operations into ready queue --> mem deps resolved */
+	  lsq_refresh();
+
+	  /* issue operations ready to execute from a previous cycle */
+	  /* <== drains ready queue <-- ready operations commence execution */
+	  ruu_issue();
+	}
+
+      /* call instruction fetch unit if it is not blocked */
+      if (!ruu_fetch_issue_delay)
+	ruu_fetch();
+      else
+	ruu_fetch_issue_delay--;
+
+      /* update buffer occupancy stats */
+      IFQ_count += fetch_num;
+      IFQ_fcount += ((fetch_num == ruu_ifq_size) ? 1 : 0);
+      RUU_count += RUU_num;
+      RUU_fcount += ((RUU_num == RUU_size) ? 1 : 0);
+      LSQ_count += LSQ_num;
+      LSQ_fcount += ((LSQ_num == LSQ_size) ? 1 : 0);
+
+      /* go to next cycle */
+      sim_cycle++;
+
+      /* finish early? */
+      if (max_insts && sim_num_insn >= max_insts)
+	return;
+    }
+}