Add some minor additional sections to report.

Size sense amp back to 40
Update report with new performance characteristics.
2021-03-17 13:34:04 -07:00 · 2021-03-17 13:12:55 -07:00 · 2021-03-17 13:09:15 -07:00 · 2021-03-17 13:07:33 -07:00 · 2021-03-17 12:58:36 -07:00 · 2021-03-17 12:21:35 -07:00
19 changed files with 2640 additions and 0 deletions
--- a/final/SRAM.jelib
+++ b/final/SRAM.jelib
--- a/final/SRAM_bits.cir
+++ b/final/SRAM_bits.cir
@@ -0,0 +1,290 @@
 * This file contains all the subcircuits to be used in SRAM256.cir
 ***** long channel VTP = -0.9, VTN = 0.8 *****
 *.include modelcard/1um.pm
 *.param supply = 5
 *.param ll = 1u
 ****** 50nm models***
 .include ./modelcard/50nm.pm
 .param supply =1
 .param lambda=25nm
 .param ll='2*lambda'
 ****** 16nm low power models***
 *.include ./modelcard/PTM_LP/16nm.pm
 *.param supply =0.9
 *.param ll=16nm
 ****** 16nm high peformance models***
 *.include ./modelcard/PTM_HP/16nm.pm
 *.param supply =0.7
 *.param ll=16nm
 .subckt wire iot iof len=10 wid=10
 .param rr=0.4
 .param cc = '100e-15'
 rt iot iof 'rr*len*50/(wid)'
 cf iof  0  'cc*len*wid*50/1e6'
 .ends
 .subckt wire_dual lt rt lf rf len=10 wid=10
 Xt lt rt wire len='len' wid='wid'
 Xf lf rf wire len='len' wid='wid'
 .ends
 .subckt wire_precharge lt rt lf rf clk len=10 wid=10 ww=10
 Xt lt rt wire len='len' wid='wid'
 Xf lf rf wire len='len' wid='wid'
 Xpt rt clk vdd pp ww='ww*2'
 Xpf rf clk vdd pp ww='ww*2'
 .ends
 .subckt nn d g s  ww=100
 mnfet d g s 0 nmos L=ll w='ww*ll'
 .ends
 .subckt pp d g s   ww=100
 mpfet d g s vdd pmos L=ll w='ww*ll'
 .ends
 .subckt inv out inn size=30 beta=2
 XPP out inn vdd pp ww='size*beta/(beta+1)'
 XNN out inn gnd nn ww='size/(beta+1)'
 .ends
 .subckt nnd2 out in1 in0 size=30 beta=2
 Xap0 out in0 vdd pp ww='beta*size/(beta+2)'
 Xap1 out in1 vdd pp ww='beta*size/(beta+2)'
 Xan0 out in0 nng nn ww='2*size/(beta+2)'
 Xan1 nng in1 0   nn ww='2*size/(beta+2)'
 .ends nnd2
 .subckt nor2 out in1 in0 size=30 beta=2
 Xap0 ppi in0 vdd pp ww='2*beta*size/(2*beta+1)'
 Xap1 out in1 ppi pp ww='2*beta*size/(2*beta+1)'
 Xan0 out in0 0 nn ww='1*size/(2*beta+1)'
 Xan1 out in1 0   nn ww='1*size/(2*beta+1)'
 .ends nor2
 .subckt latch out inn clk clb size=15 beta=2
 Xn inn clk qin nn ww='5'
 Xp inn clb qin pp ww='10'
 Xfp qin ggg vdd pp ww='5'
 Xfn qin ggg gnd nn ww='5'
 Xi ggg qin     inv size='size'
 Xo out ggg     inv size='3*size'
 .ends latch
 .subckt flop qqq ddd clk
 Xinve clb clk inv
 Xflip int ddd clb clk latch
 Xflop qqq int clk clb latch
 .ends flop
 .subckt reg8 ot7 ot6 ot5 ot4 ot3 ot2 ot1 ot0 in7 in6 in5 in4 in3 in2 in1 in0 clk
 x7 ot7 in7 clk flop
 x6 ot6 in6 clk flop
 x5 ot5 in5 clk flop
 x4 ot4 in4 clk flop
 x3 ot3 in3 clk flop
 x2 ot2 in2 clk flop
 x1 ot1 in1 clk flop
 x0 ot0 in0 clk flop
 .ends reg8
 .subckt dat1 out period=1ns start=1ns sz=50 total=5 duty=3
 V0 j0  0  PULSE('supply' 0 'start' 10p 10p 'duty*period-10ps' 'total*period')
 x7 out j0 inv size='sz'
 .ends dat1
 *generates different data stream on all eight channels, buffered output
 .subckt dat8 o7 o6 o5 o4 o3 o2 o1 o0 per=1ns start=1ns size=50
 V0 j0  0  PULSE(0 'supply' 'start' 10p 10p '0.5*per-10ps' 'per')
 V1 j1  0  PULSE(0 'supply' 'start' 10p 10p '0.5*per-10ps' '2*per')
 V2 j2  0  PULSE(0 'supply' 'start' 10p 10p '0.5*per-10ps' '3*per')
 V3 j3  0  PULSE(0 'supply' 'start' 10p 10p '0.5*per-10ps' '4*per')
 V4 j4  0  PULSE('supply' 0 'start' 10p 10p '0.5*per-10ps' '1*per')
 V5 j5  0  PULSE('supply' 0 'start' 10p 10p '1*per-10ps' '2*per')
 V6 j6  0  PULSE('supply' 0 'start' 10p 10p '1.5*per-10ps' '3*per')
 V7 j7  0  PULSE('supply' 0 'start' 10p 10p '2*per-10ps' '4*per')
 xb o7 o6 o5 o4 o3 o2 o1 o0 j7 j6 j5 j4 j3 j2 j1 j0 buf8 sz='size'
 .ends dat8
 .subckt buf8 ot7 ot6 ot5 ot4 ot3 ot2 ot1 ot0 in7 in6 in5 in4 in3 in2 in1 in0 sz=100
 x7 ot7 in7 inv size='sz'
 x6 ot6 in6 inv size='sz'
 x5 ot5 in5 inv size='sz'
 x4 ot4 in4 inv size='sz'
 x3 ot3 in3 inv size='sz'
 x2 ot2 in2 inv size='sz'
 x1 ot1 in1 inv size='sz'
 x0 ot0 in0 inv size='sz'
 .ends buf8
 .subckt nnd3 out in2 in1 in0 size=20 beta=2
 Xp0 out in0 vdd pp ww='beta*size/(beta+3)'
 Xp1 out in1 vdd pp ww='beta*size/(beta+3)'
 Xp2 out in2 vdd pp ww='beta*size/(beta+3)'
 Xn0 out in0 nn0 nn ww='3*size/(beta+3)'
 Xn1 nn0 in1 nn1 nn ww='3*size/(beta+3)'
 Xn2 nn1 in2 gnd nn ww='3*size/(beta+3)'
 .ends
 .subckt senseAmp ot1 ot0 in1 in0 eva size=40
 Xn0 ot0 in0 ot1 eva nnd3 size ='size'
 Xn1 ot1 in1 ot0 eva nnd3 size ='size'
 .ends senseAmp
 .subckt iSenseAmp ot1 ot0 in1 in0 eva size=40
 Xp1 ot1 eva vdd pp ww='size'
 Xp2 ot1 ot0 vdd pp ww='size'
 Xp3 ot0 eva vdd pp ww='size'
 Xp4 ot0 ot1 vdd pp ww='size'
 Xn1 ot1 ot0 nn1 nn ww='size'
 Xn2 ot0 ot1 nn0 nn ww='size'
 Xn3 nn1 in1 pd nn ww='size'
 Xn4 nn0 in0 pd nn ww='size'
 Xn5 pd eva gnd nn ww='size'
 .ends 
 .subckt precharge charge rwtb clk diib
 Xrdi rdi rwtb diib nnd2
 Xnn chargeb clk rdi nnd2
 Xout charge chargeb inv
 .ends precharge
 .subckt write1 btt bff dii rwt clk
 * TODO: sizes
 Xclk clkb clk inv size='25'
 Xdii diib dii inv size='25'
 Xrwt rwtb rwt inv size='25'
 Xrwn dorw clkb rwt nor2 size='50'
 Xdt pdt dii gnd nn  ww='100'
 Xdf pdf diib gnd nn ww='100'
 Xwt btt dorw pdt nn ww='100'
 Xwf bff dorw pdf nn ww='100'
 Xpcet pcet rwtb clk diib precharge
 Xpcef pcef rwtb clk dii precharge
 Xpct btt clk vdd pp ww='100'
 Xpcf bff clk vdd pp ww='100'
 .ends write1
 .subckt iWrite1 btt bff dii rwt en clk
 * TODO: sizes
 Xclk clkb clk inv size='40'
 Xdii diib dii inv size='40'
 Xrwt rwtb rwt inv size='40'
 Xrwn dorw clkb rwt nor2 size='110'
 Xdt pdt dii gnd nn  ww='200'
 Xdf pdf diib gnd nn ww='200'
 Xwt btt dorw pdt nn ww='200'
 Xwf bff dorw pdf nn ww='200'
 Xpcet pcet rwtb clk diib precharge
 Xpcef pcef rwtb clk dii precharge
 Xpct btt pcet vdd pp ww='100'
 Xpcf bff pcef vdd pp ww='100'
 .ends write1
 .subckt read1 btt bff dot rwt clk
 Xnd trigger rwt clk nnd2
 Xinv triggerb trigger inv
 Xamp set reset btt bff triggerb senseAmp size='40'
 Xinv1 set1 set inv
 Xinv2 set2 set1 inv
 Xinv3 reset1 reset inv
 * Old setup:
 * Xp nn1 set2 vdd pp
 * Xn nn1 reset1 gnd nn
 * Xh1 dot nn1 inv
 * Xh2 nn1 dot inv
 Xp dot set2 vdd pp
 Xn dot reset1 gnd nn
 Xh1 dot nn1 inv
 Xh2 nn1 dot inv
 .ends read1
 .subckt readSub btt bff set rst rwt clk en
 Xnd trigger rwt en clk nnd3
 Xinv triggerb trigger inv size='40'
 Xamp set rst btt bff triggerb senseAmp size='200'
 .ends read1
 .subckt iReadSub btt bff set rst rwt clk en
 Xnd trigger rwt en clk nnd3
 Xinv triggerb trigger inv size='40'
 Xamp set rst btt bff triggerb iSenseAmp size='40'
 .ends read1
 .subckt readcollect dot set0 rst0 set1 rst1 set2 rst2 set3 rst3
 Xset01 set01 set0 set1 nnd2
 Xset23 set23 set2 set3 nnd2
 Xrst01 rst01 rst0 rst1 nnd2
 Xrst23 rst23 rst2 rst3 nnd2
 Xnset01 nset01 set01 inv
 Xnset23 nset23 set23 inv
 Xp01 nn1 nset01 vdd pp
 Xp23 nn1 nset23 vdd pp
 Xn01 nn1 rst01 gnd nn
 Xn23 nn1 rst23 gnd nn
 Xh1 dot nn1 inv size='60'
 Xh2 nn1 dot inv size='60'
 .ends readCollect
 .subckt decode2 o11 o10 o01 o00 di1 di0 df1 df0
 .ends
 .subckt decode_nor16
 .ends
 .subckt decode_nnd16
 .ends
 .subckt decode_16and1
 .ends decode_16and1
 .subckt dmux256 o255 o223 0012 o001 dt7 dt6 dt5 d4 dt3 dt1 dt0
 .ends dmux256
 .subckt decModel choose din clk size='20'
 Xi1 nn1 din inv size='size'
 * Here: stopped using i1 and just used din
 Xnal ww1 gnd din nnd2 size='size*4'
 Xnar nn2 vdd din nnd2 size='size'
 Xnrl ww2 nn2 vdd nor2 size='size*3'
 Xnrr nn3 nn2 gnd nor2 size='size'
 Xna2l ww3 gnd nn3 nnd2 size='size*15'
 Xna2r nn4 vdd nn3 nnd2 size='size'
 Xi2 nn5 nn4 inv size='size'
 Xnac nn6 nn5 clk nnd2 size='size'
 Xi3 choose nn6 inv size='size'
 .ends
 .subckt mem1 bt bf ope
 Xpt tt ff vdd pp ww='5'
 Xnt tt ff gnd nn ww='5'
 Xpf ff tt vdd pp ww='5'
 Xnf ff tt gnd nn ww='5'
 Xat bt ope tt nn ww='5'
 Xaf bf ope ff nn ww='5'
 .ends
--- a/final/amp.png
+++ b/final/amp.png
--- a/final/decoder.png
+++ b/final/decoder.png
--- a/final/layout_arrayed.png
+++ b/final/layout_arrayed.png
--- a/final/layout_arrayed_closeup.png
+++ b/final/layout_arrayed_closeup.png
--- a/final/layout_single.png
+++ b/final/layout_single.png
--- a/final/read_select.png
+++ b/final/read_select.png
--- a/final/report.tex
+++ b/final/report.tex
@@ -0,0 +1,387 @@
 \documentclass{article}
 \usepackage[margin=1in]{geometry}
 \usepackage{graphicx}
 \usepackage{amsmath}
 \usepackage{hyperref}
 \usepackage{xcolor}
 \usepackage{caption}
 \usepackage{subcaption}
 \definecolor{link}{HTML}{006275}
 \hypersetup{
    colorlinks,
    citecolor=black,
    filecolor=black,
    linkcolor=link,
    urlcolor=black
 }
 \title{Final Project Report}
 \author{Danila Fedorin}
 \begin{document}
 \maketitle
 \tableofcontents
 \pagebreak
 \section{General Design and Considerations}
 The goal of this assignment was to create a 256-byte SRAM memory unit. In order
 to minimize wire delays, I chose to split each bit into \textbf{4 columns of 64 SRAM cells
 each}. This was motivated by the following factors:
 \begin{itemize}
    \item \emph{Larger} columns were eliminated due to the high cost of interconnect.
        Even large write blocks were not able to charge the ``far ends'' of the wire
        at shorter clock cycles. Increasing wire width did not help; although resistance
        decreased, the capacitance increased, leading to small net gains. Thus, I made
        the decision to shrink the columns as much as possible. However...
    \item \emph{Smaller} columns became a routing challenge. Even with a 4-column split,
        to properly connect each cell of the SRAM column, the SRAM cells themselves need
        to accommodate an additional three \textsc{Wl} lines. Due to the pitch requirements
        on metals three and four, this is the upper limit (for reasonably sized cells).
        Alternatives included splitting the decoder into pieces, but for large numbers
        of columns, this meant that the decoder signal traveled through significant amounts
        of wire, and was thus slower.
 \end{itemize}
 For each of the 4 64-bit columns, I attached separate read and write blocks. However,
 my placement of the write block was unorthodox. I observed that, although the write block
 is perfectly capable of quickly manipulating the bitlines close to it, the changes
 to the wires take too long to propagate through to the end. I addressed this with two separate
 changes:
 \begin{itemize}
    \item I added \textbf{additional precharge transistors} along the column, a total of 4.
        Each was sized at $10\lambda$, much like the SRAM transistors themselves. When the clock
        was low, these PMOS transistors became transparent, and helped precharge the bitlines faster.
        Doing so helped avid hysteresis. However, this did not help with writing during high clock,
        so...
    \item I also \textbf{placed the write block in the middle of the column}. This increased the distance
        between my furthest SRAM cell and the read block (since the write block now contributed to wire
        length). However, this made it significantly easier to drive the entire length of the wire,
        which was my main bottleneck. This was because the maximum distance from the write
        block to any cell in the column was halved. Since my read circuit continued to work in this
        configuration, I did not place it in the middle of the column, as that would needlessly
        increase the length of the wires. 
 \end{itemize}
 %
 This led to the configuration shown in Figure \ref{fig:top-design}. To simulate this design, I \textbf{tested three configurations}:
 \begin{enumerate}
    \item A memory cell at the very top of my column, which is the furthest spot from both the read and write.
        This is the simulation in the figure.
    \item A memory cell in the middle of my column, in the same place as the write block. Since the write block
        has brief ``false starts'', this test was to ensure that the read block can still pick up data
        despite the write block's misfires.
    \item A memory cell at the very bottom of my column. This area has additional capacitance from the read block;
        it thus takes longer to charge up, and tends to be the first spot where writes fail.
 circuit. 
 %
 \end{enumerate}
 I also split the wire into 4 equally-sized fragments, each with resistance $\frac{R}{4}$ and
 capacitance $\frac{C}{4}$. Between each fragment, I added the aforementioned $10\lambda$ precharge
 transistors, as well as 16 always-off $5\lambda$ transistors, which simulated the remaining memory cells.
 I also placed \textsc{Din}, \textsc{Ad0}, and \textsc{Rwt} behind the default-sized flip-flops
 attached to the clock to simulate something like a pipeline stage. My overall design is shown
 in Figure \ref{fig:top-design-sim}.
 \pagebreak
 \begin{figure}[h]
    \centering
    \includegraphics[width=\linewidth]{toplevel_design.png}
    \caption{Top-level design for a single bit.}
    \label{fig:top-design}
 \end{figure}
 My SRAM cell ended up being $30\lambda$ units tall when arrayed. With
 a total of 64 cells in a single column, this led to a wire length of $1920\lambda$.
 However, since my write block was now included in the column, I added another $300\lambda$
 of length to this number, to a total of roughly $2200\lambda$.
 \begin{figure}
    \centering
    \includegraphics[width=0.6\linewidth]{toplevel.png}
    \caption{Architecture of top-level simulation.}
    \label{fig:top-design-sim}
 \end{figure}
 \pagebreak
 \section{Performance Results}
 I was able to clock my design at \textbf{$1.3\textit{ns}$}.
 %
 I realize that this isn't as fast as everyone else, but I ask that you take
 into consideration the fact that \textbf{I was working with the old wire model}
 until about an hour before the final due date (since I didn't know the wire model changed).
 If I knew earlier, I'd have more time to optimize my design for the timings associated
 with the new model.
 %
 Two factors lead to this upper limit.
 %
 \begin{itemize}
    \item \textit{Write capacitance} makes it increasingly difficult to overwrite the value
        in the cell. Clocking my design any faster leads my cell to \textit{almost} flip, but not resolve correctly.
        I have found no way to work around these limits once my wire was properly sized, and my
        write block was placed in the middle of the column.
    \item \textit{Flop, decoder, and read delays} are the major limitation when both the inputs
        and the outputs of the circuit are connected to flip flops. The most significant
        instance of this issue is my write block: both \textsc{Din} and \textsc{Rwt} arrive
        around $300\textit{ps}$ into the cycle. This means two things: a) if the previous
        operation was ``read'', then the block does not start writing until halfway into
        the positive phase of the clock and b) if the data being written is different
        from the data in the previous cycle, for half the time, the write block will write
        the old data (until the flip flop switches).
 \end{itemize}
 \section{Components}
 \subsection{Decoder}
 \subsubsection{In My Own Words}
 The decoder in this design is \textit{almost} the exact same one as we were given in lecture.
 It computes all combinations of two consecutive bits using a \textsc{Nand} gate; for
 each combination, there are 4 adjacent two-bit combinations,
 leading to a 4 \textsc{Nor} gates connected to each \textsc{Nand}. There are now
 16 combinations of 4 adjacent bits; each combination of the lower 4 bits
 needs to be compared with each of the 16 combinations of the upper 4 bits,
 leading to 16 \textsc{Nand} gates connected to each \textsc{Nor}. This
 results in 256 unique \textsc{Wl} wires. Finally, these need to be attached
 to the clock, so that cells aren't open randomly. This is done using an \textsc{And}
 gate (a \textsc{Nand} followed by an inverter).
 I adjusted this design to account for the address signals that need to be fed
 into the write blocks. Which of the read/write columns is triggered
 depends on the upper two bits of the address (since we have 4 columns). I modeled
 this by increasing the fanout on the first \textsc{Nand} gate from 1 to 4.
 This is pessimistic; each 2-bit combination would only feed into one write block,
 whose trigger gate is normally sized.
 \begin{figure}[h]
    \centering
    \includegraphics[width=\linewidth]{decoder.png}
    \caption{Decoder model used in project.}
    \label{fig:decoder}
 \end{figure}
 % TODO: Domino logic
 % TODO: More inverters?
 \pagebreak
 \subsection{Read Block}
 \subsubsection{In My Own Words}
 The read block uses a \emph{sense amplifier} to detect small changes on the bitlines,
 which it then translates into a zero-or-one output. The changes in the wires are below
 the threshold of what could be considered digital logic; all the sense amplifier
 designs I've come across rely on metastability, a state in which even tiny fluctuations
 can significantly alter the outcome\footnote{My favorite analogy is a pencil balanced on its tip.
 Technically, it's stable; however, even a small air current -- one you can't feel -- can knock it over.}.
 The \textsc{Trigger} signal, which depends on the clock and \textsc{Rwt}, puts the amplifier
 into a metastable state. From there, the connected bitlines cause it to resolve one way
 or another. Finally, if one of the wires resolves, a value is written into the keeper circuit
 at the end, which ensures that the value that was read continues to be expressed until
 the next read operation.
 \subsubsection{Details}
 For my read block, I used a different sense amplifier design. The design based
 on the two \textsc{Nand3} gates was easy to understand and build, but was less
 sensitive, and tended to behave strangely under pressure. This led to difficulties
 with debugging (the output would, for instance, flip completely at certain
 wire widths), and was seemingly random. Instead, I used
 an \textbf{improved latch-based sense amplifier design} from \cite{210039}. % TODO: cite
 The design I used is shown in Figure \ref{fig:latch-amp}.
 I left it sized at $40\lambda$, since larger amplifiers seem to take longer
 to trigger and exit metastability.
 The read block is not a particular bottleneck in this design. The main concern
 was to handle the \textbf{``false start'' activation of the write block}. Because the \textsc{Rwt}
 input is behind a latch, it takes nearly $300\textit{ps}$ to pull up or down after
 the initial clock. Thus, if a write occurred during a previous cycle, the write block will
 activate for a short period of time before the read block does. The memory cell
 will overpower this initial misfire\footnote{According to my additional simulations, this is true even when the memory cell is close to the write block.}, but in this case, both \textsc{Bt} and \textsc{Bf}
 will be below \textsc{Vdd}. The ``improved sense amplifier'' seems to handle this
 case better than the one based on two \textsc{Nand} gates. 
 The latch-induced delay in \textsc{Rwt} also causes a strange \textsc{Trigger} signal during write operations
 directly following read operations. The trigger signal initialy activates, putting the sense
 amplifier into metastability; however, the correct \textsc{Rwt} value arrives before the
 sense amp's outputs are compromised. If this became a problem, I would add an additional,
 delayed clock signal \emph{after} the sense amplifier, and use an \textsc{And} gate
 to delay the read block's output.
 \begin{figure}[h]
 \centering
 \begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=.7\linewidth]{amp.png}
  \caption{The latch-based sense amplifier from \cite{210039}.}
  \label{fig:latch-amp}
 \end{subfigure}%
 \begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=.8\linewidth]{read_select.png}
  \caption{The block gathering signals from the four columns.}
  \label{fig:read-collect}
 \end{subfigure}
 \caption{Read block schematics}
 \label{fig:read}
 \end{figure}
 \pagebreak
 \subsection{Write Block}
 \subsubsection{In My Own Words}
 The write block converts a ``data in'', or \textsc{Din}, signal
 into a one-hot representation. It does so by pulling one of the bitlines high, and the other
 low. Once the memory cell connects to the bitlines, it takes on the charge provided by the
 write block, and is therefore overwritten. In my design, two PMOS transistors for each bitline
 are used to pull down; one of the transistors is triggered by the \textsc{Din} signal (which wire
 we pull down depends on the signal itself!), and the other by a combination of the clock
 and \textsc{Rwt} (we don't want to touch the wires when reading!).
 \subsubsection{Details}
 My write block was not significantly different from the original design. Under the assumption
 that data arrives first, I placed the transistors attached to \textsc{Din} and $\overline{\textsc{Din}}$
 close to \textsc{Gnd}, each followed by a transistor attached to the ``write'' signal.
 I also configured the write block to only precharge when the clock is low.
 I experimented with making the write block pull wires up when writing (during high clock). However,
 I did not find this to be of significant use. Since the wires are initially precharged,
 there is no more time spent on charging them up; furthermore, the memory cell being written to
 does not have enough ``strength'' to pull the wire down enough.
 A curiosity of this design is that reads didn't seem to work with hich clock speeds. When enough
 time is spent reading the wires, the memory cell in question is able to gradually exhaust the amount
 of charge on one of these wires. Since the original, \textsc{Nand}-based sense amplifier required
 all inputs to be high to properly function, this led to it eventually ``flipping'' and producing
 the wrong output. This was only an issue above $5\textit{ns}$, and only with the original sense amplifier
 design, though. I think that both Reed and
 Graham experienced this occurrence -- they seemed to post very similar waveforms
 to the community Discord group chat.
 One thing to note about the write block is that its \textbf{clock input is deliberately delayed} compared
 to the ``actual'' clock. This is because of an issue with \textsc{Din}. Since this
 input is behind a latch, it takes around $300\textit{ps}$ to arrive after the rising clock
 edge. If the previous value of \textsc{Din} was different than its current one, the write
 block will start writing the wrong value. This will typically mean that the block cannot properly
 perform the write. The delay on the clock input serves to mitigate this issue, by giving more
 time for \textsc{Din} to settle before starting to write. To compensate for this delay, I sized
 the write block's pull down transistors quite large ($100\lambda$), so that they can pull
 the wire down, even starting $300\textit{ps}$ into the cycle. This is why the ``clock'' input
 in my diagrams is colored black, unlike every other clocked component. The delay is achieved
 by 6 sequenced inverters, two of which are sized 10x larger than the rest.
 \begin{figure}[h]
    \centering
    \includegraphics[width=0.65\linewidth]{write.png}
    \caption{Write block used in this project.}
    \label{fig:write}
 \end{figure}
 \pagebreak
 \subsection{Memory Cell}
 \subsubsection{In My Own Words}
 The memory cell consists of two cross coupled inverters whose outputs
 are disconnected from the bitlines by two additional nMOS transistors. When disconnected,
 this cell reliably holds its value; one inverter's output turns off the other, and symmetrically,
 the ``off'' output of that other inverter keeps the first one on. However, this cell is pretty
 small; all of its transistors have size $5\lambda$ is the smallest size that can be properly
 connected with a standard $2\lambda\times2\lambda$ via. Thus, when the ``write line'' (signal
 connected to the gates of the two outside transistors) is asserted, the charge from the
 surrounding bitlines can easily overpower the cell, causing it to switch to a different value.
 \subsubsection{Details}
 There are few notable things about my cell design. Even though it was recommended that we only
 use metals one and two for the internal wiring, I went up to metal three for cross-connecting
 the two internal inverters. This was the only way I found to keep the height of the cell to 
 minimum. This limited my routing options somewhat; to compensate, I also used metal three for
 the vertical wires, \textsc{Bt} and \textsc{Bf}. This allowed me to use metal four for the
 \textsc{Wl} (access) signal. Since this was the only use of metal four, I had enough free
 room to route thee additional \textsc{Wl} signals to the remaining three columns. 
 My general principle for designing the layout was that, in an 12-bit, 4-column design, \textbf{a single
 unit of height costs as much as 64 units of width}. Thus, I was fairly liberal with my layout's
 width, but made sure to minimize the height of the design. The most significant bottleneck
 was the gate oxide ``poking out'' of the ends of the design. In total, I was able to achieve
 a height of $30\lambda$ when arrayed.
 Other designs with smaller height were possible, but I found them undesirable. For instance,
 Reed's now-famous design used a significant amount of high-level metals to achieve its tiny,
 almost square area. This, however, makes routing \textsc{Wl} signals fairly complicated. They either
 need to go to yet another layer of metal, or the decoder needs to be split into 4 pieces. The former
 is undesirable as per the requirements for this assignment; the latter incurs the cost of additional
 decoder hardware between columns, thereby significantly increasing the wire length and signal
 delays. Since delays incurred by the flip flops and other signals are already becoming
 a significant factor in my design, I thought it would be best to avoid such delays.
 Other ideas I am aware of include putting \textit{all} the transistors in a single, horizontal line.
 While this certainly succeeds at reducing the height, it incurs all the same issues described
 above - it becomes nigh impossible to wire further \textsc{Wl} lines through each column,
 unless the decoder is split into bits, in which case the width of the entire assembly drastically increases,
 slowing down all signals.
 \begin{figure}[h]
    \centering
    \includegraphics[width=0.5\linewidth]{layout_single.png}
    \caption{Electric layout for a single cell.}
    \label{fig:layout-cell}
 \end{figure}
 \pagebreak
 My basic cell is shown in Figure \ref{fig:layout-cell}. The arrayed version (in Figure \ref{fig:layout-arrayed})
 merits additional explanation. In my earlier description of the overall design, I mentioned
 that I have precharge PMOS transistors. I have integrated these into my layout to accurately model
 my design. I also made them $10\lambda$ wide, since this is, at the time of writing,
 the size of my 4 precharge transistors. In the bird's eye view (Figure \ref{fig:layout-arrayed-far}),
 three things can be observed:
 \begin{itemize}
    \item \textit{Additional vertical line:} This line represents the clock signal,
        which must be fed to the precharge transistors. In the full design, there would
        be 5 clock lines (3 shared, and 2 on either side).
    \item \textit{``Empty'' space between nodes:} I left this space because I was not sure
        how wide I would end up making my \textsc{Bt} and \textsc{Bf} wires. I have measured
        the distance to ensure that the design will remain DRC clean with up to \textbf{$8\lambda$-wide bitlines}.
        This appears to be a sweet spot for my design, anyway.
    \item \textit{Moved well contacts:} I have moved my well contacts to the region between
        two columns. By extending the N- and P-wells to this area, I was able to
        share a single contact between two cells, leaving room for prechare transistors
        on both sides of the cell. This was partially inspired by Reed's compact cell design,
        which shared a single contact between two cells\footnote{I am operating based on your
        comment that well contacts for every cell are significantly overkill.}.
 \end{itemize}
 Figure \ref{fig:layout-arrayed-close} shows a closer view of the design. Due to the additional
 space incurred, an entire column is approximately $100\lambda$ wide.
 \begin{figure}[h]
 \centering
 \begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=.7\linewidth]{layout_arrayed.png}
  \caption{Bird's eye view of the arrayed SRAM cells.}
  \label{fig:layout-arrayed-far}
 \end{subfigure}%
 \begin{subfigure}{.5\textwidth}
  \centering
  \includegraphics[width=.8\linewidth]{layout_arrayed_closeup.png}
  \caption{Close up from arrayed SRAM cells.}
  \label{fig:layout-arrayed-close}
 \end{subfigure}
 \caption{Read block schematics}
 \label{fig:layout-arrayed}
 \end{figure}
 \pagebreak
 \section{Further Design Ideas}
 I discovered -- from other people in the class -- that an 8-column design was plausible.
 Unfortunately, I was only convinced a day or so before the project was due, which did not give me
 enough time to redesign my SRAM. I have seen students successfully using
 the 8-column design by sharing \textsc{Wl} wires for each 'row', and using
 the remaining 3 bits to enable and disable the write block. Since reading does
 not change the cell value, this is a viable approach; all 8 columns would ``read''
 (except during writing, in which 7 columns would read and 1 would write). As
 long as a proper address selection mechanism is implemented into the read collector
 circuit (which at present cannot handle concurrent reads), this would work just
 fine, albeit at the expense of added power consumption (from draining and re-charging
 7 extra wires). This design, combined with my idea of placing the write block
 in the middle of the column, can lead to very short effective wire lengths. If
 I was to approach this project again, that's what I would try.
 \section{Acknowledgements}
 Reed's aforementioned idea of sharing well contacts between adjacent cells
 played a part in my design. Also, without the other students in the class
 Discord, I would not have known to use the ``better'' wire model at all.
 \pagebreak
 \bibliographystyle{unsrt}
 \bibliography{bibliography}
 \end{document}
--- a/final/testBuffer.cir
+++ b/final/testBuffer.cir
@@ -0,0 +1,70 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 1.3ns
 .param dataLead=per*0.1
 .param lw=2200
 .param wirew=14
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='per' start='per+dataLead' total=1 duty=0.5 sz=300
 Xad ad               dat1 period='per' start='per' total=1 duty=0.5 sz=300
 Xrdwr rdw               dat1 period='per' start='2*per'        total=2 duty=1 sz=300
 Xdii din                dat1 period='per' start='per'          total=4 duty=2 sz=300
 Xinv1 clkb1 clk inv
 Xinv2 clkb2 clkb1 inv
 Xinv3 clkb3 clkb2 inv
 Xinv4 clkb4 clkb3 inv size='300'
 Xinv5 clkb5 clkb4 inv
 Xinv6 clkb6 clkb5 inv size='300'
 Xad adf ad clk flop
 Xdinff dinf din clk flop
 Xrdwff rdwf rdw clk flop
 Xrotff dotf dot clk flop
 Xdec choose adf clk decModel
 Xwr bt3 bf3 dinf rdwf adf clkb6 iWrite1
 Xw1 bt1 bt2 bf1 bf2  clk   wire_precharge len='lw/4' wid='wirew'
 Xmd1 bt2 bf2 memLoad number=15
 Xw2 bt2 bt3 bf2 bf3  clk   wire_precharge len='lw/4' wid='wirew'
 Xmd2 bt3 bf3 memLoad number=16
 Xw3 bt3 bt4 bf3 bf4  clk   wire_precharge len='lw/4' wid='wirew'
 Xmd3 bt4 bf4 memLoad number=16
 Xw4 bt4 btt bf4 bff  clk   wire_precharge len='lw/4' wid='wirew'
 Xmd4 bt3 bf3             memLoad number =16
 * Xla bt1 bf1 choose         mem1
 * Xla bt3 bf3 choose         mem1
 Xla btt bff choose         mem1
 Xrd btt bff set rst rdwf clk choose iReadSub
 Xrc dot set rst vdd vdd vdd vdd vdd vdd readCollect
 .ic V(la:tt)=0 V(la:ff)=1
 .ic V(bt2)=1
 .tran 1p 'per*20'
 .meas tran dot_delay trig V(clk) val=0.8*supply rise=2 targ V(dot) val=0.8*supply rise=1
--- a/final/testDecoder.cir
+++ b/final/testDecoder.cir
@@ -0,0 +1,47 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 5ns
 .param lw=500
 .param wirew=3
 *DC supplies
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='per' start='per' total=1 duty=0.5 sz=120
 Xbit  ad0               dat1 period='per' start='0.5*per' total=3 duty=1
 Xde ope ad0 clk decModel size=20
 .tran 1p 25n
--- a/final/testMem.cir
+++ b/final/testMem.cir
@@ -0,0 +1,61 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 5ns
 .param lw=100
 .param wirew=3
 *DC supplies
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='0.5*per' total=1 duty=0.5 sz=120
 Xdii dii                dat1 period='per' start='per' total=3 duty=1
 Xbit  ad0               dat1 period='per' start='0.5*per' total=3 duty=1
 Xde   ope ad0 clk decModel size=20
 * hardwire rdw signal to gnd
 Xwr bt0 bf0 dii gnd clk write1
 Xw0 bt0 bt1 bf0 bf1     wire_dual len='lw' wid='wirew'
 * Place memory cell at end of wire
 * First make sure it works with short wire and few memory cells
 * View on plotter
 *v(ope), v(dii)
 *v(la:ff) v(la:tt)
 *v(bf1) and v(bt1)
 Xla bt1 bf1 ope         mem1 m=1
 Xmd bt1 bf1             memLoad number =254
 *14.462274109131130
 .tran 1p 50n
--- a/final/testRead.cir
+++ b/final/testRead.cir
@@ -0,0 +1,61 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 3n
 .param lw=5000
 .param wirew=3
 *DC supplies
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='per' start='per' total=1 duty=0.5 sz=120
 Xrdwr rdw               dat1 period='per' start='per' total=2 duty=1
 Xdii dii                dat1 period='per' start='per' total=3 duty=1
 * vary
 .param dip=0.05
 Vt bt2  0  PULSE('supply''supply-dip' 'per' 10p 10p '2*per' '4*per')
 Vf bf2  0  PULSE('supply-dip''supply' 'per' 10p 10p '2*per' '4*per')
 Xbit  ad0               dat1 period='per' start='0.5*per' total=3 duty=1
 Xde ope ad0 clk decModel size=20
 * Xrd bt2 bf2 dot vdd clk read1
 Xrd bt2 bf2 set rst vdd clk readSub
 .ic v(dot)=0
 .tran 1p 50n
--- a/final/testSRAM.cir
+++ b/final/testSRAM.cir
@@ -0,0 +1,58 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 100ns
 .param lw=500
 .param wirew=3
 *DC supplies
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='0.5*per' total=1 duty=0.5 sz=120
 Xrdwr rdw               dat1 period='per' start='per' total=2 duty=1
 *Vrdw rdw 0 'supply'
 Xbit  ad0               dat1 period='per' start='per' total=3 duty=1
 Xdii dii              dat1 period='4*per' total=1 duty=0.5 sz=120
 Xacc acc                dat1 period='per' start='per+10ps' total=2 duty=1
 *
 Xwr bt0 bf0 dii rdw clk write1
 Xw0 bt0 bt1 bf0 bf1     wire_dual len='lw' wid='wirew'
 Xla bt1 bf1 ope mem1
 Xmd bt1 bf1             memLoad number =1
 Xw1 bt1 bt2 bf1 bf2     wire_dual len='lw' wid='wirew'
 Xrd bt2 bf2 dot rdw clk read1
 Xde ope ad0 clk decModel size=10
 .tran 1ps 1600ns
--- a/final/testWrite.cir
+++ b/final/testWrite.cir
@@ -0,0 +1,51 @@
 * File includes subcircuits and technology definitions
 .include ./SRAM_bits.cir
 *this cell emulates load from SRAM cells,
 * Number refers to the load from than number of cells
 .subckt memLoad ttt fff number=254
 Xnt ttt gnd dead nn ww='number*5'
 Xnf fff gnd dead nn ww='number*5'
 .ends memLoad
 *********begin: topLevel*****
 * Parameters
 .global gnd vdd
 .param gnd=0
 *********begin: topLevel*****
 .param per = 1ns
 .param lw=500
 .param wirew=3
 *DC supplies
 * make sure data signal is set up before clock signal triggers write
 * possible NOR rdw and Clk, and then maybe delay clk?
 * connect PMOS transistors to output of NOR gate, not directly to clk
 vdd vdd 0 'supply'
 Xclok clk               dat1 period='per' start='per' total=1 duty=0.5 sz=120
 Xrdwr rdw               dat1 period='per' start='per' total=2 duty=1
 Xdii dii                dat1 period='per' start='per+0' total=3 duty=1
 Xwr bt0 bf0 dii gnd clk write1
 .tran 1p 15n
--- a/final/todo.md
+++ b/final/todo.md
@@ -0,0 +1,13 @@
 * [x] Figure out the weird opAmp behavior
 * [x] Design cell with strict metal policies
 * [x] Add precharger version of memory cell (or explain how they compose)
 * [x] Test cell in the _middle_.
 * [x] Walk through the consequences of the read/write block being in the middle.
 * [x] Figure out what to do with flopped write block.
 * [x] Test data close to write block (it pulls up past clock low!)
 * [ ] Drive wires to zero?
 * [x] Add missing well connection in layout
 * [x] Make sure width isn't too horrible
 * [ ] Model additional delay for read read/write block select?
 * [x] Model worst case of decoder
 * [x] Cite [this](https://ieeexplore.ieee.org/document/210039)
--- a/final/toplevel.png
+++ b/final/toplevel.png
--- a/final/toplevel_design.png
+++ b/final/toplevel_design.png
--- a/final/write.png
+++ b/final/write.png
Author	SHA1	Message	Date
Danila Fedorin	f317a7e8da	Add some minor additional sections to report.	2021-03-17 13:34:04 -07:00
Danila Fedorin	90a5aed6ec	Size sense amp back to 40	2021-03-17 13:12:55 -07:00
Danila Fedorin	c6d7795074	Update report with new performance characteristics.	2021-03-17 13:09:15 -07:00
Danila Fedorin	d8f4a272e3	Update to use new wire model's characteristics.	2021-03-17 13:07:33 -07:00
Danila Fedorin	ff0edb93bb	Update with new wire model	2021-03-17 12:58:36 -07:00
Danila Fedorin	39ec744562	Update report.	2021-03-17 12:21:35 -07:00
Danila Fedorin	6530e7ef8c	1.9ns everywhere.	2021-03-17 12:19:33 -07:00
Danila Fedorin	71195df7c9	Add final SRAM design.	2021-03-17 12:19:17 -07:00
Danila Fedorin	0c1d8611b1	Add missing images.	2021-03-17 12:19:02 -07:00
Danila Fedorin	b99403a4ff	Update TODOs.	2021-03-17 12:16:02 -07:00
Danila Fedorin	9afa839bff	Add TODO.	2021-03-17 11:51:50 -07:00
Danila Fedorin	6f99879b8f	Update electric files.	2021-03-17 09:00:29 -07:00
Danila Fedorin	fe52f689f9	Upload files to 'final'	2021-03-17 00:32:18 -07:00
Danila Fedorin	c171b0374b	Upload files to 'final'	2021-03-17 00:02:13 -07:00
Danila Fedorin	eb8d068519	Update 'final/todo.md'	2021-03-17 00:00:34 -07:00
Danila Fedorin	6db76f4fd3	Update todo	2021-03-16 23:25:36 -07:00
Danila Fedorin	64ee80be63	Update report.	2021-03-16 23:25:30 -07:00
Danila Fedorin	6b963c967b	Merge branch 'master' of dev.danilafe.com:ECE-571/Labs	2021-03-16 19:02:58 -07:00
Danila Fedorin	4d4ceddcc6	Update todos	2021-03-16 19:02:06 -07:00
Danila Fedorin	75381749d7	Add initial designs.	2021-03-16 18:32:56 -07:00
Danila Fedorin	d2f53a9a4f	Update 'final/todo.md'	2021-03-16 18:19:45 -07:00
Danila Fedorin	0f6426958e	Update 'final/todo.md'	2021-03-16 16:47:53 -07:00
Danila Fedorin	8b9eabdec1	1.33ns with flip flops and bug mitigation	2021-03-16 16:30:28 -07:00
Danila Fedorin	e077bb9071	1.35ns with flip flops and bug mitigation	2021-03-16 16:20:35 -07:00
Danila Fedorin	76899cb8a3	2ns with flops.	2021-03-16 15:44:09 -07:00
Danila Fedorin	12856ef152	Use improved senseamp to strengthen performance.	2021-03-16 14:28:24 -07:00
Danila Fedorin	ca72f3eb3d	Rollback to 1.3ns and no write flops.	2021-03-16 12:22:50 -07:00
Danila Fedorin	e5b0166d8c	With flip flops.	2021-03-15 22:24:43 -07:00
Danila Fedorin	8ba9d02a8e	1.24ns	2021-03-15 16:02:04 -07:00
Danila Fedorin	c866f63e8c	1.3 nanoseconds	2021-03-15 15:17:04 -07:00
Danila Fedorin	f3ffb39219	Write-in-middle design.	2021-03-14 23:11:00 -07:00
Danila Fedorin	f289e84389	Current best effort.	2021-03-14 21:13:43 -07:00
Danila Fedorin	5f8a49ab9a	WIP (still buggy) 2ns design	2021-03-12 13:53:24 -08:00
Danila Fedorin	2010fcdf52	Add initial version of SRAM design.	2021-03-09 19:15:39 -08:00
Danila Fedorin	8285087e3f	Add Scott's various test files.	2021-03-09 19:15:28 -08:00
Danila Fedorin	b58f4df33e	Add my initial definitions of SRAM bits.	2021-03-09 19:15:07 -08:00