-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathx86.tex
819 lines (775 loc) · 25.2 KB
/
x86.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
\documentclass[mathserif,xcolor={dvipsnames,table}]{beamer}
\mode<presentation>{\usetheme{Warsaw}\usecolortheme{crane}}
\usepackage{centernot}
\usepackage{upgreek}
\usepackage{ragged2e}
\usepackage{graphicx}
\usepackage{fancybox}
\usepackage{transparent}
\usepackage{geometry}
\usepackage{tikz}
\usetikzlibrary{shadows}
\usepackage{hyperref}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{lmodern}
\usepackage{amsmath}
\beamertemplatenavigationsymbolsempty
\title{\textbf{The x86 is dead. Long live the x86!}}
\date{}
\author{\textit{georgia tech $\circ$ summer 2013 $\circ$ cs4803uws $\circ$ nick black}}
\begin{document}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth,height=\paperheight]{images/gt2.jpeg}
}%
\begin{frame}[plain]
\textcolor{white}{
\transparent{0.5}%
\colorbox{black}{\textbf{the x86 is dead. long live the x86!}}
}
\vspace{2.7in}
\\
\hfill\includegraphics[scale=.25]{images/cc-logo.pdf}
\includegraphics[scale=.25]{images/cc-new.pdf}
\includegraphics[scale=.25]{images/cc-share.pdf}
\textcolor{white}{
\\
\hfill \tiny{CC3.0 share-alike attribution}\\
}
\textcolor{white}{
\hfill \scriptsize{copyright \copyright\ 2013 nick black}\\
}
\vspace{.6cm}
\textcolor{white}{
\hfill\tiny{with diagrams by david kanter of \href{http://realworldtech.com}{http://realworldtech.com}}
}
\end{frame}
}
{\usebackgroundtemplate{\includegraphics[scale=.27]{images/ivymain.jpeg}}
\begin{frame}{``Upon first looking into Intel's x86''}
\hfill that upon\\
\hfill which we \\
\hfill gaze is\\
\hfill mankind's\\
\hfill triumph,\\
\vspace{.8in}
\hfill and we\\
\hfill are its\\
\hfill stewards.\\
\vspace{.5in}
\hfill use it well.\\
\end{frame}
}
\begin{frame}{Why study the x86?}
\begin{itemize}
\item Used in a majority of servers, workstations, and laptops
\item Receives the most focus in the kernel/toolchain
\item Very complex processor, thus large optimization space
\item Excellent documentation and literature
\item Fascinating, revealing, lengthy history
\end{itemize}
\vfill
\vspace{.35in}
Do not think that x86 is all that's gone on over the past
30 years\footnote{Commonly expressed as ``All the world's an x86.''}.\\
\vspace{.35in}
That said, those who've chased peak on x86 can chase it anywhere.
\end{frame}
\begin{frame}{In the grim future of computing there are 10,000 ISAs}
\begin{columns}
\begin{column}{.5\textwidth}
\footnotesize{
\begin{itemize}
\item Alpha + BWX/FIX/CIX/MVI
\item AVR32 + JVM
\item CMS
\item PA-RISC + MAX-2
\item SuperH
\item i960
\item IA64 (Itanium)
\item MIPS + MDMX/MIPS-3D
\item IBMHLA (s390 + z)
\item m68k
\item VAX + VAXVA
\item z80 / MOS6502
\item MIX
\end{itemize}
}
\end{column}
\begin{column}{.5\textwidth}
\footnotesize{
\begin{itemize}
\item SPARC V9 + VIS3\footnote{\tiny{Most recently the ``Oracle SPARC Architecture 2011''.}}
\item JVM\footnote{\tiny{Most recently the Java SE 7 spec, 2013-02-28.}}
\item PTX/SASS\footnote{\tiny{Most recently the PTX ISA 3.1 spec, 2012-09-13.}}
\item TILE-Gx\footnote{\tiny{TILE-Gx ISA 1.2, 2013-02-26.}}
\item ARM + NEON\footnote{\tiny{ARMv8: A64, A32, and T32, 2011-10-27.}}
\item Blackfin
\item PowerISA + AltiVec/VSX\footnote{\tiny{PowerISA v.2.06B, 2010-11-03.}}
\item MMIX
\end{itemize}
}
\end{column}
\end{columns}
\end{frame}
\begin{frame}[t]{IA-16 CISC (MS-DOS, 640K--1M RAM)}
\begin{block}{8086/80186 (NEC, AMD, Fujitsu, OKI, Kvazar-Mikro, \ldots)}
\begin{itemize}
\item IBM PC/XT. Real mode. 16-bit regs and buses
\item 1MB RAM in 16 64K segments
\item 8086-2 ISA on 80186
{\footnotesize
(\tt{ENTER}/\tt{LEAVE}, \tt{PUSHA}/\tt{POPA}, \tt{INS}/\tt{OUTS})
}
\end{itemize}
\end{block}
\begin{block}{80286 (IBM, AMD, Fujitsu, \ldots)}
\begin{itemize}
\item IBM PC/AT (ISA). Protected mode w/ MMU
\item 16MB RAM with privilege levels on segments
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
8086 & 1978 & [email protected]$\upmu$m & 16 / 16 / 20 & 4--10MHz \\
\hline
8088 & 1979 & [email protected]$\upmu$m & 16 / 8 / 20 & 5--8MHz \\
\hline
80186 & 1980 & [email protected]$\upmu$m & 16 / 16 / 20 & 6--25MHz \\
\hline
80188 & 1982 & [email protected]$\upmu$m & 16 / 8 / 20 & 6--25MHz \\
\hline
80286 & 1982 & [email protected]$\upmu$m & 16 / 16 / 24 & 4--12.5MHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Intel 80286\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/80286.pdf}
\end{frame}
\begin{frame}[t]{IA-32 CISC (Win 3.11, OS/2, Linux, 386BSD)}
\begin{block}{80386 (Am386, TI, Chips, IBM, \ldots)}
\begin{itemize}
\item IBM PS/2 (MCA). 32-bit segments, regs, buses
\item NatSemi 16550A UART. 80386SL: first (external) cache
\item 2\textbf{c} per simple instruction
\end{itemize}
\end{block}
\begin{block}{80486 (Am486, Cx486DLC, TI486SXL, IBM486DLC,\ldots)}
\begin{itemize}
\item EISA/VLB. On-die 8KB U\$, \texttt{XADD}, \texttt{CMPXCHG}
\item 1\textbf{c} per simple instruction, integrated FPU
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
80386-DX & 1985 & [email protected]$\upmu$m & 32 / 32 / 32 & 12--33MHz \\
\hline
80386-SX & 1988 & [email protected]$\upmu$m & 32 / 16 / 24 & 16--33MHz \\
\hline
80486-DX & 1989 & [email protected]$\upmu$m & 32 / 32 / 32 & 20--50MHz \\
\hline
80486-SX & 1991 & [email protected]$\upmu$m & 32 / 32 / 32 & 16--33MHz \\
\hline
80486-DX2 & 1993 & [email protected]$\upmu$m & 32 / 32 / 32 & 40--66MHz \\
\hline
Am486-DX4 & 1995 & [email protected]$\upmu$m & 32 / 32 / 32 & 75--120MHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Intel 80486DX2\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/80486DX2.pdf}
\end{frame}
\begin{frame}[t]{Pipelined superscalar (Win 95, BeOS, FreeBSD)}
\begin{block}{Intel P5 (Pentium, Pentium MMX)}
\begin{itemize}
\item U/V pipes, 8KB each I\$/D\$, better FPU/MUL, debug regs
\item PMMX adds MMX + perf counters (\texttt{RDPMC}), 2x16KB L1
\end{itemize}
\end{block}
\begin{block}{AMD K5 (SSA/5, 5k86)}
\begin{itemize}
\item RISC Am29000 with x86 frontend emitting $\upmu$code
\item Variable register window sizes
\end{itemize}
\end{block}
\begin{block}{Cyrix 6x86}
\begin{itemize}
\item Scratchpad (``L0'') cache, great ZOPS, crap FPU
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
Pentium\footnote{\tiny{The faster .35$\upmu$m Pentiums (P54CS, P55C, Tillamook) didn't emerge until 1995 (P54CS) or 1997.}} & 1993 & [email protected]$\upmu$m & 5 & 60--300MHz \\
\hline
AMD SSA/5 & 1996 & [email protected]$\upmu$m & 5 & 75--100MHz \\
\hline
AMD 5k86 & 1996 & [email protected]$\upmu$m & 6 & 90--133MHz \\
\hline
Cyrix 6x86 & 1996 & [email protected]$\upmu$m & 7 & 80--150MHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Cyrix 6x86\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/6x86.pdf}
\end{frame}
\begin{frame}[t]{SSE + OOO CRISC (Win 98, FreeBSD 3, Linux 2)}
\begin{block}{Intel P6 (PPro / PII / PIII)}
\begin{itemize}
\item PPro adds \texttt{SYSENTER, SYSEXIT}, big ``on-package'' L2
\item PIII adds 8 128-bit regs, SSE1 and \texttt{CMOVcc}
%SSE1 is strictly 32x4 FP + data-movement stuff, plus SFENCE
%PIII shared execution resources (save regs) between SSE/FP
\item Low-latency 8W L2 on Coppermine, HW prefetch on Tualatin
\end{itemize}
\end{block}
\begin{block}{AMD (NexGen) K6 (K6 / K6-2 / K6-III)}
\begin{itemize}
\item Adds \texttt{SYSCALL, SYSRET}
% shit for performance, but worked without OS support, but who cares?
\item K6-2 adds 3DNow! atop MMX regs
\item K6-III just sticks an L2 on K6-2
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
Pentium Pro & 1995 & [email protected]$\upmu$m & 14 & 150--333MHz / 60--66MHz \\
\hline
K6 & 1997 & [email protected]$\upmu$m & 6 & 166--300MHz / 66MHz \\
\hline
K6-2 & 1998 & [email protected]$\upmu$m & 6 & 300--550MHz / 66--100MHz \\
\hline
Pentium III & 1999 & [email protected]$\upmu$m & 10 & 400--1400MHz / 100--130MHz \\
\hline
K6-III & 1999 & [email protected]$\upmu$m & 10 & 350--500MHz / 66--100MHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{K6\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/k6.pdf}
\end{frame}
\begin{frame}{K6-2\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/k62.pdf}
\end{frame}
\begin{frame}{K6-III\hfill\tiny{image source: Wikimedia Commons}}
\includegraphics[width=\textwidth]{images/Amdk6III_arch.pdf}
\end{frame}
\begin{frame}[t]{SSE2 + long pipelines (FreeBSD 4, Linux 2.4)}
\begin{block}{Intel NetBurst (P4, PD)}
\begin{itemize}
\item P4 adds SSE2, HyperThreading, quad-pumped FSB
% SSE2 implements MMX integer ops and 64-bit double FP
\end{itemize}
\end{block}
\begin{block}{AMD K7 (Athlon)}
\begin{itemize}
\item Alpha's EV6 bus, 3-way FPU fixes AMD FP
\item Enhanced 3DNow! Palomino gets SSE2
\item T-Bird adds L2 cache on-die
\end{itemize}
\end{block}
\begin{block}{VIA C3+C7 / Transmeta Crusoe+Efficeon}
\begin{itemize}
\item Who cares? x86 officially becomes a two-party system.
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
Athlon T-Bird & 2000 & 37M@180nm & 10/15 & .6--1.4GHz / 333MT/s \\
\hline
Intel Northwood & 2003 & 55M@130nm & 20 & 1.6--3.4GHz / 800MT/s \\
\hline
Athlon Barton & 2003 & 54M@130nm & 10/15 & 1.833--2.333GHz / 400MT/s \\
\hline
% to give you a point of reference that's bigger than the alaskan pipeline
Intel Prescott & 2004 & 125M@90nm & 31 & 3--3.8GHz / 800MT/s \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Pentium 4 Northwood\hfill\tiny{image source: www.chip-architect.com}}
\includegraphics[width=\textwidth]{images/northwood.jpg}
\end{frame}
\begin{frame}[t]{x86-64 multicore (FreeBSD 7, Linux 2.6)}
\begin{block}{Intel Core}
\begin{itemize}
\item 45nm Penryn replaces $\text{SiO}_2$ with HfSiON.
\item Woodcrest Xeon adds SSSE3
\end{itemize}
\end{block}
\begin{block}{AMD K8}
\begin{itemize}
\item HyperTransport @ 1GHz on Socket 939
\item San Diego adds SSE3, IOMMU
\item Quad FX (Windsor) goes NUMA on HT2
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
AMD San Diego & 2005 & 114M@90nm & 12/17 & 2.2--2.6GHz / 1GHz \\
\hline
Intel Conroe & 2006 & 291M@65nm & 14 & 1.6--2.6GHz / 533--800MHz \\
\hline
AMD Windsor & 2007 & 154M@90nm & 12/17 & 2--3.2GHz / 2GHz \\
\hline
Intel Penryn & 2008 & 410M@45nm & 14 & 1.2--3.06GHz / 800--1333MHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
{
\setbeamertemplate{background canvas}{%
\includegraphics[width=\paperwidth,height=\paperheight]{images/opteron.pdf}
}%
\begin{frame}{AMD K8 Opteron\hfill\tiny{image source: Wikimedia Commons}}
\end{frame}
}
\begin{frame}[t]{AVX + scaling + heterogeneity (FreeBSD 8, Linux 3)}
\begin{block}{Recent Intel}
\begin{itemize}
\item Return of the barrel shifter (and HyperThreading)!
\item Nehalem adds SSE4.2, 2TLB, VT-d, integrates northbridge
\item Quick Path Interconnect on Nehalem-E
\item Sandy Bridge integrates GPU, GMCH, L3, 128-bit AVX
\item Ivy Bridge adds PCIe 3.0, FinFET trigates
\item Haswell adds transactional memory, 256-bit AVX2, FMA3
\end{itemize}
\end{block}
\vfill
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{RoyalPurple!20}{RoyalPurple!5}
{\footnotesize
\begin{tabular}{|r|r|r|c|l|}
\hline
Intel Nehalem & 2008 & 731M@45nm & 16 & 2--3.33GHz \\
\hline
Intel Sandy Bridge & 2011 & 1.2B@32nm & 14/19 & 2.3--4GHz \\
\hline
AMD Bulldozer & 2011 & 1.2B@32nm & 15/20 & 3--4.6GHz \\
\hline
Intel Haswell & 2013 & 1.5B@22nm & 14/19 & 2.3--3.9GHz \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Sandy Bridge (2011)\hfill\tiny{image source: \textit{Intel Optimization Manual}}}
\includegraphics[scale=.175]{images/sandybridge.jpg}
\end{frame}
\begin{frame}{Haswell (2013)\hfill\tiny{image source: \href{http://realworldtech.com}{Real World Technologies}}}
\begin{columns}
\begin{column}{.5\textwidth}
\includegraphics[scale=.225]{images/haswell-5.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Decoding\hfill\tiny{image source: \href{http://realworldtech.com}{Real World Technologies}}}
\includegraphics[scale=.49]{images/haswell-1.png}
\end{frame}
\begin{frame}{Schedulers\hfill\tiny{image source: \href{http://realworldtech.com}{Real World Technologies}}}
\includegraphics[scale=.42]{images/haswell-2.png}
\end{frame}
\begin{frame}{Execution\hfill\tiny{image source: \href{http://realworldtech.com}{Real World Technologies}}}
\begin{columns}
\begin{column}{.5\textwidth}
\hfill\includegraphics[scale=.45]{images/haswell-3.png}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Write order with store-buffer forwarding}
\scriptsize{
Very rigid memory model for all but non-temporal accesses\footnote{\tiny{Centaur Winchip implemented ``x86 oostore'', with less strongly ordered memory.}}:
\begin{itemize}
\item Atomics are never reordered
\item The pipeline and I\$ are coherent
\item Stores can be reordered after loads, but not before
\item Accesses to uncacheable memory are never reordered
\item Loads can be reordered after stores to other locations
\item String ops, streaming stores, non-temporal moves and CLFLUSH can reorder
writes among themselves
\end{itemize}
\vfill
Among multiple (logical or physical) processors:
\begin{itemize}
\item Individual processors maintain this model
\item Writes by a single processor are seen in the same order by all processors
\item Writes among processors are not totally ordered, unless locked
\item Memory ordering obeys causality
\item Stores are externally consistent
\end{itemize}
\textbf{Implication:} concurrent stores by multiple processors will result in
each processor seeing its own write occur first!
}
\end{frame}
\begin{frame}{Task model}
Hardware-recognized task state is stored in Task State Segments:
\begin{itemize}
\item Segment selectors of segment registers and LDTR
\item GP registers, EFLAGS, EIP, CR3, TR
\item I/O map base address and I/O map
\item Stack pointers for CPL0, 1, 2 stacks
\item Link to previously executed task
\end{itemize}
\vfill
Share among tasks via three methods:
\begin{itemize}
\item Through the GDT segment descriptors\\(share among all processes)
\item Through a shared LDT\\(share among some cooperating processes)
\item Through shared descriptors in distinct LDTs mapped to a common
address in linear address space\\(share among some non-cooperating
processes)
\end{itemize}
\end{frame}
\begin{frame}{Simultaneous multithreading}
HyperThreading, when activated by BIOS and OS, results in:
\begin{description}
\item[\textbf{Replication}] Registers, renamed RSB, large page ITLB
\item[\textbf{Partition}] Load/store buffers, ROB, small page ITLB
\item[\textbf{Competition}] Reservation stations, cache\footnote{\tiny{Actually, L1 D\$ can be non-competitively shared between logical cores in \textit{adaptive mode}, so long as their CR3 registers match, their paging modes match, and bit 24 of the IA32\_MISC\_ENABLE MSR isn't set.}}, DTLB, STLB
\end{description}
\end{frame}
\begin{frame}{Immediate state}
\begin{center}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{BurntOrange!20}{BurntOrange!5}
{\footnotesize
\begin{tabular}{|l|l|}
\hline
\textbf{Architectural component} & \textbf{Count} \\
\hline
\hline
64-bit x86-64 GP / segment registers & 16 / 6 \\
\hline
80-bit x87/MMX registers & 8 \\
\hline
AVX/XMM registers & 16 \\
\hline
Special-purpose & 2 (EFLAGS, EIP) \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\vspace{.25in}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{BurntOrange!20}{BurntOrange!5}
{\footnotesize
\begin{tabular}{|l|l|l|l|}
\hline
\textbf{$\boldsymbol\upmu$architectural component} & \textbf{Haswell} & \textbf{Sandy Bridge} & \textbf{Nehalem} \\
\hline
\hline
Load buffers & 72 & 64 & 48 \\
\hline
Store buffers & 42 & 36 & 32 \\
\hline
PRF entries (x86-64+AVX) & 168+168 & 160+144 & N/A \\
\hline
Line fill buffers & 10 & 10 & 10 \\
\hline
Reservation stations & 60 & 54 & 34 \\
\hline
Reorder buffer entries & 192 & 168 & 128 \\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{center}
\end{frame}
\begin{frame}{Instruction store}
\begin{itemize}
\item 32KB instruction cache, 16B/\textbf{c}$\implies$48GB/s @ 3GHz
\item 8-way on Sandy Bridge, 4-way on Nehalem
\item 128 4K ITLB entries, 8 large ITLB entries\footnote{7 on Nehalem}
\item MSROM 4$\upmu$op/\textbf{c}
\item 1.5K$\upmu$op trace cache\footnote{The Pentium 4 trace cache was a whopping 12K$\upmu$ops.}
\end{itemize}
\end{frame}
\begin{frame}{On-die data store}
The latencies below are best cases (simple address mode, no stalls, no eviction, etc.). MESI details are stored
in L1/L2, while directory details are kept in the LLC (hence the latter's
inclusivity).
\vfill
\begin{columns}
\begin{column}{0.5\textwidth}
\small{
\begin{itemize}
\item 32KB 8-way 64B/line data cache, 4\textbf{c} latency, 2x16B/\textbf{c}
\item Sandy Bridge provides 32B/\textbf{c} L1 load, Haswell 64B/\textbf{c} L1/L2 load and 32B/\textbf{c} L1 store
\item 256KB 8-way 64B/line unified non-inclusive L2, 12\textbf{c} (10\textbf{c} on Nehalem), 32B/\textbf{c}
\item Variable 64B/line unified inclusive L3, 26--31\textbf{c} (35--40\textbf{c} on Nehalem), 32B/\textbf{c}
\item 43\textbf{c}/60\textbf{c} for clean/dirty forward
\end{itemize}
}
\end{column}
\begin{column}{0.5\textwidth}
\includegraphics[scale=.4]{images/skynet.pdf}
\end{column}
\end{columns}
\hfill
\tiny{\texttt{lstopo(1)} output}
\end{frame}
\begin{frame}{TLBs and Memory interface}
\begin{itemize}
\item 4-way DTLB translates up to 2 loads/\textbf{c} and 1 store/\textbf{c}
\item DTLB: 64 4KB entries, 32 2MB/4MB entries, 4 1GB entries
\item 4-way unified STLB with 512\footnote{1024 on Haswell} 4KB entries, 7\textbf{c} latency
\item L1: IP-based stride prefetcher, DCU streaming prefetcher
\item L2: Spatial prefetcher, streaming prefetcher
\item Nehalem: 3x 8B DDR3 @ 1.333GT/s (31.992GB/s)
\item Sandy Bridge-E: 4x 8B DDR3 @ 1.6GT/s (51.2GB/s)
\item Speculative reads allowed from WB, WC, and WT memory
\end{itemize}
\end{frame}
\begin{frame}{Instruction set I---Encoding}
x86-64 instructions can be up to 15 bytes.
\vfill
\begin{itemize}
\item Up to four \textit{legacy prefixes}:
\begin{itemize}
\item{LOCK/REPNE/REPNZ/REP/REPE/REPZ}
\item{Segment overrides and branch hinting}
\item{Operand-size override}
\item{Address-size override}
\end{itemize}
\item Opcode with possible \textit{family prefix} (REX, VEX, XOP)
\item Possible 1-byte ModR/M (operand encoding, register/memory)
\item Possible 1-byte SIB (scale-index-base)
\item 0, 1, 2, 4, or 8-byte displacement
\item 0, 1, 2, 4, or 8-byte immediate
\end{itemize}
\vfill
Recall that 4 instructions per 16B/\textbf{c} fetch is necessary for peak
decoder throughput (but this is not requisite for peak issue rate).
\end{frame}
\begin{frame}{Instruction set II---Memory}
\justifying
\small{\textit{Non-temporal} accesses bypass cache and do not require RFOs; they are
write-combining, write-collapsing, and weakly ordered. They do not take
PATs into account. When
properly bunched into cacheline-length chunks, they can deliver the full bus
bandwidth. \textit{Prefetches} (as opposed to preloading) do not consume a
register, do not stall retirement, do not split cache lines, and cannot
fault\footnote{\tiny{CPUID.01H.EBX holds the stride length (D\$/U\$ only).}}.
\textit{Fence} instructions ensure coherence of data.}\\
\vfill
%MOVs to CR3/CR4 can also invalidate page cache structures.
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{BurntOrange!20}{BurntOrange!5}
{\tiny
\begin{tabular}{|l|l|l|}
\hline
MOVNT*\footnote{\tiny{See also AMD's SSE4a's MOVNTSD/MOVNTSS.}} & \_mm\_stream\_* & Streaming store to mem\\
\hline
MOVNTDQA & \_mm\_stream\_load\_* & Streaming loads from USWC MMIO\\
\hline
PREFETCHNTA & \_mm\_prefetch(NTA) & Prefetch into second-level cache\\
\hline
PREFETCHNT0 & \_mm\_prefetch(T0) & Prefetch into all cache levels\\
\hline
MASKMOVD* & \_mm\_maskmove\_* & Bytemasked store to mem\\
\hline
SFENCE/LFENCE & \_mm\_sfence/\_lfence & Order point for stores/loads\\
\hline
MFENCE & \_mm\_mfence & Order point for stores and loads\footnote{\tiny{Not equivalent to SFENCE+LFENCE, which would be mutually unordered! Only MFENCE orders CLFLUSH.}}\\
\hline
WBINVD & "wbinvd":::"memory" & Write back, and invalidate cache\\
\hline
INVD & "invd" & Screw it, just invalidate the caches\\
\hline
INVLPG & "invlpg" & Invalidate TLB entries containing address\\
\hline
INVPCID & "invpcid" & Invalidate TLB entries based on PCID\\
\hline
% All levels, broadcasted
CLFLUSH & "clflush \%0":"+m" & Cacheline invalidation\\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{frame}
\begin{frame}{Instruction set III---Some control instructions}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{BurntOrange!20}{BurntOrange!5}
{\scriptsize
\begin{tabular}{|l|l|}
\hline
HALT & Halt the processor\\
\hline
MWAIT (SSE3) & Monitor a cache line (\_mm\_mwait)\\
\hline
CCMOVxx & Conditional move (control dep to data dep)\\
\hline
JMP & Unconditional branch\\
\hline
Jxx / Jx & Conditional branches\\
\hline
CALL & Task switch\\
\hline
IRET & Return from far call\\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{frame}
\begin{frame}{Instruction set IV---Some arithmetics}
\begin{tikzpicture}
\node[drop shadow,fill=white,inner sep=0pt]
{\rowcolors{1}{BurntOrange!20}{BurntOrange!5}
{\tiny
\begin{tabular}{|l|l|l|}
\hline
VFMADD*[PS][DS] & \_mm\_dp\_p* & FP fused multiply-add\\
\hline
VFNMADD*[PS][DS] & \_mm\_dp\_p* & FP fused multiply-negate-add\\
\hline
VFMSUB*[PS][DS] & \_mm\_dp\_p* & FP fused multiply-sub\\
\hline
VFNMSUB*[PS][DS] & \_mm\_dp\_p* & FP fused multiply-negate-sub\\
\hline
VFMADDSUB*[PS][DS] & \_mm\_dp\_p* & FP fused multiply-sub/add\\
\hline
VGATHER[DQ]D & \_m256i\_mm256\_add\_epixx & Gather ints from memory\\
\hline
VGATHER[DQ]P[SD] & \_m256i\_mm256\_add\_epixx & Gather FPs from memory\\
\hline
VPADD[BWDQ] & \_m256i\_mm256\_add\_epixx & Packed integer add\\
\hline
VPADDS[BW] & \_m256i\_mm256\_adds\_epixx & Packed sinteger add, signed saturation\\
\hline
VPADDUS[BW] & \_m256i\_mm256\_adds\_epuxx & Packed unsinteger add, unsigned saturation\\
\hline
VPAND/VPANDN & \_m256i\_mm256\_and[not]\_si256 & Logical AND / NOT+AND\footnote{\tiny{``Material nonimplication'' aka $\nrightarrow$.}}\\
\hline
VPAVG[BW] & \_m256i\_mm256\_avg\_epuXX & Integer average\\
\hline
VPBLEND[BW] & \_m256i\_mm256\_blend\_epiXX & Blend\\
\hline
VPERM[DQ] & \_m256i\_mm256\_perm\_epuXX & Permute integers \\
\hline
VPERMP[SD] & \_m256i\_mm256\_perm\_epuXX & Permute FPs \\
\hline
VPSLLV[DQ] & \_m256i\_mm256\_perm\_epuXX & Logical shift left \\
\hline
VPSRAV[DQ] & \_m256i\_mm256\_perm\_epuXX & Arithmetic shift right \\
\hline
VPSRLV[DQ] & \_m256i\_mm256\_perm\_epuXX & Logical shift right \\
\hline
PCMPcc[BWDQ] & \_m256i\_mm256\_cmpCC\_epiXX & Compare packed integers\\
\hline
PMULDQ/LD & \_mm\_mul*\_epi32 & Packed 32-bit integer multiply\\
\hline
\end{tabular}%
}
};
\end{tikzpicture}
\end{frame}
\begin{frame}{Recommended reading}
\footnotesize{
\begin{itemize}
\item Mel Gorman. ``Huge Pages in Linux'' LWN, in five parts (2010).
\item Owens et al. ``x86-TSO: A Rigorous and Usable Programmer's Model for x86 Multiprocessors'' (2010).
\item Bhandarkar and Clark. ``\href{http://dl.acm.org/citation.cfm?id=107003}{Performance from Architecture: Comparing a
RISC and a CISC with Similar Hardware Organization}'' (1991).
\vfill
\item Blem et al. ``\href{http://research.cs.wisc.edu/vertical/papers/2013/hpca13-isa-power-struggles.pdf}{Power Struggles: Revisiting the RISC vs. CISC Debate on
Contemporary ARM and x86 Architectures}'' (2013).
\vfill
\item Michael Thomadakis. ``The Architecture of the Nehalem Processor and
Nehalem-EP SMP Platforms'' (2011).
\vfill
\item Neil Dickson. ``Simple, Linear-Time x86 Jump Encoding'' (2011).
\vfill
\item \href{http://www.sandpile.org}{http://www.sandpile.org},
\href{http://www.realworldtech.com}{http://www.realworldtech.com}, and
\href{http://www.cpu-world.com}{http://www.cpu-world.com}
\vfill
\item \textit{Intel 64 and IA-32 Architectures Instruction Reference}.
\vfill
\item \textit{CUDA Parallel Thread Execution ISA Reference}.
\end{itemize}
}
\end{frame}
\begin{frame}{Hack on!}
``We still have judgment here, that we but teach\\
Bloody instructions, which, being taught, return\\
To plague th' inventor''\\
\hfill---Shakespeare, \textit{Macbeth}
\vfill
\justifying ``Each day the mythical return Enzian dreamed of seems less possible. Once it
was necessary to know uniforms, insignia, airplane markings, to observe
boundaries. But by now too many choices have been made. The single root lost,
way back there in the May desolation. Each bird has his branch now, and each
one is the Zone.''\\
\hfill---Thomas Pynchon, \textit{Gravity's Rainbow}
\end{frame}
\end{document}