GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… ·...
-
Upload
vuonghuong -
Category
Documents
-
view
251 -
download
2
Transcript of GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… ·...
!"!"#$%&'()*)"+,-.,'/01+)
%&'23/4.,)-.,)5/66&7+38)
52394:,+/;+;)",.1+66.,)<+6+/,1:)
!"#$%&$'()"*+,$'-.$/(01"*(,$(2*$3.-4"2$3&$5&$6728$
92.:;#4.+<$"=$/#.>41$?"-7)@.($
A;B;)@;#$CDDE$ F$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Version of simulator corresponding to these slides = GPGPU-Sim 2.1.1b
=7+,7&+>)
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! GHGH9IJ.)$N:;#:.;Q$
•! GHGH9IJ.)$L2+;#2(-4$$
–!%.B#"(#B1.+;B+7#;$)"*;-$
–!J"RQ(#;$"#8(2.S(>"2$
–!TU()V-;$)"*.WB(>"24$
•! GHGH9IJ.)$!""-4$
A;B;)@;#$CDDE$ C$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!
?:/4)&6)/)!"#@!
•! GH9$X$G#(V1.B4$H#"B;44.28$92.+$
•! L2+;#;4>28$@;B(74;Y$
–!Z.81-<$H(#(--;-$$
–!Z.81-<$V#"8#())(@-;$$
–! ?"))"*.+<$1(#*Q(#;$K[*;40+"V$47V;#B")V7>28\P$
•! ]:.*.( 4̂$G!_C`aY$bD$U$`IQ.*;$)7->V#"B;44"#4$
•! FD,DDD 4̂$"=$B"2B7##;2+$+1#;(*4$
A;B;)@;#$CDDE$ b$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
?:8)4:+)+A1&4+'+04)/B.24)!"#6@
J"7#B;Y$'%A,$Z"+$?1.V4$FE$$
A;B;)@;#$CDDE$ O$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
While CPU clock stays same and adding cache…
GPU peak throughput tracking Moore’s Law
GHGH9Y$GH9$?")V7>28$
$$$O$B"#;$?H9$$$$$$$$$c$$$$$$$$$$$COD$B"#;$GH9$$
–!Z;+;#"8;2;"74$B")V7>28$
A;B;)@;#$CDDE$ a$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
",.C,/''&0C)5.;+3
•! !#(*.>"2(-$:.;QV".2+$
–!M72$"2$?H9$72>-$#;(B1.28$*(+($V(#(--;-$B"*;$
4;B>"24$Q1.B1$(#;$"d"(*;*$"2+"$+1;$GH9$
•! ?"##;B+$:.;QV".2+e$K.=$<"7$Q(2+$FDDU$
4V;;*7VP$
–!GH9$X$B")V7+(>"2$Q"#01"#4;$
–!?H9$X$4;f7;2>(-$B"*;$[(BB;-;#(+"#\$(2*$LgN$
"d"(*$;28.2;$
A;B;)@;#$CDDE$ h$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"#)5&1,./,1:&4+142,+)=7+,7&+>)
DEFGFFFHI
J04+,1.00+19.0)K+4>.,L)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
A;B;)@;#$CDDE$ i$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
GPU
=P$1:&Q)N<*5
M#N*)/0;)=Q+0MR
•! TU+;24."24$"=$?$+"$47VV"#+$B"V#"B;44"#$)"*;-$
•! 3;$47VV"#+$@"+1$
–! $!1.4$+7+"#.(-$Q;^--$="B74$"2$?9A'$$
•! $%"#;$=;(+7#;4$(2*$(VV-.B(>"24$+"*(<
A;B;)@;#$CDDE$ `$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M#N*)5+'.,8)5.;+3
•! $$%;)"#<$4V(B;4$
–!J1(#;*$);)"#<$$
–!G-"@(-$
–!5"B(-$
–!?"24+(2+$
–!!;U+7#;$
J"7#B;Y$?9A'$V#"8#()).28$)(27(-$$A;B;)@;#$CDDE$ E$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M#N*)S:,+/;)T&+,/,1:8)
•! j;#2;-$X$8#.*$
"=$@-"B04$"=$
Q(#V4$"=$
+1#;(*4$
•! !1#;(*4$(#;$
4B(-(#$+1#;(*4$
J"7#B;Y$?9A'$V#"8#()).28$)(27(-$$A;B;)@;#$CDDE$ FD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M#N*)6804/A):&C:3&C:46U
•! A;B-(#(>"2$4V;B.W;#4$+"$.2*.B(+;$Q1;#;$+1.284$-.:;$kk8-"@(-kk$:".*$j;#2;-672BK&&&Pl$gg$0;#2;-$B(--(@-;$=#")$1"4+$
kk*;:.B;kk$$:".*$A;:.B;672BK&&&Pl$gg$=72B>"2$B(--(@-;$"2$*;:.B;$
•! H(#(--;-$0;#2;-$-(72B1$j;#2;-672BmmmaDD,$FC`nnnK&&&Pl$gg$aDD@-"B04,$FC`+1#;(*4$;(B1$
•! JV;B.(-$:(#.(@-;4$="#$+1#;(*$.*;2>WB(>"2$.2$0;#2;-4$*.)b$+1#;(*L*Ul$*.)b$@-"B0L*Ul$*.)b$@-"B0A.)l$
A;B;)@;#$CDDE$ FF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M#N*)VA/'Q3+)1.;+)
%4/0;/,;)M)M.;+):".*$4(UV<k4;#.(-K.2+$2,$o"(+$(,$o"(+$pU,$o"(+$p<P$$
q$
$="#$K.2+$.$X$Dl$.$m$2l$cc.P<r.s$X$(pUr.s$c$<r.sl$t$$
gg$L2:"0;$4;#.(-$J'_Hu$0;#2;-$$4(UV<k4;#.(-K2,$C&D,$U,$<Pl$
M#N*)1.;+)
kk8-"@(-kk$:".*$4(UV<kV(#(--;-K.2+$2,$o"(+$(,$o"(+$pU,$o"(+$p<P$$q$
$$$.2+$.$X$@-"B0L*U&Up@-"B0A.)&U$c$+1#;(*L*U&Ul$
$$$.=K.m2P$$$$$$$$$$$$<r.sX(pUr.sc<r.sl$
t$$)(.2KP$q$
$$v$gg$").w;*Y$(--"B(+;$(2*$.2.>(-.S;$);)"#<$
$$gg$L2:"0;$V(#(--;-$J'_Hu$0;#2;-$Q.+1$Cah$+1#;(*4g@-"B0$$$.2+$2@-"B04$X$K2$c$CaaP$g$Cahl$
$$6/AQ8WQ/,/33+3XXX0B3.1L6G)YZ[\\\D0G)YUFG)AG)8I])$$v$gg$").w;*Y$+#(24=;#$#;47-+4$=#")$GH9$+"$?H9$
t$
High performance computing with CUDA, SC09 Tutorial,
David Luebke, NVIDIA
A;B;)@;#$CDDE$ FC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"!"#$%&')&0)/)K246:+33)
•! %.B#"(#B1.+;B+7#;$>).28$)"*;-$"=$
B"2+;)V"#(#<$GH94$
•! M72$72)"*.W;*$?9A'gNV;2?5$
•! ]N!TY$!1.4$:;#4."2$"=$+1;4;$4-.*;4$B"##;4V"2*4$
+"$GHGH9IJ.)$:;#4."2$C&F&F@$$
A;B;)@;#$CDDE$ Fb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
*112,/18).-)%&'23/4.,)
0
50
100
150
200
250
0 50 100 150 200 250
GP
GP
U-S
im I
PC
Quadro FX 5800 IPC
HW - GPGPU-Sim Comparison
A;B;)@;#$CDDE$ FO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
GHGH9IJ.)$J"RQ(#;$A;V;2*;2B.;4$
•! 5.27U$
•! ?9A'$
•! ?9A'$JAj$
•! J+(2*(#*$A;:;-"V;#$T2:.#"2);2+$
–!G??,$/.4"2,$;+B&$
•! 6"#$NV;2?5$
–!?9A'$B(V(@-;$GH9$Z3$
A;B;)@;#$CDDE$ Fa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
%+^0C)2Q)4:+)6&'23/4.,
•! ?9A'ZN%TY$B7*($.24+(--$-"B(>"2$
•! ?9A'kL]J!'55kH'!ZY$B7*($.24+(--$-"B(>"2$
•! ]xLAL'k?9A'kJAjk5N?'!LN]Y$4*0$-"B(>"2$$
•! H'!ZY$(**$?9A'ZN%[email protected]$
•! 5Ak5L/M'MukH'!ZY$(**$y?9A'ZN%Tg-.@$
A;B;)@;#$CDDE$ Fh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
NV5=
A;B;)@;#$CDDE$ Fi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! !"!"#$%&')=7+,7&+>$
•! GHGH9IJ.)$L2+;#2(-4$$
–!%.B#"(#B1.+;B+7#;$)"*;-$
–!J"RQ(#;$"#8(2.S(>"2$
–!TU()V-;$)"*.WB(>"24$
•! GHGH9IJ.)$!""-4)
A;B;)@;#$CDDE$ F`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>).-)4:&6)6+19.0
•! 31(+$GHGH9IJ.)$4.)7-(+;4$
–!672B>"2(-$)"*;-$="#$H!_$c$?9A'gNV;2?5$
–!!.).28$)"*;-$="#$+1;$B")V7+;$V(#+$"=$($GH9$
•! Z"Q$GHGH9IJ.)$.2+;#=(B;4$Q.+1$?9A'$(VV-.B(>"24$
A;B;)@;#$CDDE$ FE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
31(+$GHGH9IJ.)$4.)7-(+;4
•! Functional model for PTX
–! PTX = Parallel Thread eXecution
•! A low-level, data-parallel virtual machine
•! Scalar ISA
–! Not SASS, Not DirectX, Not shader model N, Not
AMD’s ISA, Not x86, Not Larrabee. Only PTX.
•! Timing model for the compute part of a GPU
–! Not for CPU or PCIe
–! Only model microarchitecture timing relevant to
compute
A;B;)@;#$CDDE$ CD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
672B>"2(-$)"*;-$KH!_P
•! 5"QI-;:;-,$*(+(IV(#(--;-$:.#+7(-$)(B1.2;$–! '44;)@-<I-.0;Y$L24+#7B>"24$
–! H(#(--;-$+1#;(*4$#722.28$.2$@-"B04$$
–! JV;B.=<$#;4"7#B;4$Q.+1$2"$-.).+$•! Z3$*.4V(+B1;4$+1#;(*$@-"B04$(BB"#*.28$+"$.+4$-.).+$
•! JB(-(#$LJ'$–! Z3$8#"7V4$4B(-(#$+1#;(*4$.2+"$JL%A$Q(#V4$(4$($V;#="#)(2B;$"V>).S(>"2$
•! ?"2:;#8.28$V(#+$"=$?9A'$+""-$B1(.2Y$
.cu
.cl
NVCC
OpenCL Drv
PTX ptxas
G80
GT200
Fermi A;B;)@;#$CDDE$ CF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Functional model (PTX)
for (int d = blockDim.x; d > 0; d /= 2)
{
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0)
shared[tid] = f1;
}
}
$Lt_0_6146:
bar.sync 0;
setp.le.s32 %p3, %r7, %r1;
@%p3 bra $Lt_0_6402;
ld.shared.f32 %f3, [%rd9+0];
add.s32 %r9, %r7, %r1;
cvt.s64.s32 %rd18, %r9;
mul.lo.u64 %rd19, %rd18, 4;
add.u64 %rd20, %rd6, %rd19;
ld.shared.f32 %f4, [%rd20+0];
setp.gt.f32 %p4, %f3, %f4;
@!%p4 bra $Lt_0_6914;
st.shared.f32 [%rd9+0], %f4;
$Lt_0_6914:
$Lt_0_6402:
shr.s32 %r10, %r7, 31;
mov.s32 %r11, 1;
and.b32 %r12, %r10, %r11;
add.s32 %r13, %r12, %r7;
shr.s32 %r7, %r13, 1;
mov.u32 %r14, 0;
setp.gt.s32 %p5, %r7, %r14;
@%p5 bra $Lt_0_6146;
•! Scalar PTX ISA
•! Scalar control flow (if-branch, for-loops) •! Parallel Intrinsic (syncthreads())
•! Register allocation not done in PTX
A;B;)@;#$CDDE$ CC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
// some initialization code omitted
!.).28$%"*;-$="#$$
?")V7+;$V(#+4$"=$($GH9
•! GHGH9IJ.)$)"*;-4$>).28$="#Y$
–! J1(*;#$?"#;$KJ%,$JL%A$92.+P$
–! ?(B1;4$K!;U+7#;,$?"24+(2+,$vP$
–! L2+;#B"22;B>"2$];+Q"#0$
–!%;)"#<$?"2+#"--;#4$
–! G#(V1.B4$AM'%$
•! GHGH9IJ.)$*";4$]N!$)"*;-$>).28$="#Y$
–! ?H9,$H?L;$$
–! G#(V1.B4$JV;B.WB$Z3$KM(4+;#.S;#,$?-.VV.28,$A.4V-(<v$;+B&P
GPU
PCIe
Inte
rco
nn
ect
Gfx DRAM
Mem Ctrl Shdr Cores
Cache
Raster… Gfx HW
CPU
A;B;)@;#$CDDE$ Cb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!.).28$)"*;-$="#$$
GH9$).B#"I(#B1.+;B+7#;
•! GHGH9IJ.)$4.)7-(+;$+1;$
>).28$)"*;-$"=$($GH9$
#722.28$;(B1$-(72B1;*$?9A'$
0;#2;-&$$
–! M;V"#+4$z$B<B-;4$4V;2+$#722.28$
+1;$0;#2;-4&$$
–! TUB-7*;$(2<$>);$4V;2+$"2$*(+($
+#(24=;#$"2$H?L;$@74&$$
–! ?H9$.4$(447);*$+"$@;$.*-;$Q1;2$
+1;$GH9$.4$Q"#0.28&$
Time
GPU HW
GPU HW
CPU
CPU
Kernel Launch
Done
Kernel Launch
Done
CPU
GPGPU-Sim
GPGPU-Sim
A;B;)@;#$CDDE$ CO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
•! GHGH9IJ.)$.4$($!"#$%&"!'()(&"*&"+"&$4.)7-(+"#Y$
–! ?<B-;I-;:;-$)"*;-$="#$;(B1$V(#+$"=$+1;$).B#"(#B1.+;B+7#;$
–! L82"#.28$#(#;$B"#2;#$B(4;4$="#$;{B.;2B<$$
•! ;&8&$2"$!5/$).44,$2"$AM'%$#;=#;41$
•! A.|;#;2+$=#")$()(&"*$((,-$#"$4.)7-(+"#Y$
–! A";4$2"+$)(+B1$1(#*Q(#;$FDD}$
•! 31<e$
–! GH9$Z3$;U;B7+;$J'JJY$
•! H!_$~$!#(24-(+;$c$NV>).S;$~$J'JJ$
–!3;$B(2$"2-<$87;44$+1;$(B+7(-$Z3$.)V-;);2+(>"2v$
!.).28$)"*;-$="#$$
GH9$).B#"I(#B1.+;B+7#;$
GPGPU-Sim is ~0.89 correlated to the real HW.
A;B;)@;#$CDDE$ Ca$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
L2+;#=(B.28$GHGH9IJ.)$+"$'VV4
•! GHGH9IJ.)$B")V.-;4$.+4;-=$.2+"$($41(#;*$#72>);$-.@#(#<$(2*$.)V-;);2+$+1;$'HLY$–! -.@B7*(#+&4"$ $�$?9A'$#72>);$'HL$
–! -.@NV;2?5&4" $�$NV;2?5$'HL$
•! 3"#04$="#$?9A'$C&Dc$KC&b$#;B"));2*;*P$–! 6"#$?9A'$F&F,$<"7$B(2$4+(>B(--<$-.20$GHGH9IJ.)$.2+"$<"7#$?9A'$(VV$
•! %"*.=<$<"7#$5Ak5L/M'MukH'!Z$;2:&$:(#&$+"$#72$<"7#$?9A'$(VV$"2$GHGH9IJ.)$KJ;;$%(27(-P$$–! ];;*$($B"2W8$W-;$K8V8V74.)&B"2W8P$(2*$($.2+;#B"22;B>"2$B"2W8$W-;$(4$Q;--$
We provide the config files for modeling a Quadro FX 5800.
A;B;)@;#$CDDE$ Ch$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Interfacing GPGPU-Sim to Apps
•! When should you statically link GPGPU-Sim into
your CUDA app?
–! App works only in CUDA 1.1
•! We provide a common makefile that works like
the one in NVIDIA GPU Compute SDK.
A;B;)@;#$CDDE$ Ci$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
?9A'$?")V.-(>"2$Qg$GH9$Z3$
C`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
GHGH9IJ.)$?")V.-(>"2$6-"Q$
CE$
Most complexity hidden by using the DLL interface to GPGPU-Sim (LD_LIBRARY_PATH)
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S24.,&/3)=243&0+
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! GHGH9IJ.)$N:;#:.;Q$
•! !"!"#$%&')J04+,0/36$$
–!5&1,./,1:&4+142,+)'.;+3$
–!J"RQ(#;$"#8(2.S(>"2$(2*$1(B0.28$>V4$
–!TU()V-;$)"*.WB(>"24$
•! GHGH9IJ.)$!""-4)
bD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>).-)4:&6)6+19.0)
•! 31(+$(#;$Q(#V4e$
•! Z"Q$GHGH9IJ.)$)"*;-4$V.V;-.2;*$Q(#V$
;U;B7>"2$
•! j;<$).B#"(#B1.+;B+7#;$4+#7B+7#;4$)"*;-;*$@<$
GHGH9IJ.)$
bF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Shader Core
?!'$
K!1#;(*$/-"B0P$
bC$!1#;(*4$
bC$!1#;(*4$
bC$!1#;(*4$
?!'$
K!1#;(*$/-"B0P$
bC$!1#;(*4$
bC$!1#;(*4$
bC$!1#;(*4$
S:,+/;)T&+,/,1:8)<+7&6&4+;)
•! M;B(--,$0;#2;-$X$8#.*$"=$@-"B04$"=$Q(#V4$"=$+1#;(*4$
•! !1#;(*$@-"B04$K?!'4P$B"2+(.24$7V$+"$aFC$+1#;(*4$
•! !1#;(*4$(#;$8#"7V;*$.2+"$.$-/0$.2$1$-!.$-"'
!1#;(*$/-"B0$
K?!'P$
bC$!1#;(*4$
bC$!1#;(*4$
bC$!1#;(*4$
?/,Q6$
Each block is dispatched
to a shader core as a unit
of work: All of its warps
run in the core’s pipeline
until they are all done.
bC$
Source: NVIDIA
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
?/,Q)_)%J5S)VA+129.0).-)%1/3/,)
S:,+/;6)
•! 3(#V$X$JB(-(#$+1#;(*4$8#"7V;*$+"$;U;B7+;$.2$-"B04+;V$•! JL%!$:4$JL%A$
–! JL%AY$Z3$V.V;-.2;$Q.*+1$)74+$@;$02"Q2$@<$4"RQ(#;$
–! JL%!Y$V.V;-.2;$Q.*+1$1.**;2$=#")$4"RQ(#;$K!P$$
Thread Warp 3
Thread Warp 8
Thread Warp 7
Thread Warp
Scalar Thread
W
Scalar Thread
X
Scalar Thread
Y
Scalar Thread
Z
Common PC
SIMT Pipeline
bb$
K!P$?(2$4>--$Q#.+;$4"RQ(#;$+1(+$(447);4$+1#;(*4$.2$($Q(#V$;U;B7+;$.2$-"B04+;V$K;&8&$4;;$#;*7B>"2$.2$]xLAL'$
JAjP$$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"#)5&1,./,1:&4+142,+)=7+,7&+>)
DEFGFFFHI
))))))))))))))))))))))))))=P$1:&Q)N<*5)
J04+,1.00+19.0)K+4>.,L)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
bO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J06&;+)/)%:/;+,)M.,+)
•! 6.2;I8#(.2;*$)7->+1#;(*.28$
–! L2+;#-;(:;$+1#;(*$;U;B7>"2$+"$1.*;$-(+;2B.;4$
–! M;8.4+;#$:(-7;4$"=$(--$+1#;(*4$4+(<4$.2$#;8.4+;#$W-;$
–! N2;$.24+#7B>"2$V;#$+1#;(*$.2$
V.V;-.2;$(+$($>);$K]"$@#(2B1$
V#;*.B>"2P$
Decode
R F
R F
R F
A L U
A L U
A L U
Memory
Thread Warp 6
Thread Warp 1 Thread Warp 2 Data
Threads accessing memory hierarchy
Thread Warp 3 Thread Warp 8
Writeback
Threads available for scheduling
Thread Warp 7
Fetch
SIMT Pipeline
ba$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
"&Q+3&0+)64/C+6)
•! J+(8;4$GHGH9IJ.)$4.)7-(+;4$.2$($
41(*;#$B"#;$V.V;-.2;Y$
–!6;+B1 $$
–!A;B"*;$
–!TU;B7+;$
–!A7))<$H#;I%;)$J+(8;4$
–!%;)"#<$$
–!3#.+;@(B0$
6;+B1$
A;B"*;$
TU;B7+;$
H#;I%;)$
%;)"#<$
3#.+;@(B0$
bh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
`+41:)%4/C+)
•! JB1;*7-;$($Q(#V$+"$4+(#+$;U;B7>"2$.2$V.V;-.2;$
•! Z(2*-;4$@#(2B1$*.:;#8;2B;$@<$)(40.28$"|$2"2I
(B>:;$+1#;(*4$$
–!/(4;*$"2$B1"4;2$!1#;(*$JB1;*7-.28$H"-.B<$$
•! A;=(7-+Y$L));*.(+;$V"4+$*").2(+"#$KHAN%P$–! J+(B0$@(4;*$B"2+#"-$o"QY$$5;:.2+1(-$;+$(-&,$?1(V$�$($JL%A$8#(V1.B4$V#"B;44"#&$23445678'9:;<$
•! A<2().B$Q(#V$="#)(>"2$KA36P$$–! 6728$;+$(-&,$A<2().B$3(#V$6"#)(>"2$(2*$JB1;*7-.28$="#$T{B.;2+$GH9$?"2+#"-$6-"Q,$%L?MN$CDDi$$
6;+B1$
A;B"*;$
TU;B7+;$
H#;I%
;)$
%;)"#<$
3#.+;@(B0$
bi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
"N=5)N&7+,C+01+)T/0;3&0C)-.,)?/,Q6)
A
B C
D
A -- 1111 B D 1110 C D 0001
Next PC Recv PC Amask D -- 1111
Control Flow Stack
One per warp
A; if (some condition) { B; } else { C; } D;
b`$
TOS
D
1
1 1
1
A
0
0 0
1
C
1
1 1
0
B
1
1 1
1
D
Time
Execution Sequence
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
N+1.;+)%4/C+
•! 6"#$(--$+1;$(B>:;$+1#;(*4$.2$+1;$Q(#V$$$
–!A;B"*;$.24+#7B>"24$
–!672B>"2(--<$;U;B7+;$+1;$.24+#7B>"24$
–!G;+$2;B;44(#<$.2="$=#")$=72B>"2(-$4.)7-(+"#$
•! %;)"#<$4V(B;$K8-"@(-,$-"B(-,$B"24+(2+,$+;U+7#;P$
•! %;)"#<$(**#;44$K74;*$="#$B"(-;4B.28$.2$%;)$4+(8;P$
•! ];U+$H?$K="#$@#(2B1$*.:;#8;2B;$1(2*-.28P$$
6;+B1$
A;B"*;$
TU;B7+;$
H#;I%
;)$
%;)"#<$
3#.+;@(B0$
bE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA+124+a",+$'+'.,8)64/C+6
•! TU;B7+;$4+(8;$
–!T)V+<$4+(8;$.2$GHGH9IJ.)$
–!!1;$(B+7(-$=72B>"2(-$;U;B7>"2$Q(4$.2$*;B"*;$
4+(8;$$
•! H#;I);)$4+(8;4$$
–!T)V+<$4+(8;4$+"$(*�74+$V.V;-.2;$-;28+1$
–!!1;<$(#;$B"))(2*$-.2;$B"2W87#(@-;$
6;+B1$
A;B"*;$
TU;B7+;$
H#;I%
;)$
%;)"#<$
3#.+;@(B0$
OD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
5+'.,8)%4/C+)=7+,7&+>))
•! 3;$4.)7-(+;$.+$(4$a$V(#(--;-$72.+4$
–!/(4;*$"2$+1;$.24+#7B>"2 4̂$);)"#<$4V(B;$
•! A;+(.-4$)"*;-;*$.2$);)"#<$4+(8;Y$
–!J1(#;*$%;)"#<$@(20$B"2o.B+4$
–!?"(-;4B.28$
–!%JZM4$6;+B1$
A;B"*;$
TU;B7+;$
H#;I%
;)$
%;)"#<$
3#.+;@(B0$
OF$
Global Texture Constant Local Shared
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
?,&4+B/1L)%4/C+)
•! 6.2.41.28$L24+#7B>"24$7V*(+;$#;8.4+;#$W-;$
•! '#@.+#(>"2$="#$#;8.4+;#$W-;$
–!M;+7#2.28$);)"#<$.24+#7B>"24$(2*$+1;$B7##;2+-<$
#722.28$.24+#7B>"2$B")V;+;$="#$#;8.4+;#$W-;$
@(2*Q.*+1$
–!M;+7#2.28$);)"#<$.24+#7B>"24$1(:;$1.81;#$
V#."#.+<$
6;+B1$
A;B"*;$
TU;B7+;$
H#;I%
;)$
%;)"#<$
3#.+;@(B0$
OC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Pre-Mem
Global Texture Constant Local Shared
Texture Cache
Constant Cache
Shared Memory
Execute
Decode
Fetch
Writeback
Off-Chip DRAM
Data Flow
Instruction Flow
Pipeline Stage
Physical Memory
5+'.,8)%4/C+)N+4/&36)
Ob$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M.064/04)M/1:+)
•! '$M;(*I"2-<$B(B1;$="#$B"24+(2+$);)"#<$
•! GHGH9IJ.)$4.)7-(+;4$C$#;(*$V"#+4$
–!'$Q(#V$B(2$(BB;44$C$B"24+(2+$B(B1;$-"B(>"24$.2$($
4.28-;$B<B-;$
–! L=$)"#;$+1(2$C$-"B(>"24$(BB;44;*$$
•! #;(*4$(#;$4;#.(-.S;*$B(74.28$V.V;-.2;$4+(--4$
–! Kz$"=$V"#+4$.4$B"2W87#(@-;P$$
OO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S+A42,+)M/1:+)
•! M;(*I"2-<$B(B1;$
•! GHGH9IJ.)$47VV"#+$FIA$(2*$CIA$+;U+7#;4$
•! CIA$-"B(-.+<$41"7-*$@;$V#;4;#:;*$Q1;2$+;U+7#;$
B(B1;$@-"B04$(#;$=;+B1;*$=#")$);)"#<$
–!GHGH9IJ.)$74;4$($OIA$@-"B0.28$(**#;44$4B1;);$+"$
V#")"+;$4V(>(-$-"B(-.+<$.2$CIA$•! /(4;*$"2$Z(07#($;+$(-&$!1;$A;4.82$(2*$'2(-<4.4$"=$($?(B1;$
'#B1.+;B+7#;$="#$!;U+7#;$%(VV.28,$LJ?'$FEEi$$
Oa$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
%:/,+;)5+'.,8
•! TUV-.B.+-<$)(2(8;*$4B#(+B1V(*$);)"#<$$
–!'4$=(4+$(4$#;8.4+;#$W-;4$.2$(@4;2B;$"=$@(20$B"2o.B+4$$
•! !1#;(*4$.2$($@-"B0$B(2$B""V;#(+;$:.($41(#;*$
);)"#<$$
•! T(B1$41(*;#$B"#;$1(4$.+4$"Q2$41(#;*$);)"#<$
•! Fhj/$V;#$J1(*;#$B"#;$.2$B7##;2+$]xLAL'$GH94$
Oh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
%:/,+;)5+'.,8)D1.04UI)
•! %(2<$+1#;(*4$(BB;44.28$);)"#<$$–! !1;#;="#;$J1(#;*$);)"#<$.4$1.81-<$@(20;*$
•! T(B1$@(20$4;#:;4$"2;$(**#;44$V;#$B<B-;$
•! %7->V-;$(BB;44$+"$($@(20$.2$($4.28-;$B<B-;$B(74;$B/0L)1.0b&146)–! ?"2o.B>28$(BB;44;4$)74+$@;$4;#.(-.S;*$
•! J1(#;*$);)"#<$.2$]xLAL'$GH94$1(4$Fh$@(204$–! ?"2o.B+$*;+;B>"2$.4$*"2;$="#$($1(-=IQ(#V$KFh$+1#;(*4P$
–!GHGH9IJ.)$)"*;-4$+1.4$$
Oi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
%:/,+;)5+'.,8)c/0L)M.0b&146)
]"$@(20$B"2o.B+$$ `IQ(<$@(20$B"2o.B+$$
6.87#;4$+(0;2$=#")$?9A'$
)(27(-$@<$]xLAL'$
O`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!3.B/3)5+'.,8)
•! G-"@(-$);)"#<$.4$+1;$"|IB1.V$AM'%$);)"#<$$
–!!1;$-(#8;4+$(2*$4-"Q;4+$);)"#<$(:(.-(@-;$
–!'BB;44;4$)74+$8"$+1#"781$.2+;#B"22;B+,$);)"#<$
B"2+#"--;#$(2*$"|IB1.V$AM'%$
–!]"+$B(B1;*$.2$Z3$
–!/7+$GHGH9IJ.)$47VV"#+4$B(B1.28$.+$$
OE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M./3+61&0C)
•! ?")@.2.28$);)"#<$(BB;44;4$)(*;$@<$+1#;(*4$.2$($Q(#V$.2+"$=;Q;#$+#(24(B>"24$
–! T&8&$.=$+1#;(*4$.2$($Q(#V$(#;$(BB;44.28$B"24;B7>:;$CI@<+;$4.S;*$-"B(>"24$.2$);)"#<$
•! J;2*$"2;$hO�@<+;$#;f7;4+$+"$AM'%$KB"(-;4B.28P$
•! L24+;(*$"=$bC$CI@<+;$#;f7;4+4$$
•! !1.4$#;*7B;4$+1;$27)@;#$"=$+#(24(B>"24$@;+Q;;2$41(*;#$B"#;4$(2*$AM'%$–! 5;44$Q"#0$="#$L2+;#B"22;B+,$%;)"#<$?"2+#"--;#$(2*$AM'%$
aD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M./3+61&0C)K?"2+&P
•! ?9A'$?(V(@.-.+<$F&b$K;&8&$G!_C`DP$
–!?"(-;4B.28$*"2;$V;#$1(-=IQ(#V$
–!?(2$B#;(+;$FC`I@<+;,$hOI@<+;$"#$bCI@<+;$
+#(24(B>"24$
•! GHGH9IJ.)$
–!?"(-;4B.28$*"2;$="#$($=7--$Q(#V$
–!N2-<$B#;(+;4$hOI@<+;$+#(24(B>"24$$
aF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M./3+61&0C)KB"2+&P$
•! GHGH9IJ.)$B"(-;4B.28$;U()V-;$
Warp
Warp
one 64-Byte
Transaction
2 64-Byte
Transactions
= 2-bytes in memory
aC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
5&66)%4/426)T.3;&0C)<+C&64+,6
•! %JZM4$0;;V$+#(B0$"=$"7+4+(2*.28$);)"#<$#;f7;4+4$
–! 0;;V$+#(B0$"=$+1#;(*4,$+(#8;+$#;8.4+;#4,$#;f7;4+$(**#;44;4$
•! T(B1$B"(-;4B;*$);)"#<$+#(24(B>"2$B"247);4$(2$%JZM$–!%JZM4$(#;$-.).+;*$KB"2W87#(@-;P$
–! H.V;-.2;$4+(--4$.=$41(*;#$B"#;$#724$"7+$"=$%JZMJ$$
•! N2;$(VV#"(B1$+1(+$).81+$)(0;$4;24;$K]"$*;+(.-4$(:(.-(@-;$=#")$]xLAL'P$
ab$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"#)5&1,./,1:&4+142,+)=7+,7&+>)
DEFGFFFHI
))))))))))))))))))))))))))=P$1:&Q)N<*5)J04+,1.00+19.0)K+4>.,L)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
aO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M3.1L);.'/&06
•! $J.)7-(+;$.2*;V;2*;2+$B-"B0$*")(.24$="#$
–!J1(*;#$B"#;4$
•! 947(--<$4;+$+"$�$"=$$B"#;$B-"B0$$+"$B")V;24(+;$="#$74.28$
JL%A$Q.*+1$"=$bC$.24+;(*$"=$$`$$$
–! L2+;#B"22;B>"2$2;+Q"#0$
–!5C$B(B1;$K.=$;2(@-;*P$
–!AM'%$
•! !1.4$.4$#;(-$B-"B0$KB"))(2*$B-"B0P$
•! T|;B>:;$B-"B0$.4$CU$+1.4$B-"B0$*7;$+"$AAM$
aa$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
M3.1L)N.'/&0)M,.66&0C)
•! 3;$4.)7-(+;$4;2*$(2*$#;B;.:;$@7|;#4$(+$B-"B0$
B#"44.28$@"72*(#.;4$
•! !1;$@7|;#4$(#;$W--;*$(2*$*#(.2;*$.2$*.|;#;2+$
B-"B0$*")(.24$
•! T&8&$B"24.*;#$+1;$@7|;#$=#")$.2+;#B"22;B+$$+"$
);)"#<$B"2+#"--;#$$
–!6.--;*$(+$.2+;#B"22;B+$B-"B0$#(+;$
–!A#(.2;*$(+$AM'%$B-"B0$#(+;$
ah$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+19.0)K+4>.,L)5.;+3
•! L2+;#4.)$K/""04.)P$($o.+$-;:;-$4.)7-(+"#$$
–!!"V"-"8.;4$K%;41,$!"#74,$/7w;#o<,$vP$
–!M"7>28$KA.);24."2$N#*;#,$'*(V>:;,$;+B&$P$
–!6-"Q$?"2+#"-$Kx.#+7(-$?1(22;-4,$?#;*.+4P$
•! 3;$4.)7-(+;$+Q"$4;V(#(+;$2;+Q"#04$
–!6#")$J1(*;#$B"#;4$+"$);)"#<$B"2+#"--;#4$
•! M;(*$M;f7;4+4,$3#.+;$#;f7;4+4$
–!6#")$);)"#<$B"2+#"--;#4$+"$41(*;#$B"#;4$
•! M;(*$M;V-.;4$
ai$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S.Q.3.C8)VA/'Q3+6)
a`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+19.0)K+4>.,L)M.0dC
•! /""04.)$1(4$.+4$"Q2$B"2W8$W-;$–! !"V"-"8<$K+"V"-"8<,$0$,2$P$
–! x.#+7(-$B1(22;-4$K27)k:B4P$
–! /7|;#4$V;#$x?$K:Bk@7=k4.S;P$
–! M"7>28$K#"7>28$k=72B>"2P$
–! JV;;*7V4$K.2V7+k4V;;*7V,$.2+;#2(-k4V;;*7VP$
–!'--"B(+"#4$K:Bk(--"B(+"#,$4Qk(--"B(+"#P$
•! JV;B.WB$+"$GHGH9I4.)$
–! ?1(22;-$3.*+1$Ko.+k4.S;P$
–! J;�28$);)"#<$B"2+#"--;#$-"B(>"24$K74;k)(VP$
aE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)J0e+19.0)J04+,-/1+6)
Clock Boundary
J1(*;#$
?"#;$$M"7+;#$
Core Clock
Domain
Interconnect Clock
Domain
1 Flit / Cycle 1 Packet / Cycle
hD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)J0e+19.0)J04+,-/1+6)
Clock Boundary
$%;)"#<$
?"2+#"--;#$$M"7+;#$
DRAM
Clock Domain
Interconnect Clock
Domain
1 Flit / Cycle 1 Packet / Cycle
hF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)J0e+19.0)J04+,-/1+6)
Clock Boundary
5C$?(B1;$$ M"7+;#$
L2 Clock
Domain
Interconnect Clock
Domain
1 Flit / Cycle 1 Packet / Cycle
hC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$
•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$
Clock
Boundary
M"7+;#$ J1(*;#$?"#;$
Core Clock
Domain Interconnect Clock
Domain
Ejection Buffers Boundary Buffers
Credit return buffer
1 Credit / Cycle
1 Flit / Cycle
1 Flit / Cycle
1 Packet / Cycle
(Round Robin)
hb$
# of VCs
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$
•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$
Clock
Boundary
M"7+;#$%;)"#<$
?"2+#"--;#$
DRAM Clock
Domain Interconnect Clock
Domain
Ejection Buffers Boundary Buffers
Credit return buffer
1 Credit / Cycle
1 Flit / Cycle
1 Flit / Cycle
1 Packet / Cycle
(Round Robin)
hO$
# of VCs
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$
•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$
Clock
Boundary
M"7+;#$ 5C$?(B1;$
L2 Clock
Domain Interconnect Clock
Domain
Ejection Buffers Boundary Buffers
Credit return buffer
1 Credit / Cycle
1 Flit / Cycle
1 Flit / Cycle
1 Packet / Cycle
(Round Robin)
ha$
# of VCs
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"#)5&1,./,1:&4+142,+)=7+,7&+>)
DEFGFFFHI
J04+,1.00+19.0)K+4>.,L)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
%:/;+,)
M.,+)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
5+'.,8)
M.04,.33+,)
!NN<O)
hh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=P$1:&Q)N<*5
5+'.,8)*;;,+66)5/QQ&0C)
•! N|IB1.V$);)"#<$V(#>>"2;*$()"28$4;:;#(-$
);)"#<$B1(22;-4$
–!G!CDD$1(4$`$);)"#<$B1(22;-4$
–!G`D$1(*$h$);)"#<$B1(22;-4$
–!T(B1$);)"#<$B1(22;-$1(4$($);)"#<$B"2+#"--;#$
•! J7BB;44.:;$CahI@<+;$#;8."24$"=$);)"#<$(#;$
(44.82;*$+"$47BB;44.:;$);)"#<$B1(22;-4$
–!'**#;44$)(VV.28$.4$B"2W87#(@-;$.2$GHGH9IJ.)$
9]J3$?9A'$!7+"#.(-$@<$]xLAL'$V(#+$O$"V>).S.28$?9A'$hi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
5+'U)*;;,+66)5/QQ&0C$K?"2+&P$
0x0000
0x0100
0x0200
0x0300
0x0400
0x0500
0x0600
0x0700
0x0800 DRAM
Channel 0 DRAM
Channel 1 DRAM
Channel 2 DRAM
Channel 3
DRAM Channel 4
DRAM Channel 5
DRAM Channel 6
DRAM Channel 7
Interconnection Network
Shader
Core
Shader
Core
Shader
Core
h`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
N<*5)
•! AM'%$%;)"#<$
–!N|IB1.V,$1.81I*;24.+<$(2*$1.81$B(V(B.+<$
•! AM'%$(BB;44$>);$.4$K.4)B"24+(2+$
–! L+$1(4$2"2I72.="#)$(BB;44$-(+;2B.;4$
•! !1(+ 4̂$Q1<$Q;$)"*;-$.+�$
hE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
DRAM
Column Decoder
Memory Array
Ro
w D
eco
der
Mem
ory
C
on
tro
ller
Row Buffer Row Buffer
Ro
w D
eco
der
Column Decoder
Row Buffer
Column Decoder
Row Buffer
N<*5)*11+66))
iD$
•! M"Q$(BB;44$$
–! $'B>:(+;$($#"Q$"#$V(8;$"=$($
AM'%$@(20$
–! 5"(*$.+$+"$#"Q$@7|;#$
•! ?"-7)2$(BB;44$
–! $J;-;B+$(2*$#;+7#2$($@-"B0$"=$
*(+($.2$#"Q$@7|;#$
•! H#;B1(#8;$
–!3#.+;$@(B0$+1;$"V;2;*$#"Q$
.2+"$AM'%$$
–! N+1;#Q.4;$.+$Q.--$@;$-"4+�$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
N<*5)<.>)*11+66)R.1/3&48)
tRC = row cycle time
tRP = row precharge time
tRCD = row activate time
iF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
N<*5)c/0L$3+7+3)"/,/33+3&6')
iC$
•! !"$.2B#;(4;$AM'%$
V;#="#)(2B;$(2*$7>-.S(>"2$•! %7->V-;$@(204$V;#$AM'%$B1.V$
•! !"$.2B#;(4;$@74$Q.*+1$•! Multiple chips per Memory
Controller
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
%1:+;23&0C)N<*5)<+f2+646)
•! JB1;*7-.28$V"-.B.;4$47VV"#+;*$•! 6.#4+$.2$W#4+$"7+$K6L6NP$$
•! L2I"#*;#$4B1;*7-.28$
•! 6.#4+$M;(*<$6.#4+$?");$6.#4+$J;#:;$K6MI6?6JP$
•! N7+$"=$"#*;#$4B1;*7-.28$
•! M;f7.#;4$(44"B.(>:;$4;(#B1$
ib$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! GHGH9IJ.)$N:;#:.;Q$
•! GHGH9IJ.)$L2+;#2(-4$$
–!%.B#"(#B1.+;B+7#;$)"*;$
–!%.g>/,+).,C/0&h/9.0$
–!TU()V-;$)"*.WB(>"24$
•! GHGH9IJ.)$!""-4)
A;B;)@;#$CDDE$ iO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>).-)4:&6)6+19.0)
•! L2+#"*7B;$GHGH9IJ.)$)"*7-;4$
•! L2+;#=(B.28$Q.+1$?9A'$(2*$NV;2?5$
•! A;+(.-4$"=$H!_$4.)7-(>"2$
•! !.).28$%"*;-$
A;B;)@;#$CDDE$ ia$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S:,++);&P+,+04)'.;23+6())
•! M#N*a=Q+0MR)*"J)3&B,/,8)&04+,-/1+)
•! "Si)&064,219.0)6+4)+'23/4.,)
•! S&'&0C)'.;+3)
A;B;)@;#$CDDE$ ih$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,-/1+)4.)M#N*a=Q+0MR
•! ?9A'$(2*$NV;2?5$(VV-.B(>"24$.2B-7*;$B"*;$+1(+$#724$"2$+1;$
1"4+&$$/"+1$74;$(2$'HL$+"$B"))72.B(+;$@;+Q;;2$1"4+$(2*$
GH9&$$!1;$'HL$.4$*;W2;*$.2$:(#."74$1;(*;#$W-;4&$$L)V-;);2+(>"2$.4$.2$($A55$(2*g"#$4+(>B$-.@#(#<&$
•! ?7##;2+-<,$Q;$#72$1"4+$B"*;$"2$+1;$4.)7-(+"#$1"4+$V-(�"#)$
(2*$*"$2"+$)"*;-$(2<$.)V(B+$"=$'HL$"#$1"4+$"2$;U;B7>"2$
>);&$$$L&;&,$.=$<"7$#72$GHGH9IJ.)$"2$($?"#;$C$A7"$)(B1.2;,$
+1;$1"4+$B"*;$#724$2(>:;-<$"2$<"7#$?"#;$C$A7"$)(B1.2;&$
A;B;)@;#$CDDE$ ii$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,-/1+)4.)M#N*)*"J)
•! 3;$.)V-;);2+$:;#4."24$"=$NV;2?5$g$?9A'$.2+;#=(B;$B(--4$.2$($
2;Q$A55,$.2+;#=(B;$(*�74+$5Ak5L/M'MukH'!Z$(2*$?9A'g
NV;2?5$(VV-.B(>"2$#724$"2$4.)7-(+"#$#(+1;#$+1(2$GH9$1(#*Q(#;&$$$$
•! G.:;2$"7#$#;4;(#B1$="B74,$Q;$1(:;$.)V-;);2+;*$"2-<$Q1(+$
Q(4$#;f7.#;*$+"$8;+$(VV-.B(>"24$Q;$Q;#;$.2+;#;4+;*$.2$
#722.28&$$$$
•! ?"*;$B(2$@;$B-;(2;*$7V$4.82.WB(2+-<&$$
A;B;)@;#$CDDE$ i`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
J04+,-/1+)4.)M#N*)*"J)$$$$$$TU()V-;$"=$.2+;#=(B;$B"*;$@;+Q;;2$1"4+$(2*$4.)7-(+"#&$$$!1.4$.4$+1;$B"*;$
+1(+$(B+7(--<$4+(#+4$#722.28$+1;$=72B>"2(-$(2*$>).28$)"*;-4&$$?(--$+"$
B7*(5(72B1$.4$8;2;#(+;*$@<$2:BB$=#")$[mmmnnn\$2"+(>"2&$$$$
__host__ cudaError_t CUDARTAPI cudaLaunch( const char *symbol ) {
printf("\n\n\n"); char *mode = getenv("PTX_SIM_MODE_FUNC");
if( mode ) sscanf(mode,"%u", &g_ptx_sim_mode); printf("GPGPU-Sim PTX: cudaLaunch for %p (mode=%s)\n", symbol,
g_ptx_sim_mode?"functional simulation":"performance simulation"); if( g_ptx_sim_mode )
gpgpu_ptx_sim_main_func( symbol, g_cudaGridDim, g_cudaBlockDim, g_ptx_sim_params ); else gpgpu_ptx_sim_main_perf( symbol, g_cudaGridDim, g_cudaBlockDim, g_ptx_sim_params );
g_ptx_sim_params=NULL; return g_last_cudaError = cudaSuccess;
}
A;B;)@;#$CDDE$ iE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
!"!"#$%&')%4/,42Q)N+4/&36)
F&! J");$?9A'$B"*;$B(--;*$@;="#;$)(.2KP$*7#.28$
.2.>(-.S(>"2$"=$8-"@(-$:(#.(@-;4&$$
kkB7*(M;8.4+;#672B>"2,$kkB7*(M;8.4+;#J1(#;*$
!1;4;$V#":.*;$.2="#)(>"2$(@"7+$*;:.B;$B"*;$K(2*$
1"Q$+"$B(--$.+P&$
C&! 6.#4+$B(--$+"$(2<$?9A'$'HL$=72B>"2$B(74;4$4.)7-(+"#$
.2.>(-.S(>"2&$
–! M;(*$;2:.#"2);2+$:(#.(@-;4$K*;@78$.2=",$4.)$)"*;P$
–! H(#4;$"V>"2$W-;4$
–! L2.>(-.S;$GH9$7'#B1$%"*;-$
–! 5"(*gV(#4;$H!_$0;#2;-4,$*;+;#).2;$V"4+I*").2(+"#4$
A;B;)@;#$CDDE$ `D$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
"Si)"/,6&0CG)".64$N.'&0/4.,)N+4+19.0
•! 3;$B"24+#7B+;*$($o;[email protected]"2$V(#4;#$+"$#;(*$.2$H!_$B"*;&$$
•! M;(*4$H!_$+;U+$8;2;#(+;*$@<$2:"V;2BB$
•! A;=(7-+$.4$+"$#;(*$H!_$+;U+$;)@;**;*[email protected](#<$
•! 94.28$o;[email protected]"2$8.:;4$74$o;[email protected].+<$4"$Q1;2$K2"+$.=eP$]xLAL'$
B1(28;4$+1;.#$H!_$4<2+(U,$Q;$B(2$)(0;$B1(28;4$+"$"7#$V(#4;#$
+"$)(+B1&$
•! H"4+I*").2(+"#4$74;*$="#$Q(#V$*.:;#8;2B;$4.)7-(>"2$(#;$
*;+;#).2;*$(+$+1.4$V".2+$74.28$4+(2*(#*$B")V.-;#$B"2+#"-$o"Q$(2(-<4.4$(-8"#.+1)4&$
A;B;)@;#$CDDE$ `F$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
H(#4.28$H!_$
•! ];U+$+Q"$4-.*;4$.--74+#(+;$42.VV;+4$"=$B"*;$
=#")$-;U;#$(2*$V(#4;#$
A;B;)@;#$CDDE$ `C$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
B7*(I4.)gV+U&-$�$W2*$+"0;24$$
abs TC; ptx_lval.int_value = ABS_OP; return OPCODE;
add TC; ptx_lval.int_value = ADD_OP; return OPCODE;
and TC; ptx_lval.int_value = AND_OP; return OPCODE;
…
\.align TC; return ALIGN_DIRECTIVE;
\.byte TC; return BYTE_DIRECTIVE;
\.const\[[0-9]+\] TC; return CONST_DIRECTIVE;
…
"%tid" TC; ptx_lval.int_value = TID_ID; return SPECIAL_REGISTER;
…
\.u32 TC; return U32_TYPE;
\.u64 TC; return U64_TYPE;
\.f16 TC; return F16_TYPE;
\.f32 TC; return F32_TYPE;
…
\.equ TC; return EQU_OPTION;
\.neu TC; return NEU_OPTION;
\.ltu TC; return LTU_OPTION;
…
"]" TC; return RIGHT_SQUARE_BRACKET;
"<" TC; return LEFT_ANGLE_BRACKET;
">" TC; return RIGHT_ANGLE_BRACKET;
"(" TC; return LEFT_PAREN;
…
A;B;)@;#$CDDE$ `b$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
B7*(I4.)gV+U&<$�$#;(*$.24+#7B>"24$…
%token <string_value> STRING
%token <int_value> OPCODE
%token ALIGN_DIRECTIVE
%token BYTE_DIRECTIVE
…
%%
input: /* empty */
| input directive_statement
| input function_defn
| input function_decl
;
function_defn: function_decl { set_symtab($1); } LEFT_BRACE statement_list RIGHT_BRACE
{ end_function(); }
| function_decl { set_symtab($1); } block_spec LEFT_BRACE statement_list RIGHT_BRACE { end_function(); }
;
…
instruction: opcode_spec LEFT_PAREN operand RIGHT_PAREN { set_return(); } COMMA operand
COMMA LEFT_PAREN operand_list RIGHT_PAREN
| opcode_spec operand COMMA LEFT_PAREN operand_list RIGHT_PAREN
| opcode_spec operand_list
| opcode_spec
;
… A;B;)@;#$CDDE$ `O$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
Z"Q$(#;$+1#;(*4$4.)7-(+;*$
K=72B>"2(--<Pe$
•! !1#;(*$X$V#"8#()$B"72+;#$c$
$ $$ $ $$$$4;+$"=$#;8.4+;#4$c$
$ $$ $ $$$$4;+$"=$-"B(-$);)"#<$-"B(>"24$
$ $$ $ $$$$$
?!'$K@-"B0P$X$4;+$"=$+1#;(*4$Q.+1$(BB;44$+"$($
41(#;*$);)"#<&$
]"$2">"2$"=$[Q(#V\$.2$=72B>"2(-$4.)7-(+"#v$
KB")V-.B(+;4$[:"+;\$.24+#7B>"2P$A;B;)@;#$CDDE$ `a$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
T.>)&064,219.06)/,+)6&'23/4+;)
D-2019.0/338I@)
•! ?"*;$="#$4.)7-(>28$.24+#7B>"24$.2$B7*(I4.)g.24+#7B>"2&BB$
•! !1#;(*4$.2.>(-.S;*$*7#.28$-(72B1.28$"=$@-"B04&$
•! 672B>"2(-$;U;B7>"2$(+$*;B"*;Y$[8.(2+$4Q.+B1$4+(+;);2+\$
4.)7-(>"2$(VV#"(B1$K;)7-(>"2P$
•! T(B1$+1#;(*$.2$>).28$)"*;-$1(4$V".2+;#$+"$+1#;(*$B"2+;U+$.2$
=72B>"2(-$)"*;-&$
•! 5""07V$.24+#7B>"2$["@�;B+\$B"##;4V"2*.28$+"$V#"8#()$
B"72+;#$KQ;$*"$2"+$(w;)V+$+"$[;2B"*;\$.24+#7B>"2P$
A;B;)@;#$CDDE$ `h$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
T.>)/,+)7/32+6)1.''20&1/9.0)
B+4>++0)4:,+/;6@
•! ?"))72.B(>"2$B(2$1(VV;2$;.+1;#$+1#"781$[8-"@(-$);)"#<\$
"#$+1#"781$[41(#;*$);)"#<\&$
•! 3;$4.)7-(+;$(--$.24+#7B>"24$(4$+1;<$#;(B1$*;B"*;$4+(8;$"=$
V.V;-.2;,$+1.4$.2B-7*;4$-"(*4$(2*$4+"#;4$+1(+$(BB;44$);)"#<&$$
TUB;V>"2$.4$="#$(+").B4$KQ;$4.)7-(+;$+1;)$=72B>"2(--<$"2B;$
(+").B$"V;#(>"2$#;(B1;4$AM'%$B"2+#"--;#$.2$>).28$)"*;-P$
•! %"4+$?9A'$B"*;$(:".*4$.2+#(I0;#2;-$B"))72.B(>"2$+1#"781$
8-"@(-$);)"#<$K);)"#<$"#*;#.28$2"+$Q;--$*;W2;*P$
A;B;)@;#$CDDE$ `i$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>).-)4:&6)6+19.0)
•! L2+#"*7B;$GHGH9IJ.)$)"*7-;4$
•! L2+;#=(B.28$Q.+1$?9A'$(2*$NV;2?5$
•! A;+(.-4$"=$H!_$4.)7-(>"2$
•! S&'&0C)5.;+3)
A;B;)@;#$CDDE$ ``$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
J0&9/3&h/9.0
•! &0&4WCQ2DI)&0)CQ2$6&'U1)
–!*33.1/4+)/0;)&0&9/3&h+)'&1,./,1:&4+142,+)'.;+3)
/0;)64/9691)1.33+19.0)64,2142,+6)
–!M/33+;)/4)4:+)d,64)M#N*)*"J)1/33)
A;B;)@;#$CDDE$ `E$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
5/&0)%&'23/9.0)R..Q
•! ,20WCQ2W6&'DI)&0)CQ2$6&'U1)
–!M/33+;)>:+0)/)L+,0+3)&6)3/201:+;j)
–! J0&4)L+,0+3$6Q+1&d1)&0-.)D+UCU)"N=5I)
–!?:&3+)DK.4)/33)4:,+/;);.0+I)k))
))) ) )CQ2W6&'W3..QDI()5/&0)%&'23/9.0)R..Q)
))))l)
–!CQ2WQ,&04W64/4DI()!+0+,/4+)64/4)3.C)
A;B;)@;#$CDDE$ ED$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
5/&0)%&'23/9.0)R..Q
•! J06&;+)CQ2W6&'W3..QDI()
–!M:+1L)/33)13.1L);.'/&06)
•! &-)D13.1LW'/6L)m)M=<VI)k)j)l)
–!VA+124+)4:+).0+6)4:/4)/,+),+/;8)
–!M2,,+0438G)n);.'/&06()
•! M=<V) )o)%:/;+,)M.,+)p)S:,+/;)c3.1L)J662+)
•! JMKS) )o)J04+,1.00+19.0))K+4>.,L))
•! N<*5 )o)N<*5)p)<+f2+64)%1:+;23+,)
•! RY) ) )o)5+'.,8$%&;+)RY)M/1:+)))
A;B;)@;#$CDDE$ EF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
S:,+/;)c3.1L)J662+
•! &662+WB3.1LY1.,+DI()
–! J662+)0+>)4:,+/;)B3.1L6)4.)1.,+)
–! J0&9/3&h+6)4:+)4:,+/;6)&06&;+)4:+)0+>)B3.1L6)
–!S>.)'.;+6()
•! K.,'/3)o)`&33)/)1.,+)>&4:)B3.1L6)2093)&4)&6)-233)
•! %Q,+/;)o)N&64,&B24+)B3.1L)/'.0C)1.,+6)
SC 0 B0 B1 B2
SC 1 B3 B4 B5
SC 2 B6 B7 B8
B9 B10 B11
SC 0 B0 B3 B6
SC 1 B1 B4
SC 2 B2 B5
A;B;)@;#$CDDE$ EC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
%:/;+,)M.,+
•! 6:/;+,W1813+DI)&0)6:/;+,U1()
–! 6:/;+,W>,&4+B/1LDI)
–! 6:/;+,W'+'.,8DIG)6:/;+,W1.064W'+'.,8DIG)
6:/;+,W4+A42,+W'+'.,8DI)
–! 6:/;+,WQ,+W'+'.,8DI)
–! 6:/;+,W+A+124+DI)
–! 6:/;+,W;+1.;+DI)
–! 6:/;+,W-+41:DI)
•! M/33+;)&0),+7+,6+).,;+,)D.,;+,).-)64/33)Q,.Q/C/9.0I)
–! %/'+)4,&1L)26+;)&0)'/08).4:+,)1813+)3+7+3)6&'23/4.,6)
A;B;)@;#$CDDE$ Eb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
J'Q.,4/04)%4,2142,+6
•! 6:/;+,W1.,+W14AW4)
–! M.04/&0)64/4+).-)/)6:/;+,)1.,+))
•! 4:,+/;W14AW4)
–! M.04/&0)Q4AW4:;W&0-.G)&U+U)
Q4AW4:,+/;W&0-.)&0)12;/$6&')
•! &064W4)
–! "&Q+3&0+),+C&64+,)/0;);80/'&1)
&064/01+).-)/0)&064,219.0)
shader_core_ctx_t
Fetch Stage Logic
Decode Stage Logic
TS_IF
IF_ID
ID_EX
Scheduler Logic
A;B;)@;#$CDDE$ EO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
J'Q.,4/04)%4,2142,+6
•! '6:,W+04,8)
–! 5+'.,8),+f2+64)&0-.)26+;)&06&;+)6:/;+,)1.,+)
–! T/6)/)1.Q8).-)4:+)Q&Q+3&0+W,+C)4:/4)1,+/4+;)4:+),+f2+64)
•! '+'W-+41:W4)
–! 5+'.,8),+f2+64)64,2142,+)4:/4)&6)Q/66+;)
B+4>++0)'.;23+6)&0)4:+)'+'.,8)62B$6864+'U))
•! =246&;+).-)6:/;+,)1.,+)
–! !+0+,/4+;)&0)-fWQ26:DI)
–! N+64,.8+;)&0)-fWQ.QDI)-.,),+/;6)/0;)/4)
;,/'WQ.QDI)-.,)>,&4+6)
Shader Core
Memory Subsystem: !! Interconnect !! L2 !! DRAM
Mem Stage Wrbk Stage
mshr_entry_t mshr_entry_t mshr_entry_t mshr_entry_t
mem_fetch_t
mem_fetch_t
mem_fetch_t
A;B;)@;#$CDDE$ Ea$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)5.;+3())
%:/;+,)M.,+)"&Q+3&0+)%4/C+
•! !+0+,/3)*0/4.'8()
J-)DQ&Q+3&0+W,+C6q0+A4W64/C+r)0.4)+'Q48I)
)))),+42,0])aa)64/33))
M.Q8)Q&Q+3&0+W,+C6q12,,+04W64/C+r))
s)Q&Q+3&0+W,+C6q0+A4W64/C+r))
aa)%4+Q)E)&0)64/C+)
`.,+/1:)D4:,+/;)&0)>/,QI)k)j)l))
aa)%4+Q)Y)&0)64/C+)
`.,+/1:)D4:,+/;)&0)>/,QI)k)j)l))
aa)%4+Q)K)&0)64/C+)
`.,+/1:)D4:,+/;)&0)>/,QI)k)j)&-)D64/33W1.0;&9.0I),+42,0])l))
<+4,8))
K+A4)M813+)
Scan through
pipeline reg for all threads in
next stage
All threads in
warp together for each step
A;B;)@;#$CDDE$ Eh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
`+41:)%4/C+)
J-)DQ&Q+3&0+W,+C6qJ`WJNr)0.4)+'Q48I)
))));+1.;+W64/33)_)4,2+]))
%>&41:)D6&';W'.;+3I)k)
1/6+)"N=5()6:/;+,W-+41:W6&';WQ.64;.'&0/4.,DjI)1/6+)N?`()6:/;+,W-+41:W6&';W;>-DjI)
l))
J-)D0.4);+1.;+W64/33I)k)
))))%4/4)1.33+19.0)l))
J-)DQ&Q+3&0+W,+C6qS%WJ`r)0.4)+'Q48I)
)))),+42,0])aa)64/33)
>&;)_)Q;.'W61:+Wd0;W0+A4W>/,QDjI)J-)D>&;)__)$EI),+42,0])aa)0.)>/,Q),;8)
T/0;3+)B,/01:);&7+,C+01+)J662+)>/,Q)4.)Q&Q+3&0+W,+C6qS%WJ`r])
0+>W>/,QWS%)_)4,2+])
J-)D0.4);+1.;+W64/33)/0;)0+>W>/,QWS%I)k)
))))Q&Q+3&0+W,+C6qS%WJ`r)s)Q&Q+3&0+W,+C6qIF_IDr))l))
A;B;)@;#$CDDE$ Ei$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
N+1.;+)%4/C+)J-)DQ&Q+3&0+W,+C6qJNWVir)0.4)+'Q48I)
)))),+42,0])aa64/33)
Q&Q+3&0+W,+C6qJ`WJNr)s)Q&Q+3&0+W,+C6qJNWVir))
`.,+/1:)D4:,+/;)&0)>/,QI)k)
))))Q4AW;+1.;+W&064)D4:,+/;$\Q4AW4:;W&0-.G)jI])l)
`.,+/1:)D4:,+/;)&0)>/,QI)k)))))Q4AW+A+1W&064)D4:,+/;$\Q4AW4:;W&0-.G)jI])
))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU&064W48Q+)_)j])))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU6Q/1+)_)j])
))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU'+'/;;,)_)j])
))))T/0;3+)B/,,&+,)aa)B/,U6801)
l))
Separate decode and
functional execute
•! Put thread to sleep (nullify
pipeline_regs[IF_ID]) if not all thread arrived
•! Wake up threads in block after all thread arrived
Save info for memory stage
A;B;)@;#$CDDE$ E`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
VA+124+)%4/C+)
J-)DQ&Q+3&0+W,+C6qViW55r)0.4)+'Q48I)
)))),+42,0])aa64/33)
Q&Q+3&0+W,+C6qJNWVir)s)Q&Q+3&0+W,+C6qViW55r))
J-)DQ&Q+3&0+W,+C6qViW55r)0.4)+'Q48I)
)))),+42,0])aa64/33)
Q&Q+3&0+W,+C6qJNWVir)s)Q&Q+3&0+W,+C6qQ,+W'+'r))
OR
A;B;)@;#$CDDE$ EE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
5+'.,8)%4/C+)
•! !+0+,/3)*0/4.'8)D'+'.,8G)1.064G)4+A42,+I()J-)DQ&Q+3&0+W,+C6q55W?cr)0.4)+'Q48I)),+42,0])aa)64/33))
M:+1L)%4,2142,/3)T/h/,;()%5V5)ct)M.0b&14G)M./3+61&0CG)M.064u)".,4))
J-)D64,2142,/3W:/h/,;I)6+4D64/33W1.204+,IG),1W-/&3)_)4,2+])J-)D,1W-/&3I)k)
))))<+7+,4)64/46]))))),+42,0])
l)+36+)k)))))%+4)5%T<)DQ&Q+3&0+W,+C6qViW55rI)
))))%+0;)'+'.,8),+f2+64)>&4:)-QWQ26:)DjI)
))))Q&Q+3&0+W,+C6qViW55r)_)K=")l)
*11+66)M/1:+])
J-)D5&66).,)K.WM/1:+I)k)))))*33.1)5%T<)p)JMKS)B2P+,)
))))J-)D0.4)/7/&3/B3+I),1W-/&3)_)4,2+])l)
Q&Q+3&0+W,+C6qViW55r)s)Q&Q+3&0+W,+C6q55W?cr))
J-)D64/33W1.204+,)\)FI)64/33W1.204+,$$),+42,0])aa)64/33))
A;B;)@;#$CDDE$ FDD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
?,&4+B/1L
*,B&4,/4+)B+4>++0)Q&Q+3&0+W,+C6q55W?cr)/0;)<+42,0&0C)'+'.,8)/11+66)()
))))<+42,0&0C)'+'.,8)/11+66)/3>/86)>&0)))))#03.1LW9;)_)4:,+/;6)4:/4)>&0)/,B&4,/9.0)
M:+1L)-.,),+42,0&0C)'+'.,8)/11+66()
))))#6+)C+4W5%T<W,+42,0:+/;DI)4.).B4/&0)5%T<)
J-)D0.4)64/33+;WB8W5%T<I)
))))Q&Q+3&0+W,+C6q55W?cr)_)K=")
M/1:+)2Q;/4+)D&-)/QQ3&1/B3+I)
J-)DQ4AW4:,+/;W;.0+D203.1LW9;UQ4AW4:;W&0-.II)k)
))))N+1,+'+04)/197+)4:,+/;)1.204)-.,)B3.1LD203.1LW9;I)l)+36+)k)
))))?/L+)2Q)203.1LW9;)-.,)61:+;23&0C)l)
Pipeline_reg MSHR
A;B;)@;#$CDDE$ FDF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
J04+,-/1+6 •! %:/;+,)M.,+)
–! -fWQ26:DI()"26:),+f2+64)4.)'+'.,8)62B6864+')
–! -fWQ.QDI()"24)5%T<).-),+42,0&0C),+f2+64)&04.)
)) ) ) ),+42,0)f2+2+)
–! -fW:/6WB2P+,DI()M:+1L)-.,)JMKS)B2P+,)6Q/1+)
•! J04+,1.00+19.0)K+4>.,L()
–! &104W:/6WB2P+,DI()M:+1L)-.,)&0Q24)B2P+,)6Q/1+)
–! &104WQ26:DI()"26:)Q/1L+4)&04.)0+4>.,L)
–! &104WQ.QDI()".Q)Q/1L+4)-,.')0+4>.,L)
–! &104W4,/06-+,DI()<20)0+4>.,L)-.,)/)1813+)A;B;)@;#$CDDE$ FDC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
S&'&0C)'.;+3()
J04+,-/1+6
•! 5+'.,8)M.04,.33+,)DRY)p)N<*5I())
–!'+'W14,3W-233DI()5+'.,8)f2+2+)-233@)
–!'+'WQ26:DI()"26:),+f2+64)&04.)'+'.,8)f2+2+)
–!'+'W4.QDI()=B4/&0)&0-.)-.,)1.'Q3+4+;),+f2+64))
–!'+'WQ.QDI()".Q)1.'Q3+4+;),+f2+64))
mem_req = mem_ctrl_top(); if (icnt_has_buffer(mem_req.info)) { icnt_push(mem_req); mem_ctrl_pop(); }
Flow Control
A;B;)@;#$CDDE$ FDb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
*;;&0C)=Q9.06
•! #6+).Q9.0WQ/,6+,)'.;23+))
–!*24.'/91/338)Q/,6+6).Q9.06)4.)3&0L+;)7/,&/B3+6)
–!.Q9.0WQ/,6+,W,+C&64+,D.QQG)jI)3&0L).Q9.06)4.)
7/,&/B3+6)
•! %++)CQ2W,+CW.Q9.06DI)&0)CQ2$6&'U1)-.,)
+A/'Q3+6)
A;B;)@;#$CDDE$ FDO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! GHGH9IJ.)$N:;#:.;Q$
•! GHGH9IJ.)$L2+;#2(-4$$
–!%.B#"(#B1.+;B+7#;$)"*;-$
–!J"RQ(#;$"#8(2.S(>"2$
–!VA/'Q3+)'.;&d1/9.06$
•! GHGH9IJ.)$!""-4)
A;B;)@;#$CDDE$ FDa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
*;;&0C),+C&64+,)B/0L)1.0b&146)'.;+3
•! V/1:)&064,219.0)/11+66+6)O)4.)n).Q+,/0;6)
•! 5239$Q.,4),+C&64+,)&6)4..)+AQ+06&7+)-.,)!"#)
•! !"#)26+6)'239Q3+)6&0C3+)Q.,4+;)%<*5)4.)&'Q3+'+04)4:+),+C&64+,)d3+)
–!5/Q6);&P+,+04),+C&64+,6)4.);&P+,+04)B/0L6)
–!V/1:)&064,219.0)-+41:).Q+,/0;6)-,.')'239Q3+)B/0L6)
•! <+C&64+,)B/0L)1.0b&14)_)?:+0)4>.).Q+,/0;6)/,+)-,.')4:+)6/'+)B/0L)
A;B;)@;#$CDDE$ FDh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
*;;&0C),+C&64+,)B/0L)1.0b&146)'.;+3
•! T?)%.30)vE()%+,&/3&h+),+C&64+,)-+41:)$\)%4/33)Q&Q+3&0+).,)3.>+,)4:,.2C:Q24)
•! T?)%.30)vY()=Q+,/0;)M.33+14.,)*,1:&4+142,+)D#%)"/4+04)*QQ()EEaZZZG[nwI)
–! J04+,3+/7+).Q+,/0;)-+41:)-,.');&P+,+04)4:,+/;6)4.)/1:&+7+)-233)293&h/9.0)
Bank 0 Bank 1 Bank 2 Bank 3
R0 R1 R2 R3
R4 R5 R6 R7
R8 R9 R10 R11
… … … …
add.s32 R3, R1, R2; No Conflict
mul.s32 R3, R0, R4; Conflict at bank 0
A;B;)@;#$CDDE$ FDi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
%4+Q6
•! =B4/&0),+C&64+,)B/0L)/11+66)&0-.)&0)12;/$6&'))
•! VAQ.6+)&0-.)4.)9'&0C)'.;+3)
•! J'Q3+'+04),+C&64+,)B/0L)1.0b&14);+4+19.0)/0;)64/33)
•! M,+/4+)1.0dC2,/9.0).Q9.0))
A;B;)@;#$CDDE$ FD`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
=B4/&0),+C&64+,)B/0L)/11+66
•! "Si)26+6)7&,42/3),+C&64+,()x-YOG)x,wZG)+41j)
•! ?+)1:..6+)4.)26+()–! c/0L)/11+66+;)_)y&,42/3)<+CU)K.U)5=N)vB/0L6)
•! =B4/&0)4:&6)02'B+,)>:+0)4:+),+C&64+,)&6)/;;+;);2,&0C)"Si)Q/,6&0C)D&0)Q4AW&,U11I()void add_identifier( const char *identifier, ...) {
... switch ( g_space_spec) { case REG_DIRECTIVE: regnum = g_current_symbol_table->next_reg_num(); // code to obtain virtual reg no. here g_last_symbol->set_regno(regnum, arch_regnum); ... } ...
}
Modify class symbol in ptx_ir.h
A;B;)@;#$CDDE$ FDE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
M.''20&1/4+)J0-.)4.)S&'&0C)5.;+3
•! M2;/$6&')1.''20&1/4+6).Q+,/0;)/11+66)&0-.)4.)9'&0C)'.;+3)7&/)Q4AW;+1.;+W&064DjI)–! M/336).Q+,/0;W&0-.((,+CW02'DI)>:&1:)1/336)68'B.3((,+CW02'DI)
–!?+)'&'&1)4:/4)D/36.)&0)Q4AW&,U:I()
class operand_info { … int arch_reg_num() const {
return m_value.m_symbolic->arch_reg_num(); } … };
Add accessor to class symbol in ptx_ir.h
A;B;)@;#$CDDE$ FFD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
M.''20&1/4+)J0-.)4.)S&'&0C)5.;+3
•! M2;/$6&')1.''20&1/4+6).Q+,/0;)/11+66)&0-.)4.)9'&0C)'.;+3)7&/)Q4AW;+1.;+W&064DjI)–! K++;6)4.)'.;&-8)6&C0/42,+).-)Q4AW;+1.;+W&0-.DjI)&0)Q4AW&,U:G)/0;)&0)12;/$6&'U11()
class function_info { ... void function_info::ptx_decode_inst( ptx_thread_info *thread,
... ... int *vectorout, int *arch_reg );
... };
Apply to all calls of ptx_decode_inst(),
and the extern “C” interface that links this function to the timing model in C
Export operands’
virtual register number in an array
A;B;)@;#$CDDE$ FFF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
VAQ.6+)*11+66)J0-.)4.)S&'&0C)5.;+3 •! K.>)'.;&-8)&04+,0/36).-)Q4AW;+1.;+W&064DjI)&0)12;/$6&'U11()
void function_info::ptx_decode_inst(...) { ... for (; op != pI->op_iter_end(); op++, n++) {
const operand_info &o = *op; if (has_dst && n==0) {
if (o.is_reg()) { *o1 = o.reg_num(); arch_reg[0] = o.arch_reg_num();
} else {...} } else {
if (o.is_reg()) { int reg_num = o.reg_num(); arch_reg[m + 4] = o.arch_reg_num(); ... m++;
} else {...} }
} ... };
Dest. Operand
(RF Write)
Src. Operand
(RF Read)
A;B;)@;#$CDDE$ FFC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
S&'&0C)5.;+3)5.;&d1/9.0
•! "/66)Q4AW;+1.;+W&064DjI)Q.&04+,)4.)/,,/8)/,1:W,+C)26+;)4.),+42,0)4:+)7&,42/3),+C&64+,)02'B+,).-)/33)/11+66+;).Q+,/0;6U)
–! c/0L)*11+66+;)_)y&,42/3)<+CU)K.U)5=N)vB/0L6)
–! M.0b&14).112,6)>:+0)4>.).Q+,/0;6)/11+66)4:+)6/'+)B/0L)-.,)7/32+6)&0);&P+,+04),+C&64+,6))
•! S>.)Q/,46)4.)4:&6()–! N+4+190C)02'B+,).-)1813+6)4.)64/33)-.,)/)B/0L)1.0b&14)
–! J'Q3+'+090C)4:+)64/33&0C)'+1:/0&6')
A;B;)@;#$CDDE$ FFb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
int gpgpu_reg_bank_conflict_model = 0;
#define MAX_REG_BANKS 32 unsigned int gpgpu_num_reg_banks = 8; // this needs to be less than MAX_REG_BANKS
#define MAX_BANK_CONFLICT 8 /* tex can have four source and four destination regs */ #define EMPTY_REG_BANK_ACCESS { .tot = 0, .rd = 0, .wr = 0, .rd_regs = { -1, -1, -1, -1 } }
struct reg_bank_access { unsigned tot; unsigned rd; unsigned wr; int rd_regs[4];
} g_reg_bank_access[MAX_REG_BANKS] = { EMPTY_REG_BANK_ACCESS };
// just to use as "shorthand" for clearing accesses each cycle static const struct reg_bank_access empty_reg_bank_access = EMPTY_REG_BANK_ACCESS;
unsigned int gpu_reg_bank_conflict_stalls = 0;
VA/'Q3+)5.;&d1/9.0()
<+C&64+,)c/0L)M.0b&14)N+4+19.0
•! J'Q3+'+04)/)4+'Q.,/,8)64,2142,+)4:/4)4,/1L6)4:+),+C&64+,)JN)/11+66&0C)7/,&.26)B/0L6))
•! "24)4:&6)B+-.,+)6:/;+,W;+1.;+DI)&0)6:/;+,U1()
Configurable Options
Performance Counter A;B;)@;#$CDDE$ FFO$
GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
if ( gpgpu_reg_bank_conflict_model && first_valid_thread != -1 ) { for (i = 4; i < 8; i++) { if( arch_reg[i] == -1 ) continue; int skip = 0; int bank = arch_reg[i] % gpgpu_num_reg_banks; int opndreg = shader->pipeline_reg[first_valid_thread][IF_ID].in[i-4]; for (int j = 0; j < 4; j++) { if (g_reg_bank_access[bank].rd_regs[j] == -1) break; else if (g_reg_bank_access[bank].rd_regs[j] == opndreg) { skip = 1; break; } } if (!skip) { g_reg_bank_access[bank].tot++; g_reg_bank_access[bank].rd_regs[j] = opndreg; } }
VA/'Q3+)5.;&d1/9.0()
<+C&64+,)c/0L)M.0b&14)N+4+19.0
•! *;;)&06&;+)6:/;+,W;+1.;+DI)&0)6:/;+,U1()
•! *;;)4:&6)/g+,)4:+)d,64)3..Q)4:/4)1/336)Q4AW;+1.;+W&064DI()
int arch_reg[MAX_REG_OPERANDS] = { -1 };
ptx_decode_inst( shader->thread[tid].ptx_thd_info, ..., &vectorout, arch_reg );
M.204)v);&P+,+04)/11+66)4.)+/1:)B/0L)
M.0902+).0)0+A4)63&;+)
A;B;)@;#$CDDE$ FFa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
<+C&64+,)c/0L)M.0b&14)N+4+19.0
•! D1.0902+;)-,.')3/64)63&;+I()
unsigned max_access=0; int r; inst_t* conflict_inst = &shader->pipeline_reg[first_valid_thread][IF_ID]; for( r = 0; r < gpgpu_num_reg_banks; r++ ) { if( g_reg_bank_access[r].tot > max_access ) max_access = g_reg_bank_access[r].tot; g_reg_bank_access[r] = empty_reg_bank_access; } if( max_access >= 1 ) { assert( max_access <= MAX_REG_OPERANDS ); conflict_inst->reg_bank_access_pending = max_access - 1; if( max_access > 1 ) { conflict_inst->reg_bank_conflict_stall_checked = 1; return; // stall pipeline } } shader->pipeline_reg[first_valid_thread][IF_ID].reg_bank_conflict_stall_checked = 1; }
%4/33)_)5/A&'2')v);&P+,+04)/11+66)/'.0C)/33)B/0L6)
Add these new members to
inst_t in shader.h
A;B;)@;#$CDDE$ FFh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
<+C&64+,)c/0L)M.0b&14)%4/336 •! R/64)63&;+()*;;&0C)4>.)0+>)'+'B+,6)4.)&060W4)
–! S,/1L)v)1813+6)4.)64/33)
–! J0;&1/4+)4:/4)4:+)64/33)-.,),+C&64+,)-+41:&0C):/6):/QQ+0+;)6.);.)0.4)64/33)-.,)&4)/C/&0U))
–! 5/L+)62,+)4:+8)/,+)B.4:)&0&9/3&h+;)4.)F)&0)6:/;+,W&662+W4:,+/;DI()
void shader_issue_thread(shader_core_ctx_t *shader, int tid, int wlane, unsigned active_mask ) { if ( gpgpu_cuda_sim ) { ... shader->pipeline_reg[wlane][TS_IF].reg_bank_conflict_stall_checked = 0; shader->pipeline_reg[wlane][TS_IF].reg_bank_access_pending = 0; shader->pipeline_reg[wlane][TS_IF].uid = g_next_shader_inst_uid++; shader->pipeline_reg[wlane][TS_IF].warp_active_mask = active_mask; shader->pipeline_reg[wlane][TS_IF].ts_cycle = gpu_tot_sim_cycle + gpu_sim_cycle; } assert( shader->thread[tid].avail4fetch > 0 ); shader->thread[tid].avail4fetch--; assert( shader->thread[tid - (tid % warp_size)].n_avail4fetch > 0 ); shader->thread[tid - (tid % warp_size)].n_avail4fetch--; }
A;B;)@;#$CDDE$ FFi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
<+C&64+,)c/0L)M.0b&14)%4/336
if ( gpgpu_reg_bank_conflict_model ) { for (i=0; i<pipe_simd_width;i++) { if ( shader->pipeline_reg[i][IF_ID].reg_bank_conflict_stall_checked ) { if ( shader->pipeline_reg[i][IF_ID].reg_bank_access_pending > 0 ) { assert( shader->pipeline_reg[i][IF_ID].reg_bank_access_pending <= 8 ); shader->pipeline_reg[i][IF_ID].reg_bank_access_pending--; gpu_reg_bank_conflict_stalls++; return; // stall } } } }
•! *;;)4:&6)4.)4:+)B+C&00&0C).-)6:/;+,W;+1.;+DI()
if ( gpgpu_reg_bank_conflict_model && first_valid_thread != -1 && !shader->pipeline_reg[first_valid_thread][IF_ID].reg_bank_conflict_stall_checked) { for (i = 4; i < 8; i++) { if( arch_reg[i] == -1 ) continue; ...
•! S.)/7.&;);+/;3.1L()
=038)/QQ38)64/33)&-)4:+),+C&64+,)B/0L)1.0b&14)&6)1:+1L+;)
A;B;)@;#$CDDE$ FF`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
M,+/4+)M.0dC2,/9.0)=Q9.06
option_parser_register(opp, "-gpgpu_reg_bank_conflict_model", OPT_BOOL, &gpgpu_reg_bank_conflict_model, "Turn on register bank conflict model (default = off)", "0");
option_parser_register(opp, "-gpgpu_num_reg_banks", OPT_INT32, &gpgpu_num_reg_banks, "Number of register banks (default = 8)", "8");
•! *;;)4:&6)4.)CQ2W,+CW.Q9.06DI)&0)CQ2$6&'U1()
•! N;;)+A4+,06)-.,)4:+)4>.)C3.B/3)7/,&/B3+6)&0)CQ2$6&'U1()extern int gpgpu_reg_bank_conflict_model; extern int gpgpu_num_reg_banks;
•! *;;)/)-Q,&0z)&0)6:/;+,WQ,&04W/1164/46DI)&0)6:/;+,U1()
fprintf(fout, "gpu_reg_bank_conflict_stalls = %d\n", gpu_reg_bank_conflict_stalls);
A;B;)@;#$CDDE$ FFE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
VA/'Q3+)5.;&d1/9.0()
N.0+{ •! S,8),200&0C)4:&6)>&4:)4:+).Q9.0)42,0+;).0)/0;).P)
•! S,8)7/,8)4:+)02'B+,).-),+C&64+,)B/0L6))
•! ?:/4)&6)3+g).24()–! <+C&64+,)>,&4+6)/4)4:+)>,&4+B/1L)64/C+)
–! N,.Q)&0)4:&6)6/'Q3+)-.,)6&'Q3&1&48)
A;B;)@;#$CDDE$ FCD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
=7+,7&+>
•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$
•! GHGH9IJ.)$N:;#:.;Q$
•! GHGH9IJ.)$L2+;#2(-4$$
–!%.B#"(#B1.+;B+7#;$)"*;-$
–!J"RQ(#;$"#8(2.S(>"2$
–!TU()V-;$)"*.WB(>"24$
•! !"!"#$%&')S..36)
A;B;)@;#$CDDE$ FCF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
';#.(-x.4."2Y$x.47(-.S.28$
?")V-;U$A<2().B4$.2$GH9$
x.47(-.S;#$!""-$="#$GHGH9IJ.)$
A;B;)@;#$CDDE$ FCC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCb$
%">:(>"2$
•! ?"))"2$B(4;$.2$(#B1.+;B+7#(-$#;4;(#B1Y$$–! M;4;(#B1;#4$.)V-;);2+;*$+1;.#$2;Q$V#"V"4(-4$.2$(2$(#B1.+;B+7#;$4.)7-(+"#$K.2$+1.4$B(4;$GHGH9IJ.)P$
–! J.)7-(>"2$#;47-+4$(B#"44$($47.+;$"=$@;2B1)(#04$747(--<$-""0$-.0;$+1.4Y$
A B C D E F
Need to understand reason behind slowdowns!
Need to validate these results!
Ultimately: Gain more insight about performance trends
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCO$
%">:(>"2$
•! ?7##;2+$Q(<4$+"$8(.2$.24.81+4$KGHGH9IJ.)P$
–! H;#="#)(2B;$4+(>4>B4$-"8$�$;2*$"=$0;#2;-$-(72B1$
•! L82"#;4$#72>);$*<2().B4$.2$+1;$).B#"(#B1.+;B+7#;$
–! GA/$B<B-;$@<$B<B-;$4+;VV.28$
•! L2="#)(>"2$":;#-"(*$.=$+#(B0.28$)"#;$+1(2$"2;$72.+$
•! J-"Q$�$1"7#4$+"$4+;V$+1#"781$FDDD4$"=$B<B-;$
•! GH9$.4$V(#(--;-$(2*$V#"8#())(@-;$
–! 5"+4$"=$.24.81+$8(.2;*$=#")$1(:.28$($8-"@(-$:.;Q$
•! !.);I-(V4;$V;#="#)(2B;$:(#.(>"24$
•! x.;Q$)7->V-;$72.+4$.2$V(#(--;-$
–! M;-(+;$V;#="#)(2B;$4+(>4>B4$+"$4"7#B;$B"*;$
•! J.).-(#$8"(-$+"$x!72;,$@7+$="#$?9A'$#722.28$"2$GHGH9IJ.)$
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCa$
%">:(>"2$
•! 31<$.4$#72>);$*<2().B4$.)V"#+(2+$="#$GH94e$–!%(2<$B"#;$(BB;-;#(+"#$(#B1.+;B+7#;$$
–! L2+;#).w;2+$B"2+;2>"24$;|;B+$V;#="#)(2B;,$@7+$2"+$B(V+7#;*$@<$;2*I+"I;2*$V;#="#)(2B;$4+(>4>B4$
Transpose with two different address mapping DRAM utilization is ~uniform for both
case!
Bottleneck goes undetected!
We like this type of
plot so much that we go and build a tool for
this purpose... A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCh$
%">:(>"2$
•! ?9A'$0;#2;-4$(#;$8#"Q.28$-(#8;$
–!%9%%;#GH9$X$�ODD$-.2;4$$
–! GH9AG$X$�CDD$-.2;4$$
–! v$
•! %(2<$GHGH9IJ.)$74;#4$(#;$?9A'$(VV$*;:$
–! 74.28$+1;$4.)7-(+"#$+"$1;-V$72*;#4+(2*$+1;$V;#="#)(2B;$"=$
+1;.#$?9A'$(VV-.B(>"24$.2$*;:;-"V);2+$
–! ];;*$+"$V.2$V".2+$V;#="#)(2B;$@"w-;2;B0$.2$+1;$0;#2;-$
B"*;$
–! /;2;W+4$1(#*Q(#;$(#B1.+;B+4$(4$Q;--$
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCi$
L2+#"*7B.28$';#.(-x.4."2$
•! x.47(-.S;#$="#$GHGH9IJ.)$–! !.);$5(V4;$x.;QY$H;#="#)(2B;$);+#.B4$:4&$>);$
–! J"7#B;$?"*;$x.;QY$H;#="#)(2B;$);+#.B4$:4&$+1;$?9A'$4"7#B;$B"*;$$
–! L)V-;);2+;*$.2$H<+1"2Y$:;#<$;U+;24.@-;$$
•! GHGH9IJ.)$)"*.W;*$+"$8;2;#(+;$.2V7+4$="#$';#.(-x.4."2Y$
CUDA
Application
GPGPU-Sim Visualization
Trace
PTX
Instruction Statistics
AerialVision visualizer.cc
ptx-stats.cc
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FC`$
N7+-.2;$K#;)(.2*;#$"=$+7+"#.(-P$
•! ';#.(-x.4."2$%">:(>"2$c$L2+#"*7B>"2$
•! !.);$5(V4;$x.;Q$
•! J"7#B;$?"*;$x.;Q$
•! A;)"$
•! N:;#1;(*$
•! J7))(#<$
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCE$
!.);$5(V4;$x.;Q$
•! x.47(-.S;$V;#="#)(2B;$);+#.B4$:;#474$>);$–! H-"+$7V$+"$a$);+#.B4$.2$($W87#;$="#$*.#;B+$:.47(-$B")V&$$
–! L2W2.+;$z$W87#;4$K72>-$);)"#<$#724$"7+P$
•! A.|;#;2+$+<V;$"=$V-"+4$="#$*.|;#;2+$);+#.B4$–! 5.2;$V-"+4$ $ $~$8-"@(-$4.287-(#4$K;&8&$LH?P$
–! H(#(--;-$.2+;24.+<$V-"+ $~$);+#.B4$(B#"44$)7->&$Z3$72.+4$
–! J+(B0$@(#$B1(#+4$ $~$B")V"2;2+$@#;(0*"Q24$
–! H?IZ.4+"8#() $ $~$#;-(+;$+1#;(*$*<2().B4$(2*$$ $ $ $ $$$$$$4"7#B;$B"*;$$
•! H-"+4$(#;$8;2;#(+;*$Q.+1$%(+V-"+-.@$$–! ](:.8(>"2$+""-$@(#$="#$S"")$(2*$V(2$
–!%"#;$="#)(�28$"V>"24$;UV"4;*$Q.+1$;U+#($Q.*8;+4$
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbD$
!.);$5(V4;$x.;Q$�$$
%;+#.BK4P$J;-;B>"2$
Add a new tab
(a new figure)
Select a visualizer
trace file
Select metric to plot
Select type of plot
Add more plots to
the figure
Configure
each subplot
Click this to start plotting!
Shows what you have selected for each plot
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbF$
!.);$5(V4;$x.;Q$�$$
6.87#;$Figure plotting selected metrics Switch between different figures
Formatting
Tool
Navigation Tool A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbC$
!.);$5(V4;$x.;Q$�$$
H(#(--;-$L2+;24.+<$H-"+$•! x.;Q$V;#="#)(2B;$);+#.B4$="#$)7->V-;$V(#(--;-$1(#*Q(#;$72.+4$
:4&$>);$–! ?1(28;$B"-"#$)(VV.28$Q.+1$[?1(28;$?"-"#)(V$%(Ug%.2\$
Mapping from color to value of the metric
Data for each
shader core
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbb$
!.);$5(V4;$x.;Q$�$
J+(B0;*$/(#$?1(#+$
•! J1"Q4$+1;$@#;(0*"Q2$"=$($);+#.B$:4&$>);$–!3(#V$*.:;#8;2B;$$
–! 5"(*gJ+"#;$-(+;2B<$@#;(0*"Q2$Each component
is represented by a unique color.
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbO$
!.);$5(V4;$x.;Q$�$
H?IZ.4+"8#()$K?65"8P$•! '$>);$4;#.;4$"=$1.4+"8#()4,$#;V#;4;2>28$+1;$V"#>"2$"=$+1;$
V#"8#()$+1(+$+1;$+1#;(*4$1(4$#=,(1"!$*7#.28$($8.:;2$4()V-;$V;#."*&$$
•! '$+1#;(*$.4$B"24.*;#;*$+"$1(:;$#=,(1"!'(2$.24+#7B>"2Y$–! 'R;#$.+$1(4$=;+B1;*$+1;$.24+#7B>"2$
–! 92>-$.+$=;+B1;4$($2;Q$.24+#7B>"2$
CFLOG stands for
“Control Flow” Log
Each line here represents:
-! A PTX instruction
OR
-! A line in CUDA source code.
Color at each dot indicates
# threads touching the
instruction during that
sampling period.
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fba$
!.);$5(V4;$x.;Q$�$
](:.8(>"2$(2*$6"#)(�28$!""-4$
•! ?1(28;$?"-"#)(V$%(Ug%.2$–! ?"2W87#;$1"Q$);+#.B$:(-7;4$(#;$)(VV;*$+"$+1;$B"-"#$4V;B+#7)$
–! '--"Q$74;#$+"$2"#)(-.S;$()"28$(--$V-"+4$
–! ?1""4;$($*.|;#;2+$B"-"#$4B1;);$
•! ?1(28;$/.22.28$$–! %"*.=<$+1;$=#;f7;2B<$"=$>B0$-(@;-4$"2$+1;$(U;4$
•! T*.+$5(@;-4$–! T*.+$+1;$-(@;-4$"2$U$(2*$<$(U;4,$(2*$+1;$>+-;gB"-"#)(V$-(@;-$
–! ?1""4;$+1;$="2+4$4.S;$="#$-(@;-4$
Pan Zoom Change plot spacing
Save
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbh$
J"7#B;$?"*;$x.;Q$
•! '2(-<S;$?9A'$V;#="#)(2B;$"2$GHGH9IJ.)$
–!](##"Q$@"w-;2;B0$*"Q2$+"$($4.28-;$-.2;$"=$4"7#B;$
B"*;$.2$<"7#$(VV-.B(>"2$
•! 6;(+7#;4Y$
–!J1"Q4$V;#="#)(2B;$);+#.B4$4.*;I@<I4.*;$Q.+1$+1;$
4"7#B;$B"*;$$
–!](:.8(>"2$8#(V1$$
•! J1"Q4$+1;[email protected]$V.B+7#;$
•! J.28-;IB-.B0$+"$�7)V$+"$+1;$[B"*;$"=$.2+;#;4+\$
Performance Profiler
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbi$
J"7#B;$?"*;$x.;Q$�$
%;+#.BK4P$J;-;B>"2$
Select the CUDA C
file or PTX file to show with the metric
Choose how metrics
should be combined (More on this next)
Choose the metric
to be displayed
Click this to proceed
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fb`$
J"7#B;$?"*;$x.;Q$�$
?")@.2;$%;+#.B4$•! GHGH9IJ.)$4()V-;4$V;#="#)(2B;$);+#.B4$="#$;(B1$H!_$.24+#7B>"2&$$
•! T(B1$-.2;$"=$?9A'$?$~$)7->V-;$-.2;4$"=$H!_$.24+#7B>"24&$–! !1.4$)(VV.28$B(2$@;$(Bf7.#;*$:.($*;@78$.2="$8;2;#(+;*$@<$]x??$
•! �7;4>"2Y$Z"Q$41"7-*$+1;$);+#.B4$=#")$)7->V-;$H!_$.24+#7B>"24$@;$#;*7B;*e$–! %(U $�$5(+;2B<$g$TU;B7>"2$?"72+$
–! J7)$ $�$�7(2>+<$B"72+;#4$
a[i] = b[i] + c; ld.s32 %r2, [%r1];
add.f32 %r4, %r2, %r3;
st.s32 %r4, [%r1];
200
1
20
200 221
Sum? Max?
•! We provide suggestions for each metric in the manual.
•! Ratio between metrics? We use Max for both.
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbE$
J"7#B;$?"*;$x.;Q$�$
](:.8(>"2$Metric
Data
CUDA C Source Code
Navigation graph
Format
Tool etc…
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOD$
J"7#B;$?"*;$x.;Q$�$
](:.8(>"2$
Line number in CUDA C File
Right-Click Here
Viewer jumps to here,
with the clicked line highlighted
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOF$
N:;#1;(*$c$TU+;24."2$
•! GHGH9IJ.)$2;;*$+"$*"$;U+#($Q"#0$+"$8;2;#(+;$.2V7+4$="#$';#.(-x.4."2$–! J.)7-(>"2$4V;;*$�Fb}$4-"Q;#$
–! A(+($*7)V;*$+"$ZAA$X$b&O0/$V;#$4()V-;$
•! !7#2;*$"2$@<$*;=(7-+$$–! 4()V-.28$=#;f7;2B<$X$FDDD$B<B-;4$
•! '**.28$2;Q$);+#.B4$="#$';#.(-x.4."2$–! J;;$:.47(-.S;#&BB$(2*$V+UI4.)&BB$(2*$)(27(-$
–! ]"$2;;*$+"$)"*.=<$';#.(-x.4."2$
Configurable
See manual for options
A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOC$
J7))(#<$
•! ';#.(-x.4."2$�$x.47(-.S;#$="#$GHGH9IJ.)$
–!!.);$5(V4;$x.;Q$
•! %;+#.B$:4&$!.);$
–!J"7#B;$?"*;$x.;Q$
•! %;+#.B$:4&$?"*;$
•! M;(*$)(27(-$="#$)"#;$*;+(.-$.2="$
–!T)(.-$74$.=$<"7$1(:;$(2<$f7;4>"2$"$