GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… ·...

36
!"!"#$%&’( * "+,-.,’/01+ %&’23/4., -., 5/66&7+38 52394:,+/;+; ",.1+66., <+6+/,1: !"# %& ’()"*+, ’-. /(01"*(, (2* 3.-4"2 3& 5& 6728 92.:;#4.+< "= /#.>41 ?"-7)@.( A;B;)@;# CDDE F GHGH9IJ.) !7+"#.(- K%L?MNIOCP Version of simulator corresponding to these slides = GPGPU-Sim 2.1.1b =7+,7&+> /#.;= M;:.;Q "= GH9 H#"8#()).28 %"*;-4 GHGH9IJ.) N:;#:.;Q GHGH9IJ.) L2+;#2(-4 %.B#"(#B1.+;B+7#; )"*;- J"RQ(#; "#8(2.S(>"2 TU()V-; )"*.WB(>"24 GHGH9IJ.) !""-4 A;B;)@;# CDDE C GHGH9IJ.) !7+"#.(- K%L?MNIOCP ! ?:/4 &6 / !"#@ GH9 X G#(V1.B4 H#"B;44.28 92.+ L2+;#;4>28 @;B(74;Y Z.81-< H(#(--;- Z.81-< V#"8#())(@-; ?"))"*.+< 1(#*Q(#; K[*;40+"V 47V;#B")V7>28\P ]:.*.(^4 G!_C‘aY bD U ‘IQ.*; )7->V#"B;44"#4 FD,DDD^4 "= B"2B7##;2+ +1#;(*4 A;B;)@;# CDDE b GHGH9IJ.) !7+"#.(- K%L?MNIOCP ?:8 4:+ +A1&4+’+04 /B.24 !"#6@ J"7#B;Y ’%A, Z"+ ?1.V4 FE A;B;)@;# CDDE O GHGH9IJ.) !7+"#.(- K%L?MNIOCP While CPU clock stays same and adding cache… GPU peak throughput tracking Moore’s Law

Transcript of GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… ·...

Page 1: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

!"!"#$%&'()*)"+,-.,'/01+)

%&'23/4.,)-.,)5/66&7+38)

52394:,+/;+;)",.1+66.,)<+6+/,1:)

!"#$%&$'()"*+,$'-.$/(01"*(,$(2*$3.-4"2$3&$5&$6728$

92.:;#4.+<$"=$/#.>41$?"-7)@.($

A;B;)@;#$CDDE$ F$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Version of simulator corresponding to these slides = GPGPU-Sim 2.1.1b

=7+,7&+>)

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! GHGH9IJ.)$N:;#:.;Q$

•! GHGH9IJ.)$L2+;#2(-4$$

–!%.B#"(#B1.+;B+7#;$)"*;-$

–!J"RQ(#;$"#8(2.S(>"2$

–!TU()V-;$)"*.WB(>"24$

•! GHGH9IJ.)$!""-4$

A;B;)@;#$CDDE$ C$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!

?:/4)&6)/)!"#@!

•! GH9$X$G#(V1.B4$H#"B;44.28$92.+$

•! L2+;#;4>28$@;B(74;Y$

–!Z.81-<$H(#(--;-$$

–!Z.81-<$V#"8#())(@-;$$

–! ?"))"*.+<$1(#*Q(#;$K[*;40+"V$47V;#B")V7>28\P$

•! ]:.*.( 4̂$G!_C`aY$bD$U$`IQ.*;$)7->V#"B;44"#4$

•! FD,DDD 4̂$"=$B"2B7##;2+$+1#;(*4$

A;B;)@;#$CDDE$ b$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

?:8)4:+)+A1&4+'+04)/B.24)!"#6@

J"7#B;Y$'%A,$Z"+$?1.V4$FE$$

A;B;)@;#$CDDE$ O$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

While CPU clock stays same and adding cache…

GPU peak throughput tracking Moore’s Law

Page 2: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

GHGH9Y$GH9$?")V7>28$

$$$O$B"#;$?H9$$$$$$$$$c$$$$$$$$$$$COD$B"#;$GH9$$

–!Z;+;#"8;2;"74$B")V7>28$

A;B;)@;#$CDDE$ a$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

",.C,/''&0C)5.;+3

•! !#(*.>"2(-$:.;QV".2+$

–!M72$"2$?H9$72>-$#;(B1.28$*(+($V(#(--;-$B"*;$

4;B>"24$Q1.B1$(#;$"d"(*;*$"2+"$+1;$GH9$

•! ?"##;B+$:.;QV".2+e$K.=$<"7$Q(2+$FDDU$

4V;;*7VP$

–!GH9$X$B")V7+(>"2$Q"#01"#4;$

–!?H9$X$4;f7;2>(-$B"*;$[(BB;-;#(+"#\$(2*$LgN$

"d"(*$;28.2;$

A;B;)@;#$CDDE$ h$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!"#)5&1,./,1:&4+142,+)=7+,7&+>)

DEFGFFFHI

J04+,1.00+19.0)K+4>.,L)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

A;B;)@;#$CDDE$ i$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

GPU

=P$1:&Q)N<*5

M#N*)/0;)=Q+0MR

•! TU+;24."24$"=$?$+"$47VV"#+$B"V#"B;44"#$)"*;-$

•! 3;$47VV"#+$@"+1$

–! $!1.4$+7+"#.(-$Q;^--$="B74$"2$?9A'$$

•! $%"#;$=;(+7#;4$(2*$(VV-.B(>"24$+"*(<

A;B;)@;#$CDDE$ `$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 3: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

M#N*)5+'.,8)5.;+3

•! $$%;)"#<$4V(B;4$

–!J1(#;*$);)"#<$$

–!G-"@(-$

–!5"B(-$

–!?"24+(2+$

–!!;U+7#;$

J"7#B;Y$?9A'$V#"8#()).28$)(27(-$$A;B;)@;#$CDDE$ E$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M#N*)S:,+/;)T&+,/,1:8)

•! j;#2;-$X$8#.*$

"=$@-"B04$"=$

Q(#V4$"=$

+1#;(*4$

•! !1#;(*4$(#;$

4B(-(#$+1#;(*4$

J"7#B;Y$?9A'$V#"8#()).28$)(27(-$$A;B;)@;#$CDDE$ FD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M#N*)6804/A):&C:3&C:46U

•! A;B-(#(>"2$4V;B.W;#4$+"$.2*.B(+;$Q1;#;$+1.284$-.:;$kk8-"@(-kk$:".*$j;#2;-672BK&&&Pl$gg$0;#2;-$B(--(@-;$=#")$1"4+$

kk*;:.B;kk$$:".*$A;:.B;672BK&&&Pl$gg$=72B>"2$B(--(@-;$"2$*;:.B;$

•! H(#(--;-$0;#2;-$-(72B1$j;#2;-672BmmmaDD,$FC`nnnK&&&Pl$gg$aDD@-"B04,$FC`+1#;(*4$;(B1$

•! JV;B.(-$:(#.(@-;4$="#$+1#;(*$.*;2>WB(>"2$.2$0;#2;-4$*.)b$+1#;(*L*Ul$*.)b$@-"B0L*Ul$*.)b$@-"B0A.)l$

A;B;)@;#$CDDE$ FF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M#N*)VA/'Q3+)1.;+)

%4/0;/,;)M)M.;+):".*$4(UV<k4;#.(-K.2+$2,$o"(+$(,$o"(+$pU,$o"(+$p<P$$

q$

$="#$K.2+$.$X$Dl$.$m$2l$cc.P<r.s$X$(pUr.s$c$<r.sl$t$$

gg$L2:"0;$4;#.(-$J'_Hu$0;#2;-$$4(UV<k4;#.(-K2,$C&D,$U,$<Pl$

M#N*)1.;+)

kk8-"@(-kk$:".*$4(UV<kV(#(--;-K.2+$2,$o"(+$(,$o"(+$pU,$o"(+$p<P$$q$

$$$.2+$.$X$@-"B0L*U&Up@-"B0A.)&U$c$+1#;(*L*U&Ul$

$$$.=K.m2P$$$$$$$$$$$$<r.sX(pUr.sc<r.sl$

t$$)(.2KP$q$

$$v$gg$").w;*Y$(--"B(+;$(2*$.2.>(-.S;$);)"#<$

$$gg$L2:"0;$V(#(--;-$J'_Hu$0;#2;-$Q.+1$Cah$+1#;(*4g@-"B0$$$.2+$2@-"B04$X$K2$c$CaaP$g$Cahl$

$$6/AQ8WQ/,/33+3XXX0B3.1L6G)YZ[\\\D0G)YUFG)AG)8I])$$v$gg$").w;*Y$+#(24=;#$#;47-+4$=#")$GH9$+"$?H9$

t$

High performance computing with CUDA, SC09 Tutorial,

David Luebke, NVIDIA

A;B;)@;#$CDDE$ FC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 4: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

!"!"#$%&')&0)/)K246:+33)

•! %.B#"(#B1.+;B+7#;$>).28$)"*;-$"=$

B"2+;)V"#(#<$GH94$

•! M72$72)"*.W;*$?9A'gNV;2?5$

•! ]N!TY$!1.4$:;#4."2$"=$+1;4;$4-.*;4$B"##;4V"2*4$

+"$GHGH9IJ.)$:;#4."2$C&F&F@$$

A;B;)@;#$CDDE$ Fb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

*112,/18).-)%&'23/4.,)

0

50

100

150

200

250

0 50 100 150 200 250

GP

GP

U-S

im I

PC

Quadro FX 5800 IPC

HW - GPGPU-Sim Comparison

A;B;)@;#$CDDE$ FO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

GHGH9IJ.)$J"RQ(#;$A;V;2*;2B.;4$

•! 5.27U$

•! ?9A'$

•! ?9A'$JAj$

•! J+(2*(#*$A;:;-"V;#$T2:.#"2);2+$

–!G??,$/.4"2,$;+B&$

•! 6"#$NV;2?5$

–!?9A'$B(V(@-;$GH9$Z3$

A;B;)@;#$CDDE$ Fa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

%+^0C)2Q)4:+)6&'23/4.,

•! ?9A'ZN%TY$B7*($.24+(--$-"B(>"2$

•! ?9A'kL]J!'55kH'!ZY$B7*($.24+(--$-"B(>"2$

•! ]xLAL'k?9A'kJAjk5N?'!LN]Y$4*0$-"B(>"2$$

•! H'!ZY$(**$?9A'ZN%[email protected]$

•! 5Ak5L/M'MukH'!ZY$(**$y?9A'ZN%Tg-.@$

A;B;)@;#$CDDE$ Fh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 5: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

NV5=

A;B;)@;#$CDDE$ Fi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! !"!"#$%&')=7+,7&+>$

•! GHGH9IJ.)$L2+;#2(-4$$

–!%.B#"(#B1.+;B+7#;$)"*;-$

–!J"RQ(#;$"#8(2.S(>"2$

–!TU()V-;$)"*.WB(>"24$

•! GHGH9IJ.)$!""-4)

A;B;)@;#$CDDE$ F`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>).-)4:&6)6+19.0

•! 31(+$GHGH9IJ.)$4.)7-(+;4$

–!672B>"2(-$)"*;-$="#$H!_$c$?9A'gNV;2?5$

–!!.).28$)"*;-$="#$+1;$B")V7+;$V(#+$"=$($GH9$

•! Z"Q$GHGH9IJ.)$.2+;#=(B;4$Q.+1$?9A'$(VV-.B(>"24$

A;B;)@;#$CDDE$ FE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

31(+$GHGH9IJ.)$4.)7-(+;4

•! Functional model for PTX

–! PTX = Parallel Thread eXecution

•! A low-level, data-parallel virtual machine

•! Scalar ISA

–! Not SASS, Not DirectX, Not shader model N, Not

AMD’s ISA, Not x86, Not Larrabee. Only PTX.

•! Timing model for the compute part of a GPU

–! Not for CPU or PCIe

–! Only model microarchitecture timing relevant to

compute

A;B;)@;#$CDDE$ CD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 6: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

672B>"2(-$)"*;-$KH!_P

•! 5"QI-;:;-,$*(+(IV(#(--;-$:.#+7(-$)(B1.2;$–! '44;)@-<I-.0;Y$L24+#7B>"24$

–! H(#(--;-$+1#;(*4$#722.28$.2$@-"B04$$

–! JV;B.=<$#;4"7#B;4$Q.+1$2"$-.).+$•! Z3$*.4V(+B1;4$+1#;(*$@-"B04$(BB"#*.28$+"$.+4$-.).+$

•! JB(-(#$LJ'$–! Z3$8#"7V4$4B(-(#$+1#;(*4$.2+"$JL%A$Q(#V4$(4$($V;#="#)(2B;$"V>).S(>"2$

•! ?"2:;#8.28$V(#+$"=$?9A'$+""-$B1(.2Y$

.cu

.cl

NVCC

OpenCL Drv

PTX ptxas

G80

GT200

Fermi A;B;)@;#$CDDE$ CF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Functional model (PTX)

for (int d = blockDim.x; d > 0; d /= 2)

{

__syncthreads();

if (tid < d) {

float f0 = shared[tid];

float f1 = shared[tid + d];

if (f1 < f0)

shared[tid] = f1;

}

}

$Lt_0_6146:

bar.sync 0;

setp.le.s32 %p3, %r7, %r1;

@%p3 bra $Lt_0_6402;

ld.shared.f32 %f3, [%rd9+0];

add.s32 %r9, %r7, %r1;

cvt.s64.s32 %rd18, %r9;

mul.lo.u64 %rd19, %rd18, 4;

add.u64 %rd20, %rd6, %rd19;

ld.shared.f32 %f4, [%rd20+0];

setp.gt.f32 %p4, %f3, %f4;

@!%p4 bra $Lt_0_6914;

st.shared.f32 [%rd9+0], %f4;

$Lt_0_6914:

$Lt_0_6402:

shr.s32 %r10, %r7, 31;

mov.s32 %r11, 1;

and.b32 %r12, %r10, %r11;

add.s32 %r13, %r12, %r7;

shr.s32 %r7, %r13, 1;

mov.u32 %r14, 0;

setp.gt.s32 %p5, %r7, %r14;

@%p5 bra $Lt_0_6146;

•! Scalar PTX ISA

•! Scalar control flow (if-branch, for-loops) •! Parallel Intrinsic (syncthreads())

•! Register allocation not done in PTX

A;B;)@;#$CDDE$ CC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

// some initialization code omitted

!.).28$%"*;-$="#$$

?")V7+;$V(#+4$"=$($GH9

•! GHGH9IJ.)$)"*;-4$>).28$="#Y$

–! J1(*;#$?"#;$KJ%,$JL%A$92.+P$

–! ?(B1;4$K!;U+7#;,$?"24+(2+,$vP$

–! L2+;#B"22;B>"2$];+Q"#0$

–!%;)"#<$?"2+#"--;#4$

–! G#(V1.B4$AM'%$

•! GHGH9IJ.)$*";4$]N!$)"*;-$>).28$="#Y$

–! ?H9,$H?L;$$

–! G#(V1.B4$JV;B.WB$Z3$KM(4+;#.S;#,$?-.VV.28,$A.4V-(<v$;+B&P

GPU

PCIe

Inte

rco

nn

ect

Gfx DRAM

Mem Ctrl Shdr Cores

Cache

Raster… Gfx HW

CPU

A;B;)@;#$CDDE$ Cb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!.).28$)"*;-$="#$$

GH9$).B#"I(#B1.+;B+7#;

•! GHGH9IJ.)$4.)7-(+;$+1;$

>).28$)"*;-$"=$($GH9$

#722.28$;(B1$-(72B1;*$?9A'$

0;#2;-&$$

–! M;V"#+4$z$B<B-;4$4V;2+$#722.28$

+1;$0;#2;-4&$$

–! TUB-7*;$(2<$>);$4V;2+$"2$*(+($

+#(24=;#$"2$H?L;$@74&$$

–! ?H9$.4$(447);*$+"$@;$.*-;$Q1;2$

+1;$GH9$.4$Q"#0.28&$

Time

GPU HW

GPU HW

CPU

CPU

Kernel Launch

Done

Kernel Launch

Done

CPU

GPGPU-Sim

GPGPU-Sim

A;B;)@;#$CDDE$ CO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 7: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

•! GHGH9IJ.)$.4$($!"#$%&"!'()(&"*&"+"&$4.)7-(+"#Y$

–! ?<B-;I-;:;-$)"*;-$="#$;(B1$V(#+$"=$+1;$).B#"(#B1.+;B+7#;$

–! L82"#.28$#(#;$B"#2;#$B(4;4$="#$;{B.;2B<$$

•! ;&8&$2"$!5/$).44,$2"$AM'%$#;=#;41$

•! A.|;#;2+$=#")$()(&"*$((,-$#"$4.)7-(+"#Y$

–! A";4$2"+$)(+B1$1(#*Q(#;$FDD}$

•! 31<e$

–! GH9$Z3$;U;B7+;$J'JJY$

•! H!_$~$!#(24-(+;$c$NV>).S;$~$J'JJ$

–!3;$B(2$"2-<$87;44$+1;$(B+7(-$Z3$.)V-;);2+(>"2v$

!.).28$)"*;-$="#$$

GH9$).B#"I(#B1.+;B+7#;$

GPGPU-Sim is ~0.89 correlated to the real HW.

A;B;)@;#$CDDE$ Ca$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

L2+;#=(B.28$GHGH9IJ.)$+"$'VV4

•! GHGH9IJ.)$B")V.-;4$.+4;-=$.2+"$($41(#;*$#72>);$-.@#(#<$(2*$.)V-;);2+$+1;$'HLY$–! -.@B7*(#+&4"$ $�$?9A'$#72>);$'HL$

–! -.@NV;2?5&4" $�$NV;2?5$'HL$

•! 3"#04$="#$?9A'$C&Dc$KC&b$#;B"));2*;*P$–! 6"#$?9A'$F&F,$<"7$B(2$4+(>B(--<$-.20$GHGH9IJ.)$.2+"$<"7#$?9A'$(VV$

•! %"*.=<$<"7#$5Ak5L/M'MukH'!Z$;2:&$:(#&$+"$#72$<"7#$?9A'$(VV$"2$GHGH9IJ.)$KJ;;$%(27(-P$$–! ];;*$($B"2W8$W-;$K8V8V74.)&B"2W8P$(2*$($.2+;#B"22;B>"2$B"2W8$W-;$(4$Q;--$

We provide the config files for modeling a Quadro FX 5800.

A;B;)@;#$CDDE$ Ch$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Interfacing GPGPU-Sim to Apps

•! When should you statically link GPGPU-Sim into

your CUDA app?

–! App works only in CUDA 1.1

•! We provide a common makefile that works like

the one in NVIDIA GPU Compute SDK.

A;B;)@;#$CDDE$ Ci$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

?9A'$?")V.-(>"2$Qg$GH9$Z3$

C`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 8: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

GHGH9IJ.)$?")V.-(>"2$6-"Q$

CE$

Most complexity hidden by using the DLL interface to GPGPU-Sim (LD_LIBRARY_PATH)

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S24.,&/3)=243&0+

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! GHGH9IJ.)$N:;#:.;Q$

•! !"!"#$%&')J04+,0/36$$

–!5&1,./,1:&4+142,+)'.;+3$

–!J"RQ(#;$"#8(2.S(>"2$(2*$1(B0.28$>V4$

–!TU()V-;$)"*.WB(>"24$

•! GHGH9IJ.)$!""-4)

bD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>).-)4:&6)6+19.0)

•! 31(+$(#;$Q(#V4e$

•! Z"Q$GHGH9IJ.)$)"*;-4$V.V;-.2;*$Q(#V$

;U;B7>"2$

•! j;<$).B#"(#B1.+;B+7#;$4+#7B+7#;4$)"*;-;*$@<$

GHGH9IJ.)$

bF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Shader Core

?!'$

K!1#;(*$/-"B0P$

bC$!1#;(*4$

bC$!1#;(*4$

bC$!1#;(*4$

?!'$

K!1#;(*$/-"B0P$

bC$!1#;(*4$

bC$!1#;(*4$

bC$!1#;(*4$

S:,+/;)T&+,/,1:8)<+7&6&4+;)

•! M;B(--,$0;#2;-$X$8#.*$"=$@-"B04$"=$Q(#V4$"=$+1#;(*4$

•! !1#;(*$@-"B04$K?!'4P$B"2+(.24$7V$+"$aFC$+1#;(*4$

•! !1#;(*4$(#;$8#"7V;*$.2+"$.$-/0$.2$1$-!.$-"'

!1#;(*$/-"B0$

K?!'P$

bC$!1#;(*4$

bC$!1#;(*4$

bC$!1#;(*4$

?/,Q6$

Each block is dispatched

to a shader core as a unit

of work: All of its warps

run in the core’s pipeline

until they are all done.

bC$

Source: NVIDIA

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 9: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

?/,Q)_)%J5S)VA+129.0).-)%1/3/,)

S:,+/;6)

•! 3(#V$X$JB(-(#$+1#;(*4$8#"7V;*$+"$;U;B7+;$.2$-"B04+;V$•! JL%!$:4$JL%A$

–! JL%AY$Z3$V.V;-.2;$Q.*+1$)74+$@;$02"Q2$@<$4"RQ(#;$

–! JL%!Y$V.V;-.2;$Q.*+1$1.**;2$=#")$4"RQ(#;$K!P$$

Thread Warp 3

Thread Warp 8

Thread Warp 7

Thread Warp

Scalar Thread

W

Scalar Thread

X

Scalar Thread

Y

Scalar Thread

Z

Common PC

SIMT Pipeline

bb$

K!P$?(2$4>--$Q#.+;$4"RQ(#;$+1(+$(447);4$+1#;(*4$.2$($Q(#V$;U;B7+;$.2$-"B04+;V$K;&8&$4;;$#;*7B>"2$.2$]xLAL'$

JAjP$$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!"#)5&1,./,1:&4+142,+)=7+,7&+>)

DEFGFFFHI

))))))))))))))))))))))))))=P$1:&Q)N<*5)

J04+,1.00+19.0)K+4>.,L)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

bO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J06&;+)/)%:/;+,)M.,+)

•! 6.2;I8#(.2;*$)7->+1#;(*.28$

–! L2+;#-;(:;$+1#;(*$;U;B7>"2$+"$1.*;$-(+;2B.;4$

–! M;8.4+;#$:(-7;4$"=$(--$+1#;(*4$4+(<4$.2$#;8.4+;#$W-;$

–! N2;$.24+#7B>"2$V;#$+1#;(*$.2$

V.V;-.2;$(+$($>);$K]"$@#(2B1$

V#;*.B>"2P$

Decode

R F

R F

R F

A L U

A L U

A L U

Memory

Thread Warp 6

Thread Warp 1 Thread Warp 2 Data

Threads accessing memory hierarchy

Thread Warp 3 Thread Warp 8

Writeback

Threads available for scheduling

Thread Warp 7

Fetch

SIMT Pipeline

ba$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

"&Q+3&0+)64/C+6)

•! J+(8;4$GHGH9IJ.)$4.)7-(+;4$.2$($

41(*;#$B"#;$V.V;-.2;Y$

–!6;+B1 $$

–!A;B"*;$

–!TU;B7+;$

–!A7))<$H#;I%;)$J+(8;4$

–!%;)"#<$$

–!3#.+;@(B0$

6;+B1$

A;B"*;$

TU;B7+;$

H#;I%;)$

%;)"#<$

3#.+;@(B0$

bh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 10: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

`+41:)%4/C+)

•! JB1;*7-;$($Q(#V$+"$4+(#+$;U;B7>"2$.2$V.V;-.2;$

•! Z(2*-;4$@#(2B1$*.:;#8;2B;$@<$)(40.28$"|$2"2I

(B>:;$+1#;(*4$$

–!/(4;*$"2$B1"4;2$!1#;(*$JB1;*7-.28$H"-.B<$$

•! A;=(7-+Y$L));*.(+;$V"4+$*").2(+"#$KHAN%P$–! J+(B0$@(4;*$B"2+#"-$o"QY$$5;:.2+1(-$;+$(-&,$?1(V$�$($JL%A$8#(V1.B4$V#"B;44"#&$23445678'9:;<$

•! A<2().B$Q(#V$="#)(>"2$KA36P$$–! 6728$;+$(-&,$A<2().B$3(#V$6"#)(>"2$(2*$JB1;*7-.28$="#$T{B.;2+$GH9$?"2+#"-$6-"Q,$%L?MN$CDDi$$

6;+B1$

A;B"*;$

TU;B7+;$

H#;I%

;)$

%;)"#<$

3#.+;@(B0$

bi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

"N=5)N&7+,C+01+)T/0;3&0C)-.,)?/,Q6)

A

B C

D

A -- 1111 B D 1110 C D 0001

Next PC Recv PC Amask D -- 1111

Control Flow Stack

One per warp

A; if (some condition) { B; } else { C; } D;

b`$

TOS

D

1

1 1

1

A

0

0 0

1

C

1

1 1

0

B

1

1 1

1

D

Time

Execution Sequence

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

N+1.;+)%4/C+

•! 6"#$(--$+1;$(B>:;$+1#;(*4$.2$+1;$Q(#V$$$

–!A;B"*;$.24+#7B>"24$

–!672B>"2(--<$;U;B7+;$+1;$.24+#7B>"24$

–!G;+$2;B;44(#<$.2="$=#")$=72B>"2(-$4.)7-(+"#$

•! %;)"#<$4V(B;$K8-"@(-,$-"B(-,$B"24+(2+,$+;U+7#;P$

•! %;)"#<$(**#;44$K74;*$="#$B"(-;4B.28$.2$%;)$4+(8;P$

•! ];U+$H?$K="#$@#(2B1$*.:;#8;2B;$1(2*-.28P$$

6;+B1$

A;B"*;$

TU;B7+;$

H#;I%

;)$

%;)"#<$

3#.+;@(B0$

bE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA+124+a",+$'+'.,8)64/C+6

•! TU;B7+;$4+(8;$

–!T)V+<$4+(8;$.2$GHGH9IJ.)$

–!!1;$(B+7(-$=72B>"2(-$;U;B7>"2$Q(4$.2$*;B"*;$

4+(8;$$

•! H#;I);)$4+(8;4$$

–!T)V+<$4+(8;4$+"$(*�74+$V.V;-.2;$-;28+1$

–!!1;<$(#;$B"))(2*$-.2;$B"2W87#(@-;$

6;+B1$

A;B"*;$

TU;B7+;$

H#;I%

;)$

%;)"#<$

3#.+;@(B0$

OD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 11: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

5+'.,8)%4/C+)=7+,7&+>))

•! 3;$4.)7-(+;$.+$(4$a$V(#(--;-$72.+4$

–!/(4;*$"2$+1;$.24+#7B>"2 4̂$);)"#<$4V(B;$

•! A;+(.-4$)"*;-;*$.2$);)"#<$4+(8;Y$

–!J1(#;*$%;)"#<$@(20$B"2o.B+4$

–!?"(-;4B.28$

–!%JZM4$6;+B1$

A;B"*;$

TU;B7+;$

H#;I%

;)$

%;)"#<$

3#.+;@(B0$

OF$

Global Texture Constant Local Shared

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

?,&4+B/1L)%4/C+)

•! 6.2.41.28$L24+#7B>"24$7V*(+;$#;8.4+;#$W-;$

•! '#@.+#(>"2$="#$#;8.4+;#$W-;$

–!M;+7#2.28$);)"#<$.24+#7B>"24$(2*$+1;$B7##;2+-<$

#722.28$.24+#7B>"2$B")V;+;$="#$#;8.4+;#$W-;$

@(2*Q.*+1$

–!M;+7#2.28$);)"#<$.24+#7B>"24$1(:;$1.81;#$

V#."#.+<$

6;+B1$

A;B"*;$

TU;B7+;$

H#;I%

;)$

%;)"#<$

3#.+;@(B0$

OC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Pre-Mem

Global Texture Constant Local Shared

Texture Cache

Constant Cache

Shared Memory

Execute

Decode

Fetch

Writeback

Off-Chip DRAM

Data Flow

Instruction Flow

Pipeline Stage

Physical Memory

5+'.,8)%4/C+)N+4/&36)

Ob$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M.064/04)M/1:+)

•! '$M;(*I"2-<$B(B1;$="#$B"24+(2+$);)"#<$

•! GHGH9IJ.)$4.)7-(+;4$C$#;(*$V"#+4$

–!'$Q(#V$B(2$(BB;44$C$B"24+(2+$B(B1;$-"B(>"24$.2$($

4.28-;$B<B-;$

–! L=$)"#;$+1(2$C$-"B(>"24$(BB;44;*$$

•! #;(*4$(#;$4;#.(-.S;*$B(74.28$V.V;-.2;$4+(--4$

–! Kz$"=$V"#+4$.4$B"2W87#(@-;P$$

OO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 12: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

S+A42,+)M/1:+)

•! M;(*I"2-<$B(B1;$

•! GHGH9IJ.)$47VV"#+$FIA$(2*$CIA$+;U+7#;4$

•! CIA$-"B(-.+<$41"7-*$@;$V#;4;#:;*$Q1;2$+;U+7#;$

B(B1;$@-"B04$(#;$=;+B1;*$=#")$);)"#<$

–!GHGH9IJ.)$74;4$($OIA$@-"B0.28$(**#;44$4B1;);$+"$

V#")"+;$4V(>(-$-"B(-.+<$.2$CIA$•! /(4;*$"2$Z(07#($;+$(-&$!1;$A;4.82$(2*$'2(-<4.4$"=$($?(B1;$

'#B1.+;B+7#;$="#$!;U+7#;$%(VV.28,$LJ?'$FEEi$$

Oa$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

%:/,+;)5+'.,8

•! TUV-.B.+-<$)(2(8;*$4B#(+B1V(*$);)"#<$$

–!'4$=(4+$(4$#;8.4+;#$W-;4$.2$(@4;2B;$"=$@(20$B"2o.B+4$$

•! !1#;(*4$.2$($@-"B0$B(2$B""V;#(+;$:.($41(#;*$

);)"#<$$

•! T(B1$41(*;#$B"#;$1(4$.+4$"Q2$41(#;*$);)"#<$

•! Fhj/$V;#$J1(*;#$B"#;$.2$B7##;2+$]xLAL'$GH94$

Oh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

%:/,+;)5+'.,8)D1.04UI)

•! %(2<$+1#;(*4$(BB;44.28$);)"#<$$–! !1;#;="#;$J1(#;*$);)"#<$.4$1.81-<$@(20;*$

•! T(B1$@(20$4;#:;4$"2;$(**#;44$V;#$B<B-;$

•! %7->V-;$(BB;44$+"$($@(20$.2$($4.28-;$B<B-;$B(74;$B/0L)1.0b&146)–! ?"2o.B>28$(BB;44;4$)74+$@;$4;#.(-.S;*$

•! J1(#;*$);)"#<$.2$]xLAL'$GH94$1(4$Fh$@(204$–! ?"2o.B+$*;+;B>"2$.4$*"2;$="#$($1(-=IQ(#V$KFh$+1#;(*4P$

–!GHGH9IJ.)$)"*;-4$+1.4$$

Oi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

%:/,+;)5+'.,8)c/0L)M.0b&146)

]"$@(20$B"2o.B+$$ `IQ(<$@(20$B"2o.B+$$

6.87#;4$+(0;2$=#")$?9A'$

)(27(-$@<$]xLAL'$

O`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 13: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

!3.B/3)5+'.,8)

•! G-"@(-$);)"#<$.4$+1;$"|IB1.V$AM'%$);)"#<$$

–!!1;$-(#8;4+$(2*$4-"Q;4+$);)"#<$(:(.-(@-;$

–!'BB;44;4$)74+$8"$+1#"781$.2+;#B"22;B+,$);)"#<$

B"2+#"--;#$(2*$"|IB1.V$AM'%$

–!]"+$B(B1;*$.2$Z3$

–!/7+$GHGH9IJ.)$47VV"#+4$B(B1.28$.+$$

OE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M./3+61&0C)

•! ?")@.2.28$);)"#<$(BB;44;4$)(*;$@<$+1#;(*4$.2$($Q(#V$.2+"$=;Q;#$+#(24(B>"24$

–! T&8&$.=$+1#;(*4$.2$($Q(#V$(#;$(BB;44.28$B"24;B7>:;$CI@<+;$4.S;*$-"B(>"24$.2$);)"#<$

•! J;2*$"2;$hO�@<+;$#;f7;4+$+"$AM'%$KB"(-;4B.28P$

•! L24+;(*$"=$bC$CI@<+;$#;f7;4+4$$

•! !1.4$#;*7B;4$+1;$27)@;#$"=$+#(24(B>"24$@;+Q;;2$41(*;#$B"#;4$(2*$AM'%$–! 5;44$Q"#0$="#$L2+;#B"22;B+,$%;)"#<$?"2+#"--;#$(2*$AM'%$

aD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M./3+61&0C)K?"2+&P

•! ?9A'$?(V(@.-.+<$F&b$K;&8&$G!_C`DP$

–!?"(-;4B.28$*"2;$V;#$1(-=IQ(#V$

–!?(2$B#;(+;$FC`I@<+;,$hOI@<+;$"#$bCI@<+;$

+#(24(B>"24$

•! GHGH9IJ.)$

–!?"(-;4B.28$*"2;$="#$($=7--$Q(#V$

–!N2-<$B#;(+;4$hOI@<+;$+#(24(B>"24$$

aF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M./3+61&0C)KB"2+&P$

•! GHGH9IJ.)$B"(-;4B.28$;U()V-;$

Warp

Warp

one 64-Byte

Transaction

2 64-Byte

Transactions

= 2-bytes in memory

aC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 14: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

5&66)%4/426)T.3;&0C)<+C&64+,6

•! %JZM4$0;;V$+#(B0$"=$"7+4+(2*.28$);)"#<$#;f7;4+4$

–! 0;;V$+#(B0$"=$+1#;(*4,$+(#8;+$#;8.4+;#4,$#;f7;4+$(**#;44;4$

•! T(B1$B"(-;4B;*$);)"#<$+#(24(B>"2$B"247);4$(2$%JZM$–!%JZM4$(#;$-.).+;*$KB"2W87#(@-;P$

–! H.V;-.2;$4+(--4$.=$41(*;#$B"#;$#724$"7+$"=$%JZMJ$$

•! N2;$(VV#"(B1$+1(+$).81+$)(0;$4;24;$K]"$*;+(.-4$(:(.-(@-;$=#")$]xLAL'P$

ab$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!"#)5&1,./,1:&4+142,+)=7+,7&+>)

DEFGFFFHI

))))))))))))))))))))))))))=P$1:&Q)N<*5)J04+,1.00+19.0)K+4>.,L)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

aO$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M3.1L);.'/&06

•! $J.)7-(+;$.2*;V;2*;2+$B-"B0$*")(.24$="#$

–!J1(*;#$B"#;4$

•! 947(--<$4;+$+"$�$"=$$B"#;$B-"B0$$+"$B")V;24(+;$="#$74.28$

JL%A$Q.*+1$"=$bC$.24+;(*$"=$$`$$$

–! L2+;#B"22;B>"2$2;+Q"#0$

–!5C$B(B1;$K.=$;2(@-;*P$

–!AM'%$

•! !1.4$.4$#;(-$B-"B0$KB"))(2*$B-"B0P$

•! T|;B>:;$B-"B0$.4$CU$+1.4$B-"B0$*7;$+"$AAM$

aa$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

M3.1L)N.'/&0)M,.66&0C)

•! 3;$4.)7-(+;$4;2*$(2*$#;B;.:;$@7|;#4$(+$B-"B0$

B#"44.28$@"72*(#.;4$

•! !1;$@7|;#4$(#;$W--;*$(2*$*#(.2;*$.2$*.|;#;2+$

B-"B0$*")(.24$

•! T&8&$B"24.*;#$+1;$@7|;#$=#")$.2+;#B"22;B+$$+"$

);)"#<$B"2+#"--;#$$

–!6.--;*$(+$.2+;#B"22;B+$B-"B0$#(+;$

–!A#(.2;*$(+$AM'%$B-"B0$#(+;$

ah$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 15: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

J04+,1.00+19.0)K+4>.,L)5.;+3

•! L2+;#4.)$K/""04.)P$($o.+$-;:;-$4.)7-(+"#$$

–!!"V"-"8.;4$K%;41,$!"#74,$/7w;#o<,$vP$

–!M"7>28$KA.);24."2$N#*;#,$'*(V>:;,$;+B&$P$

–!6-"Q$?"2+#"-$Kx.#+7(-$?1(22;-4,$?#;*.+4P$

•! 3;$4.)7-(+;$+Q"$4;V(#(+;$2;+Q"#04$

–!6#")$J1(*;#$B"#;4$+"$);)"#<$B"2+#"--;#4$

•! M;(*$M;f7;4+4,$3#.+;$#;f7;4+4$

–!6#")$);)"#<$B"2+#"--;#4$+"$41(*;#$B"#;4$

•! M;(*$M;V-.;4$

ai$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S.Q.3.C8)VA/'Q3+6)

a`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,1.00+19.0)K+4>.,L)M.0dC

•! /""04.)$1(4$.+4$"Q2$B"2W8$W-;$–! !"V"-"8<$K+"V"-"8<,$0$,2$P$

–! x.#+7(-$B1(22;-4$K27)k:B4P$

–! /7|;#4$V;#$x?$K:Bk@7=k4.S;P$

–! M"7>28$K#"7>28$k=72B>"2P$

–! JV;;*7V4$K.2V7+k4V;;*7V,$.2+;#2(-k4V;;*7VP$

–!'--"B(+"#4$K:Bk(--"B(+"#,$4Qk(--"B(+"#P$

•! JV;B.WB$+"$GHGH9I4.)$

–! ?1(22;-$3.*+1$Ko.+k4.S;P$

–! J;�28$);)"#<$B"2+#"--;#$-"B(>"24$K74;k)(VP$

aE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,1.00+14)J0e+19.0)J04+,-/1+6)

Clock Boundary

J1(*;#$

?"#;$$M"7+;#$

Core Clock

Domain

Interconnect Clock

Domain

1 Flit / Cycle 1 Packet / Cycle

hD$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 16: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

J04+,1.00+14)J0e+19.0)J04+,-/1+6)

Clock Boundary

$%;)"#<$

?"2+#"--;#$$M"7+;#$

DRAM

Clock Domain

Interconnect Clock

Domain

1 Flit / Cycle 1 Packet / Cycle

hF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,1.00+14)J0e+19.0)J04+,-/1+6)

Clock Boundary

5C$?(B1;$$ M"7+;#$

L2 Clock

Domain

Interconnect Clock

Domain

1 Flit / Cycle 1 Packet / Cycle

hC$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$

•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$

Clock

Boundary

M"7+;#$ J1(*;#$?"#;$

Core Clock

Domain Interconnect Clock

Domain

Ejection Buffers Boundary Buffers

Credit return buffer

1 Credit / Cycle

1 Flit / Cycle

1 Flit / Cycle

1 Packet / Cycle

(Round Robin)

hb$

# of VCs

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$

•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$

Clock

Boundary

M"7+;#$%;)"#<$

?"2+#"--;#$

DRAM Clock

Domain Interconnect Clock

Domain

Ejection Buffers Boundary Buffers

Credit return buffer

1 Credit / Cycle

1 Flit / Cycle

1 Flit / Cycle

1 Packet / Cycle

(Round Robin)

hO$

# of VCs

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 17: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

J04+,1.00+14)Ve+19.0)J04+,-/1+6)•! F$T�;B>"2g@"72*(#<$@7|;#$V;#$x?$KF$o.+$g$B<B-;P$

•! '$B#;*.+$.4$4;2+$@(B0$+"$#"7+;#$(4$($o.+$8";4$=#")$;�;B>"2$+"$@"72*(#<$@7|;#$$

Clock

Boundary

M"7+;#$ 5C$?(B1;$

L2 Clock

Domain Interconnect Clock

Domain

Ejection Buffers Boundary Buffers

Credit return buffer

1 Credit / Cycle

1 Flit / Cycle

1 Flit / Cycle

1 Packet / Cycle

(Round Robin)

ha$

# of VCs

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!"#)5&1,./,1:&4+142,+)=7+,7&+>)

DEFGFFFHI

J04+,1.00+19.0)K+4>.,L)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

%:/;+,)

M.,+)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

5+'.,8)

M.04,.33+,)

!NN<O)

hh$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=P$1:&Q)N<*5

5+'.,8)*;;,+66)5/QQ&0C)

•! N|IB1.V$);)"#<$V(#>>"2;*$()"28$4;:;#(-$

);)"#<$B1(22;-4$

–!G!CDD$1(4$`$);)"#<$B1(22;-4$

–!G`D$1(*$h$);)"#<$B1(22;-4$

–!T(B1$);)"#<$B1(22;-$1(4$($);)"#<$B"2+#"--;#$

•! J7BB;44.:;$CahI@<+;$#;8."24$"=$);)"#<$(#;$

(44.82;*$+"$47BB;44.:;$);)"#<$B1(22;-4$

–!'**#;44$)(VV.28$.4$B"2W87#(@-;$.2$GHGH9IJ.)$

9]J3$?9A'$!7+"#.(-$@<$]xLAL'$V(#+$O$"V>).S.28$?9A'$hi$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

5+'U)*;;,+66)5/QQ&0C$K?"2+&P$

0x0000

0x0100

0x0200

0x0300

0x0400

0x0500

0x0600

0x0700

0x0800 DRAM

Channel 0 DRAM

Channel 1 DRAM

Channel 2 DRAM

Channel 3

DRAM Channel 4

DRAM Channel 5

DRAM Channel 6

DRAM Channel 7

Interconnection Network

Shader

Core

Shader

Core

Shader

Core

h`$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 18: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

N<*5)

•! AM'%$%;)"#<$

–!N|IB1.V,$1.81I*;24.+<$(2*$1.81$B(V(B.+<$

•! AM'%$(BB;44$>);$.4$K.4)B"24+(2+$

–! L+$1(4$2"2I72.="#)$(BB;44$-(+;2B.;4$

•! !1(+ 4̂$Q1<$Q;$)"*;-$.+�$

hE$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

DRAM

Column Decoder

Memory Array

Ro

w D

eco

der

Mem

ory

C

on

tro

ller

Row Buffer Row Buffer

Ro

w D

eco

der

Column Decoder

Row Buffer

Column Decoder

Row Buffer

N<*5)*11+66))

iD$

•! M"Q$(BB;44$$

–! $'B>:(+;$($#"Q$"#$V(8;$"=$($

AM'%$@(20$

–! 5"(*$.+$+"$#"Q$@7|;#$

•! ?"-7)2$(BB;44$

–! $J;-;B+$(2*$#;+7#2$($@-"B0$"=$

*(+($.2$#"Q$@7|;#$

•! H#;B1(#8;$

–!3#.+;$@(B0$+1;$"V;2;*$#"Q$

.2+"$AM'%$$

–! N+1;#Q.4;$.+$Q.--$@;$-"4+�$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

N<*5)<.>)*11+66)R.1/3&48)

tRC = row cycle time

tRP = row precharge time

tRCD = row activate time

iF$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

N<*5)c/0L$3+7+3)"/,/33+3&6')

iC$

•! !"$.2B#;(4;$AM'%$

V;#="#)(2B;$(2*$7>-.S(>"2$•! %7->V-;$@(204$V;#$AM'%$B1.V$

•! !"$.2B#;(4;$@74$Q.*+1$•! Multiple chips per Memory

Controller

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 19: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

%1:+;23&0C)N<*5)<+f2+646)

•! JB1;*7-.28$V"-.B.;4$47VV"#+;*$•! 6.#4+$.2$W#4+$"7+$K6L6NP$$

•! L2I"#*;#$4B1;*7-.28$

•! 6.#4+$M;(*<$6.#4+$?");$6.#4+$J;#:;$K6MI6?6JP$

•! N7+$"=$"#*;#$4B1;*7-.28$

•! M;f7.#;4$(44"B.(>:;$4;(#B1$

ib$A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! GHGH9IJ.)$N:;#:.;Q$

•! GHGH9IJ.)$L2+;#2(-4$$

–!%.B#"(#B1.+;B+7#;$)"*;$

–!%.g>/,+).,C/0&h/9.0$

–!TU()V-;$)"*.WB(>"24$

•! GHGH9IJ.)$!""-4)

A;B;)@;#$CDDE$ iO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>).-)4:&6)6+19.0)

•! L2+#"*7B;$GHGH9IJ.)$)"*7-;4$

•! L2+;#=(B.28$Q.+1$?9A'$(2*$NV;2?5$

•! A;+(.-4$"=$H!_$4.)7-(>"2$

•! !.).28$%"*;-$

A;B;)@;#$CDDE$ ia$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S:,++);&P+,+04)'.;23+6())

•! M#N*a=Q+0MR)*"J)3&B,/,8)&04+,-/1+)

•! "Si)&064,219.0)6+4)+'23/4.,)

•! S&'&0C)'.;+3)

A;B;)@;#$CDDE$ ih$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 20: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

J04+,-/1+)4.)M#N*a=Q+0MR

•! ?9A'$(2*$NV;2?5$(VV-.B(>"24$.2B-7*;$B"*;$+1(+$#724$"2$+1;$

1"4+&$$/"+1$74;$(2$'HL$+"$B"))72.B(+;$@;+Q;;2$1"4+$(2*$

GH9&$$!1;$'HL$.4$*;W2;*$.2$:(#."74$1;(*;#$W-;4&$$L)V-;);2+(>"2$.4$.2$($A55$(2*g"#$4+(>B$-.@#(#<&$

•! ?7##;2+-<,$Q;$#72$1"4+$B"*;$"2$+1;$4.)7-(+"#$1"4+$V-(�"#)$

(2*$*"$2"+$)"*;-$(2<$.)V(B+$"=$'HL$"#$1"4+$"2$;U;B7>"2$

>);&$$$L&;&,$.=$<"7$#72$GHGH9IJ.)$"2$($?"#;$C$A7"$)(B1.2;,$

+1;$1"4+$B"*;$#724$2(>:;-<$"2$<"7#$?"#;$C$A7"$)(B1.2;&$

A;B;)@;#$CDDE$ ii$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,-/1+)4.)M#N*)*"J)

•! 3;$.)V-;);2+$:;#4."24$"=$NV;2?5$g$?9A'$.2+;#=(B;$B(--4$.2$($

2;Q$A55,$.2+;#=(B;$(*�74+$5Ak5L/M'MukH'!Z$(2*$?9A'g

NV;2?5$(VV-.B(>"2$#724$"2$4.)7-(+"#$#(+1;#$+1(2$GH9$1(#*Q(#;&$$$$

•! G.:;2$"7#$#;4;(#B1$="B74,$Q;$1(:;$.)V-;);2+;*$"2-<$Q1(+$

Q(4$#;f7.#;*$+"$8;+$(VV-.B(>"24$Q;$Q;#;$.2+;#;4+;*$.2$

#722.28&$$$$

•! ?"*;$B(2$@;$B-;(2;*$7V$4.82.WB(2+-<&$$

A;B;)@;#$CDDE$ i`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

J04+,-/1+)4.)M#N*)*"J)$$$$$$TU()V-;$"=$.2+;#=(B;$B"*;$@;+Q;;2$1"4+$(2*$4.)7-(+"#&$$$!1.4$.4$+1;$B"*;$

+1(+$(B+7(--<$4+(#+4$#722.28$+1;$=72B>"2(-$(2*$>).28$)"*;-4&$$?(--$+"$

B7*(5(72B1$.4$8;2;#(+;*$@<$2:BB$=#")$[mmmnnn\$2"+(>"2&$$$$

__host__ cudaError_t CUDARTAPI cudaLaunch( const char *symbol ) {

printf("\n\n\n"); char *mode = getenv("PTX_SIM_MODE_FUNC");

if( mode ) sscanf(mode,"%u", &g_ptx_sim_mode); printf("GPGPU-Sim PTX: cudaLaunch for %p (mode=%s)\n", symbol,

g_ptx_sim_mode?"functional simulation":"performance simulation"); if( g_ptx_sim_mode )

gpgpu_ptx_sim_main_func( symbol, g_cudaGridDim, g_cudaBlockDim, g_ptx_sim_params ); else gpgpu_ptx_sim_main_perf( symbol, g_cudaGridDim, g_cudaBlockDim, g_ptx_sim_params );

g_ptx_sim_params=NULL; return g_last_cudaError = cudaSuccess;

}

A;B;)@;#$CDDE$ iE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

!"!"#$%&')%4/,42Q)N+4/&36)

F&! J");$?9A'$B"*;$B(--;*$@;="#;$)(.2KP$*7#.28$

.2.>(-.S(>"2$"=$8-"@(-$:(#.(@-;4&$$

kkB7*(M;8.4+;#672B>"2,$kkB7*(M;8.4+;#J1(#;*$

!1;4;$V#":.*;$.2="#)(>"2$(@"7+$*;:.B;$B"*;$K(2*$

1"Q$+"$B(--$.+P&$

C&! 6.#4+$B(--$+"$(2<$?9A'$'HL$=72B>"2$B(74;4$4.)7-(+"#$

.2.>(-.S(>"2&$

–! M;(*$;2:.#"2);2+$:(#.(@-;4$K*;@78$.2=",$4.)$)"*;P$

–! H(#4;$"V>"2$W-;4$

–! L2.>(-.S;$GH9$7'#B1$%"*;-$

–! 5"(*gV(#4;$H!_$0;#2;-4,$*;+;#).2;$V"4+I*").2(+"#4$

A;B;)@;#$CDDE$ `D$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 21: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

"Si)"/,6&0CG)".64$N.'&0/4.,)N+4+19.0

•! 3;$B"24+#7B+;*$($o;[email protected]"2$V(#4;#$+"$#;(*$.2$H!_$B"*;&$$

•! M;(*4$H!_$+;U+$8;2;#(+;*$@<$2:"V;2BB$

•! A;=(7-+$.4$+"$#;(*$H!_$+;U+$;)@;**;*[email protected](#<$

•! 94.28$o;[email protected]"2$8.:;4$74$o;[email protected].+<$4"$Q1;2$K2"+$.=eP$]xLAL'$

B1(28;4$+1;.#$H!_$4<2+(U,$Q;$B(2$)(0;$B1(28;4$+"$"7#$V(#4;#$

+"$)(+B1&$

•! H"4+I*").2(+"#4$74;*$="#$Q(#V$*.:;#8;2B;$4.)7-(>"2$(#;$

*;+;#).2;*$(+$+1.4$V".2+$74.28$4+(2*(#*$B")V.-;#$B"2+#"-$o"Q$(2(-<4.4$(-8"#.+1)4&$

A;B;)@;#$CDDE$ `F$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

H(#4.28$H!_$

•! ];U+$+Q"$4-.*;4$.--74+#(+;$42.VV;+4$"=$B"*;$

=#")$-;U;#$(2*$V(#4;#$

A;B;)@;#$CDDE$ `C$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

B7*(I4.)gV+U&-$�$W2*$+"0;24$$

abs TC; ptx_lval.int_value = ABS_OP; return OPCODE;

add TC; ptx_lval.int_value = ADD_OP; return OPCODE;

and TC; ptx_lval.int_value = AND_OP; return OPCODE;

\.align TC; return ALIGN_DIRECTIVE;

\.byte TC; return BYTE_DIRECTIVE;

\.const\[[0-9]+\] TC; return CONST_DIRECTIVE;

"%tid" TC; ptx_lval.int_value = TID_ID; return SPECIAL_REGISTER;

\.u32 TC; return U32_TYPE;

\.u64 TC; return U64_TYPE;

\.f16 TC; return F16_TYPE;

\.f32 TC; return F32_TYPE;

\.equ TC; return EQU_OPTION;

\.neu TC; return NEU_OPTION;

\.ltu TC; return LTU_OPTION;

"]" TC; return RIGHT_SQUARE_BRACKET;

"<" TC; return LEFT_ANGLE_BRACKET;

">" TC; return RIGHT_ANGLE_BRACKET;

"(" TC; return LEFT_PAREN;

A;B;)@;#$CDDE$ `b$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

B7*(I4.)gV+U&<$�$#;(*$.24+#7B>"24$…

%token <string_value> STRING

%token <int_value> OPCODE

%token ALIGN_DIRECTIVE

%token BYTE_DIRECTIVE

%%

input: /* empty */

| input directive_statement

| input function_defn

| input function_decl

;

function_defn: function_decl { set_symtab($1); } LEFT_BRACE statement_list RIGHT_BRACE

{ end_function(); }

| function_decl { set_symtab($1); } block_spec LEFT_BRACE statement_list RIGHT_BRACE { end_function(); }

;

instruction: opcode_spec LEFT_PAREN operand RIGHT_PAREN { set_return(); } COMMA operand

COMMA LEFT_PAREN operand_list RIGHT_PAREN

| opcode_spec operand COMMA LEFT_PAREN operand_list RIGHT_PAREN

| opcode_spec operand_list

| opcode_spec

;

… A;B;)@;#$CDDE$ `O$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 22: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

Z"Q$(#;$+1#;(*4$4.)7-(+;*$

K=72B>"2(--<Pe$

•! !1#;(*$X$V#"8#()$B"72+;#$c$

$ $$ $ $$$$4;+$"=$#;8.4+;#4$c$

$ $$ $ $$$$4;+$"=$-"B(-$);)"#<$-"B(>"24$

$ $$ $ $$$$$

?!'$K@-"B0P$X$4;+$"=$+1#;(*4$Q.+1$(BB;44$+"$($

41(#;*$);)"#<&$

]"$2">"2$"=$[Q(#V\$.2$=72B>"2(-$4.)7-(+"#v$

KB")V-.B(+;4$[:"+;\$.24+#7B>"2P$A;B;)@;#$CDDE$ `a$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

T.>)&064,219.06)/,+)6&'23/4+;)

D-2019.0/338I@)

•! ?"*;$="#$4.)7-(>28$.24+#7B>"24$.2$B7*(I4.)g.24+#7B>"2&BB$

•! !1#;(*4$.2.>(-.S;*$*7#.28$-(72B1.28$"=$@-"B04&$

•! 672B>"2(-$;U;B7>"2$(+$*;B"*;Y$[8.(2+$4Q.+B1$4+(+;);2+\$

4.)7-(>"2$(VV#"(B1$K;)7-(>"2P$

•! T(B1$+1#;(*$.2$>).28$)"*;-$1(4$V".2+;#$+"$+1#;(*$B"2+;U+$.2$

=72B>"2(-$)"*;-&$

•! 5""07V$.24+#7B>"2$["@�;B+\$B"##;4V"2*.28$+"$V#"8#()$

B"72+;#$KQ;$*"$2"+$(w;)V+$+"$[;2B"*;\$.24+#7B>"2P$

A;B;)@;#$CDDE$ `h$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

T.>)/,+)7/32+6)1.''20&1/9.0)

B+4>++0)4:,+/;6@

•! ?"))72.B(>"2$B(2$1(VV;2$;.+1;#$+1#"781$[8-"@(-$);)"#<\$

"#$+1#"781$[41(#;*$);)"#<\&$

•! 3;$4.)7-(+;$(--$.24+#7B>"24$(4$+1;<$#;(B1$*;B"*;$4+(8;$"=$

V.V;-.2;,$+1.4$.2B-7*;4$-"(*4$(2*$4+"#;4$+1(+$(BB;44$);)"#<&$$

TUB;V>"2$.4$="#$(+").B4$KQ;$4.)7-(+;$+1;)$=72B>"2(--<$"2B;$

(+").B$"V;#(>"2$#;(B1;4$AM'%$B"2+#"--;#$.2$>).28$)"*;-P$

•! %"4+$?9A'$B"*;$(:".*4$.2+#(I0;#2;-$B"))72.B(>"2$+1#"781$

8-"@(-$);)"#<$K);)"#<$"#*;#.28$2"+$Q;--$*;W2;*P$

A;B;)@;#$CDDE$ `i$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

=7+,7&+>).-)4:&6)6+19.0)

•! L2+#"*7B;$GHGH9IJ.)$)"*7-;4$

•! L2+;#=(B.28$Q.+1$?9A'$(2*$NV;2?5$

•! A;+(.-4$"=$H!_$4.)7-(>"2$

•! S&'&0C)5.;+3)

A;B;)@;#$CDDE$ ``$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 23: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

S&'&0C)5.;+3())

J0&9/3&h/9.0

•! &0&4WCQ2DI)&0)CQ2$6&'U1)

–!*33.1/4+)/0;)&0&9/3&h+)'&1,./,1:&4+142,+)'.;+3)

/0;)64/9691)1.33+19.0)64,2142,+6)

–!M/33+;)/4)4:+)d,64)M#N*)*"J)1/33)

A;B;)@;#$CDDE$ `E$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

5/&0)%&'23/9.0)R..Q

•! ,20WCQ2W6&'DI)&0)CQ2$6&'U1)

–!M/33+;)>:+0)/)L+,0+3)&6)3/201:+;j)

–! J0&4)L+,0+3$6Q+1&d1)&0-.)D+UCU)"N=5I)

–!?:&3+)DK.4)/33)4:,+/;);.0+I)k))

))) ) )CQ2W6&'W3..QDI()5/&0)%&'23/9.0)R..Q)

))))l)

–!CQ2WQ,&04W64/4DI()!+0+,/4+)64/4)3.C)

A;B;)@;#$CDDE$ ED$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

5/&0)%&'23/9.0)R..Q

•! J06&;+)CQ2W6&'W3..QDI()

–!M:+1L)/33)13.1L);.'/&06)

•! &-)D13.1LW'/6L)m)M=<VI)k)j)l)

–!VA+124+)4:+).0+6)4:/4)/,+),+/;8)

–!M2,,+0438G)n);.'/&06()

•! M=<V) )o)%:/;+,)M.,+)p)S:,+/;)c3.1L)J662+)

•! JMKS) )o)J04+,1.00+19.0))K+4>.,L))

•! N<*5 )o)N<*5)p)<+f2+64)%1:+;23+,)

•! RY) ) )o)5+'.,8$%&;+)RY)M/1:+)))

A;B;)@;#$CDDE$ EF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

S:,+/;)c3.1L)J662+

•! &662+WB3.1LY1.,+DI()

–! J662+)0+>)4:,+/;)B3.1L6)4.)1.,+)

–! J0&9/3&h+6)4:+)4:,+/;6)&06&;+)4:+)0+>)B3.1L6)

–!S>.)'.;+6()

•! K.,'/3)o)`&33)/)1.,+)>&4:)B3.1L6)2093)&4)&6)-233)

•! %Q,+/;)o)N&64,&B24+)B3.1L)/'.0C)1.,+6)

SC 0 B0 B1 B2

SC 1 B3 B4 B5

SC 2 B6 B7 B8

B9 B10 B11

SC 0 B0 B3 B6

SC 1 B1 B4

SC 2 B2 B5

A;B;)@;#$CDDE$ EC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 24: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

S&'&0C)5.;+3())

%:/;+,)M.,+

•! 6:/;+,W1813+DI)&0)6:/;+,U1()

–! 6:/;+,W>,&4+B/1LDI)

–! 6:/;+,W'+'.,8DIG)6:/;+,W1.064W'+'.,8DIG)

6:/;+,W4+A42,+W'+'.,8DI)

–! 6:/;+,WQ,+W'+'.,8DI)

–! 6:/;+,W+A+124+DI)

–! 6:/;+,W;+1.;+DI)

–! 6:/;+,W-+41:DI)

•! M/33+;)&0),+7+,6+).,;+,)D.,;+,).-)64/33)Q,.Q/C/9.0I)

–! %/'+)4,&1L)26+;)&0)'/08).4:+,)1813+)3+7+3)6&'23/4.,6)

A;B;)@;#$CDDE$ Eb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

J'Q.,4/04)%4,2142,+6

•! 6:/;+,W1.,+W14AW4)

–! M.04/&0)64/4+).-)/)6:/;+,)1.,+))

•! 4:,+/;W14AW4)

–! M.04/&0)Q4AW4:;W&0-.G)&U+U)

Q4AW4:,+/;W&0-.)&0)12;/$6&')

•! &064W4)

–! "&Q+3&0+),+C&64+,)/0;);80/'&1)

&064/01+).-)/0)&064,219.0)

shader_core_ctx_t

Fetch Stage Logic

Decode Stage Logic

TS_IF

IF_ID

ID_EX

Scheduler Logic

A;B;)@;#$CDDE$ EO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

J'Q.,4/04)%4,2142,+6

•! '6:,W+04,8)

–! 5+'.,8),+f2+64)&0-.)26+;)&06&;+)6:/;+,)1.,+)

–! T/6)/)1.Q8).-)4:+)Q&Q+3&0+W,+C)4:/4)1,+/4+;)4:+),+f2+64)

•! '+'W-+41:W4)

–! 5+'.,8),+f2+64)64,2142,+)4:/4)&6)Q/66+;)

B+4>++0)'.;23+6)&0)4:+)'+'.,8)62B$6864+'U))

•! =246&;+).-)6:/;+,)1.,+)

–! !+0+,/4+;)&0)-fWQ26:DI)

–! N+64,.8+;)&0)-fWQ.QDI)-.,),+/;6)/0;)/4)

;,/'WQ.QDI)-.,)>,&4+6)

Shader Core

Memory Subsystem: !! Interconnect !! L2 !! DRAM

Mem Stage Wrbk Stage

mshr_entry_t mshr_entry_t mshr_entry_t mshr_entry_t

mem_fetch_t

mem_fetch_t

mem_fetch_t

A;B;)@;#$CDDE$ Ea$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)5.;+3())

%:/;+,)M.,+)"&Q+3&0+)%4/C+

•! !+0+,/3)*0/4.'8()

J-)DQ&Q+3&0+W,+C6q0+A4W64/C+r)0.4)+'Q48I)

)))),+42,0])aa)64/33))

M.Q8)Q&Q+3&0+W,+C6q12,,+04W64/C+r))

s)Q&Q+3&0+W,+C6q0+A4W64/C+r))

aa)%4+Q)E)&0)64/C+)

`.,+/1:)D4:,+/;)&0)>/,QI)k)j)l))

aa)%4+Q)Y)&0)64/C+)

`.,+/1:)D4:,+/;)&0)>/,QI)k)j)l))

aa)%4+Q)K)&0)64/C+)

`.,+/1:)D4:,+/;)&0)>/,QI)k)j)&-)D64/33W1.0;&9.0I),+42,0])l))

<+4,8))

K+A4)M813+)

Scan through

pipeline reg for all threads in

next stage

All threads in

warp together for each step

A;B;)@;#$CDDE$ Eh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 25: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

S&'&0C)'.;+3()

`+41:)%4/C+)

J-)DQ&Q+3&0+W,+C6qJ`WJNr)0.4)+'Q48I)

))));+1.;+W64/33)_)4,2+]))

%>&41:)D6&';W'.;+3I)k)

1/6+)"N=5()6:/;+,W-+41:W6&';WQ.64;.'&0/4.,DjI)1/6+)N?`()6:/;+,W-+41:W6&';W;>-DjI)

l))

J-)D0.4);+1.;+W64/33I)k)

))))%4/4)1.33+19.0)l))

J-)DQ&Q+3&0+W,+C6qS%WJ`r)0.4)+'Q48I)

)))),+42,0])aa)64/33)

>&;)_)Q;.'W61:+Wd0;W0+A4W>/,QDjI)J-)D>&;)__)$EI),+42,0])aa)0.)>/,Q),;8)

T/0;3+)B,/01:);&7+,C+01+)J662+)>/,Q)4.)Q&Q+3&0+W,+C6qS%WJ`r])

0+>W>/,QWS%)_)4,2+])

J-)D0.4);+1.;+W64/33)/0;)0+>W>/,QWS%I)k)

))))Q&Q+3&0+W,+C6qS%WJ`r)s)Q&Q+3&0+W,+C6qIF_IDr))l))

A;B;)@;#$CDDE$ Ei$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)'.;+3()

N+1.;+)%4/C+)J-)DQ&Q+3&0+W,+C6qJNWVir)0.4)+'Q48I)

)))),+42,0])aa64/33)

Q&Q+3&0+W,+C6qJ`WJNr)s)Q&Q+3&0+W,+C6qJNWVir))

`.,+/1:)D4:,+/;)&0)>/,QI)k)

))))Q4AW;+1.;+W&064)D4:,+/;$\Q4AW4:;W&0-.G)jI])l)

`.,+/1:)D4:,+/;)&0)>/,QI)k)))))Q4AW+A+1W&064)D4:,+/;$\Q4AW4:;W&0-.G)jI])

))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU&064W48Q+)_)j])))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU6Q/1+)_)j])

))))Q&Q+3&0+W,+C6qJ`WJNrq9;rU'+'/;;,)_)j])

))))T/0;3+)B/,,&+,)aa)B/,U6801)

l))

Separate decode and

functional execute

•! Put thread to sleep (nullify

pipeline_regs[IF_ID]) if not all thread arrived

•! Wake up threads in block after all thread arrived

Save info for memory stage

A;B;)@;#$CDDE$ E`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)'.;+3()

VA+124+)%4/C+)

J-)DQ&Q+3&0+W,+C6qViW55r)0.4)+'Q48I)

)))),+42,0])aa64/33)

Q&Q+3&0+W,+C6qJNWVir)s)Q&Q+3&0+W,+C6qViW55r))

J-)DQ&Q+3&0+W,+C6qViW55r)0.4)+'Q48I)

)))),+42,0])aa64/33)

Q&Q+3&0+W,+C6qJNWVir)s)Q&Q+3&0+W,+C6qQ,+W'+'r))

OR

A;B;)@;#$CDDE$ EE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)'.;+3()

5+'.,8)%4/C+)

•! !+0+,/3)*0/4.'8)D'+'.,8G)1.064G)4+A42,+I()J-)DQ&Q+3&0+W,+C6q55W?cr)0.4)+'Q48I)),+42,0])aa)64/33))

M:+1L)%4,2142,/3)T/h/,;()%5V5)ct)M.0b&14G)M./3+61&0CG)M.064u)".,4))

J-)D64,2142,/3W:/h/,;I)6+4D64/33W1.204+,IG),1W-/&3)_)4,2+])J-)D,1W-/&3I)k)

))))<+7+,4)64/46]))))),+42,0])

l)+36+)k)))))%+4)5%T<)DQ&Q+3&0+W,+C6qViW55rI)

))))%+0;)'+'.,8),+f2+64)>&4:)-QWQ26:)DjI)

))))Q&Q+3&0+W,+C6qViW55r)_)K=")l)

*11+66)M/1:+])

J-)D5&66).,)K.WM/1:+I)k)))))*33.1)5%T<)p)JMKS)B2P+,)

))))J-)D0.4)/7/&3/B3+I),1W-/&3)_)4,2+])l)

Q&Q+3&0+W,+C6qViW55r)s)Q&Q+3&0+W,+C6q55W?cr))

J-)D64/33W1.204+,)\)FI)64/33W1.204+,$$),+42,0])aa)64/33))

A;B;)@;#$CDDE$ FDD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 26: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

S&'&0C)'.;+3()

?,&4+B/1L

*,B&4,/4+)B+4>++0)Q&Q+3&0+W,+C6q55W?cr)/0;)<+42,0&0C)'+'.,8)/11+66)()

))))<+42,0&0C)'+'.,8)/11+66)/3>/86)>&0)))))#03.1LW9;)_)4:,+/;6)4:/4)>&0)/,B&4,/9.0)

M:+1L)-.,),+42,0&0C)'+'.,8)/11+66()

))))#6+)C+4W5%T<W,+42,0:+/;DI)4.).B4/&0)5%T<)

J-)D0.4)64/33+;WB8W5%T<I)

))))Q&Q+3&0+W,+C6q55W?cr)_)K=")

M/1:+)2Q;/4+)D&-)/QQ3&1/B3+I)

J-)DQ4AW4:,+/;W;.0+D203.1LW9;UQ4AW4:;W&0-.II)k)

))))N+1,+'+04)/197+)4:,+/;)1.204)-.,)B3.1LD203.1LW9;I)l)+36+)k)

))))?/L+)2Q)203.1LW9;)-.,)61:+;23&0C)l)

Pipeline_reg MSHR

A;B;)@;#$CDDE$ FDF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)'.;+3()

J04+,-/1+6 •! %:/;+,)M.,+)

–! -fWQ26:DI()"26:),+f2+64)4.)'+'.,8)62B6864+')

–! -fWQ.QDI()"24)5%T<).-),+42,0&0C),+f2+64)&04.)

)) ) ) ),+42,0)f2+2+)

–! -fW:/6WB2P+,DI()M:+1L)-.,)JMKS)B2P+,)6Q/1+)

•! J04+,1.00+19.0)K+4>.,L()

–! &104W:/6WB2P+,DI()M:+1L)-.,)&0Q24)B2P+,)6Q/1+)

–! &104WQ26:DI()"26:)Q/1L+4)&04.)0+4>.,L)

–! &104WQ.QDI()".Q)Q/1L+4)-,.')0+4>.,L)

–! &104W4,/06-+,DI()<20)0+4>.,L)-.,)/)1813+)A;B;)@;#$CDDE$ FDC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

S&'&0C)'.;+3()

J04+,-/1+6

•! 5+'.,8)M.04,.33+,)DRY)p)N<*5I())

–!'+'W14,3W-233DI()5+'.,8)f2+2+)-233@)

–!'+'WQ26:DI()"26:),+f2+64)&04.)'+'.,8)f2+2+)

–!'+'W4.QDI()=B4/&0)&0-.)-.,)1.'Q3+4+;),+f2+64))

–!'+'WQ.QDI()".Q)1.'Q3+4+;),+f2+64))

mem_req = mem_ctrl_top(); if (icnt_has_buffer(mem_req.info)) { icnt_push(mem_req); mem_ctrl_pop(); }

Flow Control

A;B;)@;#$CDDE$ FDb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

*;;&0C)=Q9.06

•! #6+).Q9.0WQ/,6+,)'.;23+))

–!*24.'/91/338)Q/,6+6).Q9.06)4.)3&0L+;)7/,&/B3+6)

–!.Q9.0WQ/,6+,W,+C&64+,D.QQG)jI)3&0L).Q9.06)4.)

7/,&/B3+6)

•! %++)CQ2W,+CW.Q9.06DI)&0)CQ2$6&'U1)-.,)

+A/'Q3+6)

A;B;)@;#$CDDE$ FDO$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 27: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

=7+,7&+>

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! GHGH9IJ.)$N:;#:.;Q$

•! GHGH9IJ.)$L2+;#2(-4$$

–!%.B#"(#B1.+;B+7#;$)"*;-$

–!J"RQ(#;$"#8(2.S(>"2$

–!VA/'Q3+)'.;&d1/9.06$

•! GHGH9IJ.)$!""-4)

A;B;)@;#$CDDE$ FDa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

*;;&0C),+C&64+,)B/0L)1.0b&146)'.;+3

•! V/1:)&064,219.0)/11+66+6)O)4.)n).Q+,/0;6)

•! 5239$Q.,4),+C&64+,)&6)4..)+AQ+06&7+)-.,)!"#)

•! !"#)26+6)'239Q3+)6&0C3+)Q.,4+;)%<*5)4.)&'Q3+'+04)4:+),+C&64+,)d3+)

–!5/Q6);&P+,+04),+C&64+,6)4.);&P+,+04)B/0L6)

–!V/1:)&064,219.0)-+41:).Q+,/0;6)-,.')'239Q3+)B/0L6)

•! <+C&64+,)B/0L)1.0b&14)_)?:+0)4>.).Q+,/0;6)/,+)-,.')4:+)6/'+)B/0L)

A;B;)@;#$CDDE$ FDh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

*;;&0C),+C&64+,)B/0L)1.0b&146)'.;+3

•! T?)%.30)vE()%+,&/3&h+),+C&64+,)-+41:)$\)%4/33)Q&Q+3&0+).,)3.>+,)4:,.2C:Q24)

•! T?)%.30)vY()=Q+,/0;)M.33+14.,)*,1:&4+142,+)D#%)"/4+04)*QQ()EEaZZZG[nwI)

–! J04+,3+/7+).Q+,/0;)-+41:)-,.');&P+,+04)4:,+/;6)4.)/1:&+7+)-233)293&h/9.0)

Bank 0 Bank 1 Bank 2 Bank 3

R0 R1 R2 R3

R4 R5 R6 R7

R8 R9 R10 R11

… … … …

add.s32 R3, R1, R2; No Conflict

mul.s32 R3, R0, R4; Conflict at bank 0

A;B;)@;#$CDDE$ FDi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

%4+Q6

•! =B4/&0),+C&64+,)B/0L)/11+66)&0-.)&0)12;/$6&'))

•! VAQ.6+)&0-.)4.)9'&0C)'.;+3)

•! J'Q3+'+04),+C&64+,)B/0L)1.0b&14);+4+19.0)/0;)64/33)

•! M,+/4+)1.0dC2,/9.0).Q9.0))

A;B;)@;#$CDDE$ FD`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 28: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

VA/'Q3+)5.;&d1/9.0()

=B4/&0),+C&64+,)B/0L)/11+66

•! "Si)26+6)7&,42/3),+C&64+,()x-YOG)x,wZG)+41j)

•! ?+)1:..6+)4.)26+()–! c/0L)/11+66+;)_)y&,42/3)<+CU)K.U)5=N)vB/0L6)

•! =B4/&0)4:&6)02'B+,)>:+0)4:+),+C&64+,)&6)/;;+;);2,&0C)"Si)Q/,6&0C)D&0)Q4AW&,U11I()void add_identifier( const char *identifier, ...) {

... switch ( g_space_spec) { case REG_DIRECTIVE: regnum = g_current_symbol_table->next_reg_num(); // code to obtain virtual reg no. here g_last_symbol->set_regno(regnum, arch_regnum); ... } ...

}

Modify class symbol in ptx_ir.h

A;B;)@;#$CDDE$ FDE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

M.''20&1/4+)J0-.)4.)S&'&0C)5.;+3

•! M2;/$6&')1.''20&1/4+6).Q+,/0;)/11+66)&0-.)4.)9'&0C)'.;+3)7&/)Q4AW;+1.;+W&064DjI)–! M/336).Q+,/0;W&0-.((,+CW02'DI)>:&1:)1/336)68'B.3((,+CW02'DI)

–!?+)'&'&1)4:/4)D/36.)&0)Q4AW&,U:I()

class operand_info { … int arch_reg_num() const {

return m_value.m_symbolic->arch_reg_num(); } … };

Add accessor to class symbol in ptx_ir.h

A;B;)@;#$CDDE$ FFD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

M.''20&1/4+)J0-.)4.)S&'&0C)5.;+3

•! M2;/$6&')1.''20&1/4+6).Q+,/0;)/11+66)&0-.)4.)9'&0C)'.;+3)7&/)Q4AW;+1.;+W&064DjI)–! K++;6)4.)'.;&-8)6&C0/42,+).-)Q4AW;+1.;+W&0-.DjI)&0)Q4AW&,U:G)/0;)&0)12;/$6&'U11()

class function_info { ... void function_info::ptx_decode_inst( ptx_thread_info *thread,

... ... int *vectorout, int *arch_reg );

... };

Apply to all calls of ptx_decode_inst(),

and the extern “C” interface that links this function to the timing model in C

Export operands’

virtual register number in an array

A;B;)@;#$CDDE$ FFF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

VAQ.6+)*11+66)J0-.)4.)S&'&0C)5.;+3 •! K.>)'.;&-8)&04+,0/36).-)Q4AW;+1.;+W&064DjI)&0)12;/$6&'U11()

void function_info::ptx_decode_inst(...) { ... for (; op != pI->op_iter_end(); op++, n++) {

const operand_info &o = *op; if (has_dst && n==0) {

if (o.is_reg()) { *o1 = o.reg_num(); arch_reg[0] = o.arch_reg_num();

} else {...} } else {

if (o.is_reg()) { int reg_num = o.reg_num(); arch_reg[m + 4] = o.arch_reg_num(); ... m++;

} else {...} }

} ... };

Dest. Operand

(RF Write)

Src. Operand

(RF Read)

A;B;)@;#$CDDE$ FFC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 29: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

VA/'Q3+)5.;&d1/9.0()

S&'&0C)5.;+3)5.;&d1/9.0

•! "/66)Q4AW;+1.;+W&064DjI)Q.&04+,)4.)/,,/8)/,1:W,+C)26+;)4.),+42,0)4:+)7&,42/3),+C&64+,)02'B+,).-)/33)/11+66+;).Q+,/0;6U)

–! c/0L)*11+66+;)_)y&,42/3)<+CU)K.U)5=N)vB/0L6)

–! M.0b&14).112,6)>:+0)4>.).Q+,/0;6)/11+66)4:+)6/'+)B/0L)-.,)7/32+6)&0);&P+,+04),+C&64+,6))

•! S>.)Q/,46)4.)4:&6()–! N+4+190C)02'B+,).-)1813+6)4.)64/33)-.,)/)B/0L)1.0b&14)

–! J'Q3+'+090C)4:+)64/33&0C)'+1:/0&6')

A;B;)@;#$CDDE$ FFb$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

int gpgpu_reg_bank_conflict_model = 0;

#define MAX_REG_BANKS 32 unsigned int gpgpu_num_reg_banks = 8; // this needs to be less than MAX_REG_BANKS

#define MAX_BANK_CONFLICT 8 /* tex can have four source and four destination regs */ #define EMPTY_REG_BANK_ACCESS { .tot = 0, .rd = 0, .wr = 0, .rd_regs = { -1, -1, -1, -1 } }

struct reg_bank_access { unsigned tot; unsigned rd; unsigned wr; int rd_regs[4];

} g_reg_bank_access[MAX_REG_BANKS] = { EMPTY_REG_BANK_ACCESS };

// just to use as "shorthand" for clearing accesses each cycle static const struct reg_bank_access empty_reg_bank_access = EMPTY_REG_BANK_ACCESS;

unsigned int gpu_reg_bank_conflict_stalls = 0;

VA/'Q3+)5.;&d1/9.0()

<+C&64+,)c/0L)M.0b&14)N+4+19.0

•! J'Q3+'+04)/)4+'Q.,/,8)64,2142,+)4:/4)4,/1L6)4:+),+C&64+,)JN)/11+66&0C)7/,&.26)B/0L6))

•! "24)4:&6)B+-.,+)6:/;+,W;+1.;+DI)&0)6:/;+,U1()

Configurable Options

Performance Counter A;B;)@;#$CDDE$ FFO$

GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

if ( gpgpu_reg_bank_conflict_model && first_valid_thread != -1 ) { for (i = 4; i < 8; i++) { if( arch_reg[i] == -1 ) continue; int skip = 0; int bank = arch_reg[i] % gpgpu_num_reg_banks; int opndreg = shader->pipeline_reg[first_valid_thread][IF_ID].in[i-4]; for (int j = 0; j < 4; j++) { if (g_reg_bank_access[bank].rd_regs[j] == -1) break; else if (g_reg_bank_access[bank].rd_regs[j] == opndreg) { skip = 1; break; } } if (!skip) { g_reg_bank_access[bank].tot++; g_reg_bank_access[bank].rd_regs[j] = opndreg; } }

VA/'Q3+)5.;&d1/9.0()

<+C&64+,)c/0L)M.0b&14)N+4+19.0

•! *;;)&06&;+)6:/;+,W;+1.;+DI)&0)6:/;+,U1()

•! *;;)4:&6)/g+,)4:+)d,64)3..Q)4:/4)1/336)Q4AW;+1.;+W&064DI()

int arch_reg[MAX_REG_OPERANDS] = { -1 };

ptx_decode_inst( shader->thread[tid].ptx_thd_info, ..., &vectorout, arch_reg );

M.204)v);&P+,+04)/11+66)4.)+/1:)B/0L)

M.0902+).0)0+A4)63&;+)

A;B;)@;#$CDDE$ FFa$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

<+C&64+,)c/0L)M.0b&14)N+4+19.0

•! D1.0902+;)-,.')3/64)63&;+I()

unsigned max_access=0; int r; inst_t* conflict_inst = &shader->pipeline_reg[first_valid_thread][IF_ID]; for( r = 0; r < gpgpu_num_reg_banks; r++ ) { if( g_reg_bank_access[r].tot > max_access ) max_access = g_reg_bank_access[r].tot; g_reg_bank_access[r] = empty_reg_bank_access; } if( max_access >= 1 ) { assert( max_access <= MAX_REG_OPERANDS ); conflict_inst->reg_bank_access_pending = max_access - 1; if( max_access > 1 ) { conflict_inst->reg_bank_conflict_stall_checked = 1; return; // stall pipeline } } shader->pipeline_reg[first_valid_thread][IF_ID].reg_bank_conflict_stall_checked = 1; }

%4/33)_)5/A&'2')v);&P+,+04)/11+66)/'.0C)/33)B/0L6)

Add these new members to

inst_t in shader.h

A;B;)@;#$CDDE$ FFh$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 30: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

VA/'Q3+)5.;&d1/9.0()

<+C&64+,)c/0L)M.0b&14)%4/336 •! R/64)63&;+()*;;&0C)4>.)0+>)'+'B+,6)4.)&060W4)

–! S,/1L)v)1813+6)4.)64/33)

–! J0;&1/4+)4:/4)4:+)64/33)-.,),+C&64+,)-+41:&0C):/6):/QQ+0+;)6.);.)0.4)64/33)-.,)&4)/C/&0U))

–! 5/L+)62,+)4:+8)/,+)B.4:)&0&9/3&h+;)4.)F)&0)6:/;+,W&662+W4:,+/;DI()

void shader_issue_thread(shader_core_ctx_t *shader, int tid, int wlane, unsigned active_mask ) { if ( gpgpu_cuda_sim ) { ... shader->pipeline_reg[wlane][TS_IF].reg_bank_conflict_stall_checked = 0; shader->pipeline_reg[wlane][TS_IF].reg_bank_access_pending = 0; shader->pipeline_reg[wlane][TS_IF].uid = g_next_shader_inst_uid++; shader->pipeline_reg[wlane][TS_IF].warp_active_mask = active_mask; shader->pipeline_reg[wlane][TS_IF].ts_cycle = gpu_tot_sim_cycle + gpu_sim_cycle; } assert( shader->thread[tid].avail4fetch > 0 ); shader->thread[tid].avail4fetch--; assert( shader->thread[tid - (tid % warp_size)].n_avail4fetch > 0 ); shader->thread[tid - (tid % warp_size)].n_avail4fetch--; }

A;B;)@;#$CDDE$ FFi$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

<+C&64+,)c/0L)M.0b&14)%4/336

if ( gpgpu_reg_bank_conflict_model ) { for (i=0; i<pipe_simd_width;i++) { if ( shader->pipeline_reg[i][IF_ID].reg_bank_conflict_stall_checked ) { if ( shader->pipeline_reg[i][IF_ID].reg_bank_access_pending > 0 ) { assert( shader->pipeline_reg[i][IF_ID].reg_bank_access_pending <= 8 ); shader->pipeline_reg[i][IF_ID].reg_bank_access_pending--; gpu_reg_bank_conflict_stalls++; return; // stall } } } }

•! *;;)4:&6)4.)4:+)B+C&00&0C).-)6:/;+,W;+1.;+DI()

if ( gpgpu_reg_bank_conflict_model && first_valid_thread != -1 && !shader->pipeline_reg[first_valid_thread][IF_ID].reg_bank_conflict_stall_checked) { for (i = 4; i < 8; i++) { if( arch_reg[i] == -1 ) continue; ...

•! S.)/7.&;);+/;3.1L()

=038)/QQ38)64/33)&-)4:+),+C&64+,)B/0L)1.0b&14)&6)1:+1L+;)

A;B;)@;#$CDDE$ FF`$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

M,+/4+)M.0dC2,/9.0)=Q9.06

option_parser_register(opp, "-gpgpu_reg_bank_conflict_model", OPT_BOOL, &gpgpu_reg_bank_conflict_model, "Turn on register bank conflict model (default = off)", "0");

option_parser_register(opp, "-gpgpu_num_reg_banks", OPT_INT32, &gpgpu_num_reg_banks, "Number of register banks (default = 8)", "8");

•! *;;)4:&6)4.)CQ2W,+CW.Q9.06DI)&0)CQ2$6&'U1()

•! N;;)+A4+,06)-.,)4:+)4>.)C3.B/3)7/,&/B3+6)&0)CQ2$6&'U1()extern int gpgpu_reg_bank_conflict_model; extern int gpgpu_num_reg_banks;

•! *;;)/)-Q,&0z)&0)6:/;+,WQ,&04W/1164/46DI)&0)6:/;+,U1()

fprintf(fout, "gpu_reg_bank_conflict_stalls = %d\n", gpu_reg_bank_conflict_stalls);

A;B;)@;#$CDDE$ FFE$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

VA/'Q3+)5.;&d1/9.0()

N.0+{ •! S,8),200&0C)4:&6)>&4:)4:+).Q9.0)42,0+;).0)/0;).P)

•! S,8)7/,8)4:+)02'B+,).-),+C&64+,)B/0L6))

•! ?:/4)&6)3+g).24()–! <+C&64+,)>,&4+6)/4)4:+)>,&4+B/1L)64/C+)

–! N,.Q)&0)4:&6)6/'Q3+)-.,)6&'Q3&1&48)

A;B;)@;#$CDDE$ FCD$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

Page 31: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

=7+,7&+>

•! /#.;=$M;:.;Q$"=$GH9$H#"8#()).28$%"*;-4$

•! GHGH9IJ.)$N:;#:.;Q$

•! GHGH9IJ.)$L2+;#2(-4$$

–!%.B#"(#B1.+;B+7#;$)"*;-$

–!J"RQ(#;$"#8(2.S(>"2$

–!TU()V-;$)"*.WB(>"24$

•! !"!"#$%&')S..36)

A;B;)@;#$CDDE$ FCF$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

';#.(-x.4."2Y$x.47(-.S.28$

?")V-;U$A<2().B4$.2$GH9$

x.47(-.S;#$!""-$="#$GHGH9IJ.)$

A;B;)@;#$CDDE$ FCC$GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCb$

%">:(>"2$

•! ?"))"2$B(4;$.2$(#B1.+;B+7#(-$#;4;(#B1Y$$–! M;4;(#B1;#4$.)V-;);2+;*$+1;.#$2;Q$V#"V"4(-4$.2$(2$(#B1.+;B+7#;$4.)7-(+"#$K.2$+1.4$B(4;$GHGH9IJ.)P$

–! J.)7-(>"2$#;47-+4$(B#"44$($47.+;$"=$@;2B1)(#04$747(--<$-""0$-.0;$+1.4Y$

A B C D E F

Need to understand reason behind slowdowns!

Need to validate these results!

Ultimately: Gain more insight about performance trends

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCO$

%">:(>"2$

•! ?7##;2+$Q(<4$+"$8(.2$.24.81+4$KGHGH9IJ.)P$

–! H;#="#)(2B;$4+(>4>B4$-"8$�$;2*$"=$0;#2;-$-(72B1$

•! L82"#;4$#72>);$*<2().B4$.2$+1;$).B#"(#B1.+;B+7#;$

–! GA/$B<B-;$@<$B<B-;$4+;VV.28$

•! L2="#)(>"2$":;#-"(*$.=$+#(B0.28$)"#;$+1(2$"2;$72.+$

•! J-"Q$�$1"7#4$+"$4+;V$+1#"781$FDDD4$"=$B<B-;$

•! GH9$.4$V(#(--;-$(2*$V#"8#())(@-;$

–! 5"+4$"=$.24.81+$8(.2;*$=#")$1(:.28$($8-"@(-$:.;Q$

•! !.);I-(V4;$V;#="#)(2B;$:(#.(>"24$

•! x.;Q$)7->V-;$72.+4$.2$V(#(--;-$

–! M;-(+;$V;#="#)(2B;$4+(>4>B4$+"$4"7#B;$B"*;$

•! J.).-(#$8"(-$+"$x!72;,$@7+$="#$?9A'$#722.28$"2$GHGH9IJ.)$

Page 32: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCa$

%">:(>"2$

•! 31<$.4$#72>);$*<2().B4$.)V"#+(2+$="#$GH94e$–!%(2<$B"#;$(BB;-;#(+"#$(#B1.+;B+7#;$$

–! L2+;#).w;2+$B"2+;2>"24$;|;B+$V;#="#)(2B;,$@7+$2"+$B(V+7#;*$@<$;2*I+"I;2*$V;#="#)(2B;$4+(>4>B4$

Transpose with two different address mapping DRAM utilization is ~uniform for both

case!

Bottleneck goes undetected!

We like this type of

plot so much that we go and build a tool for

this purpose... A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCh$

%">:(>"2$

•! ?9A'$0;#2;-4$(#;$8#"Q.28$-(#8;$

–!%9%%;#GH9$X$�ODD$-.2;4$$

–! GH9AG$X$�CDD$-.2;4$$

–! v$

•! %(2<$GHGH9IJ.)$74;#4$(#;$?9A'$(VV$*;:$

–! 74.28$+1;$4.)7-(+"#$+"$1;-V$72*;#4+(2*$+1;$V;#="#)(2B;$"=$

+1;.#$?9A'$(VV-.B(>"24$.2$*;:;-"V);2+$

–! ];;*$+"$V.2$V".2+$V;#="#)(2B;$@"w-;2;B0$.2$+1;$0;#2;-$

B"*;$

–! /;2;W+4$1(#*Q(#;$(#B1.+;B+4$(4$Q;--$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCi$

L2+#"*7B.28$';#.(-x.4."2$

•! x.47(-.S;#$="#$GHGH9IJ.)$–! !.);$5(V4;$x.;QY$H;#="#)(2B;$);+#.B4$:4&$>);$

–! J"7#B;$?"*;$x.;QY$H;#="#)(2B;$);+#.B4$:4&$+1;$?9A'$4"7#B;$B"*;$$

–! L)V-;);2+;*$.2$H<+1"2Y$:;#<$;U+;24.@-;$$

•! GHGH9IJ.)$)"*.W;*$+"$8;2;#(+;$.2V7+4$="#$';#.(-x.4."2Y$

CUDA

Application

GPGPU-Sim Visualization

Trace

PTX

Instruction Statistics

AerialVision visualizer.cc

ptx-stats.cc

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FC`$

N7+-.2;$K#;)(.2*;#$"=$+7+"#.(-P$

•! ';#.(-x.4."2$%">:(>"2$c$L2+#"*7B>"2$

•! !.);$5(V4;$x.;Q$

•! J"7#B;$?"*;$x.;Q$

•! A;)"$

•! N:;#1;(*$

•! J7))(#<$

Page 33: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FCE$

!.);$5(V4;$x.;Q$

•! x.47(-.S;$V;#="#)(2B;$);+#.B4$:;#474$>);$–! H-"+$7V$+"$a$);+#.B4$.2$($W87#;$="#$*.#;B+$:.47(-$B")V&$$

–! L2W2.+;$z$W87#;4$K72>-$);)"#<$#724$"7+P$

•! A.|;#;2+$+<V;$"=$V-"+4$="#$*.|;#;2+$);+#.B4$–! 5.2;$V-"+4$ $ $~$8-"@(-$4.287-(#4$K;&8&$LH?P$

–! H(#(--;-$.2+;24.+<$V-"+ $~$);+#.B4$(B#"44$)7->&$Z3$72.+4$

–! J+(B0$@(#$B1(#+4$ $~$B")V"2;2+$@#;(0*"Q24$

–! H?IZ.4+"8#() $ $~$#;-(+;$+1#;(*$*<2().B4$(2*$$ $ $ $ $$$$$$4"7#B;$B"*;$$

•! H-"+4$(#;$8;2;#(+;*$Q.+1$%(+V-"+-.@$$–! ](:.8(>"2$+""-$@(#$="#$S"")$(2*$V(2$

–!%"#;$="#)(�28$"V>"24$;UV"4;*$Q.+1$;U+#($Q.*8;+4$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbD$

!.);$5(V4;$x.;Q$�$$

%;+#.BK4P$J;-;B>"2$

Add a new tab

(a new figure)

Select a visualizer

trace file

Select metric to plot

Select type of plot

Add more plots to

the figure

Configure

each subplot

Click this to start plotting!

Shows what you have selected for each plot

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbF$

!.);$5(V4;$x.;Q$�$$

6.87#;$Figure plotting selected metrics Switch between different figures

Formatting

Tool

Navigation Tool A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbC$

!.);$5(V4;$x.;Q$�$$

H(#(--;-$L2+;24.+<$H-"+$•! x.;Q$V;#="#)(2B;$);+#.B4$="#$)7->V-;$V(#(--;-$1(#*Q(#;$72.+4$

:4&$>);$–! ?1(28;$B"-"#$)(VV.28$Q.+1$[?1(28;$?"-"#)(V$%(Ug%.2\$

Mapping from color to value of the metric

Data for each

shader core

Page 34: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbb$

!.);$5(V4;$x.;Q$�$

J+(B0;*$/(#$?1(#+$

•! J1"Q4$+1;$@#;(0*"Q2$"=$($);+#.B$:4&$>);$–!3(#V$*.:;#8;2B;$$

–! 5"(*gJ+"#;$-(+;2B<$@#;(0*"Q2$Each component

is represented by a unique color.

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbO$

!.);$5(V4;$x.;Q$�$

H?IZ.4+"8#()$K?65"8P$•! '$>);$4;#.;4$"=$1.4+"8#()4,$#;V#;4;2>28$+1;$V"#>"2$"=$+1;$

V#"8#()$+1(+$+1;$+1#;(*4$1(4$#=,(1"!$*7#.28$($8.:;2$4()V-;$V;#."*&$$

•! '$+1#;(*$.4$B"24.*;#;*$+"$1(:;$#=,(1"!'(2$.24+#7B>"2Y$–! 'R;#$.+$1(4$=;+B1;*$+1;$.24+#7B>"2$

–! 92>-$.+$=;+B1;4$($2;Q$.24+#7B>"2$

CFLOG stands for

“Control Flow” Log

Each line here represents:

-! A PTX instruction

OR

-! A line in CUDA source code.

Color at each dot indicates

# threads touching the

instruction during that

sampling period.

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fba$

!.);$5(V4;$x.;Q$�$

](:.8(>"2$(2*$6"#)(�28$!""-4$

•! ?1(28;$?"-"#)(V$%(Ug%.2$–! ?"2W87#;$1"Q$);+#.B$:(-7;4$(#;$)(VV;*$+"$+1;$B"-"#$4V;B+#7)$

–! '--"Q$74;#$+"$2"#)(-.S;$()"28$(--$V-"+4$

–! ?1""4;$($*.|;#;2+$B"-"#$4B1;);$

•! ?1(28;$/.22.28$$–! %"*.=<$+1;$=#;f7;2B<$"=$>B0$-(@;-4$"2$+1;$(U;4$

•! T*.+$5(@;-4$–! T*.+$+1;$-(@;-4$"2$U$(2*$<$(U;4,$(2*$+1;$>+-;gB"-"#)(V$-(@;-$

–! ?1""4;$+1;$="2+4$4.S;$="#$-(@;-4$

Pan Zoom Change plot spacing

Save

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbh$

J"7#B;$?"*;$x.;Q$

•! '2(-<S;$?9A'$V;#="#)(2B;$"2$GHGH9IJ.)$

–!](##"Q$@"w-;2;B0$*"Q2$+"$($4.28-;$-.2;$"=$4"7#B;$

B"*;$.2$<"7#$(VV-.B(>"2$

•! 6;(+7#;4Y$

–!J1"Q4$V;#="#)(2B;$);+#.B4$4.*;I@<I4.*;$Q.+1$+1;$

4"7#B;$B"*;$$

–!](:.8(>"2$8#(V1$$

•! J1"Q4$+1;[email protected]$V.B+7#;$

•! J.28-;IB-.B0$+"$�7)V$+"$+1;$[B"*;$"=$.2+;#;4+\$

Performance Profiler

Page 35: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fbi$

J"7#B;$?"*;$x.;Q$�$

%;+#.BK4P$J;-;B>"2$

Select the CUDA C

file or PTX file to show with the metric

Choose how metrics

should be combined (More on this next)

Choose the metric

to be displayed

Click this to proceed

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ Fb`$

J"7#B;$?"*;$x.;Q$�$

?")@.2;$%;+#.B4$•! GHGH9IJ.)$4()V-;4$V;#="#)(2B;$);+#.B4$="#$;(B1$H!_$.24+#7B>"2&$$

•! T(B1$-.2;$"=$?9A'$?$~$)7->V-;$-.2;4$"=$H!_$.24+#7B>"24&$–! !1.4$)(VV.28$B(2$@;$(Bf7.#;*$:.($*;@78$.2="$8;2;#(+;*$@<$]x??$

•! �7;4>"2Y$Z"Q$41"7-*$+1;$);+#.B4$=#")$)7->V-;$H!_$.24+#7B>"24$@;$#;*7B;*e$–! %(U $�$5(+;2B<$g$TU;B7>"2$?"72+$

–! J7)$ $�$�7(2>+<$B"72+;#4$

a[i] = b[i] + c; ld.s32 %r2, [%r1];

add.f32 %r4, %r2, %r3;

st.s32 %r4, [%r1];

200

1

20

200 221

Sum? Max?

•! We provide suggestions for each metric in the manual.

•! Ratio between metrics? We use Max for both.

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FbE$

J"7#B;$?"*;$x.;Q$�$

](:.8(>"2$Metric

Data

CUDA C Source Code

Navigation graph

Format

Tool etc…

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOD$

J"7#B;$?"*;$x.;Q$�$

](:.8(>"2$

Line number in CUDA C File

Right-Click Here

Viewer jumps to here,

with the clicked line highlighted

Page 36: GPGPU-Sim Tutorial Slides - UBC ECEaamodt/gpgpu-sim/tutorial/GPGPU-Sim-Tutorial-… · ghgh9y$gh9$?")v7>28$ $$$o$b"#;$?h9$$$$$c$$$$$cod$b"#;$gh9$$ –!z;+;#"8;2;"74$b")v7>28$ a;b;)@;#$cdde$

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOF$

N:;#1;(*$c$TU+;24."2$

•! GHGH9IJ.)$2;;*$+"$*"$;U+#($Q"#0$+"$8;2;#(+;$.2V7+4$="#$';#.(-x.4."2$–! J.)7-(>"2$4V;;*$�Fb}$4-"Q;#$

–! A(+($*7)V;*$+"$ZAA$X$b&O0/$V;#$4()V-;$

•! !7#2;*$"2$@<$*;=(7-+$$–! 4()V-.28$=#;f7;2B<$X$FDDD$B<B-;4$

•! '**.28$2;Q$);+#.B4$="#$';#.(-x.4."2$–! J;;$:.47(-.S;#&BB$(2*$V+UI4.)&BB$(2*$)(27(-$

–! ]"$2;;*$+"$)"*.=<$';#.(-x.4."2$

Configurable

See manual for options

A;B;)@;#$CDDE$ GHGH9IJ.)$!7+"#.(-$K%L?MNIOCP$ FOC$

J7))(#<$

•! ';#.(-x.4."2$�$x.47(-.S;#$="#$GHGH9IJ.)$

–!!.);$5(V4;$x.;Q$

•! %;+#.B$:4&$!.);$

–!J"7#B;$?"*;$x.;Q$

•! %;+#.B$:4&$?"*;$

•! M;(*$)(27(-$="#$)"#;$*;+(.-$.2="$

–!T)(.-$74$.=$<"7$1(:;$(2<$f7;4>"2$"$