ADVANCED THRUST PROGRAMMING WITH EXECUTION POLICIES · 2 PITCH Execution-policies are: Extremely...
Transcript of ADVANCED THRUST PROGRAMMING WITH EXECUTION POLICIES · 2 PITCH Execution-policies are: Extremely...
April 4-7, 2016 | Silicon Valley
Steven Dalton, April 6th
ADVANCED THRUST PROGRAMMING WITH EXECUTION POLICIES
2
PITCH
Execution-policies are:
Extremely important and a core design feature of Thrust
Not well-understood or widely used
Effective mechanism for providing library extensibility
Useful for small applications, necessary for libraries built around Thrust
3
FUSED OPERATIONS
thrust::device_vector<int> vec(N, 1);
thrust::transform(vec.begin(), vec.end(),
vec.begin(),
thrust::negate<int>());
thrust::reduce(vec.begin(), vec.end(),
thrust::plus<int>());
thrust::device_vector<int> vec(N, 1);
thrust::transform_reduce(vec.begin(), vec.end(),
thrust::negate<int>(),
int(0),
thrust::plus<int>());
4
FANCY ITERATORS
thrust::reduce(thrust::constant_iterator(1),
thrust::constant_iterator(1) + N,
thrust::plus<int>());
thrust::device_vector<int> vec(N, 1);
thrust::reduce(vec.begin(), vec.end(),
thrust::plus<int>());
5
SORT
#include <vector>
#include <algorithms>
void main(void)
{
std::vector<int> vec(10, …);
std::sort(
vec.begin(),
vec.end());
}
Sort header
Data
Sort
6
THRUST SORT
#include <thrust/device_vector.h>
#include <thrust/sort.h>
void main(void)
{
thrust::device_vector<int> vec(10, …);
thrust::sort(
vec.begin(),
vec.end());
}
Sort header
Data
Sort
7
THRUST SORT
#include <thrust/device_vector.h>
#include <thrust/sort.h>
void main(void)
{
thrust::device_vector<int> vec(10, …);
thrust::sort(
vec.begin(),
vec.end());
}
Backend Systems
CPP OMP CUDA
8
MOTIVATION
void func1(…)
{
thrust::device_vector<int> vec(10, …);
thrust::sort(
vec.begin(),
vec.end());
}
void func2(…)
{
thrust::device_vector<int> vec(10, …);
thrust::sort(
vec.begin(),
vec.end());
}
void func3(…)
{
thrust::device_vector<int> vec(10, …);
thrust::sort(
vec.begin(),
vec.end());
}
void func4(…)
{
thrust::sort(
vec.begin(),
vec.end());
}
Profiling Thrust-based library
Several sorting calls across multiple functions/files
9
PROFILINGPossible Thrust profiling solutions
How? What would new thrust::sort require?
How would you profile STL routines?
REDESIGN INTERFACE DO-IT-YOURSELF
LD_PRELOAD=prof_thrust.so exec_file
INTERCEPT CALLS
thrust::sort(exec,vec.begin(),vec.end());
EXECUTION POLICIES
timer t;thrust::sort(begin, end);t.elapsed_milliseconds();
10
PROFILINGPossible Thrust profiling solutions
How? What would new thrust::sort require?
How would you profile STL routines?
REDESIGN INTERFACE
timer t;thrust::sort(begin, end);t.elapsed_milliseconds();
DO-IT-YOURSELF
LD_PRELOAD=prof_thrust.so exec_file
INTERCEPT CALLS
thrust::sort(exec,vec.begin(),vec.end());
EXECUTION POLICIES
11
PROFILINGPossible Thrust profiling solutions
How? What would new thrust::sort require?
How would you profile STL routines?
REDESIGN INTERFACE DO-IT-YOURSELF
LD_PRELOAD=prof_thrust.so exec_file
INTERCEPT CALLS
thrust::sort(exec,vec.begin(),vec.end());
EXECUTION POLICIES
timer t;thrust::sort(begin, end);t.elapsed_milliseconds();
12
PROFILINGPossible Thrust profiling solutions
How? What would new thrust::sort require?
How would you profile STL routines?
REDESIGN INTERFACE DO-IT-YOURSELF
LD_PRELOAD=prof_thrust.so exec_file
INTERCEPT CALLS
thrust::sort(exec,vec.begin(),vec.end());
EXECUTION POLICIES
timer t;thrust::sort(begin, end);t.elapsed_milliseconds();
13
THRUST SORT
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
void main(void)
{
cudaStream_t s; cudaStreamCreate(&s);
thrust::device_vector<int> vec(10, …);
thrust::sort(thrust::cuda::par.on(s),
vec.begin(),
vec.end());
}
Policy header
Sort with policy
14
THRUST SORT
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
void main(void)
{
cudaStream_t s; cudaStreamCreate(&s);
thrust::device_vector<int> vec(10, …);
thrust::sort(thrust::cuda::par.on(s),
vec.begin(),
vec.end());
}
Policy header
Sort with policy
WHAT?
HOW?
15
EXECUTION-POLICY DESIGN PATTERN
template<typename Policy, typename Iterator>
void sort(Policy& exec, Iterator begin, Iterator end)
{
// add generic sort to local context
using generic::sort;
// use ADL lookup for dispatching sort
sort(derived_cast(exec), begin, end);
}
template<typename Iterator>void sort(Iterator begin, Iterator end){
// no policy specified// use generic sortsort(exec, begin, end);
}
16
CUSTOM POLICY
struct my_policy : thrust::device_execution_policy{};
template<typename Iterator>
void sort(my_policy, Iterator begin, Iterator end)
{
PROFILE_START; // start profiling specific code
thrust::sort(begin, end);
PROFILE_STOP; // end profiler specific code
}
void main(void){
thrust::device_vector<int> vec(10);my_policy exec;thrust::sort(exec, vec.begin(), vec.end());
}
17
CGtemplate<typename Matrix, typename Vector>
void cg(Matrix& A, Vector& x, Vector& y)
{
size_t N = A.num_rows;
Vector y(N), z(N), r(N), p(N);
multiply(A, x, y);
axpby(b, y, r, 1, -1);
while(…) {
multiply(A, p, y);
double alpha = rz / dot(y, p);
axpy(y, r, -alpha);
double rz_old = rz;
rz = dot(r,z);
double beta = rz / rz_old;
axpby(z, p, p, 1, beta);
}
}
18
CG PROFILE
Thrust kernel launch
19
GRAPPLE
void main(void){
// construct grapple policygrapple::grapple_system exec;
// call thrust sort with grapple profilingthrust::sort(exec, vec.begin(), vec.end());
// automatically print summary before exiting}
Profiler for Thrust applications
In reality : Just another execution policy
Automatically intercepts all Thrust calls
NO CHANGES TO THRUST REQUIRED!
20
CG + GRAPPLEtemplate<typename Policy, typename Matrix, typename Vector>
void cg(Policy& exec, Matrix& A, Vector& x, Vector& y)
{
size_t N = A.num_rows;
Vector y(exec, N), z(exec, N), r(exec, N), p(exec, N);
multiply(exec, A, x, y);
axpby(exec, b, y, r, 1, -1);
while(…) {
multiply(exec, A, p, y);
double alpha = rz / dot(exec, y, p);
axpy(exec, y, r, -alpha);
double rz_old = rz;
rz = dot(exec, r,z);
double beta = rz / rz_old;
axpby(exec, z, p, p, 1, beta);
}
}
Pass user policy intoall inner routines
Implemented usingthrust::inner_product
21
CG + GRAPPLE PROFILE
GeForce GTX TITAN : 875.500 Mhz (Ordinal 0)
14 SMs enabled. Compute Capability sm_35
FreeMem: 5868MB TotalMem: 6143MB 64-bit pointers.
Mem Clock: 3004.000 Mhz x 384 bits (288.4 GB/s)
ECC Disabled
CUDA v7.0
PTX Version : sm_30
GCC v4.8.2
Thrust v1.8.2
[ 0][cuda] krylov::cg : 10.543 (ms), allocated : 1000000 bytes
[ 1][cuda] multiply : 3.57744 (ms), allocated : 1748000 bytes
[ 2][cuda] offsets_to_indices : 1.14803 (ms), allocated : 0 bytes
[ 3][cuda] fill : 0.050848 (ms), allocated : 0 bytes
[ 4][cuda] scatter_if : 0.056288 (ms), allocated : 0 bytes
[ 5][cuda] inclusive_scan : 1.00077 (ms),
22
CG + GRAPPLE PROFILE
GeForce GTX TITAN : 875.500 Mhz (Ordinal 0)
14 SMs enabled. Compute Capability sm_35
FreeMem: 5868MB TotalMem: 6143MB 64-bit pointers.
Mem Clock: 3004.000 Mhz x 384 bits (288.4 GB/s)
ECC Disabled
CUDA v7.0
PTX Version : sm_30
GCC v4.8.2
Thrust v1.8.2
[ 0][cuda] krylov::cg : 10.543 (ms), allocated : 1000000 bytes
[ 1][cuda] multiply : 3.57744 (ms), allocated : 1748000 bytes
[ 2][cuda] offsets_to_indices : 1.14803 (ms), allocated : 0 bytes
[ 3][cuda] fill : 0.050848 (ms), allocated : 0 bytes
[ 4][cuda] scatter_if : 0.056288 (ms), allocated : 0 bytes
[ 5][cuda] inclusive_scan : 1.00077 (ms),
23
CG + GRAPPLE PROFILE
GeForce GTX TITAN : 875.500 Mhz (Ordinal 0)
14 SMs enabled. Compute Capability sm_35
FreeMem: 5868MB TotalMem: 6143MB 64-bit pointers.
Mem Clock: 3004.000 Mhz x 384 bits (288.4 GB/s)
ECC Disabled
CUDA v7.0
PTX Version : sm_30
GCC v4.8.2
Thrust v1.8.2
[ 0][cuda] krylov::cg : 10.543 (ms), allocated : 1000000 bytes
[ 1][cuda] multiply : 3.57744 (ms), allocated : 1748000 bytes
[ 2][cuda] offsets_to_indices : 1.14803 (ms), allocated : 0 bytes
[ 3][cuda] fill : 0.050848 (ms), allocated : 0 bytes
[ 4][cuda] scatter_if : 0.056288 (ms), allocated : 0 bytes
[ 5][cuda] inclusive_scan : 1.00077 (ms),
24
CG + GRAPPLE PROFILE
GeForce GTX TITAN : 875.500 Mhz (Ordinal 0)
14 SMs enabled. Compute Capability sm_35
FreeMem: 5868MB TotalMem: 6143MB 64-bit pointers.
Mem Clock: 3004.000 Mhz x 384 bits (288.4 GB/s)
ECC Disabled
CUDA v7.0
PTX Version : sm_30
GCC v4.8.2
Thrust v1.8.2
[ 0][cuda] krylov::cg : 10.543 (ms), allocated : 1000000 bytes
[ 1][cuda] multiply : 3.57744 (ms), allocated : 1748000 bytes
[ 2][cuda] offsets_to_indices : 1.14803 (ms), allocated : 0 bytes
[ 3][cuda] fill : 0.050848 (ms), allocated : 0 bytes
[ 4][cuda] scatter_if : 0.056288 (ms), allocated : 0 bytes
[ 5][cuda] inclusive_scan : 1.00077 (ms),
25
CG + GRAPPLE PROFILE
GeForce GTX TITAN : 875.500 Mhz (Ordinal 0)
14 SMs enabled. Compute Capability sm_35
FreeMem: 5868MB TotalMem: 6143MB 64-bit pointers.
Mem Clock: 3004.000 Mhz x 384 bits (288.4 GB/s)
ECC Disabled
CUDA v7.0
PTX Version : sm_30
GCC v4.8.2
Thrust v1.8.2
[ 0][cuda] krylov::cg : 10.543 (ms), allocated : 1000000 bytes
[ 1][cuda] multiply : 3.57744 (ms), allocated : 1748000 bytes
[ 2][cuda] offsets_to_indices : 1.14803 (ms), allocated : 0 bytes
[ 3][cuda] fill : 0.050848 (ms), allocated : 0 bytes
[ 4][cuda] scatter_if : 0.056288 (ms), allocated : 0 bytes
[ 5][cuda] inclusive_scan : 1.00077 (ms),
26
GRAPPLE FEATURES
Interface level performance profiling
Annotation of memory usage
Stack frame reference for function calls
Execution system oriented annotation (eg, cpp, omp, cuda, …)
Extensible registration system
Single stepping
Runtime data inspection, pre- and post-checking
(Some In Progress)
27
GRAPPLE (HIGH-LEVEL)
template<typename Iterator>
void sort(grapple_system& exec, Iterator begin, Iterator end)
{
// mark beginning of grapple sort call
exec.start(SORT);
// cast grapple to system specific policy
sort(exec.policy(begin), begin, end);
// mark ending of grapple sort call
exec.stop();
}
28
C++ STANDARD
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
void main(void)
{
cudaStream_t s; cudaStreamCreate(&s);
thrust::device_vector<int> vec(10, …);
thrust::sort(thrust::cuda::par.on(s),
vec.begin(),
vec.end());
}
Parrellism TS accepted as part of C++17
29
C++ STANDARD
#include <vector>
#include <algorithms>
#include <thrust/system/cuda/execution_policy.h>
void main(void)
{
cudaStream_t s; cudaStreamCreate(&s);
std::vector<int,uvm_allocator> vec(10, …);
std::sort(thrust::cuda::par.on(s),
vec.begin(),
vec.end());
}
Parrellism TS accepted as part of C++17
Could look like…
April 4-7, 2016 | Silicon Valley
THANK YOU
JOIN THE NVIDIA DEVELOPER PROGRAM AT developer.nvidia.com/join