C++ amp on linux

About Me
● Miller Lee
● Junior student at NCTU CS
● Interests: C/C++, PL, CA, OS, compiler,
parallel programming, optimization

Why C++ AMP?
● GPUs can be 10+X
faster than CPUs for
parallel code
● CUDA and OpenCL
are still too
complex/verbose for
programmers

GPU computing
require explicit transfer

What we need in GPU programming
1. put data parallel codes into a kernel for GPU
to execute
2. pass the arguments to GPU
○ We can not pass the arguments by stack
3. an index to indicate current thread
4. move the data between GPU and CPU
memory

Device code in OpenCL
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB)
{
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;
}

Host code in OpenCL 1.2
1. allocate and initialize memory on host side
2. Initialize OpenCL
3. allocate device memory and move the data
4. Load and build device code
5. Launch kernel
a. append arguments
6. move the data back from device

int
main(int argc, char** argv)
{
// set seed for rand()
srand(2006);
// 1. allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
// 2. initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// 4. allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
// 5. Initialize OpenCL
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
// Setup device memory
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 6. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My commentn",
&kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
// 7. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
sizeof(cl_mem), (void *)&d_B);
sizeof(int), (void *)&wA);
sizeof(int), (void *)&wC);
localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
// 8. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
// 10. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);
}

What is C++ AMP
● C++ Accelerated Massive Parallelism
○ Designed for data level parallelism
○ Extension of C++11 proposed by M$
○ An open specification with multiple implementations
aiming at standardization
■ MS Visual Studio 2013
■ MCW CLAMP
● GPU data modeled as C++14-like containers for
multidimensional arrays
● GPU kernels modeled as C++11 lambda

Comparisons
C++AMP Thrust Bolt OpenACC SYCL
Intro
simple, elegant,
performance(?),
proposed by M$
library
proposed
by CUDA
library
proposed
by AMP
Annotation
and
pragmas
proposed
by SGI
wrapper for
OpenCL
proposed
by
Codeplay

Matrix Multiplication in C++AMP
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int
*productMatrix,
int ha, int hb, int hc) {
array_view<int, 2> a(ha, hb, aMatrix);
array_view<int, 2> b(hb, hc, bMatrix);
array_view<int, 2> product(ha, hc, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();
}
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
__kernel void
matrixMul(__global float* C, __global float*
A,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
{
float value = 0;
for (int k = 0; k < wA; ++k)
{
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
//Create a command-queue
clCommandQue = clCreateCommandQueue
(clGPUContext,
}
}
clDevices[0], 0, &errcode);

Only 20 lines of code
but performance?

C++AMP programming model
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
int row = idx[0];
int col = idx[1];
}
}
);
}
GPU data modeled
as data container

parallel_for_each(
product.extent,
int row = idx[0];
int col = idx[1];
}
}
);
}
Execution interface;
marking an implicitly
parallel region for GPU
execution

parallel_for_each(
product.extent,
int row = idx[0];
int col = idx[1];
}
}
);
}
Kernels modeled as
lambdas; arguments
are implicitly modeled
as captured variables

MCW C++AMP (CLAMP)
● Clang/LLVM-based
○ translate C++AMP code to OpenCL C code and
generate OpenCL SPIR file
○ With some template library
● Runtime support: gmac/OpenCL/HSA Okra
● An Open Source project
○ The only two C++ AMP implementations recognized
by HSA foundation (the other is MSVC)
○ Microsoft and HSA foundation supported

MCW C++ AMP Compiler
● Device Path
○ generate OpenCL C code by
CBackend
○ emit kernel function
● Host Path
○ preparation to launch the
code
C++ AMP
source
code
Clang/LLVM 3.3
Device
Code Host Code

Execution process
C++ AMP
source
code
Clang
/LLV
M 3.3
Device
Code
C++ AMP
source
code
Clang
/LLV
M 3.3
Host Code
gmac
OpenCL
Our work

gmac
● unified virtual address
space in software
● Can have high
overhead sometimes
● In HSA (AMD Kaveri),
GMAC is no longer
needed

Compiling C++AMP to OpenCL
● C++AMP → LLVM IR → subset of C
● arguments passing (lambda capture vs
function calls)
● explicit V.S. implicit memory transfer
● Heavy works were done by compiler and
runtime

lambda capture
struct add {
int a;
add(int a) : a(a) {}
int operator()(int x) const {
return a + x;
}
};
int main(void)
{
int a = 3;
auto fn = [=] (int x) { return a + x; };
int c = fn(3);
return 0;
}
Those arguments should be put
on the argument lists of OpenCL
kernel.

What we need to do?
● Kernel function
○ emit the kernel function with required arguments
● In Host side
○ a function that recursively traverses the object and
append the arguments to OpenCL stack.
● In Device side
○ reconstructor it on the device code for future use.

Example
struct A { int a; };
struct B : A { int b; };
struct C { B b; int c; };
struct C c;
c.c = 100;
auto fn = [=] () { int qq = c.c; };

Kernel code
__kernel void(int a, int b, int c)
{
C c(a, b, c);
...
}

Deserialization constructor
struct C
{
B b;
int c;
C (int a, int b, int c) : c(c), b(a, b) {}
};

Serialization constructor
struct C
{
B b;
int c;
void __cxxamp_serialize(Concurrency::Serialize s) {
b.__cxxamp_serialize(s);
s.Append(sizeof(int), &c);
}
};

Translation
parallel_for_each(product.extent,
int row = idx[0];
int col = idx[1];
}
}
);
__kernel void
matrixMul(__global float* C, __global float* A,
{
float value = 0;
for (int k = 0; k < wA; ++k)
{
}
}
● Append the arguments
● Set the index
● emit kernel function
● implicit memory management

Future work
● Future work for us
○ restrict(auto)
○ HSA related work

Future works for you
● Try this out!!
● Many of us get spoiled and don’t want to go
back to write OpenCL directly anymore :-)
● related links
○ Driver
○ Clang
○ sandbox

C++ amp on linux

More Related Content

What's hot (20)

Viewers also liked (9)

Similar to C++ amp on linux (20)

Recently uploaded (20)

C++ amp on linux