SlideShare a Scribd company logo
C++AMP on Linux 
Miller Lee
About Me 
● Miller Lee 
● Junior student at NCTU CS 
● Interests: C/C++, PL, CA, OS, compiler, 
parallel programming, optimization
Why C++ AMP? 
● GPUs can be 10+X 
faster than CPUs for 
parallel code 
● CUDA and OpenCL 
are still too 
complex/verbose for 
programmers
C++ amp on linux
C++ amp on linux
GPU computing 
require explicit transfer
What we need in GPU programming 
1. put data parallel codes into a kernel for GPU 
to execute 
2. pass the arguments to GPU 
○ We can not pass the arguments by stack 
3. an index to indicate current thread 
4. move the data between GPU and CPU 
memory
OpenCL as an example
Device code in OpenCL 
__kernel void 
matrixMul(__global float* C, __global float* A, 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
}
Host code in OpenCL 1.2 
1. allocate and initialize memory on host side 
2. Initialize OpenCL 
3. allocate device memory and move the data 
4. Load and build device code 
5. Launch kernel 
a. append arguments 
6. move the data back from device
int 
main(int argc, char** argv) 
{ 
// set seed for rand() 
srand(2006); 
// 1. allocate host memory for matrices A and B 
unsigned int size_A = WA * HA; 
unsigned int mem_size_A = sizeof(float) * size_A; 
float* h_A = (float*) malloc(mem_size_A); 
unsigned int size_B = WB * HB; 
unsigned int mem_size_B = sizeof(float) * size_B; 
float* h_B = (float*) malloc(mem_size_B); 
// 2. initialize host memory 
randomInit(h_A, size_A); 
randomInit(h_B, size_B); 
// 4. allocate host memory for the result C 
unsigned int size_C = WC * HC; 
unsigned int mem_size_C = sizeof(float) * size_C; 
float* h_C = (float*) malloc(mem_size_C); 
// 5. Initialize OpenCL 
// OpenCL specific variables 
cl_context clGPUContext; 
cl_command_queue clCommandQue; 
cl_program clProgram; 
size_t dataBytes; 
size_t kernelLength; 
cl_int errcode; 
// OpenCL device memory for matrices 
cl_mem d_A; 
cl_mem d_B; 
cl_mem d_C; 
/*****************************************/ 
/* Initialize OpenCL */ 
/*****************************************/ 
clGPUContext = clCreateContextFromType(0, 
CL_DEVICE_TYPE_GPU, 
NULL, NULL, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// get the list of GPU devices associated 
// with context 
errcode = clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, 0, NULL, 
&dataBytes); 
cl_device_id *clDevices = (cl_device_id *) 
malloc(dataBytes); 
errcode |= clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, dataBytes, 
clDevices, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
//Create a command-queue 
clCommandQue = clCreateCommandQueue(clGPUContext, 
clDevices[0], 0, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// Setup device memory 
d_C = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE, 
mem_size_A, NULL, &errcode); 
d_A = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
mem_size_A, h_A, &errcode); 
d_B = clCreateBuffer(clGPUContext, 
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
mem_size_B, h_B, &errcode); 
// 6. Load and build OpenCL kernel 
char *clMatrixMul = oclLoadProgSource("kernel.cl", 
"// My commentn", 
&kernelLength); 
shrCheckError(clMatrixMul != NULL, shrTRUE); 
clProgram = clCreateProgramWithSource(clGPUContext, 
1, (const char **)&clMatrixMul, 
&kernelLength, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
errcode = clBuildProgram(clProgram, 0, 
NULL, NULL, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
clKernel = clCreateKernel(clProgram, 
"matrixMul", &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// 7. Launch OpenCL kernel 
size_t localWorkSize[2], globalWorkSize[2]; 
int wA = WA; 
int wC = WC; 
errcode = clSetKernelArg(clKernel, 0, 
sizeof(cl_mem), (void *)&d_C); 
errcode |= clSetKernelArg(clKernel, 1, 
sizeof(cl_mem), (void *)&d_A); 
errcode |= clSetKernelArg(clKernel, 2, 
sizeof(cl_mem), (void *)&d_B); 
errcode |= clSetKernelArg(clKernel, 3, 
sizeof(int), (void *)&wA); 
errcode |= clSetKernelArg(clKernel, 4, 
sizeof(int), (void *)&wC); 
shrCheckError(errcode, CL_SUCCESS); 
localWorkSize[0] = 16; 
localWorkSize[1] = 16; 
globalWorkSize[0] = 1024; 
globalWorkSize[1] = 1024; 
errcode = clEnqueueNDRangeKernel(clCommandQue, 
clKernel, 2, NULL, globalWorkSize, 
localWorkSize, 0, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
// 8. Retrieve result from device 
errcode = clEnqueueReadBuffer(clCommandQue, 
d_C, CL_TRUE, 0, mem_size_C, 
h_C, 0, NULL, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
// 10. clean up memory 
free(h_A); 
free(h_B); 
free(h_C); 
clReleaseMemObject(d_A); 
clReleaseMemObject(d_C); 
clReleaseMemObject(d_B); 
free(clDevices); 
free(clMatrixMul); 
clReleaseContext(clGPUContext); 
clReleaseKernel(clKernel); 
clReleaseProgram(clProgram); 
clReleaseCommandQueue(clCommandQue); 
}
Nearly 200 lines of code
What is C++ AMP 
● C++ Accelerated Massive Parallelism 
○ Designed for data level parallelism 
○ Extension of C++11 proposed by M$ 
○ An open specification with multiple implementations 
aiming at standardization 
■ MS Visual Studio 2013 
■ MCW CLAMP 
● GPU data modeled as C++14-like containers for 
multidimensional arrays 
● GPU kernels modeled as C++11 lambda
Comparisons 
C++AMP Thrust Bolt OpenACC SYCL 
Intro 
simple, elegant, 
performance(?), 
proposed by M$ 
library 
proposed 
by CUDA 
library 
proposed 
by AMP 
Annotation 
and 
pragmas 
proposed 
by SGI 
wrapper for 
OpenCL 
proposed 
by 
Codeplay
Matrix Multiplication in C++AMP 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int 
*productMatrix, 
int ha, int hb, int hc) { 
array_view<int, 2> a(ha, hb, aMatrix); 
array_view<int, 2> b(hb, hc, bMatrix); 
array_view<int, 2> product(ha, hc, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
clGPUContext = clCreateContextFromType(0, 
CL_DEVICE_TYPE_GPU, 
NULL, NULL, &errcode); 
shrCheckError(errcode, CL_SUCCESS); 
// get the list of GPU devices associated 
// with context 
errcode = clGetContextInfo(clGPUContext, 
__kernel void 
matrixMul(__global float* C, __global float* 
A, 
CL_CONTEXT_DEVICES, 0, NULL, 
&dataBytes); 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
cl_device_id *clDevices = (cl_device_id *) 
malloc(dataBytes); 
errcode |= clGetContextInfo(clGPUContext, 
CL_CONTEXT_DEVICES, dataBytes, 
clDevices, NULL); 
shrCheckError(errcode, CL_SUCCESS); 
//Create a command-queue 
clCommandQue = clCreateCommandQueue 
(clGPUContext, 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
} 
clDevices[0], 0, &errcode); 
shrCheckError(errcode, CL_SUCCESS);
Only 20 lines of code 
but performance?
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
GPU data modeled 
as data container
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
Execution interface; 
marking an implicitly 
parallel region for GPU 
execution
C++AMP programming model 
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { 
array_view<int, 2> a(3, 2, aMatrix); 
array_view<int, 2> b(2, 3, bMatrix); 
array_view<int, 2> product(3, 3, productMatrix); 
parallel_for_each( 
product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
product.synchronize(); 
} 
Kernels modeled as 
lambdas; arguments 
are implicitly modeled 
as captured variables
MCW C++AMP (CLAMP) 
● Clang/LLVM-based 
○ translate C++AMP code to OpenCL C code and 
generate OpenCL SPIR file 
○ With some template library 
● Runtime support: gmac/OpenCL/HSA Okra 
● An Open Source project 
○ The only two C++ AMP implementations recognized 
by HSA foundation (the other is MSVC) 
○ Microsoft and HSA foundation supported
MCW C++ AMP Compiler 
● Device Path 
○ generate OpenCL C code by 
CBackend 
○ emit kernel function 
● Host Path 
○ preparation to launch the 
code 
C++ AMP 
source 
code 
Clang/LLVM 3.3 
Device 
Code Host Code
Execution process 
C++ AMP 
source 
code 
Clang 
/LLV 
M 3.3 
Device 
Code 
C++ AMP 
source 
code 
Clang 
/LLV 
M 3.3 
Host Code 
gmac 
OpenCL 
Our work
gmac 
● unified virtual address 
space in software 
● Can have high 
overhead sometimes 
● In HSA (AMD Kaveri), 
GMAC is no longer 
needed
Compiling C++AMP to OpenCL 
● C++AMP → LLVM IR → subset of C 
● arguments passing (lambda capture vs 
function calls) 
● explicit V.S. implicit memory transfer 
● Heavy works were done by compiler and 
runtime
lambda capture 
struct add { 
int a; 
add(int a) : a(a) {} 
int operator()(int x) const { 
return a + x; 
} 
}; 
int main(void) 
{ 
int a = 3; 
auto fn = [=] (int x) { return a + x; }; 
int c = fn(3); 
return 0; 
} 
Those arguments should be put 
on the argument lists of OpenCL 
kernel.
What we need to do? 
● Kernel function 
○ emit the kernel function with required arguments 
● In Host side 
○ a function that recursively traverses the object and 
append the arguments to OpenCL stack. 
● In Device side 
○ reconstructor it on the device code for future use.
Example 
struct A { int a; }; 
struct B : A { int b; }; 
struct C { B b; int c; }; 
struct C c; 
c.c = 100; 
auto fn = [=] () { int qq = c.c; };
Kernel code 
__kernel void(int a, int b, int c) 
{ 
C c(a, b, c); 
... 
}
Deserialization constructor 
struct C 
{ 
B b; 
int c; 
C (int a, int b, int c) : c(c), b(a, b) {} 
};
Serialization constructor 
struct C 
{ 
B b; 
int c; 
void __cxxamp_serialize(Concurrency::Serialize s) { 
b.__cxxamp_serialize(s); 
s.Append(sizeof(int), &c); 
} 
};
Translation 
parallel_for_each(product.extent, 
[=](index<2> idx) restrict(amp) { 
int row = idx[0]; 
int col = idx[1]; 
for (int inner = 0; inner < 2; inner++) { 
product[idx] += a(row, inner) * b(inner, col); 
} 
} 
); 
__kernel void 
matrixMul(__global float* C, __global float* A, 
__global float* B, int wA, int wB) 
{ 
int tx = get_global_id(0); 
int ty = get_global_id(1); 
float value = 0; 
for (int k = 0; k < wA; ++k) 
{ 
float elementA = A[ty * wA + k]; 
float elementB = B[k * wB + tx]; 
value += elementA * elementB; 
} 
C[ty * wA + tx] = value; 
} 
● Append the arguments 
● Set the index 
● emit kernel function 
● implicit memory management
C++ amp on linux
Future work 
● Future work for us 
○ restrict(auto) 
○ HSA related work
Future works for you 
● Try this out!! 
● Many of us get spoiled and don’t want to go 
back to write OpenCL directly anymore :-) 
● related links 
○ Driver 
○ Clang 
○ sandbox
C++ amp on linux

More Related Content

PDF
GPU Programming on CPU - Using C++AMP
PDF
C++ How I learned to stop worrying and love metaprogramming
PPT
Intro2 Cuda Moayad
PPTX
ISCA Final Presentaiton - Compilations
PDF
Bridge TensorFlow to run on Intel nGraph backends (v0.5)
PDF
Vc4c development of opencl compiler for videocore4
PDF
Tiramisu をちょっと、味見してみました。
DOCX
Histogram dan Segmentasi 2
GPU Programming on CPU - Using C++AMP
C++ How I learned to stop worrying and love metaprogramming
Intro2 Cuda Moayad
ISCA Final Presentaiton - Compilations
Bridge TensorFlow to run on Intel nGraph backends (v0.5)
Vc4c development of opencl compiler for videocore4
Tiramisu をちょっと、味見してみました。
Histogram dan Segmentasi 2

What's hot (20)

PDF
TVM VTA (TSIM)
PDF
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
PDF
How to make a large C++-code base manageable
PPT
Евгений Крутько, Многопоточные вычисления, современный подход.
PDF
Joel Falcou, Boost.SIMD
PDF
Facebook Glow Compiler のソースコードをグダグダ語る会
PPTX
C++17 now
PPTX
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
PDF
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
PDF
Multithreading done right
PPT
Threaded Programming
PPT
Introduction to gdb
PDF
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
PDF
Работа с реляционными базами данных в C++
PPTX
Story of static code analyzer development
PPTX
Node.js System: The Landing
PDF
【論文紹介】Relay: A New IR for Machine Learning Frameworks
PDF
Interpreter, Compiler, JIT from scratch
PDF
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
PDF
Open CL For Haifa Linux Club
TVM VTA (TSIM)
Productive OpenCL Programming An Introduction to OpenCL Libraries with Array...
How to make a large C++-code base manageable
Евгений Крутько, Многопоточные вычисления, современный подход.
Joel Falcou, Boost.SIMD
Facebook Glow Compiler のソースコードをグダグダ語る会
C++17 now
Evgeniy Muralev, Mark Vince, Working with the compiler, not against it
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
Multithreading done right
Threaded Programming
Introduction to gdb
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
Работа с реляционными базами данных в C++
Story of static code analyzer development
Node.js System: The Landing
【論文紹介】Relay: A New IR for Machine Learning Frameworks
Interpreter, Compiler, JIT from scratch
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Open CL For Haifa Linux Club
Ad

Viewers also liked (9)

PPTX
C++ AMPを使ってみよう
PPTX
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
PPTX
Taxi Fare Deep Dive
PPTX
Traffic simulation based on space syntax
PPTX
Power BI チュートリアル 導入・初級編
PDF
Linux practicals T.Y.B.ScIT
PDF
Linux System Administration Crash Course
PPTX
Matlab distributed computing serverの使い方
PPT
Linux Administration
C++ AMPを使ってみよう
A Sensing Coverage Analysis of a Route Control Method for Vehicular Crowd Sen...
Taxi Fare Deep Dive
Traffic simulation based on space syntax
Power BI チュートリアル 導入・初級編
Linux practicals T.Y.B.ScIT
Linux System Administration Crash Course
Matlab distributed computing serverの使い方
Linux Administration
Ad

Similar to C++ amp on linux (20)

PPT
Lecture 04
PDF
Using GPUs for parallel processing
PPTX
OpenCL Heterogeneous Parallel Computing
PPTX
Intro to GPGPU with CUDA (DevLink)
PDF
CUDA Deep Dive
PPTX
C++ AMP 실천 및 적용 전략
PPT
Lecture5 cuda-memory-spring-2010
PPTX
Griffon Topic2 Presentation (Tia)
PDF
CUG2011 Introduction to GPU Computing
PDF
Automatic Vectorization in ART (Android RunTime) - SFO17-216
PDF
CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes
PPTX
What&rsquo;s new in Visual C++
PDF
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
PPT
Cuda 2011
PPTX
2011.02.18 marco parenzan - modelli di programmazione per le gpu
PDF
Programar para GPUs
PPTX
Lec05 buffers basic_examples
PDF
Introduction to CUDA C: NVIDIA : Notes
PDF
clWrap: Nonsense free control of your GPU
PDF
CUDA by Example : Parallel Programming in CUDA C : Notes
Lecture 04
Using GPUs for parallel processing
OpenCL Heterogeneous Parallel Computing
Intro to GPGPU with CUDA (DevLink)
CUDA Deep Dive
C++ AMP 실천 및 적용 전략
Lecture5 cuda-memory-spring-2010
Griffon Topic2 Presentation (Tia)
CUG2011 Introduction to GPU Computing
Automatic Vectorization in ART (Android RunTime) - SFO17-216
CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes
What&rsquo;s new in Visual C++
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Cuda 2011
2011.02.18 marco parenzan - modelli di programmazione per le gpu
Programar para GPUs
Lec05 buffers basic_examples
Introduction to CUDA C: NVIDIA : Notes
clWrap: Nonsense free control of your GPU
CUDA by Example : Parallel Programming in CUDA C : Notes

Recently uploaded (20)

PDF
How Creative Agencies Leverage Project Management Software.pdf
PDF
Digital Strategies for Manufacturing Companies
PPTX
history of c programming in notes for students .pptx
PDF
Internet Downloader Manager (IDM) Crack 6.42 Build 41
PPTX
VVF-Customer-Presentation2025-Ver1.9.pptx
PDF
Nekopoi APK 2025 free lastest update
PPTX
Lecture 3: Operating Systems Introduction to Computer Hardware Systems
PPTX
Transform Your Business with a Software ERP System
PPTX
Essential Infomation Tech presentation.pptx
PDF
How to Choose the Right IT Partner for Your Business in Malaysia
PDF
Adobe Premiere Pro 2025 (v24.5.0.057) Crack free
PPTX
ai tools demonstartion for schools and inter college
PDF
Raksha Bandhan Grocery Pricing Trends in India 2025.pdf
PPTX
Introduction to Artificial Intelligence
PDF
Claude Code: Everyone is a 10x Developer - A Comprehensive AI-Powered CLI Tool
PDF
Softaken Excel to vCard Converter Software.pdf
PPTX
Odoo POS Development Services by CandidRoot Solutions
PDF
System and Network Administraation Chapter 3
PDF
Wondershare Filmora 15 Crack With Activation Key [2025
PDF
Which alternative to Crystal Reports is best for small or large businesses.pdf
How Creative Agencies Leverage Project Management Software.pdf
Digital Strategies for Manufacturing Companies
history of c programming in notes for students .pptx
Internet Downloader Manager (IDM) Crack 6.42 Build 41
VVF-Customer-Presentation2025-Ver1.9.pptx
Nekopoi APK 2025 free lastest update
Lecture 3: Operating Systems Introduction to Computer Hardware Systems
Transform Your Business with a Software ERP System
Essential Infomation Tech presentation.pptx
How to Choose the Right IT Partner for Your Business in Malaysia
Adobe Premiere Pro 2025 (v24.5.0.057) Crack free
ai tools demonstartion for schools and inter college
Raksha Bandhan Grocery Pricing Trends in India 2025.pdf
Introduction to Artificial Intelligence
Claude Code: Everyone is a 10x Developer - A Comprehensive AI-Powered CLI Tool
Softaken Excel to vCard Converter Software.pdf
Odoo POS Development Services by CandidRoot Solutions
System and Network Administraation Chapter 3
Wondershare Filmora 15 Crack With Activation Key [2025
Which alternative to Crystal Reports is best for small or large businesses.pdf

C++ amp on linux

  • 1. C++AMP on Linux Miller Lee
  • 2. About Me ● Miller Lee ● Junior student at NCTU CS ● Interests: C/C++, PL, CA, OS, compiler, parallel programming, optimization
  • 3. Why C++ AMP? ● GPUs can be 10+X faster than CPUs for parallel code ● CUDA and OpenCL are still too complex/verbose for programmers
  • 6. GPU computing require explicit transfer
  • 7. What we need in GPU programming 1. put data parallel codes into a kernel for GPU to execute 2. pass the arguments to GPU ○ We can not pass the arguments by stack 3. an index to indicate current thread 4. move the data between GPU and CPU memory
  • 8. OpenCL as an example
  • 9. Device code in OpenCL __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; }
  • 10. Host code in OpenCL 1.2 1. allocate and initialize memory on host side 2. Initialize OpenCL 3. allocate device memory and move the data 4. Load and build device code 5. Launch kernel a. append arguments 6. move the data back from device
  • 11. int main(int argc, char** argv) { // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // 5. Initialize OpenCL // OpenCL specific variables cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; size_t dataBytes; size_t kernelLength; cl_int errcode; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS); // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); // 6. Load and build OpenCL kernel char *clMatrixMul = oclLoadProgSource("kernel.cl", "// My commentn", &kernelLength); shrCheckError(clMatrixMul != NULL, shrTRUE); clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&clMatrixMul, &kernelLength, &errcode); shrCheckError(errcode, CL_SUCCESS); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); shrCheckError(errcode, CL_SUCCESS); // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); shrCheckError(errcode, CL_SUCCESS); localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 8. Retrieve result from device errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); shrCheckError(errcode, CL_SUCCESS); // 10. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); free(clMatrixMul); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
  • 12. Nearly 200 lines of code
  • 13. What is C++ AMP ● C++ Accelerated Massive Parallelism ○ Designed for data level parallelism ○ Extension of C++11 proposed by M$ ○ An open specification with multiple implementations aiming at standardization ■ MS Visual Studio 2013 ■ MCW CLAMP ● GPU data modeled as C++14-like containers for multidimensional arrays ● GPU kernels modeled as C++11 lambda
  • 14. Comparisons C++AMP Thrust Bolt OpenACC SYCL Intro simple, elegant, performance(?), proposed by M$ library proposed by CUDA library proposed by AMP Annotation and pragmas proposed by SGI wrapper for OpenCL proposed by Codeplay
  • 15. Matrix Multiplication in C++AMP void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix, int ha, int hb, int hc) { array_view<int, 2> a(ha, hb, aMatrix); array_view<int, 2> b(hb, hc, bMatrix); array_view<int, 2> product(ha, hc, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, __kernel void matrixMul(__global float* C, __global float* A, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); shrCheckError(errcode, CL_SUCCESS); //Create a command-queue clCommandQue = clCreateCommandQueue (clGPUContext, float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; } clDevices[0], 0, &errcode); shrCheckError(errcode, CL_SUCCESS);
  • 16. Only 20 lines of code but performance?
  • 17. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } GPU data modeled as data container
  • 18. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } Execution interface; marking an implicitly parallel region for GPU execution
  • 19. C++AMP programming model void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) { array_view<int, 2> a(3, 2, aMatrix); array_view<int, 2> b(2, 3, bMatrix); array_view<int, 2> product(3, 3, productMatrix); parallel_for_each( product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); product.synchronize(); } Kernels modeled as lambdas; arguments are implicitly modeled as captured variables
  • 20. MCW C++AMP (CLAMP) ● Clang/LLVM-based ○ translate C++AMP code to OpenCL C code and generate OpenCL SPIR file ○ With some template library ● Runtime support: gmac/OpenCL/HSA Okra ● An Open Source project ○ The only two C++ AMP implementations recognized by HSA foundation (the other is MSVC) ○ Microsoft and HSA foundation supported
  • 21. MCW C++ AMP Compiler ● Device Path ○ generate OpenCL C code by CBackend ○ emit kernel function ● Host Path ○ preparation to launch the code C++ AMP source code Clang/LLVM 3.3 Device Code Host Code
  • 22. Execution process C++ AMP source code Clang /LLV M 3.3 Device Code C++ AMP source code Clang /LLV M 3.3 Host Code gmac OpenCL Our work
  • 23. gmac ● unified virtual address space in software ● Can have high overhead sometimes ● In HSA (AMD Kaveri), GMAC is no longer needed
  • 24. Compiling C++AMP to OpenCL ● C++AMP → LLVM IR → subset of C ● arguments passing (lambda capture vs function calls) ● explicit V.S. implicit memory transfer ● Heavy works were done by compiler and runtime
  • 25. lambda capture struct add { int a; add(int a) : a(a) {} int operator()(int x) const { return a + x; } }; int main(void) { int a = 3; auto fn = [=] (int x) { return a + x; }; int c = fn(3); return 0; } Those arguments should be put on the argument lists of OpenCL kernel.
  • 26. What we need to do? ● Kernel function ○ emit the kernel function with required arguments ● In Host side ○ a function that recursively traverses the object and append the arguments to OpenCL stack. ● In Device side ○ reconstructor it on the device code for future use.
  • 27. Example struct A { int a; }; struct B : A { int b; }; struct C { B b; int c; }; struct C c; c.c = 100; auto fn = [=] () { int qq = c.c; };
  • 28. Kernel code __kernel void(int a, int b, int c) { C c(a, b, c); ... }
  • 29. Deserialization constructor struct C { B b; int c; C (int a, int b, int c) : c(c), b(a, b) {} };
  • 30. Serialization constructor struct C { B b; int c; void __cxxamp_serialize(Concurrency::Serialize s) { b.__cxxamp_serialize(s); s.Append(sizeof(int), &c); } };
  • 31. Translation parallel_for_each(product.extent, [=](index<2> idx) restrict(amp) { int row = idx[0]; int col = idx[1]; for (int inner = 0; inner < 2; inner++) { product[idx] += a(row, inner) * b(inner, col); } } ); __kernel void matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) { int tx = get_global_id(0); int ty = get_global_id(1); float value = 0; for (int k = 0; k < wA; ++k) { float elementA = A[ty * wA + k]; float elementB = B[k * wB + tx]; value += elementA * elementB; } C[ty * wA + tx] = value; } ● Append the arguments ● Set the index ● emit kernel function ● implicit memory management
  • 33. Future work ● Future work for us ○ restrict(auto) ○ HSA related work
  • 34. Future works for you ● Try this out!! ● Many of us get spoiled and don’t want to go back to write OpenCL directly anymore :-) ● related links ○ Driver ○ Clang ○ sandbox