SlideShare a Scribd company logo
CUDA – First Programs
“Hello, world” is traditionally the first program we write. We can do the same for CUDA. Here it is:
In file hello.cu:
#include "stdio.h"
int main()
{
printf("Hello, worldn");
return 0;
}
On our Tesla machine, you can compile and this with:
$ nvcc hello.cu
$ ./a.out
You can change the output file name with the –o flag: nvcc –o hello hello.cu
If you edit your .bashrc file you can also add your current directory to your path if you don’t want to
have to type the preceding . all of the time, which refers to the current working directory. Add
export PATH=$PATH:.
To the .bashrc file. Some would recommend not doing this for security purposes.
You might consider this program to be cheating, since it doesn’t really use any CUDA functionality.
Everything runs on the host. However, the point is that CUDA C programs can do everything a regular C
program can do.
Here is a slightly more interesting (but inefficient and only useful as an example) program that adds two
numbers together using a kernel function:
#include "stdio.h"
__global__ void add(int a, int b, int *c)
{
*c = a + b;
}
int main()
{
int a,b,c;
int *dev_c;
a=3;
b=4;
cudaMalloc((void**)&dev_c, sizeof(int));
add<<<1,1>>>(a,b,dev_c);
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d + %d is %dn", a, b, c);
cudaFree(dev_c);
return 0;
}
To do in class: walk through program; show similar program in straight C and it runs much faster! Why?
cudaMalloc returns cudaSuccess if it was successful; could check to ensure that the program will run
correctly.
Example: Summing Vectors
This is a simple problem. Given two vectors (i.e. arrays), we would like to add them together in a third
array. For example:
A = {0, 2, 4, 6, 8}
B = {1, 1, 2, 2, 1}
Then A + B =
C = {1, 3, 6, 8, 9}
In this example the array is 5 elements long, so our approach will be to create 5 different threads. The
first thread is responsible for computing C[0] = A[0] + B[0]. The second thread is responsible for
computing C[1] = A[1] + B[1], and so forth.
Here is how we can do this with traditional C code:
#include "stdio.h"
#define N 10
void add(int *a, int *b, int *c)
{
int tID = 0;
while (tID < N)
{
c[tID] = a[tID] + b[tID];
tID += 1;
}
}
int main()
{
int a[N], b[N], c[N];
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
add (a, b, c);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %dn", a[i], b[i], c[i]);
}
return 0;
}
This is a rather roundabout way to add two arrays – our reason is because this will translate a little nicer
to the CUDA version. To compile and run it, we have to use g++ (since it uses some C++ style notations
that don’t work in C). Here is the CUDA version:
#include "stdio.h"
#define N 10
__global__ void add(int *a, int *b, int *c)
{
int tID = blockIdx.x;
if (tID < N)
{
c[tID] = a[tID] + b[tID];
}
}
int main()
{
int a[N], b[N], c[N];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, N*sizeof(int));
cudaMalloc((void **) &dev_b, N*sizeof(int));
cudaMalloc((void **) &dev_c, N*sizeof(int));
// Fill Arrays
for (int i = 0; i < N; i++)
{
a[i] = i,
b[i] = 1;
}
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
add<<<N,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++)
{
printf("%d + %d = %dn", a[i], b[i], c[i]);
}
return 0;
}
blockIDx.x gives us the Block ID, which ranges from 0 to N-1. What if we used add<<<1,N>>>
instead? Then we can access by the ThreadID which is the variable threadIDx.x.
As another example, let’s add two 2D arrays. We can define a 2D array of ints as follows:
int c[2][3];
The following code illustrates how the 2D array is laid out in memory:
for (int i=0; i < 2; i++)
for (int j=0; j< 3; j++)
printf("[%d][%d] at %ldn",i,j,&c[i][j]);
Output:
[0][0] at 140733933298160
[0][1] at 140733933298164
[0][2] at 140733933298168
[1][0] at 140733933298172
[1][1] at 140733933298176
[1][2] at 140733933298180
We can see that we have a layout where the next cell in the j dimension occupies the next sequential
integer in memory, where an int is 4 bytes:
c[0][0] at &c c[0][1] at &c + 4 c[0][2] at &c + 8
c[1][0] at &c + 12 c[1][1] at &c + 16 c[1][2] at &c + 20
In general, the address of a cell can be computed by:
&c + [(sizeof(int) * sizeof-j-dimension * i] + (sizeof(int)) * j
In our example the size of the j dimension is 3. For example, the cell at c[1][1] would be combined as
the base address + (4*3*1) + (4*1) = &c+16.
C will do the addressing for us if we use the array notation, so if INDEX=i*WIDTH + J
then we can access the element via: c[INDEX]
CUDA requires we allocate memory as a one-dimensional array, so we can use the mapping above to a
2D array.
To make the mapping a little easier in the kernel function we can declare the blocks to be in a grid that is
the same dimensions as the 2D array. This will create variables blockIdx.x and blockIdx.y that
correspond to the width and height of the array.
#include "stdio.h"
#define COLUMNS 3
#define ROWS 2
__global__ void add(int *a, int *b, int *c)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
}
int main()
{
int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS];
int *dev_a, *dev_b, *dev_c;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
for (int y = 0; y < ROWS; y++) // Fill Arrays
for (int x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
}
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
dim3 grid(COLUMNS,ROWS);
add<<<grid,1>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),
cudaMemcpyDeviceToHost);
for (int y = 0; y < ROWS; y++) // Output Arrays
{
for (int x = 0; x < COLUMNS; x++)
{
printf("[%d][%d]=%d ",y,x,c[y][x]);
}
printf("n");
}
return 0;
}

More Related Content

PDF
C++ ARRAY WITH EXAMPLES
PDF
13. dynamic allocation
DOCX
imager package in R and examples..
PPT
Advance features of C++
PPTX
OPTIMAL BINARY SEARCH
PDF
C++ TUTORIAL 6
PDF
Statistics.cpp
PDF
Numpy tutorial(final) 20160303
C++ ARRAY WITH EXAMPLES
13. dynamic allocation
imager package in R and examples..
Advance features of C++
OPTIMAL BINARY SEARCH
C++ TUTORIAL 6
Statistics.cpp
Numpy tutorial(final) 20160303

What's hot (20)

DOCX
ggtimeseries-->ggplot2 extensions
DOCX
Doubly linklist
PPT
Arrays
DOCX
Data Visualization with R.ggplot2 and its extensions examples.
PDF
C++ TUTORIAL 10
PDF
C++ TUTORIAL 7
PPTX
Python Tidbits
DOCX
Basic Calculus in R.
PDF
python高级内存管理
PDF
NumPy Refresher
DOCX
Advanced Data Visualization in R- Somes Examples.
PDF
Frsa
PDF
OOP 2012 - Hint: Dynamic allocation in c++
KEY
Pointer Events in Canvas
DOCX
Advanced Data Visualization Examples with R-Part II
PPT
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
PDF
C++ TUTORIAL 1
PDF
C++ TUTORIAL 9
DOCX
Class array
PDF
Generative Adversarial Nets
ggtimeseries-->ggplot2 extensions
Doubly linklist
Arrays
Data Visualization with R.ggplot2 and its extensions examples.
C++ TUTORIAL 10
C++ TUTORIAL 7
Python Tidbits
Basic Calculus in R.
python高级内存管理
NumPy Refresher
Advanced Data Visualization in R- Somes Examples.
Frsa
OOP 2012 - Hint: Dynamic allocation in c++
Pointer Events in Canvas
Advanced Data Visualization Examples with R-Part II
Scientific Computing with Python Webinar March 19: 3D Visualization with Mayavi
C++ TUTORIAL 1
C++ TUTORIAL 9
Class array
Generative Adversarial Nets
Ad

Similar to CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes (20)

PDF
Introduction to CUDA C: NVIDIA : Notes
PDF
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
PDF
Tema3_Introduction_to_CUDA_C.pdf
PPTX
introduction to CUDA_C.pptx it is widely used
PDF
CUDA Tutorial 01 : Say Hello to CUDA : Notes
PPT
Intro2 Cuda Moayad
PPTX
Introduction_to_CUDA_C_simple et parfiat.pptx
PPT
002 - Introduction to CUDA Programming_1.ppt
PDF
3. CUDA_Thread.pdf info on cuda threads .
PDF
CUDA by Example : Thread Cooperation : Notes
PDF
CUDA by Example : Parallel Programming in CUDA C : Notes
PDF
CUDA Tutorial 02 : CUDA in Actions : Notes
PPTX
Intro to GPGPU with CUDA (DevLink)
PDF
GPU Computing with CUDA
PDF
Kato Mivule: An Overview of CUDA for High Performance Computing
PDF
CUDA Deep Dive
PDF
lecture_GPUArchCUDA02-CUDAMem.pdf
PPTX
C for Cuda - Small Introduction to GPU computing
PPT
cuda_programming for vietual reality in 3d
PDF
CUDA by Example : Introduction to CUDA C : Notes
Introduction to CUDA C: NVIDIA : Notes
Etude éducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Tema3_Introduction_to_CUDA_C.pdf
introduction to CUDA_C.pptx it is widely used
CUDA Tutorial 01 : Say Hello to CUDA : Notes
Intro2 Cuda Moayad
Introduction_to_CUDA_C_simple et parfiat.pptx
002 - Introduction to CUDA Programming_1.ppt
3. CUDA_Thread.pdf info on cuda threads .
CUDA by Example : Thread Cooperation : Notes
CUDA by Example : Parallel Programming in CUDA C : Notes
CUDA Tutorial 02 : CUDA in Actions : Notes
Intro to GPGPU with CUDA (DevLink)
GPU Computing with CUDA
Kato Mivule: An Overview of CUDA for High Performance Computing
CUDA Deep Dive
lecture_GPUArchCUDA02-CUDAMem.pdf
C for Cuda - Small Introduction to GPU computing
cuda_programming for vietual reality in 3d
CUDA by Example : Introduction to CUDA C : Notes
Ad

More from Subhajit Sahu (20)

PDF
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
PDF
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
PDF
Adjusting Bitset for graph : SHORT REPORT / NOTES
PDF
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
PDF
Adjusting primitives for graph : SHORT REPORT / NOTES
PDF
Experiments with Primitive operations : SHORT REPORT / NOTES
PDF
PageRank Experiments : SHORT REPORT / NOTES
PDF
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
PDF
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
PDF
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
PDF
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
PDF
Shared memory Parallelism (NOTES)
PDF
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
PDF
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
PDF
Application Areas of Community Detection: A Review : NOTES
PDF
Community Detection on the GPU : NOTES
PDF
Survey for extra-child-process package : NOTES
PDF
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
PDF
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
PDF
Fast Incremental Community Detection on Dynamic Graphs : NOTES
About TrueTime, Spanner, Clock synchronization, CAP theorem, Two-phase lockin...
Levelwise PageRank with Loop-Based Dead End Handling Strategy : SHORT REPORT ...
Adjusting Bitset for graph : SHORT REPORT / NOTES
Algorithmic optimizations for Dynamic Levelwise PageRank (from STICD) : SHORT...
Adjusting primitives for graph : SHORT REPORT / NOTES
Experiments with Primitive operations : SHORT REPORT / NOTES
PageRank Experiments : SHORT REPORT / NOTES
Algorithmic optimizations for Dynamic Monolithic PageRank (from STICD) : SHOR...
Adjusting OpenMP PageRank : SHORT REPORT / NOTES
word2vec, node2vec, graph2vec, X2vec: Towards a Theory of Vector Embeddings o...
DyGraph: A Dynamic Graph Generator and Benchmark Suite : NOTES
Shared memory Parallelism (NOTES)
A Dynamic Algorithm for Local Community Detection in Graphs : NOTES
Scalable Static and Dynamic Community Detection Using Grappolo : NOTES
Application Areas of Community Detection: A Review : NOTES
Community Detection on the GPU : NOTES
Survey for extra-child-process package : NOTES
Dynamic Batch Parallel Algorithms for Updating PageRank : POSTER
Abstract for IPDPS 2022 PhD Forum on Dynamic Batch Parallel Algorithms for Up...
Fast Incremental Community Detection on Dynamic Graphs : NOTES

Recently uploaded (20)

PPTX
Wireless and Mobile Backhaul Market.pptx
PPTX
ERP good ERP good ERP good ERP good good ERP good ERP good
PPTX
Operating System Processes_Scheduler OSS
PDF
Cableado de Controladores Logicos Programables
DOCX
fsdffdghjjgfxfdghjvhjvgfdfcbchghgghgcbjghf
PPTX
Presentacion compuuuuuuuuuuuuuuuuuuuuuuu
PPTX
making presentation that do no stick.pptx
PPTX
DEATH AUDIT MAY 2025.pptxurjrjejektjtjyjjy
PPTX
material for studying about lift elevators escalation
PDF
-DIGITAL-INDIA.pdf one of the most prominent
PPT
Lines and angles cbse class 9 math chemistry
PPTX
Fundamentals of Computer.pptx Computer BSC
PDF
Dynamic Checkweighers and Automatic Weighing Machine Solutions
PPTX
Nanokeyer nano keyekr kano ketkker nano keyer
PPTX
sdn_based_controller_for_mobile_network_traffic_management1.pptx
DOCX
A PROPOSAL ON IoT climate sensor 2.docx
PPTX
quadraticequations-111211090004-phpapp02.pptx
PPTX
Computers and mobile device: Evaluating options for home and work
PPTX
PROGRAMMING-QUARTER-2-PYTHON.pptxnsnsndn
PPTX
"Fundamentals of Digital Image Processing: A Visual Approach"
Wireless and Mobile Backhaul Market.pptx
ERP good ERP good ERP good ERP good good ERP good ERP good
Operating System Processes_Scheduler OSS
Cableado de Controladores Logicos Programables
fsdffdghjjgfxfdghjvhjvgfdfcbchghgghgcbjghf
Presentacion compuuuuuuuuuuuuuuuuuuuuuuu
making presentation that do no stick.pptx
DEATH AUDIT MAY 2025.pptxurjrjejektjtjyjjy
material for studying about lift elevators escalation
-DIGITAL-INDIA.pdf one of the most prominent
Lines and angles cbse class 9 math chemistry
Fundamentals of Computer.pptx Computer BSC
Dynamic Checkweighers and Automatic Weighing Machine Solutions
Nanokeyer nano keyekr kano ketkker nano keyer
sdn_based_controller_for_mobile_network_traffic_management1.pptx
A PROPOSAL ON IoT climate sensor 2.docx
quadraticequations-111211090004-phpapp02.pptx
Computers and mobile device: Evaluating options for home and work
PROGRAMMING-QUARTER-2-PYTHON.pptxnsnsndn
"Fundamentals of Digital Image Processing: A Visual Approach"

CUDA First Programs: Computer Architecture CSE448 : UAA Alaska : Notes

  • 1. CUDA – First Programs “Hello, world” is traditionally the first program we write. We can do the same for CUDA. Here it is: In file hello.cu: #include "stdio.h" int main() { printf("Hello, worldn"); return 0; } On our Tesla machine, you can compile and this with: $ nvcc hello.cu $ ./a.out You can change the output file name with the –o flag: nvcc –o hello hello.cu If you edit your .bashrc file you can also add your current directory to your path if you don’t want to have to type the preceding . all of the time, which refers to the current working directory. Add export PATH=$PATH:. To the .bashrc file. Some would recommend not doing this for security purposes. You might consider this program to be cheating, since it doesn’t really use any CUDA functionality. Everything runs on the host. However, the point is that CUDA C programs can do everything a regular C program can do.
  • 2. Here is a slightly more interesting (but inefficient and only useful as an example) program that adds two numbers together using a kernel function: #include "stdio.h" __global__ void add(int a, int b, int *c) { *c = a + b; } int main() { int a,b,c; int *dev_c; a=3; b=4; cudaMalloc((void**)&dev_c, sizeof(int)); add<<<1,1>>>(a,b,dev_c); cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost); printf("%d + %d is %dn", a, b, c); cudaFree(dev_c); return 0; } To do in class: walk through program; show similar program in straight C and it runs much faster! Why? cudaMalloc returns cudaSuccess if it was successful; could check to ensure that the program will run correctly.
  • 3. Example: Summing Vectors This is a simple problem. Given two vectors (i.e. arrays), we would like to add them together in a third array. For example: A = {0, 2, 4, 6, 8} B = {1, 1, 2, 2, 1} Then A + B = C = {1, 3, 6, 8, 9} In this example the array is 5 elements long, so our approach will be to create 5 different threads. The first thread is responsible for computing C[0] = A[0] + B[0]. The second thread is responsible for computing C[1] = A[1] + B[1], and so forth. Here is how we can do this with traditional C code: #include "stdio.h" #define N 10 void add(int *a, int *b, int *c) { int tID = 0; while (tID < N) { c[tID] = a[tID] + b[tID]; tID += 1; } } int main() { int a[N], b[N], c[N]; // Fill Arrays for (int i = 0; i < N; i++) { a[i] = i, b[i] = 1; } add (a, b, c); for (int i = 0; i < N; i++) { printf("%d + %d = %dn", a[i], b[i], c[i]); } return 0; }
  • 4. This is a rather roundabout way to add two arrays – our reason is because this will translate a little nicer to the CUDA version. To compile and run it, we have to use g++ (since it uses some C++ style notations that don’t work in C). Here is the CUDA version: #include "stdio.h" #define N 10 __global__ void add(int *a, int *b, int *c) { int tID = blockIdx.x; if (tID < N) { c[tID] = a[tID] + b[tID]; } } int main() { int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; cudaMalloc((void **) &dev_a, N*sizeof(int)); cudaMalloc((void **) &dev_b, N*sizeof(int)); cudaMalloc((void **) &dev_c, N*sizeof(int)); // Fill Arrays for (int i = 0; i < N; i++) { a[i] = i, b[i] = 1; } cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice); add<<<N,1>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i < N; i++) { printf("%d + %d = %dn", a[i], b[i], c[i]); } return 0; } blockIDx.x gives us the Block ID, which ranges from 0 to N-1. What if we used add<<<1,N>>> instead? Then we can access by the ThreadID which is the variable threadIDx.x.
  • 5. As another example, let’s add two 2D arrays. We can define a 2D array of ints as follows: int c[2][3]; The following code illustrates how the 2D array is laid out in memory: for (int i=0; i < 2; i++) for (int j=0; j< 3; j++) printf("[%d][%d] at %ldn",i,j,&c[i][j]); Output: [0][0] at 140733933298160 [0][1] at 140733933298164 [0][2] at 140733933298168 [1][0] at 140733933298172 [1][1] at 140733933298176 [1][2] at 140733933298180 We can see that we have a layout where the next cell in the j dimension occupies the next sequential integer in memory, where an int is 4 bytes: c[0][0] at &c c[0][1] at &c + 4 c[0][2] at &c + 8 c[1][0] at &c + 12 c[1][1] at &c + 16 c[1][2] at &c + 20 In general, the address of a cell can be computed by: &c + [(sizeof(int) * sizeof-j-dimension * i] + (sizeof(int)) * j In our example the size of the j dimension is 3. For example, the cell at c[1][1] would be combined as the base address + (4*3*1) + (4*1) = &c+16. C will do the addressing for us if we use the array notation, so if INDEX=i*WIDTH + J then we can access the element via: c[INDEX] CUDA requires we allocate memory as a one-dimensional array, so we can use the mapping above to a 2D array. To make the mapping a little easier in the kernel function we can declare the blocks to be in a grid that is the same dimensions as the 2D array. This will create variables blockIdx.x and blockIdx.y that correspond to the width and height of the array.
  • 6. #include "stdio.h" #define COLUMNS 3 #define ROWS 2 __global__ void add(int *a, int *b, int *c) { int x = blockIdx.x; int y = blockIdx.y; int i = (COLUMNS*y) + x; c[i] = a[i] + b[i]; } int main() { int a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS]; int *dev_a, *dev_b, *dev_c; cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int)); cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int)); cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int)); for (int y = 0; y < ROWS; y++) // Fill Arrays for (int x = 0; x < COLUMNS; x++) { a[y][x] = x; b[y][x] = y; } cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), cudaMemcpyHostToDevice); dim3 grid(COLUMNS,ROWS); add<<<grid,1>>>(dev_a, dev_b, dev_c); cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int), cudaMemcpyDeviceToHost); for (int y = 0; y < ROWS; y++) // Output Arrays { for (int x = 0; x < COLUMNS; x++) { printf("[%d][%d]=%d ",y,x,c[y][x]); } printf("n"); } return 0; }