SlideShare a Scribd company logo
2
Most read
3
Most read
5
Most read
Basic Example: Matrix
                  Multiplication using CUDA

                   General-purpose Programming of Massively Parallel
                                 Graphics Processors
                                               Shiraz University, Spring 2010
                                                  Instructor: Reza Azimi


                                Some materials/slides are adapted from:
                          Andreas Moshovos’ Course at the University of Toronto
                              UIUC course by Wen-Mei Hwu and David Kirk
                                                                                                                     




     (      6 07 4    7 6 5 4 32 1 0)                                    0
           A    @  9 8                                                       B A



 void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
   for (int i = 0; i < Width; ++i)                   N
     for (int j = 0; j < Width; ++j) {
         float sum = 0;                                                                                         k
         for (int k = 0; k < Width; ++k) {
                                                                                                                        WIDTH




              float a = M[i * Width + k];              j
              float b = N[k * Width + j];
              sum += a * b;
          }
          P[i * Width + j] = sum;
     }
 }
                                                          M                                         P

                                                                     i
                                                                                                                        WIDTH




                                                              k

Adapted From:
             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©       WIDTH       '      2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                1
60     IH 34   4 G F ED
     A  8P A       A@




 __global__
 void MatrixMulKernel(float* d_M,
                                                                                                    d_N
                      float* d_N,
                      float* d_P,                                                                                 k
                      int Width) {




                                                                                                                          WIDTH
   int row = threadIdx.y;
   int col = threadIdx.x;                                                                           col
   float P_val = 0;                                                                                 (threadIdx.x)
   for (int k = 0; k  Width; ++k) {
     float M_elem = d_M[row * Width + k];
     float N_elem = d_N[k * Width + col];
     P_val += M_elem * N_elem;
   }                            d_M                                                                 d_P
   d_p[row*Width+col] = P_val;        row
 }                                    (threadIdx.y)




                                                                                                                          WIDTH
                                                            k

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©         WIDTH       C      3
Adapted From:
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




     4W 7              60    6 33 (         I2  R
    @     AT @ V U AT A  8TP        T8T S 8   A




   void MatrixMulOnDevice(float* M,
                          float* N,
                          float* P,
                          int Width)
   {
      int matrix_size = Width * Width * sizeof(float);
      float *d_M, *d_N, *d_P;

        // Allocate and Load M and N to device memory
        cudaMalloc(d_M, matrix_size);
        cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

        cudaMalloc(d_N, matrix_size);
        cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

        // Allocate P on the device
        cudaMalloc(d_P, matrix_size);

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
                         £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     Q      4
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                  2
26         60    6    R 34   4G                                                             3I7 4        7
     a `   B U AT A  8TP   YA      A@                                                                        8




         // Setup the execution configuration
         dim3 dimGrid(1, 1);
         dim3 dimBlock(Width, Width);


      // Launch the device computation threads!
      MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
   Width);

         // Copy back the results from device to host
         cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

         // Free up the device memory matrices
         cudaFree(d_P);
         cudaFree(d_M);
         cudaFree(d_N);
     © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          X   5
     ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign




    Only One Thread Block Used
     c
         One Block of threads compute                                                      Grid 1                    d_N
         matrix d_P                                                                     Block 1
                                                                                                                       2

                                                                                                                       4
     d
         Each thread
           e
               Loads a row of matrix d_M                                                       Thread
                                                                                                (2, 2)
                                                                                                                       2
           e
               Loads a column of matrix d_N                                                                            6
           e
               Perform one multiply and
               addition for each pair of d_M
               and d_N elements
           e
               Computes one element of d_P


                                                                                 3     2       5         4             48
   Size of matrix limited by
   the number of threads
   allowed in a thread block
                                                                                     WIDTH                           d_P
                                                                                     d_M
Adapted From:
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
                                                           ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          b   6




                                                                                                                                    3
6 0 I 36)                  6r 4 6     4 p   p   i 4  0D
  A   8                     s@    @  q UT @  V  PT     Y   hg




                                                             threadIdx.x                           TILE_WIDTH


                                                         d_P
                                                                                                                    TILE_


                               threadIdx.y




                                                                                         7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries


            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                f




  Solution 1: Give Each Thread More
  Work
   __global__ void MatrixMulKernel(float* d_M,
                      float* d_N,
                      float* d_P,
                      int Width) {
     int start_row = threadIdx.y * TILE_WIDTH;
     int end_row = start_row + TILE_WIDTH;
     int start_col = threadIdx.x * TILE_WIDTH;
     int end_col = start_col + TILE_WIDTH;

       for (int row = start_row; row  end_row; row++) {
          for(int col = start_col; col  end_col; col++) {
             float P_val = 0;
             for (int k = 0; k  Width; ++k) {
                float M_elem = d_M[row * Width + k];
                float N_elem = d_N[k * Width + col];
                P_val += M_elem * N_elem;
             }                              With one block we utilize
             d_p[row*Width+col] = P_val;    only one multiprocessor!
          }
            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                t

       }
   }




                                                                                                                            4
63    4 p   4 32 0 3I   47 F     6 0 I 36)                                                           7
  w UT @  V       8   q      hv A   8                                                           sP


                                                             threadIdx.x

                                    blockIdx.x                                     blockDim.x

       d_P
                                                                                                blockDim.y



      blockIdx.y




                                                              9                                 assigned to a
     threadIdx.y                                                                                thread

                                                                                                assigned to a
                                                                                                thread block



       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  u




Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
                   float* d_N,
                   float* d_P,
                   int Width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  float P_val = 0;

    for (int k = 0; k  Width; ++k) {
      float M_elem = d_M[row * Width + k];
      float N_elem = d_N[k * Width + col];
      P_val += M_elem * N_elem;
    }
    d_p[row*Width+col] = P_val;
}


       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©               x 




                                                                                                                 5
26         60    6    R 34   4G                                                      3I7 4       7
 a `   B U AT A  8TP   YA      A@                                                                8




     int block_size = 64;

     // Setup the execution configuration
     dim3 dimGrid(Width/block_size, Width/block_size);
     dim3 dimBlock(block_size, block_size);


   // Launch the device computation threads!
   MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

     …
                                                                            Size of matrix limited by the
                                                                            number of threads allowed
                                                                            on a device



        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     11




  60    01 0    4 p         D                                                    7
 A  8T8     ‚ UT @  V  € v yV




 ƒ
     Max Number of Threads per Block: 512
 ƒ
     Max Number of Blocks per Streaming
     Multiprocessor: 8
 ƒ
     Number of Streaming Multiprocessors: 30
 ƒ
     Total Number of Threads Available =
                30 x 8 x 512 = 122880

                          Let me double-check this!

        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                ' 




                                                                                                                      6
6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B



                                                       threadIdx.x
                                blockIdx.x                                    blockDim.x

     d_P
                                                                                                 blockDim.y



     blockIdx.y


                                                                                                 TILE_WIDTH


                                                        13
    threadIdx.y
                                                                                                              TILE_WIDT




          © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  C 




     6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B




__global__ void MatrixMulKernel(float* d_M,
                     float* d_N,
                     float* d_P,
                     int Width) {
  int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
  int end_row = start_row + TILE_WIDTH;
  int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
  int end_col = start_col + TILE_WIDTH;

    for (int row = start_row; row  end_row; row++) {
       for(int col = start_col; col  end_col; col++) {
          float P_val = 0;
          for (int k = 0; k  Width; ++k) {
             float M_elem = d_M[row * Width + k];
             float N_elem = d_N[k * Width + col];
             P_val += M_elem * N_elem;
          }
          d_p[row*Width+col] = P_val;
       }
    }
}         © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  Q 




                                                                                                                          7

More Related Content

PDF
Multi-linear algebra and different tensor formats with applications
PDF
11.on generalized dislocated quasi metrics
PDF
On generalized dislocated quasi metrics
PDF
11.some fixed point theorems in generalised dislocated metric spaces
PDF
Some fixed point theorems in generalised dislocated metric spaces
PDF
Summary of "A Universally-Truthful Approximation Scheme for Multi-unit Auction"
PDF
Options pricing using Lattice models
DOCX
Laporan pembuatan Final Project (Java - Netbeans) "Rental CD"
Multi-linear algebra and different tensor formats with applications
11.on generalized dislocated quasi metrics
On generalized dislocated quasi metrics
11.some fixed point theorems in generalised dislocated metric spaces
Some fixed point theorems in generalised dislocated metric spaces
Summary of "A Universally-Truthful Approximation Scheme for Multi-unit Auction"
Options pricing using Lattice models
Laporan pembuatan Final Project (Java - Netbeans) "Rental CD"

Similar to Matrix multiplication using CUDA (17)

PDF
Automatically Describing Program Structure and Behavior (PhD Defense)
PDF
Mtech Communication Networks lab Manual.pdf
PPTX
Implementation of graphs, adjaceny matrix
KEY
openFrameworks 007 - 3D
PPTX
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
PDF
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
PDF
Triggering patterns of topology changes in dynamic attributed graphs
DOCX
Lab Practices and Works Documentation / Report on Computer Graphics
PPT
CS 354 Transformation, Clipping, and Culling
PDF
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
PDF
10CSL67 CG LAB PROGRAM 1
PDF
Writing a Space Shooter with HTML5 Canvas
PDF
rgDefense
DOC
oop Lecture 5
PDF
Kirti Kumawat
DOCX
2.docx
PDF
Interactive High-Dimensional Visualization of Social Graphs
Automatically Describing Program Structure and Behavior (PhD Defense)
Mtech Communication Networks lab Manual.pdf
Implementation of graphs, adjaceny matrix
openFrameworks 007 - 3D
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Sachpazis_Consolidation Settlement Calculation Program-The Python Code and th...
Triggering patterns of topology changes in dynamic attributed graphs
Lab Practices and Works Documentation / Report on Computer Graphics
CS 354 Transformation, Clipping, and Culling
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
10CSL67 CG LAB PROGRAM 1
Writing a Space Shooter with HTML5 Canvas
rgDefense
oop Lecture 5
Kirti Kumawat
2.docx
Interactive High-Dimensional Visualization of Social Graphs
Ad

More from Piyush Mittal (20)

PPTX
Power mock
PDF
Design pattern tutorial
PPT
Reflection
PPTX
Gpu archi
PPTX
Cuda Architecture
PDF
Intel open mp
PDF
Intro to parallel computing
PDF
Cuda toolkit reference manual
PPT
Channel coding
PPT
Basics of Coding Theory
PDF
Java cheat sheet
PDF
Google app engine cheat sheet
PDF
Git cheat sheet
PDF
Vi cheat sheet
PDF
Css cheat sheet
PDF
Cpp cheat sheet
PDF
Ubuntu cheat sheet
PDF
Php cheat sheet
PDF
oracle 9i cheat sheet
PDF
Open ssh cheet sheat
Power mock
Design pattern tutorial
Reflection
Gpu archi
Cuda Architecture
Intel open mp
Intro to parallel computing
Cuda toolkit reference manual
Channel coding
Basics of Coding Theory
Java cheat sheet
Google app engine cheat sheet
Git cheat sheet
Vi cheat sheet
Css cheat sheet
Cpp cheat sheet
Ubuntu cheat sheet
Php cheat sheet
oracle 9i cheat sheet
Open ssh cheet sheat
Ad

Recently uploaded (20)

PPTX
master seminar digital applications in india
PPTX
Pharmacology of Heart Failure /Pharmacotherapy of CHF
PPTX
school management -TNTEU- B.Ed., Semester II Unit 1.pptx
PDF
Anesthesia in Laparoscopic Surgery in India
PPTX
IMMUNITY IMMUNITY refers to protection against infection, and the immune syst...
PDF
O7-L3 Supply Chain Operations - ICLT Program
PPTX
Cell Types and Its function , kingdom of life
PPTX
Renaissance Architecture: A Journey from Faith to Humanism
PDF
Module 4: Burden of Disease Tutorial Slides S2 2025
PDF
Saundersa Comprehensive Review for the NCLEX-RN Examination.pdf
PDF
TR - Agricultural Crops Production NC III.pdf
PDF
Classroom Observation Tools for Teachers
PDF
Insiders guide to clinical Medicine.pdf
PDF
Sports Quiz easy sports quiz sports quiz
PPTX
Institutional Correction lecture only . . .
PPTX
Introduction_to_Human_Anatomy_and_Physiology_for_B.Pharm.pptx
PPTX
PPT- ENG7_QUARTER1_LESSON1_WEEK1. IMAGERY -DESCRIPTIONS pptx.pptx
PDF
The Lost Whites of Pakistan by Jahanzaib Mughal.pdf
PDF
3rd Neelam Sanjeevareddy Memorial Lecture.pdf
PDF
Microbial disease of the cardiovascular and lymphatic systems
master seminar digital applications in india
Pharmacology of Heart Failure /Pharmacotherapy of CHF
school management -TNTEU- B.Ed., Semester II Unit 1.pptx
Anesthesia in Laparoscopic Surgery in India
IMMUNITY IMMUNITY refers to protection against infection, and the immune syst...
O7-L3 Supply Chain Operations - ICLT Program
Cell Types and Its function , kingdom of life
Renaissance Architecture: A Journey from Faith to Humanism
Module 4: Burden of Disease Tutorial Slides S2 2025
Saundersa Comprehensive Review for the NCLEX-RN Examination.pdf
TR - Agricultural Crops Production NC III.pdf
Classroom Observation Tools for Teachers
Insiders guide to clinical Medicine.pdf
Sports Quiz easy sports quiz sports quiz
Institutional Correction lecture only . . .
Introduction_to_Human_Anatomy_and_Physiology_for_B.Pharm.pptx
PPT- ENG7_QUARTER1_LESSON1_WEEK1. IMAGERY -DESCRIPTIONS pptx.pptx
The Lost Whites of Pakistan by Jahanzaib Mughal.pdf
3rd Neelam Sanjeevareddy Memorial Lecture.pdf
Microbial disease of the cardiovascular and lymphatic systems

Matrix multiplication using CUDA

  • 1. Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH k Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 2. 60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3 Adapted From: David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(d_P, matrix_size); © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ Adapted From: £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 3. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009 © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_M Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6 3
  • 4. 6 0 I 36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7 Each thread is assigned to a Tile of TILE_WIDTHxTILE_WIDTH entries © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t } } 4
  • 5. 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u Solution 2: Use Multiple Thread Blocks __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x  5
  • 6. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); … Size of matrix limited by the number of threads allowed on a device © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this! © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '  6
  • 7. 6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } } } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q  7