CUDA:具有共享内存和非 block 大小倍数的矩阵大小的平铺矩阵-矩阵乘法

我正在努力让自己熟悉 CUDA 编程,并从中度过一段愉快的时光。我目前正在查看 this处理矩阵乘法的 pdf,在有和没有共享内存的情况下完成。可以找到两个版本的完整代码 here .此代码与 CUDA 矩阵乘法示例中的代码几乎完全相同。尽管非共享内存版本能够在任何矩阵大小下运行,无论 block 大小如何,共享内存版本必须使用 block 大小的倍数的矩阵(我设置为 4,默认值为 16) .

pdf 末尾建议的问题之一是更改它,以便共享内存版本也可以使用 block 大小的非倍数。我认为这将是一个简单的索引检查,就像在非共享版本中一样:

int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > A.height || col > B.width) return;

但这行不通。这是完整的代码,去掉了 main 方法(有点乱,抱歉),我对它做了一些修改:

void MatMul(const Matrix A, const Matrix B, Matrix C) { 
  // Load A and B to device memory 
  Matrix d_A; 
  d_A.width = d_A.stride = A.width; 
  d_A.height = A.height; 
  size_t size = A.width * A.height * sizeof(float); 
  cudaError_t err = cudaMalloc(&d_A.elements, size); 
  printf("CUDA malloc A: %s\n",cudaGetErrorString(err)); 
  err = cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); 
  printf("Copy A to device: %s\n",cudaGetErrorString(err)); 

  Matrix d_B; 
  d_B.width = d_B.stride = B.width; 
  d_B.height = B.height; 
  size = B.width * B.height * sizeof(float); 
  err = cudaMalloc(&d_B.elements, size); 
  printf("CUDA malloc B: %s\n",cudaGetErrorString(err));
  err = cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice);
  printf("Copy B to device: %s\n",cudaGetErrorString(err)); 

  Matrix d_C; 
  d_C.width = d_C.stride = C.width; 
  d_C.height = C.height; 
  size = C.width * C.height * sizeof(float); 
  err = cudaMalloc(&d_C.elements, size); 
  printf("CUDA malloc C: %s\n",cudaGetErrorString(err));

  dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); 
    dim3 dimGrid((B.width + dimBlock.x - 1) / dimBlock.x, (A.height + dimBlock.y-1) / dimBlock.y);
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 
    err = cudaThreadSynchronize();
    printf("Run kernel: %s\n", cudaGetErrorString(err));

  // Read C from device memory 
  err = cudaMemcpy(C.elements, d_C.elements, size, cudaMemcpyDeviceToHost); 
  printf("Copy C off of device: %s\n",cudaGetErrorString(err));

  // Free device memory

// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col) { 
  return A.elements[row * A.stride + col]; 

// Set a matrix element 
__device__ void SetElement(Matrix A, int row, int col, float value) { 
  A.elements[row * A.stride + col] = value; 

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 
// located col sub-matrices to the right and row sub-matrices down 
// from the upper-left corner of A 
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) { 
  Matrix Asub; 
  Asub.width = BLOCK_SIZE; 
  Asub.height = BLOCK_SIZE; 
  Asub.stride = A.stride; 
  Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]; 
  return Asub; 

// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) { 
  // Block row and column 
  int blockRow = blockIdx.y; 
  int blockCol = blockIdx.x; 

  int rowTest = blockIdx.y * blockDim.y + threadIdx.y;
  int colTest = blockIdx.x * blockDim.x + threadIdx.x;
  if (rowTest>A.height || colTest>B.width)
  // Each thread block computes one sub-matrix Csub of C
  Matrix Csub = GetSubMatrix(C, blockRow, blockCol); 

  // Each thread computes one element of Csub 
  // by accumulating results into Cvalue 
  float Cvalue = 0.0; 
  // Thread row and column within Csub 
  int row = threadIdx.y; 
  int col = threadIdx.x; 
  // Loop over all the sub-matrices of A and B that are 
  // required to compute Csub 
  // Multiply each pair of sub-matrices together 
  // and accumulate the results 
  for (int m = 0; m < (BLOCK_SIZE + A.width - 1)/BLOCK_SIZE; ++m) {
    // Get sub-matrix Asub of A 
    Matrix Asub = GetSubMatrix(A, blockRow, m); 

    // Get sub-matrix Bsub of B 
    Matrix Bsub = GetSubMatrix(B, m, blockCol); 

    // Shared memory used to store Asub and Bsub respectively 
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 
    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 

    // Load Asub and Bsub from device memory to shared memory 
    // Each thread loads one element of each sub-matrix 
    As[row][col] = GetElement(Asub, row, col); 
    Bs[row][col] = GetElement(Bsub, row, col); 

    // Synchronize to make sure the sub-matrices are loaded 
    // before starting the computation 

    // Multiply Asub and Bsub together 
    for (int e = 0; e < BLOCK_SIZE; ++e) 
      Cvalue += As[row][e] * Bs[e][col];
    // Synchronize to make sure that the preceding 
    // computation is done before loading two new 
    // sub-matrices of A and B in the next iteration 
  // Write Csub to device memory 
  // Each thread writes one element 
  SetElement(Csub, row, col, Cvalue); 

我改变的值得注意的事情:我在 MatMulKernel 中添加了一个检查,检查我们当前的线程是否试图在 C 中不存在的地方工作。这似乎不起作用。虽然它确实改变了结果,但这些变化似乎没有任何模式,只是后来(更高的 x 或 y 值)条目似乎受到更大的影响(而且我得到了更多的非整数结果)。我还更改了给定的 dimGrid 计算方法和 MatMulKernel 中 m 的循环条件(之前它只是宽度或高度除以 block 大小,这似乎是错误的)。



当矩阵维度不是图 block 维度的倍数时,可能会出现某些图 block 仅部分覆盖矩阵的情况。落在未完全重叠的图 block 之外的图 block 元素应正确归零。因此,将您的代码扩展到任意大小的矩阵很容易,但不等于简单的索引检查。下面,我正在使用任意大小的矩阵复制并粘贴我的平铺矩阵-矩阵乘法内核版本

__global__ void MatMul(float* A, float* B, float* C, int ARows, int ACols, int BRows,
    int BCols, int CRows, int CCols)
    float CValue = 0;

    int Row = blockIdx.y*TILE_DIM + threadIdx.y;
    int Col = blockIdx.x*TILE_DIM + threadIdx.x;

    __shared__ float As[TILE_DIM][TILE_DIM];
    __shared__ float Bs[TILE_DIM][TILE_DIM];

    for (int k = 0; k < (TILE_DIM + ACols - 1)/TILE_DIM; k++) {

         if (k*TILE_DIM + threadIdx.x < ACols && Row < ARows)
             As[threadIdx.y][threadIdx.x] = A[Row*ACols + k*TILE_DIM + threadIdx.x];
             As[threadIdx.y][threadIdx.x] = 0.0;

         if (k*TILE_DIM + threadIdx.y < BRows && Col < BCols)
             Bs[threadIdx.y][threadIdx.x] = B[(k*TILE_DIM + threadIdx.y)*BCols + Col];
             Bs[threadIdx.y][threadIdx.x] = 0.0;


         for (int n = 0; n < TILE_DIM; ++n)
             CValue += As[threadIdx.y][n] * Bs[n][threadIdx.x];


    if (Row < CRows && Col < CCols)
        C[((blockIdx.y * blockDim.y + threadIdx.y)*CCols) +
           (blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;

