免责声明:我是 cuda 初学者。
typedef struct
{
int row_;
int col_;
float* element_;
int step;
}Matrix_t;
#define BLOCK_SIZE 64
__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);
Matrix_t 用于通过 ptr() 运算符将 cv::cuda::GpuMat 链接到 C 接口(interface),以获取指向元素的 GPU 指针。
__device__ float getElement(const Matrix_t A, int row, int col)
{
return A.element_[row* A.step + col];
}
__device__ void setElement(Matrix_t A, int row, int col, float value)
{
A.element_[row*A.step + col] = value;
}
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
{
Matrix_t A_sub;
A_sub.row_ = BLOCK_SIZE;
A_sub.col_ = BLOCK_SIZE;
A_sub.step = A.step;
A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];
return A_sub;
}
这是内核:
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
float SubDotValue = 0.0f;
int row = threadIdx.y;
int col = threadIdx.x;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, blockRow, m);
Matrix_t B_sub = getSubMat(B, blockRow, m);
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[row] = SubDotValue;
}
和包装器:
float Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
{
float retval = 0;
float* retval_partial;
float* retval_device;
Matrix_t A;
A.col_ = Number_of_cols;
A.row_ = Number_of_rows;
A.element_ = M_dev_1;
A.step = step;
Matrix_t B;
B.col_ = Number_of_cols;
B.row_ = Number_of_rows;
B.element_ = M_dev_2;
B.step = step;
retval_partial = (float*)malloc( B.row_*sizeof(float) );
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
printf("\n Cuda malloc: %s", cudaGetErrorString(err));
std::cout<<std::flush;
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);
MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
err = cudaThreadSynchronize();
std::cout<<std::flush;
printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
err = cudaFree(retval_device);
printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
{
retval+=retval_partial[i];
}
free(retval_partial);
return retval;
}
和主要的:
int main(int argc, const char * argv[])
{
cv::cuda::DeviceInfo devInfo;
cv::cuda::setDevice(devInfo.deviceID());
cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);
double tr = (double) cv::getTickCount();
double res = cudatestA.dot(cudatestB);
tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();
cv::cuda::GpuMat ctA(cudatestA);
cv::cuda::GpuMat ctB(cudatestB);
double tm_ = (double) cv::getTickCount();
float res_m = 0;
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();
printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
return 0;
}
我目前卡在几个点上:
1)它总是输出0。
2) 它只能对定义的 BLOCK_SIZE (64) 的矩阵 M*N 倍数起作用。
对于 1) 我不知道我的逻辑在哪里中断,我可以毫无问题地让点积在 vector 上工作,但是每行之间的跨度引起的矩阵问题阻止我使用代码(代码删除为该网站告诉我代码太多)。
最佳答案
部分答案:
在您的内核中,您没有进行良好的求和,也没有采用良好的元素,并且您的 dim 似乎倒置了
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;
float SubDotValue = 0.0f;
//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;
}
而且您的包装器也没有分配您需要的大小:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
应该是:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );
请注意,其他相关分配也必须更改(懒惰我)。
并且您在 main 中的调用需要将 GpuMat 步长除以 GpuMat 的一个元素的大小
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());
您可能还想更改您的 Matrix_t 结构以使用 const float* 而不是 float 以便能够使用:
GpuMat_.ptr<float>();
代替:
(float*)GpuMat.ptr();
请注意,对于 N 行的矩阵,您将启动 N^2 个线程来执行相同的操作。我没有足够的 Cuda 知识来解决这个问题。
关于c++ - 矩阵点积,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47309531/