这是一个说明问题的最小程序。我使用的是具有 128 个 CUDA 核心的 GTS 250,Windows 7 上的 CUDA 5.0。
void cuda_ops_test(int N, float* R)
{
//Values of input matrix in CPU
fprintf(stderr, "\nValues of R: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
//Initialize CUDA/CUBLAS
cublasHandle_t handle;
cublasStatus_t status;
status = cublasCreate(&handle);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "CUBLAS initialization succeeded.\n");
//Allocate device memory
float *dR = 0;
cudaError_t alloc_status;
alloc_status = cudaMalloc((void**)&dR, N*sizeof(dR[0]));
if(alloc_status == cudaSuccess)
fprintf (stderr, "\nDevice memory allocation succeeded.\n");
//Load array into memory
status = cublasSetMatrix(1, N, sizeof(R[0]), R, N, dR, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice write succeeded.\n");
//First operation: summation
float ans;
status = cublasSasum(handle, N, dR, 1, &ans);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "\ncublasSasum produced no error. Sum of dR: %d\n", ans);
else
fprintf(stderr, "\ncublasSasum error: %d.\n", status);
//Second operation: y = ax+y
const float alpha = 2.0;
status = cublasSaxpy(handle, N,
&alpha,
dR, 1,
dR, 1);
if (status == CUBLAS_STATUS_SUCCESS)
fprintf(stderr, "\ncublasSaxpy produced no error.\n");
else
fprintf(stderr, "\ncublasSaxpy error: %d.\n", status);
// transfer device dR to host R
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, N, R, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice read succeded\n");
//Display post-op values of R
fprintf(stderr, "\nValues of R, after cublasSaxpy: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
//Attempt to zero with cudaMemset
cudaError_t stat = cudaMemset(dR, 0, N*sizeof(dR[0]));
if (stat==cudaSuccess)
fprintf(stderr, "\nZeroing with cudaMemset on R produced no error.\n");
//Again transfer device dR to host R, after zeroing
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, N, R, N);
if(status == CUBLAS_STATUS_SUCCESS)
fprintf (stderr, "\nDevice read succeded.\n");
//Display values of R again
fprintf(stderr, "\nValues of R, after zeroing with cudaMemset: \n");
for (int i=0; i<N; ++i)
fprintf(stderr, "%f, ", R[i]);
fprintf(stderr, "\n");
cudaFree(dR);
}
以下是输出,表明当数据加载到 GPU 内存中时,实际上没有发生任何操作:
Values of R: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
CUBLAS initialization succeeded.
Device memory allocation succeeded.
cublasSasum produced no error. Sum of dR: 0
cublasSaxpy produced no error.
Values of R, after cublasSaxpy: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
Zeroing with cudaMemset on R produced no error.
Values of R, after zeroing with cudaMemset: 0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
发生什么事了? (节日快乐。:) )
最佳答案
您的代码有几个错误。正如我在评论中提到的,您错过了这样一个事实:您的设备读取和设备写入消息都没有被打印出来,因为这些函数(cublasSetMatrix、cublasGetMatrix)实际上失败了。
要修复 cublasSetMatrix 和 cublasGetMatrix 调用,请将 lda
和 ldb
参数更改为 1:
status = cublasSetMatrix(1, N, sizeof(R[0]), R, 1, dR, 1);
...
status = cublasGetMatrix (1, N, sizeof(dR[0]), dR, 1, R, 1);
documentation对于这些函数来说:“源矩阵 A 和目标矩阵 B 的前导维度分别在 lda 和 ldb 中给出。前导维度表示分配的矩阵的行数”
在打印 cublasSasum 操作结果的行中,您的 printf 语句错误地使用 int 格式说明符来打印浮点值。这行不通。将 %d
更改为 %f
:
fprintf(stderr, "\ncublasSasum produced no error. Sum of dR: %f\n", ans);
通过这些更改,我能够得到合理的结果:
Values of R:
0.123020, 0.367809, 0.834681, 0.035096, 0.517014, 0.662984, 0.426221, 0.104678,
CUBLAS initialization succeeded.
Device memory allocation succeeded.
Device write succeeded.
cublasSasum produced no error. Sum of dR: 3.071503
cublasSaxpy produced no error.
Device read succeded
Values of R, after cublasSaxpy:
0.369060, 1.103427, 2.504043, 0.105288, 1.551042, 1.988952, 1.278663, 0.314034,
Zeroing with cudaMemset on R produced no error.
Device read succeded.
Values of R, after zeroing with cudaMemset:
0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
请注意 this SO question/answer提供了有关有用、方便的 cublas 错误解析器函数的提示。将其构建到 cublas 函数调用的包装器或错误检查宏中并不困难。
关于cuBlas、cuda 功能不起作用,没有产生错误,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14034524/