c++ - 与 cusparse 相比，cublas 异常缓慢

我正在尝试运行一些测试来比较不同稀疏度(使用 Titan X)下的 cusparse 和 cublas 性能，这里是名为“testcusparsevector.cpp”的主要代码:

#include <stdio.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <fstream>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
#include <assert.h>
#define M 6
#define N 5
#define IDX2C(i,j,ld) (((j)*(ld))+(i))


// /home/gpu1/Install/OpenBLAS-0.2.14


#define CHECK_EQ(a,b) do { \
    if ((a) != (b)) { \
        cout <<__FILE__<<" : "<< __LINE__<<" : check failed because "<<a<<"!="<<b<<endl;\
        exit(1);\
    }\
} while(0)

#define CUBLAS_CHECK(condition) \
do {\
    cublasStatus_t status = condition; \
    CHECK_EQ(status, CUBLAS_STATUS_SUCCESS); \
} while(0)

#define CUSPARSE_CHECK(condition)\
do {\
    cusparseStatus_t status = condition; \
    switch(status)\
    {\
        case CUSPARSE_STATUS_NOT_INITIALIZED:\
            cout << "CUSPARSE_STATUS_NOT_INITIALIZED" << endl;\
            break;\
        case CUSPARSE_STATUS_ALLOC_FAILED:\
            cout << "CUSPARSE_STATUS_ALLOC_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INVALID_VALUE:\
            cout << "CUSPARSE_STATUS_INVALID_VALUE" << endl;\
            break;\
        case CUSPARSE_STATUS_ARCH_MISMATCH:\
            cout << "CUSPARSE_STATUS_ARCH_MISMATCH" << endl;\
            break;\
        case CUSPARSE_STATUS_MAPPING_ERROR:\
            cout << "CUSPARSE_STATUS_MAPPING_ERROR" << endl;\
            break;\
            case CUSPARSE_STATUS_EXECUTION_FAILED:\
            cout << "CUSPARSE_STATUS_EXECUTION_FAILED" << endl;\
            break;\
        case CUSPARSE_STATUS_INTERNAL_ERROR:\
            cout << "CUSPARSE_STATUS_INTERNAL_ERROR" << endl;\
            break;\
        case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:\
            cout << "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED" << endl;\
            break;\
        case CUSPARSE_STATUS_ZERO_PIVOT:\
            cout << "CUSPARSE_STATUS_ZERO_PIVOT" << endl;\
    }\
    CHECK_EQ(status, CUSPARSE_STATUS_SUCCESS); \
} while(0)

#define CUDA_CHECK(condition)\
do {\
    cudaError_t error = condition;\
    CHECK_EQ(error, cudaSuccess);\
} while(0)

//check after kernel function
#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())



#define __TIMING__ 1

#if __TIMING__


#define INIT_TIMER  cudaEvent_t start, stop; \
    float milliseconds = 0; \
    float sum = 0;\
    cudaEventCreate( &start );\
    cudaEventCreate( &stop );

#define TIC {  cudaEventRecord( start ); }

#if __CUDNN__
    #define PREDEFNAME "CUDNN"
#else
    #define PREDEFNAME "CUDA"
#endif

#define TOC(a) { cudaEventRecord( stop ); \
        cudaEventSynchronize( stop ); \
        cudaEventElapsedTime( &milliseconds, start, stop );  \
        printf( "GPU Execution time of %s_%s: %f ms\n",PREDEFNAME, a, milliseconds ); \
        sum += milliseconds;\
        fflush(stdout); }

#define CLOSE_TIMER {cudaEventDestroy(start); cudaEventDestroy(stop); }
#endif

using namespace std;

void dispArray(double* array, size_t width, size_t height) {
    for (int i=0; i < height;i++ ) {
        for (int j=0;j < width;j++) {
            cout << array[j*height+i] << ' ';
        }
        cout << endl;
    }
    cout << endl;
}

int main()
{
    srand(time(NULL));
    const int num_loop = 1;
    const int inside_loop = 1000;
    // const int WIDTH = 512*3*3;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 36;
    const int WIDTH = 4608;
    const int HEIGHT = 512;
    const int WIDTHOUT = 144;
    // const int WIDTH = 18500;
    // const int HEIGHT = 512;
    // const int WIDTHOUT = 1;
    // const int WIDTH = 3;
    // const int HEIGHT = 5;
    // const int WIDTHOUT = 2;
    INIT_TIMER
    ofstream myfile;
    myfile.open("test_sparsity.log");

    cudaError_t cudaStat;    
    cusparseStatus_t stat;
    cusparseHandle_t handle;
    cublasHandle_t handleblas;

    double *devPtrOutput;
    double *devPtrOutput2;
    double *devPtrRand;
    double *devPtrSec;
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput), sizeof(double)*HEIGHT*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrOutput2), sizeof(double)*HEIGHT*WIDTHOUT));

    CUDA_CHECK(cudaMalloc((void **)&(devPtrRand), sizeof(double)*WIDTH*WIDTHOUT));
    CUDA_CHECK(cudaMalloc((void **)&(devPtrSec), sizeof(double)*WIDTH*HEIGHT));
    const double alpha=1.0;
    const double beta=0.0;
    double *csrVal;
    int *csrRowPtr;
    int *csrColInd;

    const bool SPARSE = true;
    long a = clock();
    long temp = clock();
    cusparseMatDescr_t descr;
    CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
    cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
    cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
    int nnz;
    CUSPARSE_CHECK(cusparseCreate(&handle));
    CUBLAS_CHECK(cublasCreate(&handleblas));
    int *nnzPerRow_gpu;
    CUDA_CHECK(cudaMalloc((void **)&(nnzPerRow_gpu), sizeof(int)*HEIGHT));
    CUDA_CHECK(cudaMalloc((void **)&(csrRowPtr), sizeof(int)*(HEIGHT+1)));
    double density_array[1] = {0.9999};//, 0.8, 0.7, 0.6, 0.5,      0.4, 0.3, 0.2, 0.1 ,0.09,     0.08, 0.07, 0.06, 0.05 ,0.04,     0.03, 0.02, 0.01};
    for (int inddense=0;inddense < 1;inddense++) {
        double DENSITY = density_array[inddense];
        int num_non_zeros = DENSITY * (WIDTH * HEIGHT);

        CUDA_CHECK(cudaMalloc((void **)&(csrColInd), sizeof(int)*num_non_zeros));
        CUDA_CHECK(cudaMalloc((void **)&(csrVal), sizeof(double)*num_non_zeros));
        INIT_TIMER
        for (int iter=0; iter < num_loop;iter++) {
            vector<double> randVec(WIDTH*WIDTHOUT, 0);
            vector<double> secArray(WIDTH*HEIGHT, 0);
            vector<int> temp(WIDTH*HEIGHT, 1);

            for (int j = 0; j < WIDTH*WIDTHOUT; j++) {
                randVec[j]=(double)(rand()%100000)/100;
            }

            for (int x, i = 0; i < num_non_zeros;i++) {
                do
                {
                    x = rand() % (WIDTH*HEIGHT);
                } while(temp[x] == 0);
                temp[x]=0;
                secArray[x]=(double)(rand()%100000)/100;
            }
            int count = 0;
            for(int i=0;i < WIDTH*HEIGHT;i++) {
                if (secArray[i] != 0) {
                    count++;
                }
            }

            // randVec = {2,2,2,3,3,3};
            // secArray = {0,5,0,2,5,8,7,0,0,0,0,2,0,4,4};
            CUDA_CHECK(cudaMemcpy(devPtrRand, &randVec[0], sizeof(double)*WIDTH*WIDTHOUT, cudaMemcpyHostToDevice));
            CUDA_CHECK(cudaMemcpy(devPtrSec, &secArray[0], sizeof(double)*WIDTH*HEIGHT, cudaMemcpyHostToDevice));


            if (SPARSE) {
                CUSPARSE_CHECK(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, HEIGHT, WIDTH, descr, devPtrSec, HEIGHT, nnzPerRow_gpu, &nnz));
                CUSPARSE_CHECK(cusparseDdense2csr(handle, HEIGHT, WIDTH, descr,devPtrSec,HEIGHT,nnzPerRow_gpu,csrVal,csrRowPtr,csrColInd));
            }       
            // vector<double> tempcsrVal(nnz,0);
            // vector<int> tempcsrRowPtr(HEIGHT+1);
            // vector<int> tempcsrColInd(nnz,0);
            // CUDA_CHECK(cudaMemcpy(&tempcsrVal[0], csrVal, sizeof(double)*nnz, cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrRowPtr[0], csrRowPtr, sizeof(int)*(HEIGHT+1), cudaMemcpyDeviceToHost));
            // CUDA_CHECK(cudaMemcpy(&tempcsrColInd[0], csrColInd, sizeof(int)*nnz, cudaMemcpyDeviceToHost));
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrVal[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < HEIGHT+1;i++) {
                // cout << tempcsrRowPtr[i] << " ";
            // }
            // cout << endl;
            // for (int i =0; i < nnz;i++) {
                // cout << tempcsrColInd[i] << " ";
            // }
            // cout << endl;
            cudaDeviceSynchronize();
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                    HEIGHT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, csrColInd, 
                    devPtrRand, &beta, devPtrOutput));
                    // TOC("csrmv")
                } else {
                    // TIC
                    CUSPARSE_CHECK(cusparseDcsrmm(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, 
                        HEIGHT, WIDTHOUT, WIDTH, nnz, &alpha, descr, csrVal, csrRowPtr, 
                        csrColInd, devPtrRand, WIDTH, &beta, devPtrOutput, HEIGHT));
                    // TOC("csrmm")
                }
            }
            TOC("csr")
            TIC
            for (int i=0 ; i < inside_loop;i++) {
                if (WIDTHOUT == 1) {
                    // TIC
                    CUBLAS_CHECK(cublasDgemv(handleblas, CUBLAS_OP_N, HEIGHT, WIDTH, &alpha, devPtrSec, HEIGHT , devPtrRand, 1, &beta, devPtrOutput2, 1));
                    // TOC("dgemv")
                } else {
                    // TIC
                    CUBLAS_CHECK(cublasDgemm(handleblas, CUBLAS_OP_N, CUBLAS_OP_N, HEIGHT, WIDTHOUT, WIDTH, &alpha, devPtrSec, HEIGHT, devPtrRand, WIDTH, &beta, devPtrOutput2, HEIGHT));
                    // TOC("dgemm")
                }
            }
            TOC("blas")


            #if 0
            vector<double> output(HEIGHT*WIDTHOUT, 0);
            vector<double> output2(HEIGHT*WIDTHOUT, 0);
            CUDA_CHECK(cudaMemcpy(&output[0], devPtrOutput, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            CUDA_CHECK(cudaMemcpy(&output2[0], devPtrOutput2, sizeof(double)*HEIGHT*WIDTHOUT, cudaMemcpyDeviceToHost));
            dispArray(&output[0], WIDTHOUT, HEIGHT);
            cout << endl;
            for (int i=0;i < WIDTHOUT * HEIGHT;i++) {
                if (output[i] != output2[i]) {
                    cout << "error: " << i << " " << (output[i] - output2[i]) << " " << output[i] << endl;
                }
            }
            #endif

        }

        cout << DENSITY << " " << sum/num_loop << endl;
        myfile << DENSITY << " " << sum/num_loop << endl;
        cudaFree(csrColInd);
        cudaFree(csrVal);
    }
    myfile.close();
    cudaFree(csrRowPtr);
    cudaFree(devPtrOutput);
    cudaFree(devPtrRand);
    cudaFree(devPtrSec);

}

但是在编译代码之后

g++ -std=c++1y -O3 -I/usr/local/cuda/include -o testcusparsevector testcusparsevector.cpp -L/usr/local/cuda/lib64 -lcudart -lcublas -lcusparse

这是输出:

GPU Execution time of CUDA_csr: 4818.447266 ms
GPU Execution time of CUDA_blas: 5024.459961 ms

这应该意味着即使我的密度是 0.999，cusparseDcsrmm 仍然比 cublasDgemm 快，我已经检查了好的结果，并且与其他示例相比，问题似乎来自 cublas，这太过分了慢。

你知道它是从哪里来的吗？

编辑:我试图将值更改为 float ，结果更符合我的要求，显然，cublas 不是为双重计算而设计的...

提前致谢。

最佳答案

Titan X(以及 maxwell GPU 系列的所有当前成员)的 double 浮点运算和单精度浮点运算之间的吞吐量比率为 1:32。

通常，稀疏矩阵运算受内存带宽限制，而密集矩阵-矩阵乘法是计算受限问题的一个示例。

因此，在您的示例中，您正在处理一个通常受计算限制的问题，并将其作为稀疏矩阵乘法在内存带宽相对较大且 double 相对较小的处理器上运行计算吞吐量。

这种情况可能会导致两个 API 之间的界限变得模糊，而 CUBLAS API 通常可以更快地进行这种比较。

如果您将代码切换为使用 float 而不是 double 我认为您已经尝试过，您将看到 CUBLAS 再次获胜。同样，如果您在单精度和 double 吞吐量比率不同的 GPU 上按原样运行代码，您也会看到 CUBLAS 再次获胜。

apparently, cublas is not made for double computation...

与其这样说，不如说 GTX Titan X 不是(主要)用于双重计算的。尝试使用 Tesla K80、K40 或其他具有更接近双吞吐量与单吞吐量比率的 GPU。

这是您的程序在“未增强”的 Tesla K40 上运行的输出:

$ ./testcusparsevector
GPU Execution time of CUDA_csr: 8870.386719 ms
GPU Execution time of CUDA_blas: 1045.211792 ms

免责声明:我没有尝试研究您的代码。我仔细看了看，没有发现任何明显的问题。但可能存在我没有发现的问题。

关于c++ - 与 cusparse 相比，cublas 异常缓慢，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/36331291/

c++ - 与 cusparse 相比，cublas 异常缓慢

上一篇：c++ - 使用 C++ 中的多个键之一访问相同的值？

下一篇：c++ - 在 Eclipse 中使用 C++ 在 Linux 中获取环境变量 $PATH