c++ - 快速计算许多标量积

标签 c++ performance algorithm cuda parallel-processing

<分区>

我有一个计算 1-10 百万标量积的程序。

看起来像这样。 ts 和 A 是大约 1000-10000 3D 点的数组(每个元素是一个 3x1 vector )。目前，在 ts.size() = 10,000 和 A.size() = 1000 的情况下，我的代码大约需要 41ms。到目前为止，我还没有进行任何并行化。例如，在 CUDA 中，计算会更快吗？我没有这样的经验。或者还有其他办法吗？谢谢。

for(int i = 0; i< ts.size(); i++){
    for(int j = 0; j< A.size(); j++){
        if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
            score[i] +=1;
        }
    }
}

这是我对标量积的实现。

double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}

我可以改用 Lapack 或 Eigen，将问题表述为矩阵乘法吗？我已经在 Matlab 中做到了，它只慢了 5 倍。任何加速都会很棒。使用 OpenMP，我想我的速度可以4x。

最佳答案

这个答案由两部分组成:

加速许多独立标量积的计算；
解决您的具体问题。

第 1 部分

计算大量独立标量积的问题是一个令人尴尬的并行问题。如果您的目标是仅加速上述标量积，将其余计算保留在 CPU 上，那么我同意 Calvin 的观点，即大部分时间将花在设备-> 大型 N*M< 的内存事务上 结果矩阵。但是，如果您从上述交易中清除时间，加速计算将是值得的。下面的代码显示了这一点，在配备 NVIDIA Kepler K20c 卡的 Intel Xeon E5-2650 2.00 GHz 八核处理器上进行了测试，其时序如下:

CPU: 27ms;     GPU (without D2H transaction): 0.08ms;     GPU (with D2H transaction): 23ms

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
    return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}

/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z, 
              const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z, 
              float* __restrict__ output, const int N, const int M) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((idx < N) && (idy < M))

        output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 10000;
    const int M = 1000;

    // --- Host side allocations
    float *Ax = (float*)malloc(N*sizeof(float));
    float *Ay = (float*)malloc(N*sizeof(float));
    float *Az = (float*)malloc(N*sizeof(float));

    float *Bx = (float*)malloc(M*sizeof(float));
    float *By = (float*)malloc(M*sizeof(float));
    float *Bz = (float*)malloc(M*sizeof(float));

    float *C = (float*)malloc(N*M*sizeof(float));
    float *D = (float*)malloc(N*M*sizeof(float));

    // --- Device side allocations
    float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
    float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
    float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));

    float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
    float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
    float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));

    float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));

    // --- Initialization
    srand(time(NULL));
    for (int i=0; i<N; i++) {
        Ax[i] = rand() / RAND_MAX;
        Ay[i] = rand() / RAND_MAX;
        Az[i] = rand() / RAND_MAX;
    }

    for (int i=0; i<M; i++) {
        Bx[i] = rand() / RAND_MAX;
        By[i] = rand() / RAND_MAX;
        Bz[i] = rand() / RAND_MAX;
    }

    // --- Host side computations
    double t1 = clock();
    for (int i=0; i<N; i++) 
        for (int j=0; j<M; j++) 
            C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
    double t2 = clock();
    printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);

    // --- Device side computations
    dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
    dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // --- Host to device memory transfers
    gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));

    // --- Computations
    kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);


    for (int i=0; i<N*M; i++) {
        if (D[i] != C[i]) {
            printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
            return 1;
        }
    }
    printf("Results match!\n");
    cudaDeviceReset();
    return 0;
}

第 2 部分

为了解决您的特定问题，即使考虑 D2H 内存事务(非常便宜)，CUDA 也是值得的。下面的代码证实了这一点，在与上面相同的系统上进行了测试，其时间如下:

CPU: 46ms;     GPU (with D2H transaction): 0.31ms;

#include <stdio.h>
#include <time.h>

#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }

/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
    return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}

/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z, 
              const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z, 
              float* __restrict__ output, const int N, const int M) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;

    if ((idx < N) && (idy < M))

        if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f) 
            output[idx] = 1.f;
        else
            output[idx] = 0.f;

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 10000;
    const int M = 1000;

    // --- Host side allocations
    float *Ax = (float*)malloc(N*sizeof(float));
    float *Ay = (float*)malloc(N*sizeof(float));
    float *Az = (float*)malloc(N*sizeof(float));

    float *Bx = (float*)malloc(M*sizeof(float));
    float *By = (float*)malloc(M*sizeof(float));
    float *Bz = (float*)malloc(M*sizeof(float));

    float *C = (float*)malloc(N*sizeof(float));
    float *D = (float*)malloc(N*sizeof(float));

    // --- Device side allocations
    float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
    float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
    float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));

    float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
    float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
    float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));

    float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));

    // --- Initialization
    srand(time(NULL));
    for (int i=0; i<N; i++) {
        Ax[i] = rand() / RAND_MAX;
        Ay[i] = rand() / RAND_MAX;
        Az[i] = rand() / RAND_MAX;
    }

    for (int i=0; i<M; i++) {
        Bx[i] = rand() / RAND_MAX;
        By[i] = rand() / RAND_MAX;
        Bz[i] = rand() / RAND_MAX;
    }

    // --- Host side computations
    double t1 = clock();
    for (int i=0; i<N; i++) 
        for (int j=0; j<M; j++)
            if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f) 
                C[i] = 1.f;
            else
                C[i] = 0.f;
    double t2 = clock();
    printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);

    // --- Device side computations
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    // --- Host to device memory transfers
    gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));

    gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));

    // --- Computations
    kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time:  %3.4f ms \n", time);


    for (int i=0; i<N; i++) {
        if (D[i] != C[i]) {
            printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
            return 1;
        }
    }
    printf("Results match!\n");
    cudaDeviceReset();
    return 0;
}

关于c++ - 快速计算许多标量积，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/24888334/

上一篇：C++ vector : push_back Objects vs push_back Pointers performance

下一篇：c++ - **运算符不适用于二维数组作为参数

相关文章：

database - 在数据库中使用 b 树

c# - 用较小的基于二维图 block 的形状填充非矩形二维图 block 空间的算法

c++ - 将矩阵划分为 p 行

c++ - 在 C++11 中声明多参数构造函数的最佳方法是什么

c++ - 运行时访问图书管理员类？

c++ - 将 const wchar_t * 转换为 const std::basic_string<char_t> 的更好方法

python - 与倒数第二个相比，最后一个索引对 numpy 数组的访问时间的影响更大

c# - 使用 sp_executesql 运行时相同的 SQL 查询很快，如果在查询分析器中作为动态查询执行则非常慢，为什么？

algorithm - Matlab 多元回归

mysql - 我想默认获取最后一行 mysql 查询