我有一个计算 1-10
百万标量积的程序。
看起来像这样。 ts
和 A
是大约 1000
-10000
3D 点的数组(每个元素是一个 3x1
vector )。目前,在 ts.size() = 10,000
和 A.size() = 1000
的情况下,我的代码大约需要 41ms
。到目前为止,我还没有进行任何并行化。例如,在 CUDA 中,计算会更快吗?我没有这样的经验。或者还有其他办法吗?谢谢。
for(int i = 0; i< ts.size(); i++){
for(int j = 0; j< A.size(); j++){
if(abs(scalarProduct(ts.at(i), A.at(j))) <epsilon){
score[i] +=1;
}
}
}
这是我对标量积的实现。
double scalarProduct(const Point &p1,const Point &p2)
{
return (p1.getX()*p2.getX() + p1.getY()*p2.getY() + p1.getZ()*p2.getZ()) ;
}
我可以改用 Lapack 或 Eigen,将问题表述为矩阵乘法吗?我已经在 Matlab 中做到了,它只慢了 5
倍。任何加速都会很棒。使用 OpenMP,我想我的速度可以4x
。
这个答案由两部分组成:
- 加速许多独立标量积的计算;
- 解决您的具体问题。
第 1 部分
计算大量独立标量积的问题是一个令人尴尬的并行问题。如果您的目标是仅加速上述标量积,将其余计算保留在 CPU 上,那么我同意 Calvin 的观点,即大部分时间将花在设备-> 大型 N*M< 的内存事务上
结果矩阵。但是,如果您从上述交易中清除时间,加速计算将是值得的。下面的代码显示了这一点,在配备 NVIDIA Kepler K20c 卡的 Intel Xeon E5-2650 2.00 GHz 八核处理器上进行了测试,其时序如下:
CPU: 27ms; GPU (without D2H transaction): 0.08ms; GPU (with D2H transaction): 23ms
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
output[idy * N + idx] = scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy]);
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*M*sizeof(float));
float *D = (float*)malloc(N*M*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*M*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
C[i*M + j] = scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j]);
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
dim3 dimBlock(BLOCKSIZE_X, BLOCKSIZE_Y);
dim3 dimGrid(iDivUp(N, BLOCKSIZE_X), iDivUp(M, BLOCKSIZE_Y));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<dimGrid, dimBlock>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*M*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N*M; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}
第 2 部分
为了解决您的特定问题,即使考虑 D2H 内存事务(非常便宜),CUDA 也是值得的。下面的代码证实了这一点,在与上面相同的系统上进行了测试,其时间如下:
CPU: 46ms; GPU (with D2H transaction): 0.31ms;
#include <stdio.h>
#include <time.h>
#define BLOCKSIZE_X 16
#define BLOCKSIZE_Y 16
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*************************************************/
/* DEVICE FUNCTION PERFORMING THE SCALAR PRODUCT */
/*************************************************/
__host__ __device__ float scalarProduct(float p1x, float p1y, float p1z, float p2x, float p2y, float p2z)
{
return (p1x * p2x + p1y * p2y + p1z * p2z) ;
}
/*******************/
/* KERNEL FUNCTION */
/*******************/
__global__ void kernel(const float* __restrict__ p1x, const float* __restrict__ p1y, const float* __restrict__ p1z,
const float* __restrict__ p2x, const float* __restrict__ p2y, const float* __restrict__ p2z,
float* __restrict__ output, const int N, const int M) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int idy = threadIdx.y + blockIdx.y * blockDim.y;
if ((idx < N) && (idy < M))
if(abs(scalarProduct(p1x[idx], p1y[idx], p1z[idx], p2x[idy], p2y[idy], p2z[idy])) < 0.01f)
output[idx] = 1.f;
else
output[idx] = 0.f;
}
/********/
/* MAIN */
/********/
int main() {
const int N = 10000;
const int M = 1000;
// --- Host side allocations
float *Ax = (float*)malloc(N*sizeof(float));
float *Ay = (float*)malloc(N*sizeof(float));
float *Az = (float*)malloc(N*sizeof(float));
float *Bx = (float*)malloc(M*sizeof(float));
float *By = (float*)malloc(M*sizeof(float));
float *Bz = (float*)malloc(M*sizeof(float));
float *C = (float*)malloc(N*sizeof(float));
float *D = (float*)malloc(N*sizeof(float));
// --- Device side allocations
float *d_Ax; gpuErrchk(cudaMalloc((void**)&d_Ax, N*sizeof(float)));
float *d_Ay; gpuErrchk(cudaMalloc((void**)&d_Ay, N*sizeof(float)));
float *d_Az; gpuErrchk(cudaMalloc((void**)&d_Az, N*sizeof(float)));
float *d_Bx; gpuErrchk(cudaMalloc((void**)&d_Bx, M*sizeof(float)));
float *d_By; gpuErrchk(cudaMalloc((void**)&d_By, M*sizeof(float)));
float *d_Bz; gpuErrchk(cudaMalloc((void**)&d_Bz, M*sizeof(float)));
float *d_C; gpuErrchk(cudaMalloc((void**)&d_C, N*sizeof(float)));
// --- Initialization
srand(time(NULL));
for (int i=0; i<N; i++) {
Ax[i] = rand() / RAND_MAX;
Ay[i] = rand() / RAND_MAX;
Az[i] = rand() / RAND_MAX;
}
for (int i=0; i<M; i++) {
Bx[i] = rand() / RAND_MAX;
By[i] = rand() / RAND_MAX;
Bz[i] = rand() / RAND_MAX;
}
// --- Host side computations
double t1 = clock();
for (int i=0; i<N; i++)
for (int j=0; j<M; j++)
if(abs(scalarProduct(Ax[i], Ay[i], Az[i], Bx[j], By[j], Bz[j])) < 0.01f)
C[i] = 1.f;
else
C[i] = 0.f;
double t2 = clock();
printf("CPU elapsed time: %3.4f ms \n", 1000.*((double)(t2-t1))/CLOCKS_PER_SEC);
// --- Device side computations
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
// --- Host to device memory transfers
gpuErrchk(cudaMemcpy(d_Ax, Ax, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Ay, Ay, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Az, Az, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bx, Bx, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_By, By, M*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_Bz, Bz, M*sizeof(float), cudaMemcpyHostToDevice));
// --- Computations
kernel<<<iDivUp(N, BLOCKSIZE_X), BLOCKSIZE_X>>>(d_Ax, d_Ay, d_Az, d_Bx, d_By, d_Bz, d_C, N, M);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(D, d_C, N*sizeof(float), cudaMemcpyDeviceToHost));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time: %3.4f ms \n", time);
for (int i=0; i<N; i++) {
if (D[i] != C[i]) {
printf("Mismatch at i = %i; Host= %f, Device = %f\n", i, C[i], D[i]);
return 1;
}
}
printf("Results match!\n");
cudaDeviceReset();
return 0;
}