c++ - CUDA:如何在设备上填充动态大小的 vector 并将其内容返回给另一个设备功能?

标签 c++ cuda

我想知道在设备上填充动态大小数组(int *row,在下面的代码中)然后返回其内容以供另一个设备功能使用的正确技术。

为了将问题背景化,下面的代码尝试使用在 GPU 上运行的高斯-勒让德求积法跨越勒让德多项式基组中的任意函数。

#include <math.h>
#include <stdlib.h>
#include <stdio.h>

__device__  double *d_droot, *d_dweight;

/*How could be returned the array or the pointer to the array int *row, on the device,  that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int * row) {

  int a[100][100];
  int i, j;

  //first row and first coloumn has the same value=1
  for (i = 1; i <= n_row; i++) {
    a[i][1] = a[1][i] = 1;
  }

  //Generate the full Triangle
  for (i = 2; i <= n_row; i++) {
    for (j = 2; j <= n_row - i; j++) {
      if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
    break;
      }
      a[i][j] = a[i - 1][j] + a[i][j - 1];
    }
  }

  row = new int[n_row];

  for (i = 1; i <= n_row; i++) {
    row[i] = a[i][n_row-1];
  }

}


__device__
double Legendre_poly(int order, double x)
{
  int n,k;
  double val=0;
  int *binomials;
  for(n=order; n>=0; n--)
    {
      Pascal_Triangle(n, binomials); /*Here are the problems*/
      for(k=0; k<=n; k++)
    val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
    }

  return val;
}


__device__ __host__
double f(double alpha,double x)
{
  /*function expanded on a basis of Legendre palynomials. */
  return exp(-alpha*x*x);
}


/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
  /*
    Parameters:
    n: Total number of expansion coeficients
    a: Upper integration limit
    b: Lower integration limit
    d_droots[]: roots for the quadrature
    d_dweight[]: weights for the quadrature
    coefficients[]: allocate N expansion coefficients.
  */

  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      coefficients[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2)*c1;
    }

}


int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
  cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));

  // Perform the expansion

  Span<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/






  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);



  cudaFree(d_dresult);
  cudaFree(d_droot_temp);
  cudaFree(d_dweight_temp);

}

这里是编译上面代码的makefile:

objects = main.o 

all: $(objects)
        nvcc   -arch=sm_20 $(objects) -o span

%.o: %.cpp
        nvcc -x cu -arch=sm_20  -I. -dc $< -o $@

clean:
        rm -f *.o span

提前感谢您的任何建议。

最佳答案

(对不起,我之前的回答有误)

您正在将一个指针传递给这个函数:

void Pascal_Triangle(int n_row, int * row) {

然后你试图用一个新值覆盖这个指针:

row = new int[n_row];

一旦您从此函数返回,调用环境中的 row 将保持不变。 (这是一个普通的 C/C++ 问题,并非特定于 CUDA。)

这可能是一个令人困惑的问题,但是指针值按值传递给函数Pascal_Triangle。您不能在函数中修改指针值,并期望修改后的值显示在调用环境中。 (您可以修改指针指向的位置的内容,这是通过指针传递 row 的通常原因。)

有几种方法可以解决这个问题。最简单的可能只是通过引用传递指针:

void Pascal_Triangle(int n_row, int * &row) {

您的代码似乎还有其他缺陷。我建议你雇用 proper cuda error checking并使用 cuda-memcheck 运行您的代码。

特别是,内核中的 new 运算符的行为方式与内核中的 malloc 类似,并且它有 similar limitations .

您的设备堆空间即将用完,因此您的许多 new 操作都失败了,并返回了 NULL 指针。

作为对此的测试,在您的 new 操作之后放置这样一行是很好的调试习惯:

if (row == NULL) assert(0);

(您还需要包含 assert.h)

如果你这样做,你会发现这个断言被命中了。

我没有计算出您的代码实际需要多少设备堆空间,但它似乎使用了很多。在 C++ 中,通常会在完成后deletenew 进行的分配。您可能想研究释放使用 new 完成的分配,或者(甚至更好)重新使用分配(即每个线程分配一次),并完全避免重新分配。

这里是对你的代码的修改,它演示了上面的内容(每个线程一个分配)并且编译和运行对我来说没有错误:

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

__device__  double *d_droot, *d_dweight;

/*How could be returned the array or the pointer to the array int *row, on the device,  that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int *row) {

  int a[100][100];
  int i, j;

  //first row and first coloumn has the same value=1
  for (i = 1; i <= n_row; i++) {
    a[i][1] = a[1][i] = 1;
  }

  //Generate the full Triangle
  for (i = 2; i <= n_row; i++) {
    for (j = 2; j <= n_row - i; j++) {
      if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
    break;
      }
      a[i][j] = a[i - 1][j] + a[i][j - 1];
    }
  }

  for (i = 1; i <= n_row; i++) {
    row[i] = a[i][n_row-1];
  }

}


__device__
double Legendre_poly(int order, double x, int *my_storage)
{
  int n,k;
  double val=0;
  int *binomials = my_storage;
  if (binomials == NULL) assert(0);
  for(n=order; n>=0; n--)
    {
      Pascal_Triangle(n, binomials); /*Here are the problems*/
      for(k=0; k<=n; k++)
    val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
    }
  return val;
}


__device__ __host__
double f(double alpha,double x)
{
  /*function expanded on a basis of Legendre palynomials. */
  return exp(-alpha*x*x);
}


/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
  /*
    Parameters:
    n: Total number of expansion coeficients
    a: Upper integration limit
    b: Lower integration limit
    d_droots[]: roots for the quadrature
    d_dweight[]: weights for the quadrature
    coefficients[]: allocate N expansion coefficients.
  */

  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      #define MY_LIM 5
      int *thr_storage = new int[MY_LIM];
      if (thr_storage == NULL) assert(0);
      coefficients[i] = 0.0;
      for (dummy = 0; dummy < MY_LIM; dummy++)
        coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
      delete thr_storage;
    }

}


int main(void)
{
  cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576ULL*1024));
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
  cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));

  // Perform the expansion

  Span<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_dresult); /*This kerlnel works OK*/






  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);



  cudaFree(d_dresult);
  cudaFree(d_droot_temp);
  cudaFree(d_dweight_temp);

}

这段代码有几个优点:

  1. 它可以在设备堆上以更小的预留空间运行
  2. 它比您的代码试图执行的大量分配要快得多。

编辑:

你可以这样代替 assert:

/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
  /*
    Parameters:
    n: Total number of expansion coeficients
    a: Upper integration limit
    b: Lower integration limit
    d_droots[]: roots for the quadrature
    d_dweight[]: weights for the quadrature
    coefficients[]: allocate N expansion coefficients.
  */

  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      #define MY_LIM 5
      int *thr_storage = new int[MY_LIM];
      if (thr_storage == NULL) printf("allocation failure!\");
      else {
        coefficients[i] = 0.0;
        for (dummy = 0; dummy < MY_LIM; dummy++)
          coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
        delete thr_storage;
        }
    }

}

关于c++ - CUDA:如何在设备上填充动态大小的 vector 并将其内容返回给另一个设备功能?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28860120/

相关文章:

c++ - 如何以最少的重复管理并行和顺序版本代码?

c++ - 如何让 qmake 在链接阶段的 -lpthread 之后添加 -lm

c++ - 为什么循环停止?

c++ - 为什么 round() 和 ceil() 不返回整数?

c++ - 在共享库中全局声明的非 POD 对象的语义是什么?

c++ - 调用 cv::stereoBM 构造函数时出错

c++ - 是否可以重载 std::chrono::duration_cast ?

c++ - cudaMemcpy 结构设备主机不工作

c++ - 如何确定我的 GPU 是否进行 16/32/64 位算术运算?

c++ - 使用 nvcc (CUDA-RINSIDE) 正确链接目标文件