我想知道在设备上填充动态大小数组(int *row,在下面的代码中)然后返回其内容以供另一个设备功能使用的正确技术。
为了将问题背景化,下面的代码尝试使用在 GPU 上运行的高斯-勒让德求积法跨越勒让德多项式基组中的任意函数。
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
__device__ double *d_droot, *d_dweight;
/*How could be returned the array or the pointer to the array int *row, on the device, that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int * row) {
int a[100][100];
int i, j;
//first row and first coloumn has the same value=1
for (i = 1; i <= n_row; i++) {
a[i][1] = a[1][i] = 1;
}
//Generate the full Triangle
for (i = 2; i <= n_row; i++) {
for (j = 2; j <= n_row - i; j++) {
if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
break;
}
a[i][j] = a[i - 1][j] + a[i][j - 1];
}
}
row = new int[n_row];
for (i = 1; i <= n_row; i++) {
row[i] = a[i][n_row-1];
}
}
__device__
double Legendre_poly(int order, double x)
{
int n,k;
double val=0;
int *binomials;
for(n=order; n>=0; n--)
{
Pascal_Triangle(n, binomials); /*Here are the problems*/
for(k=0; k<=n; k++)
val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
}
return val;
}
__device__ __host__
double f(double alpha,double x)
{
/*function expanded on a basis of Legendre palynomials. */
return exp(-alpha*x*x);
}
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
coefficients[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2)*c1;
}
}
int main(void)
{
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform the expansion
Span<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
这里是编译上面代码的makefile:
objects = main.o
all: $(objects)
nvcc -arch=sm_20 $(objects) -o span
%.o: %.cpp
nvcc -x cu -arch=sm_20 -I. -dc $< -o $@
clean:
rm -f *.o span
提前感谢您的任何建议。
最佳答案
(对不起,我之前的回答有误)
您正在将一个行
指针传递给这个函数:
void Pascal_Triangle(int n_row, int * row) {
然后你试图用一个新值覆盖这个指针:
row = new int[n_row];
一旦您从此函数返回,调用环境中的 row
将保持不变。 (这是一个普通的 C/C++ 问题,并非特定于 CUDA。)
这可能是一个令人困惑的问题,但是行
的指针值被按值传递给函数Pascal_Triangle
。您不能在函数中修改指针值,并期望修改后的值显示在调用环境中。 (您可以修改指针指向的位置的内容,这是通过指针传递 row
的通常原因。)
有几种方法可以解决这个问题。最简单的可能只是通过引用传递指针:
void Pascal_Triangle(int n_row, int * &row) {
您的代码似乎还有其他缺陷。我建议你雇用 proper cuda error checking并使用 cuda-memcheck
运行您的代码。
特别是,内核中的 new
运算符的行为方式与内核中的 malloc
类似,并且它有 similar limitations .
您的设备堆空间即将用完,因此您的许多 new
操作都失败了,并返回了 NULL 指针。
作为对此的测试,在您的 new
操作之后放置这样一行是很好的调试习惯:
if (row == NULL) assert(0);
(您还需要包含 assert.h
)
如果你这样做,你会发现这个断言被命中了。
我没有计算出您的代码实际需要多少设备堆空间,但它似乎使用了很多。在 C++ 中,通常会在完成后delete
由 new
进行的分配。您可能想研究释放使用 new
完成的分配,或者(甚至更好)重新使用分配(即每个线程分配一次),并完全避免重新分配。
这里是对你的代码的修改,它演示了上面的内容(每个线程一个分配)并且编译和运行对我来说没有错误:
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
__device__ double *d_droot, *d_dweight;
/*How could be returned the array or the pointer to the array int *row, on the device, that is filled by this function? */
__device__
void Pascal_Triangle(int n_row, int *row) {
int a[100][100];
int i, j;
//first row and first coloumn has the same value=1
for (i = 1; i <= n_row; i++) {
a[i][1] = a[1][i] = 1;
}
//Generate the full Triangle
for (i = 2; i <= n_row; i++) {
for (j = 2; j <= n_row - i; j++) {
if (a[i - 1][j] == 0 || a[i][j - 1] == 0) {
break;
}
a[i][j] = a[i - 1][j] + a[i][j - 1];
}
}
for (i = 1; i <= n_row; i++) {
row[i] = a[i][n_row-1];
}
}
__device__
double Legendre_poly(int order, double x, int *my_storage)
{
int n,k;
double val=0;
int *binomials = my_storage;
if (binomials == NULL) assert(0);
for(n=order; n>=0; n--)
{
Pascal_Triangle(n, binomials); /*Here are the problems*/
for(k=0; k<=n; k++)
val += binomials[k]*pow(x-1,n-k)*pow(x-1,k);
}
return val;
}
__device__ __host__
double f(double alpha,double x)
{
/*function expanded on a basis of Legendre palynomials. */
return exp(-alpha*x*x);
}
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) assert(0);
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
int main(void)
{
cudaDeviceSetLimit(cudaLimitMallocHeapSize, (1048576ULL*1024));
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult, *d_droot_temp, *d_dweight_temp;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dweight_temp, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot_temp, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight_temp, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_droot, &d_droot_temp, sizeof(double *));
cudaMemcpyToSymbol(d_dweight, &d_dweight_temp, sizeof(double *));
// Perform the expansion
Span<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_dresult); /*This kerlnel works OK*/
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
cudaFree(d_dresult);
cudaFree(d_droot_temp);
cudaFree(d_dweight_temp);
}
这段代码有几个优点:
- 它可以在设备堆上以更小的预留空间运行
- 它比您的代码试图执行的大量分配要快得多。
编辑:
你可以这样代替 assert
:
/*Kernel that computes the expansion by quadratures*/
__global__ void Span(int n, double alpha, double a, double b, double *coefficients)
{
/*
Parameters:
n: Total number of expansion coeficients
a: Upper integration limit
b: Lower integration limit
d_droots[]: roots for the quadrature
d_dweight[]: weights for the quadrature
coefficients[]: allocate N expansion coefficients.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
#define MY_LIM 5
int *thr_storage = new int[MY_LIM];
if (thr_storage == NULL) printf("allocation failure!\");
else {
coefficients[i] = 0.0;
for (dummy = 0; dummy < MY_LIM; dummy++)
coefficients[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*Legendre_poly(dummy,c1 * d_droot[dummy] + c2, thr_storage)*c1;
delete thr_storage;
}
}
}
关于c++ - CUDA:如何在设备上填充动态大小的 vector 并将其内容返回给另一个设备功能?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28860120/