c++ - 在cuda中分配结构数组后变量丢失

我有一个用 C 语言编写的结构体，其中包含结构体数组，我需要在 GPU 中复制该结构体。为此，我正在编写一个函数，该函数将结构中的变量的一些 cudaMalloc 和 cudaMemcpy 从主机传输到设备。

该结构的一个简单版本(真正的版本内部有各种结构和变量/数组)是:

struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};

我的问题是我一定在内存分配和结构复制中做错了什么。当我使用 Graph 复制变量时，我可以看到它们已正确复制(通过在内核中访问它，如下例所示)。例如，我可以检查 graph.nBoundary=3。

但是，如果我不分配和复制Node *的内存，我只能看到这一点。如果这样做，我会得到 -858993460 而不是 3。有趣的是，Node * 没有错误分配，因为我可以检查 graph.node[0].pos[0] 的值并且它具有正确的值。

这只发生在graph.nBoundary上。所有其他变量都保持正确的数值，但在运行 Node* 的 cudaMemcpy 时，这个变量会“出错”。

我做错了什么以及为什么会发生这种情况？我该如何修复它？

如果您需要更多信息，请告诉我。

<小时/>

MCVE:

#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph{
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph,unsigned int * d_res){
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++){
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++){
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%d", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph){
    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));

    //copy constants
    gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));


    // copy boundary
    unsigned int * d_auxboundary, *h_auxboundary;
    h_auxboundary = inGraph->boundary;
    gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));


    //Create nodes 
    Node * auxnode;
    gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));

    // Crate auxiliary pointers to grab them from host and pass them to device
    float ** d_position, ** h_position;
    d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
    h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));

    for (int i = 0; i < inGraph->nNode; i++){

        // Positions
        h_position[i] = inGraph->node[i].position;
        gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
        gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));

    }
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ////////////// If I comment the following section, nBoundary can be read by the kernel
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////

    gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));



    return outGraph;
}

最佳答案

问题出在函数 cudaGraphMalloc 中，您尝试将设备内存分配给已在设备上分配的 outGraph 成员。在此过程中，您取消引用主机上的设备指针，这是非法的。

要将设备内存分配给设备上存在的struct类型变量的成员，我们首先必须创建该struct类型的临时主机变量，然后分配设备内存到其成员，然后将其复制到设备上存在的结构体中。

我回答过类似的问题here 。请看一下。

固定代码可能如下所示:

#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>

// A point, part of some elements
struct Node {

    float* position;

};

struct Graph {
    unsigned int nNode;
    Node* node;
    unsigned int nBoundary;
    unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
    if (code != cudaSuccess)
    {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

__global__ void testKernel(Graph* graph, unsigned int * d_res) {
    d_res[0] = graph->nBoundary;

};
int main()
{

    // Generate some fake data on the CPU
    Graph graph;
    graph.node = (Node*)malloc(2 * sizeof(Node));
    graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
    for (int i = 0; i < 3; i++) {
        graph.boundary[i] = i + 10;
    }
    graph.nBoundary = 3;
    graph.nNode = 2;
    for (int i = 0; i < 2; i++) {
        // They can have different sizes in the original code
        graph.node[i].position = (float*)malloc(3 * sizeof(float));
        graph.node[i].position[0] = 45;
        graph.node[i].position[1] = 1;
        graph.node[i].position[2] = 2;
    }

    // allocate GPU memory
    Graph * d_graph = cudaGraphMalloc(&graph);
    // some dummy variables to test on GPU.
    unsigned int * d_res, *h_res;
    cudaMalloc((void **)&d_res, sizeof(unsigned int));
    h_res = (unsigned int*)malloc(sizeof(unsigned int));

    //Run kernel
    testKernel << <1, 1 >> >(d_graph, d_res);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));

    printf("%u\n", graph.nBoundary);
    printf("%u\n", h_res[0]);

    return 0;
}

Graph* cudaGraphMalloc(const Graph* inGraph) 
{
    //Create auxiliary Graph variable on host
    Graph temp;

    //copy constants
    temp.nNode = inGraph->nNode;
    temp.nBoundary = inGraph->nBoundary;

    // copy boundary
    gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
    gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));


    //Create nodes 
    size_t nodeBytesTotal = temp.nNode * sizeof(Node);
    gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));

    for (int i = 0; i < temp.nNode; i++)
    {
        //Create auxiliary node on host
        Node auxNodeHost;

        //Allocate device memory to position member of auxillary node
        size_t nodeBytes = 3 * sizeof(float);
        gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
        gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));

        //Copy auxillary host node to device
        Node* dPtr = temp.node + i;
        gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
    }


    Graph* outGraph;
    gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
    gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));

    return outGraph;
}

请注意，您必须保留内部设备指针的主机拷贝(即辅助主机变量)。这是因为您稍后必须释放设备内存，并且主代码中只有 Graph 的设备拷贝，因此您将无法从主机访问其成员来调用cudaFree 。在本例中，变量 Node auxNodeHost (在每次迭代中创建)和 Graph temp 就是这些变量。

上面的代码并未执行此操作，仅用于演示目的。

在 Windows 10、Visual Studio 2015、CUDA 9.2、NVIDIA 驱动程序 397.44 上进行测试。

关于c++ - 在cuda中分配结构数组后变量丢失，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/55682899/

c++ - 在cuda中分配结构数组后变量丢失

上一篇：c - 如何读取文件直到遇到换行符

下一篇：c - 为什么 getchar() 在 scanf() 之后不等我按 Enter？