c++ - 在 CUDA 中求解稀疏正定线性系统

标签 c++ cuda linear-algebra solver cusolver

我们在使用 cuSOLVERcusolverSpScsrlsvchol 函数时遇到问题,可能是由于对 cuSOLVER 库的误解。

动机:我们正在求解矩形网格上的泊松方程 -divgrad x = b。在具有 5-stencil (1, 1, -4, 1, 1)2 维度中,网格上的拉普拉斯算子提供了 (相当稀疏)矩阵 A。此外,网格上的电荷分布给出了一个(密集的) vector bA 是正定对称的。

现在我们使用 CUDA 7.0 附带的 nvidia 新 cuSOLVER 库为 x 求解 A * x = b 。它提供了一个函数 cusolverSpScsrlsvchol,它应该对 float 进行稀疏的 Cholesky 分解。

注意:我们能够使用替代的稀疏 QR 分解函数 cusolverSpScsrlsvqr 正确求解系统。对于 4 x 4 网格,边缘上的所有 b 条目都为 1,其余为 0,我们得到对于 x:

1 1 0.999999 1 1 1 0.999999 1 1 1 1 1 1 1 1 1 

我们的问题:

  1. cusolverSpScsrlsvchol 返回 x 的错误结果:

    1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1
    
  2. (已解决,请参阅下面的答案)将 CSR 矩阵 A 转换为密集矩阵并显示输出给出奇怪的数字(10^-44 和喜欢)。来自 CSR 格式的相应数据是正确的,并使用 python numpy 进行了验证。

  3. (已解决,请参阅下面的答案)甚至找不到替代的稀疏 LU 和使用 cusolverSpScsrlsvlu 的部分旋转:

    $ nvcc -std=c++11 cusparse_test3.cu -o cusparse_test3 -lcusparse -lcusolver
    cusparse_test3.cu(208): error: identifier "cusolverSpScsrlsvlu" is undefined
    

我们做错了什么?感谢您的帮助!

我们的 C++ CUDA 代码:

#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusolverSp.h>
#include <cusparse.h>
#include <vector>
#include <cassert>


// create poisson matrix with Dirichlet bc. of a rectangular grid with
// dimension NxN
void assemble_poisson_matrix_coo(std::vector<float>& vals, std::vector<int>& row, std::vector<int>& col,
                     std::vector<float>& rhs, int Nrows, int Ncols) {

        //nnz: 5 entries per row (node) for nodes in the interior
    // 1 entry per row (node) for nodes on the boundary, since we set them explicitly to 1.
    int nnz = 5*Nrows*Ncols - (2*(Ncols-1) + 2*(Nrows-1))*4;
    vals.resize(nnz);
    row.resize(nnz);
    col.resize(nnz);
    rhs.resize(Nrows*Ncols);

    int counter = 0;
    for(int i = 0; i < Nrows; ++i) {
        for (int j = 0; j < Ncols; ++j) {
            int idx = j + Ncols*i;
            if (i == 0 || j == 0 || j == Ncols-1 || i == Nrows-1) {
                vals[counter] = 1.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                rhs[idx] = 1.;
//                if (i == 0) {
//                    rhs[idx] = 3.;
//                }
            } else { // -laplace stencil
                // above
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-Ncols;
                counter++;
                // left
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx-1;
                counter++;
                // center
                vals[counter] = 4.;
                row[counter] = idx;
                col[counter] = idx;
                counter++;
                // right
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+1;
                counter++;
                // below
                vals[counter] = -1.;
                row[counter] = idx;
                col[counter] = idx+Ncols;
                counter++;

                rhs[idx] = 0;
            }
        }
    }
    assert(counter == nnz);
}



int main() {
    // --- create library handles:
    cusolverSpHandle_t cusolver_handle;
    cusolverStatus_t cusolver_status;
    cusolver_status = cusolverSpCreate(&cusolver_handle);
    std::cout << "status create cusolver handle: " << cusolver_status << std::endl;

    cusparseHandle_t cusparse_handle;
    cusparseStatus_t cusparse_status;
    cusparse_status = cusparseCreate(&cusparse_handle);
    std::cout << "status create cusparse handle: " << cusparse_status << std::endl;

    // --- prepare matrix:
    int Nrows = 4;
    int Ncols = 4;
    std::vector<float> csrVal;
    std::vector<int> cooRow;
    std::vector<int> csrColInd;
    std::vector<float> b;

    assemble_poisson_matrix_coo(csrVal, cooRow, csrColInd, b, Nrows, Ncols);

    int nnz = csrVal.size();
    int m = Nrows * Ncols;
    std::vector<int> csrRowPtr(m+1);

    // --- prepare solving and copy to GPU:
    std::vector<float> x(m);
    float tol = 1e-5;
    int reorder = 0;
    int singularity = 0;

    float *db, *dcsrVal, *dx;
    int *dcsrColInd, *dcsrRowPtr, *dcooRow;
    cudaMalloc((void**)&db, m*sizeof(float));
    cudaMalloc((void**)&dx, m*sizeof(float));
    cudaMalloc((void**)&dcsrVal, nnz*sizeof(float));
    cudaMalloc((void**)&dcsrColInd, nnz*sizeof(int));
    cudaMalloc((void**)&dcsrRowPtr, (m+1)*sizeof(int));
    cudaMalloc((void**)&dcooRow, nnz*sizeof(int));

    cudaMemcpy(db, b.data(), b.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrVal, csrVal.data(), csrVal.size()*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dcsrColInd, csrColInd.data(), csrColInd.size()*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dcooRow, cooRow.data(), cooRow.size()*sizeof(int), cudaMemcpyHostToDevice);

    cusparse_status = cusparseXcoo2csr(cusparse_handle, dcooRow, nnz, m,
                                       dcsrRowPtr, CUSPARSE_INDEX_BASE_ZERO);
    std::cout << "status cusparse coo2csr conversion: " << cusparse_status << std::endl;

    cudaDeviceSynchronize(); // matrix format conversion has to be finished!

    // --- everything ready for computation:

    cusparseMatDescr_t descrA;

    cusparse_status = cusparseCreateMatDescr(&descrA);
    std::cout << "status cusparse createMatDescr: " << cusparse_status << std::endl;

    // optional: print dense matrix that has been allocated on GPU

    std::vector<float> A(m*m, 0);
    float *dA;
    cudaMalloc((void**)&dA, A.size()*sizeof(float));

    cusparseScsr2dense(cusparse_handle, m, m, descrA, dcsrVal,
                       dcsrRowPtr, dcsrColInd, dA, m);

    cudaMemcpy(A.data(), dA, A.size()*sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "A: \n";
    for (int i = 0; i < m; ++i) {
        for (int j = 0; j < m; ++j) {
            std::cout << A[i*m + j] << " ";
        }
        std::cout << std::endl;
    }

    cudaFree(dA);

    std::cout << "b: \n";
    cudaMemcpy(b.data(), db, (m)*sizeof(int), cudaMemcpyDeviceToHost);
    for (auto a : b) {
        std::cout << a << ",";
    }
    std::cout << std::endl;


    // --- solving!!!!

//    cusolver_status = cusolverSpScsrlsvchol(cusolver_handle, m, nnz, descrA, dcsrVal,
//                       dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
//                       &singularity);

     cusolver_status = cusolverSpScsrlsvqr(cusolver_handle, m, nnz, descrA, dcsrVal,
                        dcsrRowPtr, dcsrColInd, db, tol, reorder, dx,
                        &singularity);

    cudaDeviceSynchronize();

    std::cout << "singularity (should be -1): " << singularity << std::endl;

    std::cout << "status cusolver solving (!): " << cusolver_status << std::endl;

    cudaMemcpy(x.data(), dx, m*sizeof(float), cudaMemcpyDeviceToHost);

    // relocated these 2 lines from above to solve (2):
    cusparse_status = cusparseDestroy(cusparse_handle);
    std::cout << "status destroy cusparse handle: " << cusparse_status << std::endl;

    cusolver_status = cusolverSpDestroy(cusolver_handle);
    std::cout << "status destroy cusolver handle: " << cusolver_status << std::endl;

    for (auto a : x) {
        std::cout << a << " ";
    }
    std::cout << std::endl;



    cudaFree(db);
    cudaFree(dx);
    cudaFree(dcsrVal);
    cudaFree(dcsrColInd);
    cudaFree(dcsrRowPtr);
    cudaFree(dcooRow);

    return 0;
}

最佳答案

1.cusolverSpScsrlsvchol returns wrong results for x: 1 3.33333 2.33333 1 3.33333 2.33333 1.33333 1 2.33333 1.33333 0.666667 1 1 1 1 1

你说:

A is positive definite and symmetric.

不,不是。它不是对称的。

cusolverSpcsrlsvqr()不要求 A 矩阵是对称的。

cusolverSpcsrlsvchol()确实有这样的要求:

A is an m×m symmetric postive definite sparse matrix

这是您的代码为 A 矩阵提供的打印输出:

A:
1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
0  1  0  0  0 -1  0  0  0  0  0  0  0  0  0  0
0  0  1  0  0  0 -1  0  0  0  0  0  0  0  0  0
0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
0  0  0  0  1 -1  0  0  0  0  0  0  0  0  0  0
0  0  0  0  0  4 -1  0  0 -1  0  0  0  0  0  0
0  0  0  0  0 -1  4  0  0  0 -1  0  0  0  0  0
0  0  0  0  0  0 -1  1  0  0  0  0  0  0  0  0
0  0  0  0  0  0  0  0  1 -1  0  0  0  0  0  0
0  0  0  0  0 -1  0  0  0  4 -1  0  0  0  0  0
0  0  0  0  0  0 -1  0  0 -1  4  0  0  0  0  0
0  0  0  0  0  0  0  0  0  0 -1  1  0  0  0  0
0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0
0  0  0  0  0  0  0  0  0 -1  0  0  0  1  0  0
0  0  0  0  0  0  0  0  0  0 -1  0  0  0  1  0
0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1 

如果那是对称的,我希望第二行:

0 1 0 0 0 -1 0 0 0 0 0 0 0 0 0 0

匹配第二列:

0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

顺便提一下关于 Stack Overflow 的建议。如果您回答自己的问题,我的建议是您希望它是一个完整的答案。有些人可能会看到已回答的问题并跳过它。将此类内容编辑到您的问题中可能更好,从而将您的问题(我认为)集中到一个问题上。当您每个问题问多个问题时,我认为 SO 也不起作用。这种行为使问题不必要地变得更难回答,我认为它在这里对你没有好处。

关于c++ - 在 CUDA 中求解稀疏正定线性系统,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30454089/

相关文章:

java - 为什么字符串在许多编程语言中都是不可变的?

c++ - nvcc 编译器将静态 constexpr 识别为设备代码中未定义

c - 我编写了矩阵乘法 CUDA 程序,但与我的 CPU 结果相比,它总是得到错误的答案?

python - Python 中的 MATLAB interp2 函数

c++ - 如何使用单个字符指针将 float 转换为字符串?

c++ - QString arg 中的参数不超过 9 个?如何处理?

C++ 强制转换为 void

c++ - 当数据在设备中时,有什么方法可以将 thrust 与 cufftComplex 数据类型一起使用?

Matlab:支持非素数模数的线性同余求解器?

Python - NumPy - 元组作为数组的元素