algorithm - QR 分解求解 CUDA 中的线性系统

标签 algorithm cuda gpgpu gpu

我正在写一个GPU上的图像恢复算法,详情在

Cuda: least square solving , poor in speed

求解线性系统的QR分解法

Ax=b  

工作原理如下

min||Ax-b|| ---> ||QRx-b||  ---> ||(Q^T)QRx-(Q^T)b|| ---> ||Rx-(Q^T)b||

其中 R 是上三角矩阵。由此产生的上三角线性系统很容易求解。

我想用CULA工具来实现这个方法。 CULA 例程 GEQRF 计算 QR 因式分解。手册说:

On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N upper trapezoidal matrix R (R is upper triangular if m >= n); the elements below the diagonal, with the array TAU, represent the orthogonal/unitary matrix Q as a product of min(m,n) elementary reflectors.

我无法弄清楚 Q 的存储位置,算法对我来说似乎太复杂了。你能给点建议吗?

谢谢!

最佳答案

截至 2015 年 2 月,CUDA 7.0(现在为候选版本)提供了新的 cuSOLVER 库,包括计算矩阵的 QR 分解的可能性。这与 cuBLAS 库结合使用,能够根据 cuSOLVER 用户指南附录 C 中阐述的指南求解线性系统。

您必须遵循三个步骤:

1) geqrf:它通过返回A的上三角部分的上三角矩阵R来计算矩阵的QR分解和矩阵 Q 以 Householder 向量的形式存储在 A 的下三角部分,而 Householder 向量的比例因子由 TAU< 返回 参数;

2) ormqr:它通过覆盖C返回Q和矩阵C的乘积;

3) trsm:求解上三角线性系统。

下面,我将提供这些例程用法的完整示例。

#include "cuda_runtime.h"
#include "device_launch_paraMeters.h"

#include<iostream>
#include<fstream>
#include<iomanip>
#include<stdlib.h>
#include<stdio.h>
#include<assert.h>

#include <cusolverDn.h>
#include <cublas_v2.h>
#include <cuda_runtime_api.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCK_SIZE 32

#define prec_save 10

/***************/
/* COPY KERNEL */
/***************/
__global__ void copy_kernel(const double * __restrict d_in, double * __restrict d_out, const int M, const int N) {

    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    const int j = blockIdx.y * blockDim.y + threadIdx.y;

    if ((i < N) && (j < N)) d_out[j * N + i] = d_in[j * M + i];
}

/****************************************************/
/* LOAD INDIVIDUAL REAL MATRIX FROM txt FILE TO CPU */
/****************************************************/
// --- Load individual real matrix from txt file
template <class T>
void loadCPUrealtxt(T * __restrict h_out, const char *filename, const int M) {

    std::ifstream infile;
    infile.open(filename);
    for (int i = 0; i < M; i++) {
        double temp;
        infile >> temp;
        h_out[i] = (T)temp;
    }

    infile.close();

}

/************************************/
/* SAVE REAL ARRAY FROM GPU TO FILE */
/************************************/
template <class T>
void saveGPUrealtxt(const T * d_in, const char *filename, const int M) {

    T *h_in = (T *)malloc(M * sizeof(T));

    gpuErrchk(cudaMemcpy(h_in, d_in, M * sizeof(T), cudaMemcpyDeviceToHost));

    std::ofstream outfile;
    outfile.open(filename);
    for (int i = 0; i < M; i++) outfile << std::setprecision(prec_save) << h_in[i] << "\n";
    outfile.close();

}

/********/
/* MAIN */
/********/
int main(){

    // --- Extension of Appendix C.1 of cuSOLVER library User's Guide
    // --- See also http://www.netlib.org/lapack/lug/node40.html

    // --- ASSUMPTION Nrows >= Ncols
    const int Nrows = 500;
    const int Ncols = 500;

    TimingGPU timerGPU;
    double timingQR, timingSolve;

    // --- cuSOLVE input/output parameters/arrays
    int work_size = 0;
    int *devInfo;           gpuErrchk(cudaMalloc(&devInfo, sizeof(int)));

    // --- CUDA solver initialization
    cusolverDnHandle_t solver_handle;
    cusolveSafeCall(cusolverDnCreate(&solver_handle));

    // --- CUBLAS initialization
    cublasHandle_t cublas_handle;
    cublasSafeCall(cublasCreate(&cublas_handle));

    /***********************/
    /* SETTING THE PROBLEM */
    /***********************/
    // --- Setting the host, Nrows x Ncols matrix
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double));
    loadCPUrealtxt(h_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testMatrix.txt", Nrows * Ncols);

    // --- Setting the device matrix and moving the host matrix to the device
    double *d_A;            gpuErrchk(cudaMalloc(&d_A, Nrows * Ncols * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice));

    // --- Initializing the data matrix C (Of course, this step could be done by a kernel function directly on the device).
    // --- Notice that, in this case, only the first column of C contains actual data, the others being empty (zeroed). However, cuBLAS trsm
    //     has the capability of solving triangular linear systems with multiple right hand sides.
    double *h_C = (double *)calloc(Nrows * Nrows, sizeof(double));
    loadCPUrealtxt(h_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\testVector.txt", Nrows);

    double *d_C;            gpuErrchk(cudaMalloc(&d_C, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_C, h_C, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    /**********************************/
    /* COMPUTING THE QR DECOMPOSITION */
    /**********************************/
    timerGPU.StartCounter();

    // --- CUDA QR GEQRF preliminary operations
    double *d_TAU;      gpuErrchk(cudaMalloc((void**)&d_TAU, min(Nrows, Ncols) * sizeof(double)));
    cusolveSafeCall(cusolverDnDgeqrf_bufferSize(solver_handle, Nrows, Ncols, d_A, Nrows, &work_size));
    double *work;   gpuErrchk(cudaMalloc(&work, work_size * sizeof(double)));

    // --- CUDA GEQRF execution: The matrix R is overwritten in upper triangular part of A, including diagonal 
    //     elements. The matrix Q is not formed explicitly, instead, a sequence of householder vectors are
    //     stored in lower triangular part of A.
    cusolveSafeCall(cusolverDnDgeqrf(solver_handle, Nrows, Ncols, d_A, Nrows, d_TAU, work, work_size, devInfo));
    int devInfo_h = 0;  gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost));
    if (devInfo_h != 0) std::cout << "Unsuccessful gerf execution\n\n";

    timingQR = timerGPU.GetCounter();
    printf("Timing for QR calculation = %f [ms]\n", timingQR);

    /*****************************/
    /* SOLVING THE LINEAR SYSTEM */
    /*****************************/
    timerGPU.StartCounter();

    // --- CUDA ORMQR execution: Computes the multiplication Q^T * C and stores it in d_C
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_C, Nrows, work, work_size, devInfo));

    // --- Reducing the linear system size
    double *d_R; gpuErrchk(cudaMalloc(&d_R, Ncols * Ncols * sizeof(double)));
    double *d_B; gpuErrchk(cudaMalloc(&d_B, Ncols * sizeof(double)));
    dim3 Grid(iDivUp(Ncols, BLOCK_SIZE), iDivUp(Ncols, BLOCK_SIZE));
    dim3 Block(BLOCK_SIZE, BLOCK_SIZE);
    copy_kernel << <Grid, Block >> >(d_A, d_R, Nrows, Ncols);
    gpuErrchk(cudaMemcpy(d_B, d_C, Ncols * sizeof(double), cudaMemcpyDeviceToDevice));

    // --- Solving an upper triangular linear system - compute x = R \ Q^T * B
    const double alpha = 1.;
    cublasSafeCall(cublasDtrsm(cublas_handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
        CUBLAS_DIAG_NON_UNIT, Ncols, 1, &alpha, d_R, Ncols, d_B, Ncols));

    timingSolve = timerGPU.GetCounter();
    printf("Timing for solution of the linear system = %f [ms]\n", timingSolve);
    printf("Overall timing = %f [ms]\n", timingQR + timingSolve);

    /************************/
    /* CHECKING THE RESULTS */
    /************************/
    // --- The upper triangular part of A contains the elements of R. Showing this.
    saveGPUrealtxt(d_A, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_R.txt", Nrows * Ncols);

    // --- The first Nrows elements of d_C contain the result of Q^T * C
    saveGPUrealtxt(d_C, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_QTC.txt", Nrows);

    // --- Initializing the output Q matrix (Of course, this step could be done by a kernel function directly on the device)
    double *h_Q = (double *)malloc(Nrows * Nrows * sizeof(double));
    for (int j = 0; j < Nrows; j++)
        for (int i = 0; i < Nrows; i++)
            if (j == i) h_Q[j + i*Nrows] = 1.;
            else        h_Q[j + i*Nrows] = 0.;

    double *d_Q;            gpuErrchk(cudaMalloc(&d_Q, Nrows * Nrows * sizeof(double)));
    gpuErrchk(cudaMemcpy(d_Q, h_Q, Nrows * Nrows * sizeof(double), cudaMemcpyHostToDevice));

    // --- Calculation of the Q matrix
    cusolveSafeCall(cusolverDnDormqr(solver_handle, CUBLAS_SIDE_LEFT, CUBLAS_OP_N, Nrows, Ncols, min(Nrows, Ncols), d_A, Nrows, d_TAU, d_Q, Nrows, work, work_size, devInfo));

    // --- d_Q contains the elements of Q. Showing this.
    saveGPUrealtxt(d_Q, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_Q.txt", Nrows * Nrows);

    // --- At this point, d_C contains the elements of Q^T * C, where C is the data vector. Showing this.
    // --- According to the above, only the first column of d_C makes sense.
    //gpuErrchk(cudaMemcpy(h_C, d_C, Nrows * Nrows * sizeof(double), cudaMemcpyDeviceToHost));
    //printf("\n\n");
    //for (int j = 0; j < Nrows; j++)
    //  for (int i = 0; i < Nrows; i++)
    //      printf("C[%i, %i] = %f\n", j, i, h_C[j + i*Nrows]);

    // --- Check final result
    saveGPUrealtxt(d_B, "D:\\Project\\solveNonSquareLinearSystemQRCUDA\\solveNonSquareLinearSystemQRCUDA\\d_B.txt", Ncols);

    cusolveSafeCall(cusolverDnDestroy(solver_handle));

    return 0;
}

运行此类示例所需的 Utilities.cuUtilities.cuh 文件在 github page 中维护。 . TimingGPU.cuTimingGPU.cuh 文件保存在 github page 中。 .

可以通过以下Matlab代码生成数据并检查结果:

clear all
close all
clc

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% GENERATE RANDOM NON-SQUARE MATRIX WITH DESIRED CONDITION NUMBER %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% --- Credit to https://math.stackexchange.com/questions/198515/can-we-generate-random-singular-matrices-with-desired-condition-number-using-mat
Nrows = 500;                % --- Number of rows
Ncols = 500;                % --- Number of columns
% condNumber = 10 * sqrt(2);  % --- Desired condition number
% A = randn(Nrows, Ncols);
% [U, S, V] = svd(A);
% S(S~=0) = linspace(condNumber, 1, min(Nrows, Ncols));
% A = U * S * V';

% --- Setting the problem solution
x = ones(Ncols, 1);

% y = A * x;
% 
% Asave = reshape(A, Nrows * Ncols, 1);
% save testMatrix.txt Asave -ascii -double
% save testVector.txt y -ascii -double

load testMatrix.txt
load testVector.txt
A = reshape(testMatrix, Nrows, Ncols);
y = testVector;

[Q, R] = qr(A);

xMatlab = R \ (Q.' * y);

fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(xMatlab - x).^2)) / sum(sum(abs(x).^2))));

fprintf('Percentage rms of Q * R - A %f\n', 100 * sqrt(sum(sum(abs(Q * R - A).^2)) / sum(sum(abs(A).^2))));

load d_R.txt
d_R = reshape(d_R, Nrows, Ncols);

d_R = d_R(1 : Ncols, :);
R   = R(1 : Ncols, :);

fprintf('Percentage rms of matrix R between Matlab and CUDA %f\n', 100 * sqrt(sum(sum(abs(triu(R) - triu(d_R)).^2)) / sum(sum(abs(triu(d_R)).^2))));

load d_QTC.txt
fprintf('Percentage rms of Q^T * y - d_QTC %f\n', 100 * sqrt(sum(sum(abs(Q.' * y - d_QTC).^2)) / sum(sum(abs(d_QTC).^2))));

load d_B.txt
fprintf('Percentage rms of solution in Matlab %f\n', 100 * sqrt(sum(sum(abs(d_B - x).^2)) / sum(sum(abs(x).^2))));

请根据需要评论/取消评论行。

时间

计时(以毫秒为单位)(在 GTX960 卡上执行的测试,cc. 5.2):

Size         QR decomposition       Solving system       Overall
100x100      0.89                   1.41                 2.30
200x200      5.97                   3.23                 9.20
500x500      17.08                  21.6                 38.7

关于algorithm - QR 分解求解 CUDA 中的线性系统,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22399794/

相关文章:

java - 如何在 Big-O(N) 时间内将 3 个排序数组合并为 1 个排序数组?

C++-amp 库和软件要求

c++ - 我是否需要在 CUDA 中跨多个 GPU 镜像输入缓冲区/纹理?

python - PyCUDA NPP 库兼容性

c - 在不同范围内的Cuda内核中生成随机数

python - 安装了 beignet 以在 Intel 上使用 OpenCL,但 OpenCL 程序仅在以 root 身份运行时才有效

algorithm - 霍夫曼算法逆匹配

python - 查找所有点对之间的欧氏距离

java - 我怎样才能影响 minimax 算法更喜欢立即奖励?

cuda - Udacity 并行编程,未指定的启动失败 cudaGetLastError()