cuda - 有没有办法在cuBLAS中执行 "saypx"？

cublasSaxpy 计算 y' = a * x + y，其中 x 和 y 是向量，a 是标量。

事实证明我需要计算 y' = a * y + x。我不知道如何扭转 cuBLAS 库来做到这一点。

(当然，我可以计算 y' = a * y，然后 y' = y' + x，但是在这种情况下 y' 被读取得太频繁。我可以编写自己的 CUDA 代码来做到这一点，但是那么它可能不会像 cuBLAS 代码那么快。我只是很惊讶没有明显的方法直接执行“saypx”。)

[已添加] Intel 版本的 cblas 中有类似于“saxpby”的函数，它可以满足我的需要。但奇怪的是，这不在 cuBLAS 中。

[添加#2]看起来我可以使用 cudnnAddTensor 函数，并对描述符进行一些别名(我有一个指向张量的 FilterDescriptor，AddTensor 不会接受它，但我应该能够为 TensorDescriptor 指定别名相同的内存和形状。)

最佳答案

据我所知，在 CUBLAS 或标准 BLAS 中都没有办法完成您所要求的操作。您在 MKL 中发现的是 Intel 添加的扩展，但我不记得在其他主机和加速器 BLAS 实现中看到过类似的东西。

好消息是，您的断言“我可以编写自己的 CUDA 代码来完成此操作，但它可能不会像 cuBLAS 代码那么快”，这是不正确的，至少对于像 saxpy 这样微不足道的操作来说是这样。即使是 saxpy 的简单实现也会非常接近 CUBLAS，因为实际上没有那么多需要读取两个数组、执行 FMAD 并写回结果。只要内存合并正确，编写高性能代码就非常简单。例如:

#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>

#include "cublas_v2.h"

typedef enum
{ 
    AXPY = 0,
    AXPBY = 1
} saxpy_op_t;

__device__ __host__ __inline__ 
float axpby_op(float y, float x, float a)
{
    return a * y + x;
}

__device__ __host__ __inline__ 
float axpy_op(float y, float x, float a)
{
    return y + a * x;
}

template<typename T>
class pitched_accessor
{
    T * p;
    size_t pitch;

    public:
    __host__ __device__
    pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};

    __host__ __device__
    T& operator[](size_t idx) { return p[pitch*idx]; };

    __host__ __device__ 
    const T& operator[](size_t idx) const { return p[pitch*idx]; };
};


template<saxpy_op_t op>
__global__ 
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x, 
                  const float a, const unsigned int N1)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int stride = gridDim.x * blockDim.x;

    #pragma unroll 8
    for(; idx < N1; idx += stride) {
        switch (op) {
            case AXPY:
                y[idx] = axpy_op(y[idx], x[idx], a);
                break;
            case AXPBY:
                y[idx] = axpby_op(y[idx], x[idx], a);
                break;
        }
    }
}

__host__ void saxby(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
    saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                 pitched_accessor<float>(x, xinc), a, N);
}

__host__ void saxpy(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
    saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                pitched_accessor<float>(x, xinc), a, N);
}

void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
    auto it = yhat.begin();
    for(; it != yhat.end(); ++it) {
        float err = std::fabs(*it - result);
        assert( err < tolerance ); 
    }
}

int main()
{

    const int N = 1<<22;

    std::vector<float> x_h(N);
    std::vector<float> y_h(N);

    const float a = 2.f, y0 = 1234.f, x0 = 532.f;
    std::fill(y_h.begin(), y_h.end(), y0);
    std::fill(x_h.begin(), x_h.end(), x0);

    float *x_d, *y_d;
    size_t sz = sizeof(float) * size_t(N);
    cudaMalloc((void **)&x_d, sz);
    cudaMalloc((void **)&y_d, sz);

    cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxby(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpby_op(y0, x0, a));
    }

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxpy(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
    }

    {
        cublasHandle_t handle;
        cublasCreate(&handle);
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
        cublasDestroy(handle);
    }

    return int(cudaDeviceReset());
}

这表明，一个非常简单的 axpy 内核可以轻松地适应执行标准操作和您想要的版本，并且在我测试的计算 5.2 设备上的 CUBLAS 运行时间的 10% 内运行:

$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]

$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 54.06%  11.190ms         5  2.2381ms     960ns  2.9094ms  [CUDA memcpy HtoD]
 40.89%  8.4641ms         3  2.8214ms  2.8039ms  2.8310ms  [CUDA memcpy DtoH]
  1.73%  357.59us         1  357.59us  357.59us  357.59us  void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.72%  355.15us         1  355.15us  355.15us  355.15us  void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.60%  332.21us         1  332.21us  332.21us  332.21us  void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

关于cuda - 有没有办法在cuBLAS中执行 "saypx"？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/36635340/

cuda - 有没有办法在cuBLAS中执行 "saypx"？

上一篇：正则表达式:如何使这段代码更短？ (重音字符)

下一篇：Vim 自动缩进不正确