我有一个 2d 类型的问题,我在实现内核时解决了这个问题。由于问题是二维的,如果我可以用 d_A[i][j]
而不是 d_A[i + m*j]
的话,内核中的可读性会更好使用列主矩阵排序。如果我只使用 cudaMalloc
,访问 d_A[i][j]
将不起作用。我必须为此使用什么功能?我将不胜感激一个例子。在 C++ 中,这是通过分配 2d 内存来实现的,例如double** A = new double[10][10];
和cudaMallocPitch
有关系吗?或者音高版本仅用于最大化 2d 对齐和合并内存访问?
您可以先定义一个支持条纹的 vector 类,然后二维矩阵的运算符[]可以返回一个适当设置条纹的 vector 。第二个 [] 实际上是从 vector 中调用的。这是一个例子:
#define _devhost_ __device__ __host__
typedef long SizeT;
template<typename T>
_devhost_ const T* pointer_offset(const T* ptr, SizeT offset) {
return reinterpret_cast<const T*>(
reinterpret_cast<const uint8_t*>(ptr) + offset);
}
typedef enum {
NonConst = 0,
Const = 1,
} ConstEnum;
typedef enum {
NonOwner = 0,
Owner = 1,
} OwnerEnum;
// Strip is measured in the number of bytes.
typedef enum {
NonStrip = 0,
Strip = 1,
} StripEnum;
template<
typename ValueType, typename Alloc,
ConstEnum IsConst = NonConst,
OwnerEnum IsOwner = NonOwner,
StripEnum HasStrip = NonStrip
> class Vector;
template<
typename ValueType, typename Alloc,
ConstEnum IsConst = NonConst,
OwnerEnum IsOwner = NonOwner
> class DenseMatrix;
template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const> {
protected:
ValueType* ptr_;
SizeT len_;
public:
_devhost_ Vector():ptr_(0), len_(0) {}
_devhost_ Vector(const ValueType* ptr, SizeT len) {
ptr_ = const_cast<ValueType*>(ptr);
len_ = len;
}
_devhost_ const ValueType& operator[] (SizeT i) const {
return ptr_[i];
}
_devhost_ SizeT size() const {return len_;}
_devhost_ const ValueType* data() const {return ptr_;}
};
template<typename ValueType, typename Alloc>
class Vector<ValueType, Alloc, Const, NonOwner, Strip>:
public Vector<ValueType, Alloc, Const> {
protected:
SizeT strip_;
typedef Vector<ValueType, Alloc, Const> Base;
// C++ independent names lookup will not look into base classes which
// are depended on template arguments. A "using" is required here.
using Base::ptr_;
using Base::len_;
public:
_devhost_ Vector():strip_(sizeof(ValueType)) {}
_devhost_ Vector(const ValueType* ptr, SizeT len,
SizeT strip = sizeof(ValueType)):Base(ptr, len), strip_(strip) {}
_devhost_ const ValueType& operator[] (SizeT i) const {
return *pointer_offset(ptr_, i * strip_);
}
// NOTE: size() and data() still valid,
// but may not make the right sense here in the presence of stripe.
};
template<typename ValueType, typename Alloc>
class DenseMatrix<ValueType, Alloc, Const> {
protected:
ValueType* vals_;
SizeT nrows_, ncols_;
public:
_devhost_ DenseMatrix() {vals_ = 0; nrows_ = 0; ncols_ = 0;}
_devhost_ DenseMatrix(const ValueType* vals, SizeT n_rows, SizeT n_cols) {
nrows_ = n_rows; ncols_ = n_cols;
vals_ = const_cast<ValueType*>(vals_);
}
_devhost_ SizeT num_rows() const {return nrows_;}
_devhost_ SizeT num_cols() const {return ncols_;}
_devhost_ SizeT numel() const {return nrows_ * ncols_;}
_devhost_ const ValueType* data() const {return vals_;}
_devhost_ const ValueType& at(SizeT irow, SizeT icol) const {
return vals_[irow + icol * nrows_];
}
typedef Vector<ValueType, Alloc, Const, NonOwner, Strip> ConstIndexer;
_devhost_ ConstIndexer operator[] (SizeT irow) const {
return ConstIndexer(vals_ + irow, ncols_, nrows_ * sizeof(ValueType));
}
_devhost_ DenseMatrix<ValueType, Alloc, Const> get_cols(SizeT icol,
SizeT n_cols) const {
return DenseMatrix<ValueType, Alloc, Const>(vals_ + icol * nrows_,
nrows_, n_cols);
}
_devhost_ Vector<ValueType, Alloc, Const> get_col(SizeT icol) const {
return Vector<ValueType, Alloc, Const>(vals_ + icol * nrows_, nrows_);
}
};