python - 使用 PyCuda 的遗传细胞自动机,如何有效地将每个细胞的大量数据传递给 CUDA 内核?

标签 python c optimization cuda pycuda

我正在使用 PyCuda 开发遗传元胞自动机。每个细胞都会有大量的基因组数据,以及细胞参数。我想知道什么是最有效的方法来 1) 将单元格数据传递给 CUDA 内核,然后 2) 处理这些数据。

我从一个特别糟糕的 (imo),但仍然有效的解决方案开始。它在单独的数组中传递每个参数,然后使用 switch-case 和大量重复代码处理它们。

然后,我意识到我很快就会以每个内核函数的大量参数结束,并决定重写它。

第二种解决方案是将所有单元格参数存储在具有额外维度的单个数组中。这在代码中要优雅得多,但令人惊讶的是代码运行速度慢了 10 倍!

为了更清楚,我需要为每个单元格存储的完整数据列表:

  • (Fc, Mc, Tc):3x (int) - 细胞当前的“味道”、质量和温度
  • (Rfc, Rmc, Rtc): 3x (int) - 单元格的当前寄存器
  • 每个邻居的 (Fi, Mi, Ti):8*3x (int) - 传入值
  • 每个邻居的 (Rfi, Rmi, Rti):8*3x (int) - 输入值
  • 门方向:1x (uchar)
  • 执行指针:1x (uchar)
  • 当前微操作内存:32x (uchar)
  • 最后一步的微操作内存:32x (uchar)

我将一个自动机步骤分成两个阶段。首先(发射阶段)是为每个细胞邻居计算(Fi,Mi,Ti)。第二个(吸收阶段)是将 8x(Fi, Mi, Ti) 值与当前细胞状态混合。尚未实现基因组或寄存器,但我需要它的数据以备将来使用。

所以,我的第一个解决方案的代码是:

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
    int x = i / h;
    int y = i % h;

    int3 cell = cells[i];
    float M = (float) cell.y;
    float T = (float) cell.z;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i].y -= Mi;
    cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);

    int Fi = cell.x;
    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int Ti = cell.z;
    int ii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fi) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo;
      if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo;
      if (xo >= w) yo = yo - h;
      ii = xo * h + yo;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      switch(c) {
        case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        default: break;
      }

    } 
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)

absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
    int3 cell = cells[i];

    int3 dcell = dcells0[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;

    dcell = dcells1[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells2[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells3[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells4[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells5[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells6[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells7[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    cells[i] = cell;
    img[i] = hsv2rgb(cell);

""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ int3 operator+(const int3 &a, const int3 &b) {
    return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}

__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
    int dF = Fi - Fc;
    if (dF > 180) Fc += 360;
    if (dF < -180) Fc -= 360;
    float sM = Mi + Mc;
    if (sM != 0) sM = Mi / sM; else sM = 0;
    dF = (int) (Fi - Fc) * sM;
    int dM = Mi;
    int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
    return make_int3(dF, dM, dT);
}

__device__ uint hsv2rgb(int3 pixel) {
    // skipped for brevity
}
""" % (Mk, Tk, RAM))

第二个和当前的解决方案:

Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell

emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
    int x = i / h;
    int y = i % h;
    int ii = i * CN;

    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[ii+1] = Mc - Mi;
    cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int iii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fc) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      if (c > 4) c--;
      iii = (xo * h + yo) * CN + 6 + c*3;

      cells[iii] = Fc;
      cells[iii+1] = Mbase + Madd;
      cells[iii+2] = Tc;

    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
    int ii = i * CN;
    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];

    for (int c=0; c < 8; c++){
      int iii = ii + c * 3 + 6;
      int Fi = cells[iii];
      int Mi = cells[iii+1];
      int Ti = cells[iii+2];

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[ii] = Fc;
    cells[ii+1] = Mc;
    cells[ii+2] = Tc;
    cells[ii+18] = (cells[ii+18] + 1) % 8;

    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
#define CN %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))

两种变体产生完全相同的 CA 行为,但后者运行速度要慢得多。

GTX 泰坦:

  • 字段大小:1900x1080 个单元格
  • 解决方案 #1:~200 步/秒
  • 解决方案 #2:~20 步/秒

GT 630M:

  • 字段大小:1600x900 个单元格
  • 解决方案#1:~7.8 步/秒
  • 解决方案 #2:~1.5 步/秒

如果需要,请随意使用这两种解决方案:

Solution #1 full source

Solution #2 full source

欢迎任何线索或建议:

  1. 为什么性能变慢?
  2. 是否有可能将解决方案 #2 的性能至少提高到 #1 的水平?
  3. 或者其他解决方案会更好?

最佳答案

好的,我设法将第二个解决方案的运行速度提高了将近 15 倍。进行了以下更改:

  • 将主要参数数组从int 转换为int4。这使得它比使用 int3 的解决方案更快。虽然,额外的空间未被使用(.w 维度)。 [3 倍加速]
  • 重新打包 WIDTHxHEIGHT 组中的相关参数。因此,形状从 (WIDTH, HEIGHT, N) 变为 (N, WIDTH, HEIGHT)。这使得内存访问更加高效,因为组内的元素往往会一起处理。 [5 倍加速]

优化后的代码如下:

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int4 *cells, int w, int h, int cn", """
    int x = i / h;
    int y = i % h;

    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i] = make_int4(Fc, Mc - Mi, Tc - (int) (T * fmin(1, T / Tk)), 0);

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int ii;
    int xo, yo;

    int cnn = 0;
    for (int dx = -1; dx < 2; dx++) {
      xo = x + dx;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      for (int dy = -1; dy < 2; dy++) {
        if (dx == 0 && dy == 0) continue;
        cnn += cn;
        yo = y + dy;
        if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;
        if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
        ii = (xo * h + yo) + cnn;
        cells[ii] = make_int4(Fc, Mbase + Madd, Tc, 0);
      }
    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int4 *cells, int *img, int w, int h, int cn", """
    int ii = i;
    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;

    for (int c=0; c < 8; c++){
      ii += cn;
      cell = cells[ii];
      int Fi = cell.x;
      int Mi = cell.y;
      int Ti = cell.z;

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[i] = make_int4(Fc, Mc, Tc, 0);
    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk))

感谢 Park Young-Bae 提供重新打包的线索,也感谢 Alexey Shchepin 解决一些优化问题。

关于python - 使用 PyCuda 的遗传细胞自动机,如何有效地将每个细胞的大量数据传递给 CUDA 内核?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27500963/

相关文章:

python - Python中的10个字符组成的字符串集在RAM中的大小比预期的大10倍

python - Pandas.to_datetime() 仅在数据框中的列上失败

C 带 block : Stack-based blocks going out of scope

database - 多个索引可以一起工作吗?

optimization - 如果我启用 C 优化 -O2 或 -fstrict-overflow(-O1 很好),我的 C 程序会崩溃

python - 对不存在的数组使用 numpy View

c# - 为什么我们不需要动态语言的接口(interface)?

c - 替换C中的多行单行注释(转义换行符)

C 编程 while 循环和数组结构

c++ - 代码优化;切换与 if 的