C++ 数组访问优化

我正在修改一个开源项目(粉末玩具)并始终使用/O2(最大化速度)选项进行编译，启用 SSE2 代码生成并且刚刚检查了这个:

void membwand(void * destv, void * srcv, size_t destsize, size_t srcsize)
{
    size_t i;
    unsigned char * dest = (unsigned char*)destv;
    unsigned char * src = (unsigned char*)srcv;
    for(i = 0; i < destsize; i++){
        dest[i] = dest[i] & src[i%srcsize];
    }
}

这里我替换了下面几行

membwand(gravy, gravmask, size_dst, size_src);

membwand(gravx, gravmask, size_dst, size_src);

    // gravy, gravx and gravmask are 1MB each

与

membwand2(gravy,gravx, gravmask, size_dst, size_src);   
    // sizes are same, why not put all in a single function?

实现为:

void membwand2(void * destv1, void * destv2,void * srcv, size_t destsize, size_t srcsize)
{
    if(destsize!=srcsize)
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;
        unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        for(i = 0; i < destsize; i++)
        {
            dest1[i] = dest1[i] & src[i%srcsize];
            dest2[i] = dest2[i] & src[i%srcsize];
        }
    }
    else
    {
        size_t i;
        unsigned char * dest1 = (unsigned char*)destv1;unsigned char * dest2 = (unsigned char*)destv2;
        unsigned char * src = (unsigned char*)srcv;
        unsigned char tmpChar0=0;unsigned char tmpChar1=0;unsigned char tmpChar2=0;
        unsigned char tmpChar3=0;unsigned char tmpChar4=0;unsigned char tmpChar5=0;
        unsigned char tmpChar6=0;unsigned char tmpChar7=0;unsigned char tmpChar8=0;
        unsigned char tmpChar9=0;unsigned char tmpChar10=0;unsigned char tmpChar11=0;
        unsigned char tmpChar12=0;unsigned char tmpChar13=0;unsigned char tmpChar14=0;
        unsigned char tmpChar15=0;unsigned char tmpChar16=0;unsigned char tmpChar17=0;
        unsigned char tmpChar18=0;unsigned char tmpChar19=0;unsigned char tmpChar20=0;
        unsigned char tmpChar21=0;unsigned char tmpChar22=0;unsigned char tmpChar23=0;
        unsigned char tmpChar24=0;unsigned char tmpChar25=0;unsigned char tmpChar26=0;
        unsigned char tmpChar27=0;unsigned char tmpChar28=0;unsigned char tmpChar29=0;
        unsigned char tmpChar30=0;unsigned char tmpChar31=0;
        for(i = 0; i < destsize; i+=32)
        {
            tmpChar0=src[i];tmpChar1=src[i+1];tmpChar2=src[i+2];tmpChar3=src[i+3];
            tmpChar4=src[i+4];tmpChar5=src[i+5];tmpChar6=src[i+6];tmpChar7=src[i+7];

            tmpChar8=src[i+8];tmpChar9=src[i+9];tmpChar10=src[i+10];tmpChar11=src[i+11];
            tmpChar12=src[i+12];tmpChar13=src[i+13];tmpChar14=src[i+14];tmpChar15=src[i+15];

            tmpChar16=src[i+16];tmpChar17=src[i+17];tmpChar18=src[i+18];tmpChar19=src[i+19];
            tmpChar20=src[i+20];tmpChar21=src[i+21];tmpChar22=src[i+22];tmpChar23=src[i+23];

            tmpChar24=src[i+24];tmpChar25=src[i+25];tmpChar26=src[i+26];tmpChar27=src[i+27];
            tmpChar28=src[i+28];tmpChar29=src[i+29];tmpChar30=src[i+30];tmpChar31=src[i+31];


            dest1[i] = dest1[i] & tmpChar0;
            dest1[i+1] = dest1[i+1] & tmpChar1;
            dest1[i+2] = dest1[i+2] & tmpChar2;
            dest1[i+3] = dest1[i+3] & tmpChar3;
            dest1[i+4] = dest1[i+4] & tmpChar4;
            dest1[i+5] = dest1[i+5] & tmpChar5;
            dest1[i+6] = dest1[i+6] & tmpChar6;
            dest1[i+7] = dest1[i+7] & tmpChar7;
            dest1[i+8] = dest1[i+8] & tmpChar8;
            dest1[i+9] = dest1[i+9] & tmpChar9;
            dest1[i+10] = dest1[i+10] & tmpChar10;
            dest1[i+11] = dest1[i+11] & tmpChar11;
            dest1[i+12] = dest1[i+12] & tmpChar12;
            dest1[i+13] = dest1[i+13] & tmpChar13;
            dest1[i+14] = dest1[i+14] & tmpChar14;
            dest1[i+15] = dest1[i+15] & tmpChar15;
            dest1[i+16] = dest1[i+16] & tmpChar16;
            dest1[i+17] = dest1[i+17] & tmpChar17;
            dest1[i+18] = dest1[i+18] & tmpChar18;
            dest1[i+19] = dest1[i+19] & tmpChar19;
            dest1[i+20] = dest1[i+20] & tmpChar20;
            dest1[i+21] = dest1[i+21] & tmpChar21;
            dest1[i+22] = dest1[i+22] & tmpChar22;
            dest1[i+23] = dest1[i+23] & tmpChar23;
            dest1[i+24] = dest1[i+24] & tmpChar24;
            dest1[i+25] = dest1[i+25] & tmpChar25;
            dest1[i+26] = dest1[i+26] & tmpChar26;
            dest1[i+27] = dest1[i+27] & tmpChar27;
            dest1[i+28] = dest1[i+28] & tmpChar28;
            dest1[i+29] = dest1[i+29] & tmpChar29;
            dest1[i+30] = dest1[i+30] & tmpChar30;
            dest1[i+31] = dest1[i+31] & tmpChar31;

            dest2[i] = dest2[i] & tmpChar0;
            dest2[i+1] = dest2[i+1] & tmpChar1;
            dest2[i+2] = dest2[i+2] & tmpChar2;
            dest2[i+3] = dest2[i+3] & tmpChar3;
            dest2[i+4] = dest2[i+4] & tmpChar4;
            dest2[i+5] = dest2[i+5] & tmpChar5;
            dest2[i+6] = dest2[i+6] & tmpChar6;
            dest2[i+7] = dest2[i+7] & tmpChar7;
            dest2[i+8] = dest2[i+8] & tmpChar8;
            dest2[i+9] = dest2[i+9] & tmpChar9;
            dest2[i+10] = dest2[i+10] & tmpChar10;
            dest2[i+11] = dest2[i+11] & tmpChar11;
            dest2[i+12] = dest2[i+12] & tmpChar12;
            dest2[i+13] = dest2[i+13] & tmpChar13;
            dest2[i+14] = dest2[i+14] & tmpChar14;
            dest2[i+15] = dest2[i+15] & tmpChar15;
            dest2[i+16] = dest2[i+16] & tmpChar16;
            dest2[i+17] = dest2[i+17] & tmpChar17;
            dest2[i+18] = dest2[i+18] & tmpChar18;
            dest2[i+19] = dest2[i+19] & tmpChar19;
            dest2[i+20] = dest2[i+20] & tmpChar20;
            dest2[i+21] = dest2[i+21] & tmpChar21;
            dest2[i+22] = dest2[i+22] & tmpChar22;
            dest2[i+23] = dest2[i+23] & tmpChar23;
            dest2[i+24] = dest2[i+24] & tmpChar24;
            dest2[i+25] = dest2[i+25] & tmpChar25;
            dest2[i+26] = dest2[i+26] & tmpChar26;
            dest2[i+27] = dest2[i+27] & tmpChar27;
            dest2[i+28] = dest2[i+28] & tmpChar28;
            dest2[i+29] = dest2[i+29] & tmpChar29;
            dest2[i+30] = dest2[i+30] & tmpChar30;
            dest2[i+31] = dest2[i+31] & tmpChar31;


        }
    }
}

两行版本的 Omp 挂钟计时为 17 毫秒，单行版本为 1.5 毫秒。 (经过数千次迭代)

问题:是什么导致了 9 倍的加速？ src[] 或缓存利用的数据重用？也许代码之前不适合矢量化，并且展开启用了它，这可能是索引缺少的模数操作吗？我应该在循环顶部添加#pragma omp 吗？

编辑:#pragma omp parallel for num_threads(4) 以某种方式使情况变得更糟。也许只有 1MB 并不能隐藏多线程开销？

Edit2: 省略模运算符使其为 2.5 毫秒，但性能从 2.5 毫秒增加到 1.5 毫秒(~%60 增加)一定来自展开/缓存/...

注意:启用“favor fast code”、启用整个程序优化、完全优化(/Ox)和启用内在函数并没有改变速度(也许启用 SSE2 和/O2 就足够了)

中央处理器:FX8150 IDE:MSVC

最佳答案

我认为loop unrolling是助推器。您可以用谷歌搜索它以获取详细信息。

当然模运算可能会很慢..你可以通过替换来测试

src[i%srcsize]

与

src[i]

为了计时测试。

关于C++ 数组访问优化，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/21753744/

C++ 数组访问优化

上一篇：python - 在 Mac OS X (10.7) 上安装 graph-tool - 已经安装了 Boost，但不断出现此错误

下一篇：c++ - 需要帮助理解从 B.Stroustrup 的新书中摘录的这段文字