我是 SSE 编码的新手。我想为我的算法编写一个 SSE 代码。我想将以下 C 代码转换为 SSE 代码。
for(int i=1;i<height;i++)
{
for(int j=1;j<width;j++)
{
int index = 0;
if(input[width*i + j]<=input[width*(i-1)+(j-1)])) index += 0x80;
if(input[width*i + j]<=input[width*(i-1)+(j )])) index += 0x40;
if(input[width*i + j]<=input[width*(i-1)+(j+1)])) index += 0x20;
if(input[width*i + j]<=input[width*(i )+(j-1)])) index += 0x10;
if(input[width*i + j]<=input[width*(i )+(j+1)])) index += 0x08;
if(input[width*i + j]<=input[width*(i+1)+(j-1)])) index += 0x04;
if(input[width*i + j]<=input[width*(i+1)+(j )])) index += 0x02;
if(input[width*i + j]<=input[width*(i+1)+(j+1)])) index ++;
output[width*(i-1)+(j-1)] = index;
}
}
这是我的 SSE 代码:
unsigned char *dst_d = outputbuffer
float *CT_image_0 = inputbuffer;
float *CT_image_1 = CT_image_0 + width;
float *CT_image_2 = CT_image_1 + width;
for(int i=1;i<height;i++)
{
for(int j=1;j<width;j+=4)
{
__m128 CT_current_00 = _mm_loadu_ps((CT_image_0+j-1));
__m128 CT_current_10 = _mm_loadu_ps((CT_image_1+j-1));
__m128 CT_current_20 = _mm_loadu_ps((CT_image_2+j-1));
__m128 CT_current_01 = _mm_loadu_ps(((CT_image_0+1)+j-1));
__m128 CT_current_11 = _mm_loadu_ps(((CT_image_1+1)+j-1));
__m128 CT_current_21 = _mm_loadu_ps(((CT_image_2+1)+j-1));
__m128 CT_current_02 = _mm_loadu_ps(((CT_image_0+2)+j-1));
__m128 CT_current_12 = _mm_loadu_ps(((CT_image_1+2)+j-1));
__m128 CT_current_22 = _mm_loadu_ps(((CT_image_2+2)+j-1));
__m128 val = CT_current_11;
//Below I tried to write the SSE instruction but that was wrong :(
//--How I can do index + ...operation with this _mm_cmple_ss return value ????
__m128 sample6= _mm_cmple_ss(val,CT_current_00);
sample6 += _mm_cmple_ss(val,CT_current_01);
sample6 += _mm_cmple_ss(val,CT_current_02);
sample6 += _mm_cmple_ss(val,CT_current_10);
sample6 +=_mm_cmple_ss(val,CT_current_12);
sample6 +=_mm_cmple_ss(val,CT_current_20);
sample6 +=_mm_cmple_ss(val,CT_current_21);
sample6 +=_mm_cmple_ss(val,CT_current_22);
}
CT_image_0 +=width;
CT_image_1 +=width;
CT_image_2 +=width;
dst_d += (width-2);
}
我打破了我的头并尝试(作为一个外行人)使用 if 条件......请给我一些想法???
最佳答案
需要工作的部分显然是这样的:
__m128 sample6= _mm_cmple_ss(val,CT_current_00);
sample6 += _mm_cmple_ss(val,CT_current_01);
sample6 += _mm_cmple_ss(val,CT_current_02);
sample6 += _mm_cmple_ss(val,CT_current_10);
sample6 +=_mm_cmple_ss(val,CT_current_12);
sample6 +=_mm_cmple_ss(val,CT_current_20);
sample6 +=_mm_cmple_ss(val,CT_current_21);
sample6 +=_mm_cmple_ss(val,CT_current_22);
您需要将所有比较结果组合成一组标志,例如像这样:
__m128i out = _mm_setzero_si128(); // init output flags to all zeroes
__m128i test;
test = _mm_cmple_ss(val, CT_current_00); // compare
test = _mm_and_si128(test, _mm_set1_epi32(0x80)); // mask all but required flag
out = _mm_or_si128(out, test); // merge flags to output mask
test = _mm_cmple_ss(val, CT_current_01);
test = _mm_and_si128(test, _mm_set1_epi32(0x40));
out = _mm_or_si128(out, test);
// ... repeat for each offset and flag value
// ... then finally extract 4 bytes from `out`
// ... and store at output[width*(i-1)+(j-1)]
关于SSE中的比较操作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/26402272/