延迟是这里最大的问题。我发现尝试通过 OpenGL 将带有 RGBA 覆盖的 3 个 1920x1080 视频源渲染到单个窗口有限制。我能够渲染两个带叠加层的窗口或 3 个不带叠加层的窗口,但当引入第三个窗口时,渲染停顿很明显。我认为这个问题是由于过度使用 glAlphaFunc() 来覆盖 RGB 视频纹理和基于 RGBA 的纹理。为了减少过度使用,我的想法是将一些覆盖功能移到 CPU 中(因为我有很多 CPU - 双六核至强)。执行此操作的理想位置是将源 RGB 图像复制到映射的 PBO 并将 RGB 值替换为 A > 0 的 RGBA 叠加层中的值。
我尝试过使用英特尔 IPP 方法,但没有一种方法不涉及多次调用并会导致过多的延迟。我试过直接使用 C 代码,但这比我允许的 33 毫秒要长。我需要帮助来创建优化的程序集或基于 SSE 的例程,以提供最小的延迟。
使用 > g++ -fopenmp -O2 -mtune=native 编译下面的代码
为清楚起见,基本 C 函数:
void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
int i;
#pragma omp parallel for
for (i=0; i<aWidth*aHeight; ++i) {
if (0 == aOverlayRGBA[i*4+3]) {
aDestinationRGB[i*3] = aSourceRGB[i*3]; // R
aDestinationRGB[i*3+1] = aSourceRGB[i*3+1]; // G
aDestinationRGB[i*3+2] = aSourceRGB[i*3+2]; // B
} else {
aDestinationRGB[i*3] = aOverlayRGBA[i*4]; // R
aDestinationRGB[i*3+1] = aOverlayRGBA[i*4+1]; // G
aDestinationRGB[i*3+2] = aOverlayRGBA[i*4+2]; // B
}
}
}
uint64_t getTime() {
struct timeval tNow;
gettimeofday(&tNow, NULL);
return (uint64_t)tNow.tv_sec * 1000000 + (uint64_t)tNow.tv_usec;
}
int main(int argc, char **argv) {
int pixels = _WIDTH_ * _HEIGHT_ * 3;
uint8_t *rgba = new uint8_t[_WIDTH_ * _HEIGHT_ * 4];
uint8_t *src = new uint8_t[pixels];
uint8_t *dst = new uint8_t[pixels];
uint64_t tStart = getTime();
for (int t=0; t<1000; ++t) {
copyAndOverlay(src, rgba, dst, _WIDTH_, _HEIGHT_);
}
printf("delta: %lu\n", (getTime() - tStart) / 1000);
delete [] rgba;
delete [] src;
delete [] dst;
return 0;
}
最佳答案
这是一个 SSE4 实现,它比您随问题发布的代码快 5 倍多一点(没有循环的并行化)。如所写,它仅适用于 16 字节对齐且大小为 64 的倍数的 RGBA 缓冲区,以及 16 字节对齐且大小为 48 的倍数的 RGB 缓冲区。大小要求将与您的 1920x1080 分辨率完美匹配,并且您可能需要添加代码以确保您的缓冲区是 16 字节对齐的。
void copyAndOverlay(const uint8_t* aSourceRGB, const uint8_t* aOverlayRGBA, uint8_t* aDestinationRGB, int aWidth, int aHeight) {
__m128i const ocmp = _mm_setzero_si128();
__m128i const omskshf1 = _mm_set_epi32(0x00000000, 0x0F0F0F0B, 0x0B0B0707, 0x07030303);
__m128i const omskshf2 = _mm_set_epi32(0x07030303, 0x00000000, 0x0F0F0F0B, 0x0B0B0707);
__m128i const omskshf3 = _mm_set_epi32(0x0B0B0707, 0x07030303, 0x00000000, 0x0F0F0F0B);
__m128i const omskshf4 = _mm_set_epi32(0x0F0F0F0B, 0x0B0B0707, 0x07030303, 0x00000000);
__m128i const ovalshf1 = _mm_set_epi32(0x00000000, 0x0E0D0C0A, 0x09080605, 0x04020100);
__m128i const ovalshf2 = _mm_set_epi32(0x04020100, 0x00000000, 0x0E0D0C0A, 0x09080605);
__m128i const ovalshf3 = _mm_set_epi32(0x09080605, 0x04020100, 0x00000000, 0x0E0D0C0A);
__m128i const ovalshf4 = _mm_set_epi32(0x0E0D0C0A, 0x09080605, 0x04020100, 0x00000000);
__m128i const blndmsk1 = _mm_set_epi32(0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000);
__m128i const blndmsk2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000);
__m128i const blndmsk3 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
__m128i a, b, c, x, y, z, w, p, q, r, s;
uint8_t const *const aSourceRGBPast = aSourceRGB + 3 * aWidth * aHeight;
while (aSourceRGB != aSourceRGBPast) {
// source:
// aaabbbcccdddeeef
// ffggghhhiiijjjkk
// klllmmmnnnoooppp
//
// overlay:
// aaaabbbbccccdddd
// eeeeffffgggghhhh
// iiiijjjjkkkkllll
// mmmmnnnnoooopppp
// load source
a = _mm_load_si128((__m128i const*)(aSourceRGB ));
b = _mm_load_si128((__m128i const*)(aSourceRGB + 16));
c = _mm_load_si128((__m128i const*)(aSourceRGB + 32));
// load overlay
x = _mm_load_si128((__m128i const*)(aOverlayRGBA ));
y = _mm_load_si128((__m128i const*)(aOverlayRGBA + 16));
z = _mm_load_si128((__m128i const*)(aOverlayRGBA + 32));
w = _mm_load_si128((__m128i const*)(aOverlayRGBA + 48));
// compute blend mask, put 0xFF in bytes equal to zero
p = _mm_cmpeq_epi8(x, ocmp);
q = _mm_cmpeq_epi8(y, ocmp);
r = _mm_cmpeq_epi8(z, ocmp);
s = _mm_cmpeq_epi8(w, ocmp);
// align overlay to be condensed to 3-byte color
x = _mm_shuffle_epi8(x, ovalshf1);
y = _mm_shuffle_epi8(y, ovalshf2);
z = _mm_shuffle_epi8(z, ovalshf3);
w = _mm_shuffle_epi8(w, ovalshf4);
// condense overlay to 3-btye color
x = _mm_blendv_epi8(x, y, blndmsk1);
y = _mm_blendv_epi8(y, z, blndmsk2);
z = _mm_blendv_epi8(z, w, blndmsk3);
// align blend mask to be condensed to 3-byte color
p = _mm_shuffle_epi8(p, omskshf1);
q = _mm_shuffle_epi8(q, omskshf2);
r = _mm_shuffle_epi8(r, omskshf3);
s = _mm_shuffle_epi8(s, omskshf4);
// condense blend mask to 3-btye color
p = _mm_blendv_epi8(p, q, blndmsk1);
q = _mm_blendv_epi8(q, r, blndmsk2);
r = _mm_blendv_epi8(r, s, blndmsk3);
// select from overlay and source based on blend mask
x = _mm_blendv_epi8(x, a, p);
y = _mm_blendv_epi8(y, b, q);
z = _mm_blendv_epi8(z, c, r);
// write colors to destination
_mm_store_si128((__m128i*)(aDestinationRGB ), x);
_mm_store_si128((__m128i*)(aDestinationRGB + 16), y);
_mm_store_si128((__m128i*)(aDestinationRGB + 32), z);
// update poniters
aSourceRGB += 48;
aOverlayRGBA += 64;
aDestinationRGB += 48;
}
}
关于c++ - 使用 RGB 源和 RGBA 叠加实现近乎实时的 CPU 功能,如 glAlphaFunc(GL_GREATER),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/22018826/