optimization - 改进 SSE (SSSE3) YUV 转 RGB 代码

标签 optimization assembly rgb sse yuv

我正在寻找优化我为将 YUV 转换为 RGB(平面和打包 YUV 函数)而编写的一些 SSE 代码。

我目前正在使用 SSSE3,但如果有来自更高版本的 SSE 的有用功能,那也没关系。

我主要对如何解决处理器停顿等问题感兴趣。

有人知道对 SSE 代码进行静态分析的任何工具吗?

;
; Copyright (C) 2009-2010 David McPaul
;
; All rights reserved. Distributed under the terms of the MIT License.
;

; A rather unoptimised set of ssse3 yuv to rgb converters
; does 8 pixels per loop

; inputer:
; reads 128 bits of yuv 8 bit data and puts
; the y values converted to 16 bit in xmm0
; the u values converted to 16 bit and duplicated into xmm1
; the v values converted to 16 bit and duplicated into xmm2

; conversion:
; does the yuv to rgb conversion using 16 bit integer and the
; results are placed into the following registers as 8 bit clamped values
; r values in xmm3
; g values in xmm4
; b values in xmm5

; outputer:
; writes out the rgba pixels as 8 bit values with 0 for alpha

; xmm6 used for scratch
; xmm7 used for scratch

%macro cglobal 1
global _%1
%define %1 _%1
align 16
%1:
%endmacro

; conversion code
%macro yuv2rgbsse2 0
; u = u - 128
; v = v - 128
; r = y + v + v >> 2 + v >> 3 + v >> 5
; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
; b = y + u + u >> 1 + u >> 2 + u >> 6
; subtract 16 from y
movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm0,xmm7 ; y = y - 16
; subtract 128 from u and v movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached)
psubsw xmm1,xmm7 ; u = u - 128
psubsw xmm2,xmm7 ; v = v - 128
; load r,b with y
movdqa xmm3,xmm0 ; r = y
pshufd xmm5,xmm0, 0xE4 ; b = y

; r = y + v + v >> 2 + v >> 3 + v >> 5
paddsw xmm3, xmm2 ; add v to r
movdqa xmm7, xmm1 ; move u to scratch
pshufd xmm6, xmm2, 0xE4 ; move v to scratch

psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,1 ; divide v by 2
paddsw xmm3, xmm6 ; and add to r
psraw xmm6,2 ; divide v by 4
paddsw xmm3, xmm6 ; and add to r

; b = y + u + u >> 1 + u >> 2 + u >> 6
paddsw xmm5, xmm1 ; add u to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,1 ; divide u by 2
paddsw xmm5, xmm7 ; and add to b
psraw xmm7,4 ; divide u by 32
paddsw xmm5, xmm7 ; and add to b

; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
movdqa xmm7,xmm2 ; move v to scratch
pshufd xmm6,xmm1, 0xE4 ; move u to scratch
movdqa xmm4,xmm0 ; g = y

psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,2 ; divide u by 4
psubsw xmm4,xmm6 ; subtract from g
psraw xmm6,1 ; divide u by 2
psubsw xmm4,xmm6 ; subtract from g

psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,2 ; divide v by 4
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
psraw xmm7,1 ; divide v by 2
psubsw xmm4,xmm7 ; subtract from g
%endmacro

; outputer
%macro rgba32sse2output 0
; clamp values
pxor xmm7,xmm7
packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel
packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel
packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel
; convert to bgra32 packed
punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg
movdqa xmm0, xmm5 ; save bg values
punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0
punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0
punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0
; write to output ptr
movntdq [edi], xmm5 ; output first 4 pixels bypassing cache
movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache
%endmacro

SECTION .data align=16

Const16 dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16
dw 16

Const128 dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128
dw 128

UMask db 0x01
db 0x80
db 0x01
db 0x80
db 0x05
db 0x80
db 0x05
db 0x80
db 0x09
db 0x80
db 0x09
db 0x80
db 0x0d
db 0x80
db 0x0d
db 0x80

VMask db 0x03
db 0x80
db 0x03
db 0x80
db 0x07
db 0x80
db 0x07
db 0x80
db 0x0b
db 0x80
db 0x0b
db 0x80
db 0x0f
db 0x80
db 0x0f
db 0x80

YMask db 0x00
db 0x80
db 0x02
db 0x80
db 0x04
db 0x80
db 0x06
db 0x80
db 0x08
db 0x80
db 0x0a
db 0x80
db 0x0c
db 0x80
db 0x0e
db 0x80

; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
width equ ebp+16
toPtr equ ebp+12
fromPtr equ ebp+8

; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
width1 equ ebp+24
toPtr1 equ ebp+20
fromVPtr equ ebp+16
fromUPtr equ ebp+12
fromYPtr equ ebp+8

SECTION .text align=16

cglobal Convert_YUV422_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx

mov esi, [fromPtr]
mov edi, [toPtr]
mov ecx, [width]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP
REPEATLOOP: ; loop over width / 8
; YUV422 packed inputer
movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv
pshufd xmm1, xmm0, 0xE4 ; copy to xmm1
movdqa xmm2, xmm0 ; copy to xmm2
; extract both y giving y0y0
pshufb xmm0, [YMask]
; extract u and duplicate so each u in yuyv becomes u0u0
pshufb xmm1, [UMask]
; extract v and duplicate so each v in yuyv becomes v0v0
pshufb xmm2, [VMask]

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,16
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP
ENDLOOP:
; Cleanup
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

cglobal Convert_YUV420P_RGBA32_SSSE3
; reserve variables
push ebp
mov ebp, esp
push edi
push esi
push ecx
push eax
push ebx

mov esi, [fromYPtr]
mov eax, [fromUPtr]
mov ebx, [fromVPtr]
mov edi, [toPtr1]
mov ecx, [width1]
; loop width / 8 times
shr ecx,3
test ecx,ecx
jng ENDLOOP1
REPEATLOOP1: ; loop over width / 8
; YUV420 Planar inputer movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000
movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000
movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000

; extract y
pxor xmm7,xmm7 ; 00000000000000000000000000000000
punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
; extract u and duplicate so each becomes 0u0u
punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000
punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000
pshuflw xmm1,xmm1, 0xA0 ; copy u values
pshufhw xmm1,xmm1, 0xA0 ; to get u0u0
; extract v
punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000
punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000
pshuflw xmm2,xmm2, 0xA0 ; copy v values
pshufhw xmm2,xmm2, 0xA0 ; to get v0v0

yuv2rgbsse2

rgba32sse2output

; endloop
add edi,32
add esi,8
add eax,4
add ebx,4
sub ecx, 1 ; apparently sub is better than dec
jnz REPEATLOOP1
ENDLOOP1:
; Cleanup
pop ebx
pop eax
pop ecx
pop esi
pop edi
mov esp, ebp
pop ebp
ret

SECTION .note.GNU-stack noalloc noexec nowrite progbits

最佳答案

如果您将 u & v 保持在一个寄存器中交错,并使用“pmaddwd”和预先计算的常量而不是您的移位加法方法,则可以将转换代码压缩到大约三分之一,并同时消除大多数停顿:

; xmm0 = y y y y y y y y
; xmm3 = u v u v u v u v

psubsw xmm3, [Const128]
psubsw xmm0, [Const16] 
movdqa xmm4, xmm3
movdqa xmm5, xmm3
pmaddwd xmm3, [const_1]
pmaddwd xmm4, [const_2]
pmaddwd xmm5, [const_3]
psrad xmm3, 14
psrad xmm4, 14
psrad xmm5, 14
pshufb xmm3, xmm3, [const_4] ; or pshuflw & pshufhw
pshufb xmm4, xmm4, [const_4]
pshufb xmm5, xmm5, [const_4]
paddsw xmm3, xmm0
paddsw xmm4, xmm0
paddsw xmm5, xmm0

如果您希望它运行得更快,那么使用 PMADDUBSW 应该可以让您一次处理 16 个像素,而复杂度略有增加。

大多数处理器(尤其是非英特尔处理器,因没有运行良好的硬件预取器而臭名昭著,但在较小程度上,英特尔也是如此)将受益于在循环中抛出的 prefetchnta [esi+256]。

编辑:使用 PMADDUBSW 的代码可能如下所示(不保证正确性):
const a: 
times 4 db 1,3
times 4 db 5,7
const b: 
times 4 db 9,11
times 4 db 13,15
const_c: times 8 dw 0x00ff
const_d: times 4 dd 0x00ffffff

const_uv_to_rgb_mul:
...
const_uv_to_rgb_add:
...

movdqa xmm4, [esi]
movdqa xmm0, xmm4
movdqa xmm1, xmm4
pshufb xmm0, [const_a] 
pshufb xmm1, [const_b]
pand xmm4, [const_c] 

; xmm0: uv0 uv0 uv0 uv0 uv2 uv2 uv2 uv2 uv2
; xmm1: uv4 uv4 uv4 uv4 ...
; xmm4: y0 0 y1 0 y2 0 y3 0 y4 0 y5 0 y6 0 y7 0

pmaddubsw xmm0, [const_uv_to_rgb_mul]
pmaddubsw xmm1, [const_uv_to_rgb_mul]
paddsw xmm0, [const_uv_to_rgb_add]
paddsw xmm1, [const_uv_to_rgb_add]
psraw xmm0, 6
psraw xmm1, 6

; r01 g01 b01 0 r23 g23 b23 0

pshufd xmm2, xmm0, 2+3*4+2*16+3*64
pshufd xmm0, xmm0, 0+1*4+0+16+1*64
pshufd xmm3, xmm1, 2+3*4+2*16+3*64
pshufd xmm1, xmm1, 0+1*4+0+16+1*64

; xmm0: r01 g01 b01 0 r01 g01 b01 0
; xmm2: r23 g23 b23 0 r23 g23 b23 0
; xmm1: r45 g45 b45 0 r45 g45 b45 0

paddsw xmm0, xmm4 ; add y
paddsw xmm1, xmm4 
paddsw xmm2, xmm4
paddsw xmm3, xmm4

packuswb xmm0, xmm2  ; pack with saturation into 0-255 range
packuswb xmm1, xmm3
pand xmm0, [const_d] ; zero out the alpha byte
pand xmm1, [const_d]
movntdq [edi], xmm0
movntdq [edi+16], xmm1

关于optimization - 改进 SSE (SSSE3) YUV 转 RGB 代码,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/4572548/

相关文章:

windows - 当您最小化应用程序时,Windows 会做些什么吗?

r - R中的非线性离散优化

assembly - 什么是机器周期?

linux - 为什么 %rbp 没有指向任何东西?

assembly - 为什么编译器要在从函数返回的 MIPS "j"指令之后放置一条指令?

android - 如何防止在 Nexus 4 上的 Android Chrome 中下载 "dekstop"css?

algorithm - 具有 K 个额外节点的最小生成树

jquery - 获取 #000 格式的背景颜色,而不是 RGB

c++ - 如何通过组合红色和蓝色无符号字节来优化混合?

javascript - 随喜好的随机颜色