c++ - 如何强制 emscripten/em++/llvm 从 .rodata 加载常量和/或执行更好的 SIMD 优化?

标签 c++ assembly v8 webassembly emscripten

我是 SSIM.js 的活跃作者和维护者和 jest-image-snapshot .目前,我正在努力优化我们的图像处理实现,以利用 WebAssembly 来提高性能。
现在,我注意到生成的代码从 llvm 程序集(webassembly 文本?)输出角度以及 Node.js 的实际程序集输出(--print-wasm-code)添加了不必要的指令。特别值得注意的是,它在加载常量时会做一些非常奇怪的事情。例如,查看下面三段代码中名为 multiplier 的数组或常量舍入器。在 GCC 上,乘数将存储在程序集的 .rodata 部分中以加载一次或转换为整数,而舍入器将内联 movd 或 movq。这里似乎是在循环的每一轮中插入值。它还在用 vpblendw 做一些我完全不知道的事情。
我该如何解决?

alignas(64) const static uint16_t multiplierArray[8]= {77,150,29,1,77,160,29,1};
extern "C"
int rgba2y(void* inputDataBuffer, ptrdiff_t length) {
        typedef __u8x16 v8x16;
        typedef __u16x8 v16x8;
        v8x16* pInputPtr = (v8x16*) inputDataBuffer;
        v8x16* pInputPtrEnd = (v8x16*)((uint8_t*)inputDataBuffer + length);
        v8x16* pOutputPtr = (v8x16*) inputDataBuffer;
        __m128i rounder = _mm_cvtsi32_si128(0x80808080);
        v8x16 zero;
        zero ^= zero;
        __m128i multiplier = *((__m128i*)multiplierArray);
//      v16x8 multiplier = wasm_i64x2_splat(0x1001d0096004d);
        unsigned i = 0;
        for (; (i+4)*sizeof(__m128i)<= length; i+= 4) {
                v8x16 iv0 = wasm_v8x16_shuffle(pInputPtr[i/4],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv1 = wasm_v8x16_shuffle(pInputPtr[i/4+1],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv2 = wasm_v8x16_shuffle(pInputPtr[i/4+2],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                v8x16 iv3 = wasm_v8x16_shuffle(pInputPtr[i/4+3],rounder,0,1,2,16,4,5,6,16,8,9,10,16,12,13,14,16);
                // rg ba rg ba rg ba rg ba rg ba rg ba rg ba
                __m128i rg0 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv0, (__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv0,(__m128i)zero),(__m128i)multiplier));
                __m128i rg1 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv1,(__m128i)zero),(__m128i)multiplier));
                __m128i rg2 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv2,(__m128i)zero),(__m128i)multiplier));
                __m128i rg3 = _mm_hadd_epi16(_mm_mullo_epi16(_mm_unpacklo_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier), _mm_mullo_epi16(_mm_unpackhi_epi8((__m128i)iv3,(__m128i)zero),(__m128i)multiplier));
                // rgba rgba rgba rgba rgba rgba rgba rgba
                __m128i rgba0 = wasm_u16x8_shr(_mm_hadd_epi16(rg0,rg1), 8);
                __m128i rgba1 = wasm_u16x8_shr(_mm_hadd_epi16(rg2,rg3), 8);
                pOutputPtr[i/4] = wasm_u8x16_narrow_i16x8(rgba0,rgba1);
        }
        // abbreviated...
        return 0;
}

llvm 程序集是:
    .section    .text.rgba2y,"",@
    .hidden rgba2y                          # -- Begin function rgba2y
    .globl  rgba2y
    .type   rgba2y,@function
rgba2y:                                 # @rgba2y
.Lfunc_begin0:
    .loc    2 56 0                          # rgb2y-sample.cpp:56:0
    .functype   rgba2y (i32, i32) -> (i32)
    .local      i32, i32, v128, v128, v128, v128, v128, v128
# %bb.0:                                # %entry
    #DEBUG_VALUE: rgba2y:length <- %4
    #DEBUG_VALUE: rgba2y:pInputPtrEnd <- undef
    #DEBUG_VALUE: rgba2y:i <- 0
    #DEBUG_VALUE: rgba2y:inputDataBuffer <- %3
    #DEBUG_VALUE: rgba2y:pInputPtr <- %3
    #DEBUG_VALUE: rgba2y:pOutputPtr <- %3
    #DEBUG_VALUE: rgba2y:rounder <- undef
    #DEBUG_VALUE: rgba2y:zero <- undef
    #DEBUG_VALUE: rgba2y:multiplier <- undef
    block
.Ltmp0:
    .loc    2 68 30 prologue_end            # rgb2y-sample.cpp:68:30
    local.get   1
    i32.const   64
    i32.lt_u
.Ltmp1:
    .loc    2 68 2 is_stmt 0                # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: down to label0
.Ltmp2:
# %bb.1:
    .loc    2 0 2                           # rgb2y-sample.cpp:0:2
    i32.const   0
    local.set   2
    i32.const   4
    local.set   3
.LBB0_2:                                # %for.body
                                        # =>This Inner Loop Header: Depth=1
    loop                                        # label1:
.Ltmp3:
    #DEBUG_VALUE: rgba2y:i <- %101
    #DEBUG_VALUE: rgba0 <- undef
    #DEBUG_VALUE: rgba1 <- undef
    .loc    2 69 15 is_stmt 1               # rgb2y-sample.cpp:69:15
    local.get   0
    local.get   2
    i32.const   2
    i32.shl
    i32.add
    local.tee   2
    local.get   2
    v128.load   0
    i32.const   0
    i8x16.splat
    local.tee   4
    i32.const   -128
    i8x16.replace_lane  0
    i32.const   -128
    i8x16.replace_lane  1
    i32.const   -128
    i8x16.replace_lane  2
    i32.const   -128
    i8x16.replace_lane  3
    local.tee   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp4:
    .loc    2 74 48                         # rgb2y-sample.cpp:74:48
    local.tee   6
.Ltmp5:
    #DEBUG_VALUE: iv0 <- undef
    #DEBUG_VALUE: iv0 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    i32.const   77
    .loc    2 74 32 is_stmt 0               # rgb2y-sample.cpp:74:32
    i16x8.splat
    i32.const   150
    i16x8.replace_lane  1
    i32.const   29
    i16x8.replace_lane  2
    i32.const   1
    i16x8.replace_lane  3
    i32.const   160
    i16x8.replace_lane  5
    i32.const   29
    i16x8.replace_lane  6
    i32.const   1
    i16x8.replace_lane  7
    local.tee   7
    i16x8.mul
    .loc    2 74 133                        # rgb2y-sample.cpp:74:133
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 74 117                        # rgb2y-sample.cpp:74:117
    local.get   7
    i16x8.mul
    .loc    2 74 17                         # rgb2y-sample.cpp:74:17
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp6:
    .loc    2 0 17                          # rgb2y-sample.cpp:0:17
    local.tee   6
.Ltmp7:
    #DEBUG_VALUE: rg0 <- undef
    #DEBUG_VALUE: rg0 <- %153
    .loc    2 70 15 is_stmt 1               # rgb2y-sample.cpp:70:15
    local.get   2
    i32.const   16
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp8:
    .loc    2 75 62                         # rgb2y-sample.cpp:75:62
    local.tee   8
.Ltmp9:
    #DEBUG_VALUE: iv1 <- undef
    #DEBUG_VALUE: iv1 <- %157
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 75 46 is_stmt 0               # rgb2y-sample.cpp:75:46
    local.get   7
    i16x8.mul
    .loc    2 75 146                        # rgb2y-sample.cpp:75:146
    local.tee   9
    local.get   8
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 75 130                        # rgb2y-sample.cpp:75:130
    local.get   7
    i16x8.mul
    .loc    2 75 31                         # rgb2y-sample.cpp:75:31
    local.tee   8
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   9
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp10:
    .loc    2 79 33 is_stmt 1               # rgb2y-sample.cpp:79:33
    local.tee   8
.Ltmp11:
    #DEBUG_VALUE: rg1 <- undef
    #DEBUG_VALUE: rg1 <- %157
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   8
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 79 18 is_stmt 0               # rgb2y-sample.cpp:79:18
    i16x8.shr_u
    .loc    2 71 15 is_stmt 1               # rgb2y-sample.cpp:71:15
    local.get   2
    i32.const   32
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp12:
    .loc    2 76 62                         # rgb2y-sample.cpp:76:62
    local.tee   6
.Ltmp13:
    #DEBUG_VALUE: iv2 <- undef
    #DEBUG_VALUE: iv2 <- %153
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 76 46 is_stmt 0               # rgb2y-sample.cpp:76:46
    local.get   7
    i16x8.mul
    .loc    2 76 146                        # rgb2y-sample.cpp:76:146
    local.tee   8
    local.get   6
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 76 130                        # rgb2y-sample.cpp:76:130
    local.get   7
    i16x8.mul
    .loc    2 76 31                         # rgb2y-sample.cpp:76:31
    local.tee   6
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   6
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp14:
    .loc    2 0 31                          # rgb2y-sample.cpp:0:31
    local.tee   6
.Ltmp15:
    #DEBUG_VALUE: rg2 <- undef
    #DEBUG_VALUE: rg2 <- %153
    .loc    2 72 15 is_stmt 1               # rgb2y-sample.cpp:72:15
    local.get   2
    i32.const   48
    i32.add
    v128.load   0
    local.get   5
    v8x16.shuffle   0, 1, 2, 16, 4, 5, 6, 16, 8, 9, 10, 16, 12, 13, 14, 16
.Ltmp16:
    .loc    2 77 62                         # rgb2y-sample.cpp:77:62
    local.tee   5
.Ltmp17:
    #DEBUG_VALUE: iv3 <- undef
    #DEBUG_VALUE: iv3 <- %98
    local.get   4
    v8x16.shuffle   0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23
    .loc    2 77 46 is_stmt 0               # rgb2y-sample.cpp:77:46
    local.get   7
    i16x8.mul
    .loc    2 77 146                        # rgb2y-sample.cpp:77:146
    local.tee   8
    local.get   5
    local.get   4
    v8x16.shuffle   8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31
    .loc    2 77 130                        # rgb2y-sample.cpp:77:130
    local.get   7
    i16x8.mul
    .loc    2 77 31                         # rgb2y-sample.cpp:77:31
    local.tee   4
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   8
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
.Ltmp18:
    .loc    2 80 33 is_stmt 1               # rgb2y-sample.cpp:80:33
    local.tee   4
.Ltmp19:
    #DEBUG_VALUE: rg3 <- undef
    #DEBUG_VALUE: rg3 <- %93
    v8x16.shuffle   2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
    local.get   6
    local.get   4
    v8x16.shuffle   0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
    i16x8.add
    i32.const   8
    .loc    2 80 18 is_stmt 0               # rgb2y-sample.cpp:80:18
    i16x8.shr_u
    .loc    2 81 21 is_stmt 1               # rgb2y-sample.cpp:81:21
    i8x16.narrow_i16x8_u
    .loc    2 81 19 is_stmt 0               # rgb2y-sample.cpp:81:19
    v128.store  0
.Ltmp20:
    #DEBUG_VALUE: rgba2y:i <- %170
    .loc    2 0 19                          # rgb2y-sample.cpp:0:19
    local.get   3
    local.tee   3
    local.set   2
.Ltmp21:
    .loc    2 68 11 is_stmt 1               # rgb2y-sample.cpp:68:11
    local.get   3
    i32.const   4
    i32.add
    local.tee   3
    i32.const   4
    .loc    2 68 14 is_stmt 0               # rgb2y-sample.cpp:68:14
    i32.shl
    .loc    2 68 30                         # rgb2y-sample.cpp:68:30
    local.get   1
    i32.le_u
.Ltmp22:
    .loc    2 68 2                          # rgb2y-sample.cpp:68:2
    br_if       0                               # 0: up to label1
.Ltmp23:
.LBB0_3:                                # %for.end
    end_loop
    end_block                               # label0:
    i32.const   0
.Ltmp24:
    .loc    2 84 2 is_stmt 1                # rgb2y-sample.cpp:84:2
                                        # fallthrough-return
    end_function
.Ltmp25:
.Lfunc_end0:
    .size   rgba2y, .Lfunc_end0-rgba2y
                                        # -- End function
组装/组装:
--- WebAssembly code ---
index: 2
kind: wasm function
compiler: TurboFan
Body (size = 1088 = 1086 + 2 padding)
Instructions (size = 1064)
0xa5976359180     0  55             push rbp
0xa5976359181     1  4889e5         REX.W movq rbp,rsp
0xa5976359184     4  6a0a           push 0xa
0xa5976359186     6  56             push rsi
0xa5976359187     7  4883ec58       REX.W subq rsp,0x58
0xa597635918b     b  488b5e17       REX.W movq rbx,[rsi+0x17]
0xa597635918f     f  83fa40         cmpl rdx,0x40
0xa5976359192    12  0f8307000000   jnc 0xa597635919f  <+0x1f>
0xa5976359198    18  33c9           xorl rcx,rcx
0xa597635919a    1a  e990030000     jmp 0xa597635952f  <+0x3af>
0xa597635919f    1f  b94d000000     movl rcx,0x4d
0xa59763591a4    24  c5f96ec1       vmovd xmm0,rcx
0xa59763591a8    28  c5fb70c000     vpshuflw xmm0,xmm0,0x0
0xa59763591ad    2d  c5f970c000     vpshufd xmm0,xmm0,0x0
0xa59763591b2    32  33c9           xorl rcx,rcx
0xa59763591b4    34  c5f96ec9       vmovd xmm1,rcx
0xa59763591b8    38  c4410057ff     vxorps xmm15,xmm15,xmm15
0xa59763591bd    3d  c4c27100cf     vpshufb xmm1,xmm1,xmm15
0xa59763591c2    42  bf96000000     movl rdi,0x96
0xa59763591c7    47  c5f9c4c701     vpinsrw xmm0,xmm0,rdi,0x1
0xa59763591cc    4c  bf80ffffff     movl rdi,0xffffff80
0xa59763591d1    51  c5f928d1       vmovapd xmm2,xmm1
0xa59763591d5    55  c4e36920d700   vpinsrb xmm2,xmm2,dil,0x0
0xa59763591db    5b  41b81d000000   movl r8,0x1d
0xa59763591e1    61  c4c179c4c002   vpinsrw xmm0,xmm0,r8,0x2
0xa59763591e7    67  c4e36920d701   vpinsrb xmm2,xmm2,dil,0x1
0xa59763591ed    6d  41b901000000   movl r9,0x1
0xa59763591f3    73  c4c179c4c103   vpinsrw xmm0,xmm0,r9,0x3
0xa59763591f9    79  c4e36920d702   vpinsrb xmm2,xmm2,dil,0x2
0xa59763591ff    7f  41bba0000000   movl r11,0xa0
0xa5976359205    85  c4c179c4c305   vpinsrw xmm0,xmm0,r11,0x5
0xa597635920b    8b  c4e36920d703   vpinsrb xmm2,xmm2,dil,0x3
0xa5976359211    91  c4c179c4c006   vpinsrw xmm0,xmm0,r8,0x6
0xa5976359217    97  c4c179c4c107   vpinsrw xmm0,xmm0,r9,0x7
0xa597635921d    9d  488bf9         REX.W movq rdi,rcx
0xa5976359220    a0  41b804000000   movl r8,0x4
0xa5976359226    a6  e90b000000     jmp 0xa5976359236  <+0xb6>
0xa597635922b    ab  0f1f440000     nop
0xa5976359230    b0  498bf8         REX.W movq rdi,r8
0xa5976359233    b3  4d8bc1         REX.W movq r8,r9
0xa5976359236    b6  4c8b4e2f       REX.W movq r9,[rsi+0x2f]
0xa597635923a    ba  493b21         REX.W cmpq rsp,[r9]
0xa597635923d    bd  0f86f4020000   jna 0xa5976359537  <+0x3b7>
0xa5976359243    c3  458d4804       leal r9,[r8+0x4]
0xa5976359247    c7  4d8bd9         REX.W movq r11,r9
0xa597635924a    ca  41c1e304       shll r11, 4
0xa597635924e    ce  8d3cb8         leal rdi,[rax+rdi*4]
0xa5976359251    d1  c5fa6f1c3b     vmovdqu xmm3,[rbx+rdi*1]
0xa5976359256    d6  c5fa6f641f10   vmovdqu xmm4,[rdi+rbx*1+0x10]
0xa597635925c    dc  c5fa6f6c1f20   vmovdqu xmm5,[rdi+rbx*1+0x20]
0xa5976359262    e2  c5fa6f741f30   vmovdqu xmm6,[rdi+rbx*1+0x30]
0xa5976359268    e8  c57810fe       vmovups xmm15,xmm6
0xa597635926c    ec  49ba0001028004050680 REX.W movq r10,0x8006050480020100
0xa5976359276    f6  c441f96ec2     vmovq xmm8,r10
0xa597635927b    fb  49ba08090a800c0d0e80 REX.W movq r10,0x800e0d0c800a0908
0xa5976359285   105  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635928b   10b  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359290   110  0f10fa         movups xmm7,xmm2
0xa5976359293   113  49ba8080800080808000 REX.W movq r10,0x80808000808080
0xa597635929d   11d  c441f96ec2     vmovq xmm8,r10
0xa59763592a2   122  4c8b15ecffffff REX.W movq r10,[rip+0xffffffec]
0xa59763592a9   129  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592af   12f  c4c24100f8     vpshufb xmm7,xmm7,xmm8
0xa59763592b4   134  c4c141ebff     vpor xmm7,xmm7,xmm15
0xa59763592b9   139  c57810fd       vmovups xmm15,xmm5
0xa59763592bd   13d  4c8b15aaffffff REX.W movq r10,[rip+0xffffffaa]
0xa59763592c4   144  c441f96ec2     vmovq xmm8,r10
0xa59763592c9   149  4c8b15adffffff REX.W movq r10,[rip+0xffffffad]
0xa59763592d0   150  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592d6   156  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa59763592db   15b  0f10f2         movups xmm6,xmm2
0xa59763592de   15e  4c8b15b0ffffff REX.W movq r10,[rip+0xffffffb0]
0xa59763592e5   165  c441f96ec2     vmovq xmm8,r10
0xa59763592ea   16a  4c8b15a4ffffff REX.W movq r10,[rip+0xffffffa4]
0xa59763592f1   171  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa59763592f7   177  c4c24900f0     vpshufb xmm6,xmm6,xmm8
0xa59763592fc   17c  c4c149ebf7     vpor xmm6,xmm6,xmm15
0xa5976359301   181  c57810fc       vmovups xmm15,xmm4
0xa5976359305   185  4c8b1562ffffff REX.W movq r10,[rip+0xffffff62]
0xa597635930c   18c  c441f96ec2     vmovq xmm8,r10
0xa5976359311   191  4c8b1565ffffff REX.W movq r10,[rip+0xffffff65]
0xa5976359318   198  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635931e   19e  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa5976359323   1a3  0f10ea         movups xmm5,xmm2
0xa5976359326   1a6  4c8b1568ffffff REX.W movq r10,[rip+0xffffff68]
0xa597635932d   1ad  c441f96ec2     vmovq xmm8,r10
0xa5976359332   1b2  4c8b155cffffff REX.W movq r10,[rip+0xffffff5c]
0xa5976359339   1b9  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa597635933f   1bf  c4c25100e8     vpshufb xmm5,xmm5,xmm8
0xa5976359344   1c4  c4c151ebef     vpor xmm5,xmm5,xmm15
0xa5976359349   1c9  c57810fb       vmovups xmm15,xmm3
0xa597635934d   1cd  4c8b151affffff REX.W movq r10,[rip+0xffffff1a]
0xa5976359354   1d4  c441f96ec2     vmovq xmm8,r10
0xa5976359359   1d9  4c8b151dffffff REX.W movq r10,[rip+0xffffff1d]
0xa5976359360   1e0  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359366   1e6  c4420100f8     vpshufb xmm15,xmm15,xmm8
0xa597635936b   1eb  0f10e2         movups xmm4,xmm2
0xa597635936e   1ee  4c8b1520ffffff REX.W movq r10,[rip+0xffffff20]
0xa5976359375   1f5  c441f96ec2     vmovq xmm8,r10
0xa597635937a   1fa  4c8b1514ffffff REX.W movq r10,[rip+0xffffff14]
0xa5976359381   201  c443b922c201   vpinsrq xmm8,xmm8,r10,0x1
0xa5976359387   207  c4c25900e0     vpshufb xmm4,xmm4,xmm8
0xa597635938c   20c  c4c159ebe7     vpor xmm4,xmm4,xmm15
0xa5976359391   211  c5f928df       vmovapd xmm3,xmm7
0xa5976359395   215  c5e168d9       vpunpckhbw xmm3,xmm3,xmm1
0xa5976359399   219  c5c160f9       vpunpcklbw xmm7,xmm7,xmm1
0xa597635939d   21d  c57928c6       vmovapd xmm8,xmm6
0xa59763593a1   221  c53968c1       vpunpckhbw xmm8,xmm8,xmm1
0xa59763593a5   225  c5c960f1       vpunpcklbw xmm6,xmm6,xmm1
0xa59763593a9   229  c57928cd       vmovapd xmm9,xmm5
0xa59763593ad   22d  c53168c9       vpunpckhbw xmm9,xmm9,xmm1
0xa59763593b1   231  c5d160e9       vpunpcklbw xmm5,xmm5,xmm1
0xa59763593b5   235  c57928d4       vmovapd xmm10,xmm4
0xa59763593b9   239  c52968d1       vpunpckhbw xmm10,xmm10,xmm1
0xa59763593bd   23d  c5d960e1       vpunpcklbw xmm4,xmm4,xmm1
0xa59763593c1   241  c5e1d5d8       vpmullw xmm3,xmm3,xmm0
0xa59763593c5   245  c5c1d5f8       vpmullw xmm7,xmm7,xmm0
0xa59763593c9   249  c539d5c0       vpmullw xmm8,xmm8,xmm0
0xa59763593cd   24d  c5c9d5f0       vpmullw xmm6,xmm6,xmm0
0xa59763593d1   251  c531d5c8       vpmullw xmm9,xmm9,xmm0
0xa59763593d5   255  c5d1d5e8       vpmullw xmm5,xmm5,xmm0
0xa59763593d9   259  c529d5d0       vpmullw xmm10,xmm10,xmm0
0xa59763593dd   25d  c5d9d5e0       vpmullw xmm4,xmm4,xmm0
0xa59763593e1   261  c57928df       vmovapd xmm11,xmm7
0xa59763593e5   265  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763593ea   26a  c463010efb55   vpblendw xmm15,xmm15,xmm3,0x55
0xa59763593f0   270  c443210edfaa   vpblendw xmm11,xmm11,xmm15,0xaa
0xa59763593f6   276  c442212bdf     vpackusdw xmm11,xmm11,xmm15
0xa59763593fb   27b  c57810fb       vmovups xmm15,xmm3
0xa59763593ff   27f  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359405   285  c5c172d710     vpsrld xmm7,xmm7,16
0xa597635940a   28a  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa597635940f   28f  c5f928de       vmovapd xmm3,xmm6
0xa5976359413   293  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359418   298  c443010ef855   vpblendw xmm15,xmm15,xmm8,0x55
0xa597635941e   29e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa5976359424   2a4  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa5976359429   2a9  c4417810f8     vmovups xmm15,xmm8
0xa597635942e   2ae  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359434   2b4  c5c972d610     vpsrld xmm6,xmm6,16
0xa5976359439   2b9  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa597635943e   2be  c57928c5       vmovapd xmm8,xmm5
0xa5976359442   2c2  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359447   2c7  c443010ef955   vpblendw xmm15,xmm15,xmm9,0x55
0xa597635944d   2cd  c443390ec7aa   vpblendw xmm8,xmm8,xmm15,0xaa
0xa5976359453   2d3  c442392bc7     vpackusdw xmm8,xmm8,xmm15
0xa5976359458   2d8  c4417810f9     vmovups xmm15,xmm9
0xa597635945d   2dd  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359463   2e3  c5d172d510     vpsrld xmm5,xmm5,16
0xa5976359468   2e8  c4c2512bef     vpackusdw xmm5,xmm5,xmm15
0xa597635946d   2ed  c57928cc       vmovapd xmm9,xmm4
0xa5976359471   2f1  c44101efff     vpxor xmm15,xmm15,xmm15
0xa5976359476   2f6  c443010efa55   vpblendw xmm15,xmm15,xmm10,0x55
0xa597635947c   2fc  c443310ecfaa   vpblendw xmm9,xmm9,xmm15,0xaa
0xa5976359482   302  c442312bcf     vpackusdw xmm9,xmm9,xmm15
0xa5976359487   307  c4417810fa     vmovups xmm15,xmm10
0xa597635948c   30c  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359492   312  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359497   317  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635949c   31c  c4c141fdfb     vpaddw xmm7,xmm7,xmm11
0xa59763594a1   321  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa59763594a5   325  c4c151fde8     vpaddw xmm5,xmm5,xmm8
0xa59763594aa   32a  c4c159fde1     vpaddw xmm4,xmm4,xmm9
0xa59763594af   32f  c5f928de       vmovapd xmm3,xmm6
0xa59763594b3   333  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594b8   338  c463010eff55   vpblendw xmm15,xmm15,xmm7,0x55
0xa59763594be   33e  c4c3610edfaa   vpblendw xmm3,xmm3,xmm15,0xaa
0xa59763594c4   344  c4c2612bdf     vpackusdw xmm3,xmm3,xmm15
0xa59763594c9   349  c57810ff       vmovups xmm15,xmm7
0xa59763594cd   34d  c4c10172d710   vpsrld xmm15,xmm15,16
0xa59763594d3   353  c5c972d610     vpsrld xmm6,xmm6,16
0xa59763594d8   358  c4c2492bf7     vpackusdw xmm6,xmm6,xmm15
0xa59763594dd   35d  c5f928fc       vmovapd xmm7,xmm4
0xa59763594e1   361  c44101efff     vpxor xmm15,xmm15,xmm15
0xa59763594e6   366  c463010efd55   vpblendw xmm15,xmm15,xmm5,0x55
0xa59763594ec   36c  c4c3410effaa   vpblendw xmm7,xmm7,xmm15,0xaa
0xa59763594f2   372  c4c2412bff     vpackusdw xmm7,xmm7,xmm15
0xa59763594f7   377  c57810fd       vmovups xmm15,xmm5
0xa59763594fb   37b  c4c10172d710   vpsrld xmm15,xmm15,16
0xa5976359501   381  c5d972d410     vpsrld xmm4,xmm4,16
0xa5976359506   386  c4c2592be7     vpackusdw xmm4,xmm4,xmm15
0xa597635950b   38b  c5c9fdf3       vpaddw xmm6,xmm6,xmm3
0xa597635950f   38f  c5d9fde7       vpaddw xmm4,xmm4,xmm7
0xa5976359513   393  c5c971d608     vpsrlw xmm6,xmm6,8
0xa5976359518   398  c5d971d408     vpsrlw xmm4,xmm4,8
0xa597635951d   39d  c5d967e6       vpackuswb xmm4,xmm4,xmm6
0xa5976359521   3a1  c5fa7f243b     vmovdqu [rbx+rdi*1],xmm4
0xa5976359526   3a6  443bda         cmpl r11,rdx
0xa5976359529   3a9  0f8601fdffff   jna 0xa5976359230  <+0xb0>
0xa597635952f   3af  488bc1         REX.W movq rax,rcx
0xa5976359532   3b2  488be5         REX.W movq rsp,rbp
0xa5976359535   3b5  5d             pop rbp
0xa5976359536   3b6  c3             retl
0xa5976359537   3b7  488955e8       REX.W movq [rbp-0x18],rdx
0xa597635953b   3bb  48895de0       REX.W movq [rbp-0x20],rbx
0xa597635953f   3bf  c5f81145d0     vmovups [rbp-0x30],xmm0
0xa5976359544   3c4  c5f8114dc0     vmovups [rbp-0x40],xmm1
0xa5976359549   3c9  c5f81155b0     vmovups [rbp-0x50],xmm2
0xa597635954e   3ce  488945a8       REX.W movq [rbp-0x58],rax
0xa5976359552   3d2  48897da0       REX.W movq [rbp-0x60],rdi
0xa5976359556   3d6  4c894598       REX.W movq [rbp-0x68],r8
0xa597635955a   3da  e8615dffff     call 0xa597634f2c0       ;; wasm stub: WasmStackGuard
0xa597635955f   3df  33c9           xorl rcx,rcx
0xa5976359561   3e1  488b55e8       REX.W movq rdx,[rbp-0x18]
0xa5976359565   3e5  488b5de0       REX.W movq rbx,[rbp-0x20]
0xa5976359569   3e9  c5f81045d0     vmovups xmm0,[rbp-0x30]
0xa597635956e   3ee  c5f8104dc0     vmovups xmm1,[rbp-0x40]
0xa5976359573   3f3  c5f81055b0     vmovups xmm2,[rbp-0x50]
0xa5976359578   3f8  488b45a8       REX.W movq rax,[rbp-0x58]
0xa597635957c   3fc  488b7da0       REX.W movq rdi,[rbp-0x60]
0xa5976359580   400  4c8b4598       REX.W movq r8,[rbp-0x68]
0xa5976359584   404  488b75f0       REX.W movq rsi,[rbp-0x10]
0xa5976359588   408  e9b6fcffff     jmp 0xa5976359243  <+0xc3>
0xa597635958d   40d  e8fe5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359592   412  e8f95affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa5976359597   417  e8f45affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa597635959c   41c  e8ef5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a1   421  e8ea5affff     call 0xa597634f090       ;; wasm stub: ThrowWasmTrapMemOutOfBounds
0xa59763595a6   426  90             nop
0xa59763595a7   427  90             nop

Protected instructions:
 pc offset  land pad
       3a1       40d
        e2       412
        dc       417
        d6       41c
        d1       421

Source positions:
 pc offset  position
        d1        43
        d6       239
        dc       416
        e2       545
       3a1       722
       3b7        29
       40d       722
       412       545
       417       416
       41c       239
       421        43

Safepoints (size = 22)
0xa5a7635917fffffffff  000000000000000 (sp -> fp)

RelocInfo (size = 8)
0xa597635955b  wasm stub call
0xa597635958e  wasm stub call
0xa5976359593  wasm stub call
0xa5976359598  wasm stub call
0xa597635959d  wasm stub call
0xa59763595a2  wasm stub call

--- End code ---

最佳答案

Emscripten issue 复制我的答案:
我们不使用 v128.const 的原因是 v128.const 最近才在 V8 中实现。为了避免破坏原始试用用户,我们无法更新 LLVM 以发出 v128.const,直到相关的 V8 补丁进入 Chrome 稳定版。我正在密切关注此仪表板,以确定何时是进行此更改的好时机。如果您使用的是 Chrome 的较新版本或其他支持 v128.const 的执行环境,您可以尝试使用 -munimplemented-simd128 标志编译您的项目,这将在 LLVM 中启用 v128.const(但也可能引入您不想要的其他更改)。一旦 v128.const 被广泛使用,LLVM 使用 v128.const 会比从内存中加载 vector 更好,因为这允许引擎确定在给定运行时平台的情况下实现 vector 的最佳方式。
也可能值得考虑将代码的性能敏感部分移植到直接使用 WebAssembly 内在函数头而不是依赖于模拟的 SSE。这将减少您的代码和底层机器代码之间的一层阻抗不匹配。
最后,如果您注意到任何地方的指令选择不理想,如果您可以提交关于您的特定问题的 LLVM 错误(如果它在代码 -> wasm 端)或 V8 错误(如果它在 wasm -> native 端),将会很有帮助看。这种反馈对我们来说非常有值(value)。

关于c++ - 如何强制 emscripten/em++/llvm 从 .rodata 加载常量和/或执行更好的 SIMD 优化?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63875389/

相关文章:

c++ - 为什么 std::vector 迭代器在 erase() 调用后失效?

c - 为什么这个 shellcode 程序会发出段错误警告?

用于检查 mips 的 c >= 或 <= 的命令

javascript - Chrome JavaScript CPU 分析器做了什么可能会影响程序的性能(在分析期间)?

JavaScript:浏览器支持 vs 纯语言支持:setTimeout、setInterval

c++ - 正确处理 Persistent<ArrayBuffer>

c++ - 处理文件和图像加载异常的最佳方式是什么?

c++ - tensorflow C++ API : How to read Tensor from files?

c++ - 函数 int86 编译错误 : "Stray 302",

algorithm - 汇编中的 Adob​​e Type 1 加密算法