我有两种迭代大小为 500 的 vector 的代码设计。一种设计包含 64 位 double 组,第二种设计使用包含 32 位整数的数组。我原以为 32 位设计会更快,因为可以将更多有用的数据打包到缓存中。
编译器 MSVC,CPU Ivy Bridge,编译 64 位模式。
这是代码 1,使用 32 位整数(在 2600 CPU 周期内运行):
#include <vector>
#include <iostream>
int main(){
std::vector<unsigned int> x1;
std::vector<unsigned int> x2;
std::vector<unsigned int> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtsc();
for(int i=0; i < 500; i++){
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp();
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
生成此 asm (-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r10,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov r8,rbx
mov r9d,1F4h
unsigned int a = x1[i];
unsigned int b = x2[i];
unsigned int g = x3[i];
mov edx,dword ptr [r8+r15]
m = m + (a * g);
mov ecx,edx
imul ecx,dword ptr [r8+r14]
xorps xmm0,xmm0
cvtsi2sd xmm0,rcx
addsd xmm7,xmm0
n = n + (b * g);
imul edx,dword ptr [r8]
mov eax,edx
xorps xmm0,xmm0
cvtsi2sd xmm0,rax
addsd xmm8,xmm0
for(int i=0; i < 500; i++){
add r8,4
dec r9
jne main+0E5h (013F681261h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
这是代码 2,使用 64 位 double (代码在 2000 CPU 周期内运行):
#include <vector>
#include <iostream>
int main(){
std::vector<double> x1;
std::vector<double> x2;
std::vector<unsigned long long> x3;
x1.resize(500);
x2.resize(500);
x3.resize(500);
for(int i =0; i<500; i++){
x1[i] = i;
x2[i] = 2*i;
x3[i] = 4*i;
}
int counter = 0;
while(counter < 1000){
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
double n = 0;
start = __rdtscp(&p);
for(int i=0; i < 500; i++){
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
m = m + (a * g);
n = n + (b * g);
}
end = __rdtscp(&q);
std::cout << (end-start) << "\t\t"<<m << n << std::endl;
counter++;
}
}
这是生成的 asm (-Os):
start = __rdtscp(&p);
rdtscp
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rax,rdx
mov r9,rax
unsigned int p;
unsigned int q;
unsigned long long start = 0;
unsigned long long end = 0;
double m = 0;
mov rdx,rbx
mov r8d,1F4h
double a = x1[i];
double b = x2[i];
unsigned long long g = x3[i];
mov rcx,qword ptr [rdx+r15]
xorps xmm1,xmm1
m = m + (a * g);
cvtsi2sd xmm1,rcx
test rcx,rcx
jns main+120h (013F32129Ch)
addsd xmm1,xmm9
movaps xmm0,xmm1
mulsd xmm0,mmword ptr [rdx+r14]
addsd xmm6,xmm0
n = n + (b * g);
mulsd xmm1,mmword ptr [rdx]
addsd xmm7,xmm1
for(int i=0; i < 500; i++){
add rdx,8
dec r8
jne main+10Ah (013F321286h)
}
end = __rdtscp(&q);
rdtscp
}
end = __rdtscp(&q);
lea r8,[rbp+6Fh]
mov dword ptr [r8],ecx
shl rdx,20h
or rdx,rax
最佳答案
不同的是第一段代码中整数到 double 的转换( vector 中包含unsigned int
,乘积是整数运算,但累加使用double
,在汇编程序中,这会将 cvtsi2sd
指令添加到您的代码中。
在第二个代码中,你到处都使用 double ,所以你没有转换,代码运行得更快。
在定点和浮点处理单元之间有更严格区分的 CPU 上,这种差异很多会更明显(POWER 平台就是一个例子)。 X86 平台在这方面非常宽容。
关于c++ - 64 位 vector 的点积比 32 位无符号整数 vector 快一倍?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23507597/