为什么第一段代码比第二段慢很多?
public long Gen(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % mod;
}
public long Gen(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % 12345;
}
gen 函数是一个 64 位无符号 PRNG(见下文)。
问题是第一段代码慢得多,使用变量计算模数基本上增加了计算随机数所需时间的 3 倍!更神秘的是,当你去掉循环,并使用变量计算模数时,速度与第二段代码相似。
这里发生了一些奇怪的事情,因为你不能告诉我使用变量的模数比这样慢几倍:
public ulong Gen()
{
counter = (counter + 1) & 3;
if (counter == 0)
{
state[0]++;
ulong x0 = state[0];
ulong x1 = state[1];
ulong x2 = state[2];
ulong x3 = state[3];
for (int i = 0; i < 2; i++)
{
x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));
x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));
x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
}
block[0] = x0;
block[1] = x1;
block[2] = x2;
block[3] = x3;
}
return block[counter];
}
要求的最小可复制版本:
using System;
using System.Diagnostics;
class Program
{
static void Main(string[] args)
{
Stopwatch sw = new Stopwatch();
Arx rng = new Arx();
long a = 0;
// constant = fast
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenConstant(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
// no loop = fast
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenNoLoop(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
// modulus variable = slow
sw.Start();
for (int i = 0; i < 10000000; i++)
{
a += rng.GenVariable(123);
}
sw.Stop();
Console.WriteLine(sw.ElapsedMilliseconds);
Console.WriteLine("{0:x16}", a);
sw.Reset();
}
}
class Arx
{
static public ulong[] state = new ulong[4];
static public ulong[] outBlock = new ulong[4];
static int counter = -1;
public Arx(ulong seed = 0)
{
if (seed == 0)
state[1] = (ulong)DateTime.UtcNow.Ticks;
else
state[1] = seed;
}
public ulong Gen()
{
counter = (counter + 1) & 3;
if (counter == 0)
{
state[0]++;
ulong x0 = state[0];
ulong x1 = state[1];
ulong x2 = state[2];
ulong x3 = state[3];
for (int i = 0; i < 2; i++)
{
x0 += x1; x1 ^= ((x0 << 32) | (x0 >> (64 - 32)));
x1 += x0; x0 ^= ((x1 << 32) | (x1 >> (64 - 32)));
x2 += x3; x3 ^= ((x2 << 32) | (x2 >> (64 - 32)));
x3 += x2; x2 ^= ((x3 << 32) | (x3 >> (64 - 32)));
x0 += x2; x2 ^= ((x0 << 27) | (x0 >> (64 - 27)));
x2 += x0; x0 ^= ((x2 << 27) | (x2 >> (64 - 27)));
x1 += x3; x3 ^= ((x1 << 27) | (x1 >> (64 - 27)));
x3 += x1; x1 ^= ((x3 << 27) | (x3 >> (64 - 27)));
x0 += x3; x3 ^= ((x0 << 11) | (x0 >> (64 - 11)));
x3 += x0; x0 ^= ((x3 << 11) | (x3 >> (64 - 11)));
x1 += x2; x2 ^= ((x1 << 11) | (x1 >> (64 - 11)));
x2 += x1; x1 ^= ((x2 << 11) | (x2 >> (64 - 11)));
}
outBlock[0] = x0;
outBlock[1] = x1;
outBlock[2] = x2;
outBlock[3] = x3;
}
return outBlock[counter];
}
public long GenConstant(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % 12345;
}
public long GenVariable(int mod)
{
long a, b = 0x7fffffffffffffff - mod;
do
{
a = (long)(Gen() >> 1);
} while (a > b);
return a % mod;
}
public long GenNoLoop(int mod)
{
long a = (long)(Gen() >> 1);
return a % mod;
}
}
最佳答案
这是一个优化器问题。
首先毫无疑问,使用变量比使用常量慢因为加载变量需要更多时间。
但是当你删除循环部分时,方法变得简单并且优化器使它们内联。请注意,当方法内联时,rng.GenNoLoop(123)
中的表达式 a % mod
可以被识别为常量。所以它们现在是相同的。
要恢复未优化的状态,您需要将一个真实变量传递给GenNoLoop
。
static int mod = 123;
static void Main(string[] args)
{
rng.GenNoLoop(mod);
}
另一种选择是强制方法不内联
[MethodImpl(MethodImplOptions.NoInlining)]
public long GenNoLoop(int mod)
关于C# 奇怪的模数速度行为,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56711861/