更新:之前提到的跨度问题已在 .net core 2.1 版本(目前处于预览阶段)中得到修复。这些实际上使跨度向量*比数组向量*更快*...
注意:在“Intel Xeon E5-1660 v4”上测试这个,CPU-Z 告诉我有“MMX、SSE、SSE2、SSE3、SSSE3、SSE4.1、SSE4.2、EM64T、VT-x”的说明, AES, AVX, AVX2, FMA3, RSX” 所以应该没问题...
关闭回答 Vector based question ,我想我会尝试实现一些 BLAS 功能。我发现那些正在读取/求和的东西(例如点积)非常好,但是我正在写回数组的东西很糟糕 - 比非 SIMD 好,但几乎没有。
那么我是做错了什么,还是需要在 JIT 中做更多的工作?
示例(假设 x.Length = y.Length,不为空等等等等等等):
public static void daxpy(double alpha, double[] x, double[] y)
{
for (var i = 0; i < x.Length; ++i)
y[i] = y[i] + x[i] * alpha;
}
向量形式变为:
public static void daxpy(double alpha, double[] x, double[] y)
{
var i = 0;
if (Vector.IsHardwareAccelerated)
{
var length = x.Length + 1 - Vector<double>.Count;
for (; i < length; i += Vector<double>.Count)
{
var valpha = new Vector<double>(alpha);
var vx = new Vector<double>(x, i);
var vy = new Vector<double>(y, i);
(vy + vx * valpha).CopyTo(y, i);
}
}
for (; i < x.Length; ++i)
y[i] = y[i] + x[i] * alpha;
}
并且,在 .NET Core 2.0 中试玩,我虽然会尝试 Span,包括原始形式和 Vector 形式:
public static void daxpy(double alpha, Span<double> x, Span<double> y)
{
for (var i = 0; i < x.Length; ++i)
y[i] += x[i] * alpha;
}
和矢量
public static void daxpy(double alpha, Span<double> x, Span<double> y)
{
if (Vector.IsHardwareAccelerated)
{
var vx = x.NonPortableCast<double, Vector<double>>();
var vy = y.NonPortableCast<double, Vector<double>>();
var valpha = new Vector<double>(alpha);
for (var i = 0; i < vx.Length; ++i)
vy[i] += vx[i] * valpha;
x = x.Slice(Vector<double>.Count * vx.Length);
y = y.Slice(Vector<double>.Count * vy.Length);
}
for (var i = 0; i < x.Length; ++i)
y[i] += x[i] * alpha;
}
所以所有这些的相对时间是:
Naive 1.0
Vector 0.8
Span Naive 2.5 ==> Update: Span Naive 1.1
Span Vector 0.9 ==> Update: Span Vector 0.6
那我是不是做错了什么?我几乎想不出一个更简单的例子,所以我不这么认为?
最佳答案
你可能想用 2.1 测试而不是 2.0; 在我的笔记本电脑上(与我的台式机相比,它的 SIMD 很差),我得到:
daxpy_naive x10000: 144ms
daxpy_arr_vector x10000: 77ms
daxpy_span x10000: 173ms
daxpy_vector x10000: 67ms
daxpy_vector_no_slice x10000: 67ms
使用代码:
using System;
using System.Diagnostics;
using System.Numerics;
class Program
{
static void Main(string[] args)
{
double alpha = 0.5;
double[] x = new double[16 * 1024], y = new double[x.Length];
var rand = new Random(12345);
for (int i = 0; i < x.Length; i++)
x[i] = rand.NextDouble();
RunAll(alpha, x, y, 1, false);
RunAll(alpha, x, y, 10000, true);
}
private static void RunAll(double alpha, double[] x, double[] y, int loop, bool log)
{
GC.Collect(GC.MaxGeneration);
GC.WaitForPendingFinalizers();
var watch = Stopwatch.StartNew();
for(int i = 0; i < loop; i++)
{
daxpy_naive(alpha, x, y);
}
watch.Stop();
if (log) Console.WriteLine($"{nameof(daxpy_naive)} x{loop}: {watch.ElapsedMilliseconds}ms");
watch = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
daxpy_arr_vector(alpha, x, y);
}
watch.Stop();
if (log) Console.WriteLine($"{nameof(daxpy_arr_vector)} x{loop}: {watch.ElapsedMilliseconds}ms");
watch = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
daxpy_span(alpha, x, y);
}
watch.Stop();
if (log) Console.WriteLine($"{nameof(daxpy_span)} x{loop}: {watch.ElapsedMilliseconds}ms");
watch = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
daxpy_vector(alpha, x, y);
}
watch.Stop();
if (log) Console.WriteLine($"{nameof(daxpy_vector)} x{loop}: {watch.ElapsedMilliseconds}ms");
watch = Stopwatch.StartNew();
for (int i = 0; i < loop; i++)
{
daxpy_vector_no_slice(alpha, x, y);
}
watch.Stop();
if (log) Console.WriteLine($"{nameof(daxpy_vector_no_slice)} x{loop}: {watch.ElapsedMilliseconds}ms");
}
public static void daxpy_naive(double alpha, double[] x, double[] y)
{
for (var i = 0; i < x.Length; ++i)
y[i] = y[i] + x[i] * alpha;
}
public static void daxpy_arr_vector(double alpha, double[] x, double[] y)
{
var i = 0;
if (Vector.IsHardwareAccelerated)
{
var length = x.Length + 1 - Vector<double>.Count;
for (; i < length; i += Vector<double>.Count)
{
var valpha = new Vector<double>(alpha);
var vx = new Vector<double>(x, i);
var vy = new Vector<double>(y, i);
(vy + vx * valpha).CopyTo(y, i);
}
}
for (; i < x.Length; ++i)
y[i] = y[i] + x[i] * alpha;
}
public static void daxpy_span(double alpha, Span<double> x, Span<double> y)
{
for (var i = 0; i < x.Length; ++i)
y[i] += x[i] * alpha;
}
public static void daxpy_vector(double alpha, Span<double> x, Span<double> y)
{
if (Vector.IsHardwareAccelerated)
{
var vx = x.NonPortableCast<double, Vector<double>>();
var vy = y.NonPortableCast<double, Vector<double>>();
var valpha = new Vector<double>(alpha);
for (var i = 0; i < vx.Length; ++i)
vy[i] += vx[i] * valpha;
x = x.Slice(Vector<double>.Count * vx.Length);
y = y.Slice(Vector<double>.Count * vy.Length);
}
for (var i = 0; i < x.Length; ++i)
y[i] += x[i] * alpha;
}
public static void daxpy_vector_no_slice(double alpha, Span<double> x, Span<double> y)
{
int i = 0;
if (Vector.IsHardwareAccelerated)
{
var vx = x.NonPortableCast<double, Vector<double>>();
var vy = y.NonPortableCast<double, Vector<double>>();
var valpha = new Vector<double>(alpha);
for (i = 0; i < vx.Length; ++i)
vy[i] += vx[i] * valpha;
i = Vector<double>.Count * vx.Length;
}
for (; i < x.Length; ++i)
y[i] += x[i] * alpha;
}
}
使用 dotnet build -c Release
和 dotnet run -c Release
,dotnet --version
报告“2.2.0- preview1-008000”(不久前的“日报”)。
在我的桌面上,我希望差异会更好。
关于C# Vector<double>.CopyTo 比非 SIMD 版本快多少?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49308115/