c# - 并行化直方图函数

标签 c# .net image-processing parallel-processing histogram

我已经实现了一个简单函数的正常并行版本,该函数根据 32bppArgb 位图计算直方图。普通版本在 1920x1080 图像上大约需要 0.03 秒,而并行版本需要 0.07 秒。

线程开销真的那么大吗?除了 Parallel.For 之外还有其他构造可以加速这个过程吗?我需要加快速度,因为我正在处理 30fps 视频。

这是简化的代码:

public sealed class Histogram
{
    public int MaxA = 0;
    public int MaxR = 0;
    public int MaxG = 0;
    public int MaxB = 0;
    public int MaxT = 0;

    public int [] A = null;
    public int [] R = null;
    public int [] G = null;
    public int [] B = null;

    public Histogram ()
    {
        this.A = new int [256];
        this.R = new int [256];
        this.G = new int [256];
        this.B = new int [256];

        this.Initialize();
    }

    public void Initialize ()
    {
        this.MaxA = 0;
        this.MaxR = 0;
        this.MaxG = 0;
        this.MaxB = 0;
        this.MaxT = 0;

        for (int i = 0; i < this.A.Length; i++)
            this.A [i] = 0;
        for (int i = 0; i < this.R.Length; i++)
            this.R [i] = 0;
        for (int i = 0; i < this.G.Length; i++)
            this.G [i] = 0;
        for (int i = 0; i < this.B.Length; i++)
            this.B [i] = 0;
    }

    public void ComputeHistogram (System.Drawing.Bitmap bitmap, bool parallel = false)
    {
        System.Drawing.Imaging.BitmapData data = null;

        data = bitmap.LockBits
        (
            new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
            System.Drawing.Imaging.ImageLockMode.ReadOnly,
            System.Drawing.Imaging.PixelFormat.Format32bppArgb
        );

        try
        {
            ComputeHistogram(data, parallel);
        }
        catch
        {
            bitmap.UnlockBits(data);

            throw;
        }

        bitmap.UnlockBits(data);
    }

    public void ComputeHistogram (System.Drawing.Imaging.BitmapData data, bool parallel = false)
    {
        int stride = System.Math.Abs(data.Stride);

        this.Initialize();

        if (parallel)
        {
            unsafe
            {
                System.Threading.Tasks.Parallel.For
                (
                    0,
                    data.Height,
                    new System.Threading.Tasks.ParallelOptions() { MaxDegreeOfParallelism = System.Environment.ProcessorCount },
                    y =>
                    {
                        byte* pointer = ((byte*) data.Scan0) + (stride * y);

                        for (int x = 0; x < stride; x += 4)
                        {
                            this.B [pointer [x + 0]]++;
                            this.G [pointer [x + 1]]++;
                            this.R [pointer [x + 2]]++;
                            this.A [pointer [x + 3]]++;
                        }
                    }
                );
            }
        }
        else
        {
            unsafe
            {
                for (int y = 0; y < data.Height; y++)
                {
                    byte* pointer = ((byte*) data.Scan0) + (stride * y);

                    for (int x = 0; x < stride; x += 4)
                    {
                        this.B [pointer [x + 0]]++;
                        this.G [pointer [x + 1]]++;
                        this.R [pointer [x + 2]]++;
                        this.A [pointer [x + 3]]++;
                    }
                }
            }
        }

        for (int i = 0; i < this.A.Length; i++)
            if (this.MaxA < this.A [i]) this.MaxA = this.A [i];
        for (int i = 0; i < this.R.Length; i++)
            if (this.MaxR < this.R [i]) this.MaxR = this.R [i];
        for (int i = 0; i < this.G.Length; i++)
            if (this.MaxG < this.G [i]) this.MaxG = this.G [i];
        for (int i = 0; i < this.B.Length; i++)
            if (this.MaxB < this.B [i]) this.MaxB = this.B [i];

        if (this.MaxT < this.MaxA) this.MaxT = this.MaxA;
        if (this.MaxT < this.MaxR) this.MaxT = this.MaxR;
        if (this.MaxT < this.MaxG) this.MaxT = this.MaxG;
        if (this.MaxT < this.MaxB) this.MaxT = this.MaxB;
    }
}

最佳答案

好吧,首先,你的并行循环中有一个巨大的错误:

您将有多个线程访问、递增和更新共享数组 - 由于固有的竞争条件,仅在同一图像上多次运行您的示例代码会导致截然不同的结果。

但这不是你问的。

至于为什么您看到使用并行实现的性能下降,简单的答案是您可能没有在每个并行任务的主体中做足够的工作来抵消创建新任务的“启动成本”,安排它等

可能更关键的是,我相信你正在从 L1/L2 缓存中跳出来,在内存中跳来跳去——每个任务线程都会尝试将它认为需要的内容加载到缓存中,但是当您在各处进行索引时,您不再创建一致的访问模式,因此每次尝试访问位图缓冲区或内部数组时,您都可能会遇到缓存未命中的情况。

还有一种在不使用不安全代码的情况下获取位图的只读数据的同样高效的方法......实际上,让我们先这样做:

因此,通过调用 LockBits,您有一个指向非托管内存的指针。让我们复制它:

System.Drawing.Imaging.BitmapData data = null;
data = bitmap.LockBits
(
    new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
    System.Drawing.Imaging.ImageLockMode.ReadOnly,
    System.Drawing.Imaging.PixelFormat.Format32bppArgb
);

// For later usage
var imageStride = data.Stride;
var imageHeight = data.Height;

// allocate space to hold the data
byte[] buffer = new byte[data.Stride * data.Height];

// Source will be the bitmap scan data
IntPtr pointer = data.Scan0;

// the CLR marshalling system knows how to move blocks of bytes around, FAST.
Marshal.Copy(pointer, buffer, 0, buffer.Length);

// and now we can unlock this since we don't need it anymore
bitmap.UnlockBits(data);

ComputeHistogram(buffer, imageStride, imageHeight, parallel);

现在,至于竞争条件 - 您可以通过使用 Interlocked 调用来增加计数,以合理的性能方式克服这个问题(注意!!!多线程编程很困难,而且我的解决方案完全有可能并不完美!)

public void ComputeHistogram (byte[] data, int stride, int height, bool parallel = false)
{
    this.Initialize();

    if (parallel)
    {
        System.Threading.Tasks.Parallel.For
        (
            0,
            height,
            new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
            y =>
            {
                int startIndex = (stride * y);
                int endIndex = stride * (y+1);
                for (int x = startIndex; x < endIndex; x += 4)
                {
                    // Interlocked actions are more-or-less atomic 
                    // (caveats abound, but this should work for us)
                    Interlocked.Increment(ref this.B[data[x]]);
                    Interlocked.Increment(ref this.G[data[x+1]]);
                    Interlocked.Increment(ref this.R[data[x+2]]);
                    Interlocked.Increment(ref this.A[data[x+3]]);
                }
            }
        );
    }
    else
    {
        // the original way is ok for non-parallel, since only one
        // thread is mucking around with the data
    }

    // Sorry, couldn't help myself, this just looked "cleaner" to me
    this.MaxA = this.A.Max();
    this.MaxR = this.R.Max();
    this.MaxG = this.G.Max();
    this.MaxB = this.B.Max();
    this.MaxT = new[] { this.MaxA, this.MaxB, this.MaxG, this.MaxR }.Max();
}

那么,这对运行时行为有何影响?

不是很多,但至少并行分支现在可以计算出正确的结果。 :)

使用非常便宜的测试装置:

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        var totalRunTime = TimeSpan.Zero;
        var sw = new Stopwatch();
        var runCount = 10;
        for(int run=0; run < runCount; run++)
        {
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();
            sw.Reset();
            sw.Start();
            var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
            var hist = new Histogram();
            hist.ComputeHistogram(bmp, useParallel);
            sw.Stop();
            totalRunTime = totalRunTime.Add(sw.Elapsed);
        }
        Console.WriteLine("Parallel={0}, Avg={1} ms", useParallel, totalRunTime.TotalMilliseconds / runCount);
    }
}

我得到这样的结果:

Parallel=False, Avg=1.69777 ms
Parallel=True, Avg=5.33584 ms

如您所见,我们仍未解决您最初的问题。 :)

那么让我们尝试一下让并行工作“更好”:

让我们看看“给任务更多的工作”有什么作用:

if (parallel)
{
    var batchSize = 2;
    System.Threading.Tasks.Parallel.For
    (
        0,
        height / batchSize,
        new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
        y =>
        {
            int startIndex = (stride * y * batchSize);
            int endIndex = startIndex + (stride * batchSize);
            for (int x = startIndex; x < endIndex; x += 4)
            {
                // Interlocked actions are more-or-less atomic 
                // (caveats abound, but this should work for us)
                Interlocked.Increment(ref this.B[data[x]]);
                Interlocked.Increment(ref this.G[data[x+1]]);
                Interlocked.Increment(ref this.R[data[x+2]]);
                Interlocked.Increment(ref this.A[data[x+3]]);
            }
        }
    );
}

结果:

Parallel=False, Avg=1.70273 ms
Parallel=True, Avg=4.82591 ms

哦,这看起来很有希望......我想知道当我们改变 batchSize 时会发生什么?

让我们这样改变我们的测试平台:

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        for(int batchSize = 1; batchSize < 1024; batchSize <<= 1)
        {
            var totalRunTime = TimeSpan.Zero;
            var sw = new Stopwatch();
            var runCount = 10;
            for(int run=0; run < runCount; run++)
            {
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();
                sw.Reset();
                sw.Start();
                var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
                var hist = new Histogram();
                hist.ComputeHistogram(bmp, useParallel, batchSize);
                sw.Stop();
                totalRunTime = totalRunTime.Add(sw.Elapsed);
            }
            Console.WriteLine("Parallel={0}, BatchSize={1} Avg={2} ms", useParallel, batchSize, totalRunTime.TotalMilliseconds / runCount);
        }        
    }
}

结果:(只显示parallel=true,因为非parallel不会改变)

Parallel=True, BatchSize=1 Avg=5.57644 ms
Parallel=True, BatchSize=2 Avg=5.49982 ms
Parallel=True, BatchSize=4 Avg=5.20434 ms
Parallel=True, BatchSize=8 Avg=5.1721 ms
Parallel=True, BatchSize=16 Avg=5.00405 ms
Parallel=True, BatchSize=32 Avg=4.44973 ms
Parallel=True, BatchSize=64 Avg=2.28332 ms
Parallel=True, BatchSize=128 Avg=1.39957 ms
Parallel=True, BatchSize=256 Avg=1.29156 ms
Parallel=True, BatchSize=512 Avg=1.28656 ms

一旦我们的批量大小达到 64-128 的范围,我们似乎正在接近某种渐近线,当然,您的里程可能会因位图大小等而异。

希望对您有所帮助!这是我等待生产构建完成的一天的有趣消遣! :)

关于c# - 并行化直方图函数,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/14898473/

相关文章:

c# - 注释斜杠//缩进太左(Rider)

.net - 如何更改 WPF 中网格的网格线颜色?

c - 在实时视频流中跟踪和替换标记

c# - 自动对焦例程检测非常小的模糊差异

c# - 如何对 DataTable 对象执行复杂的 SQL 查询?

c# - 使用 MOQ 模拟系统事件

c# - Windows 服务设计帮助

c# - 从 coldfusion 调用 .net 对象的方法

c# - 检测二维图像中的标记

c# - 使用MediaCapture进行实时流式传输