c# - 将文本绘制到png文件上的性能方法？

我需要绘制正方形的二维网格，并在其上居中放置文本到一个(透明的)PNG文件中。
磁贴需要具有足够大的分辨率，以使文本不会被像素化。
为了进行测试，我创建了一个2048x2048px 32位(透明)的PNG图像，其中包含128x128px的图块，例如:
问题是我需要以合理的性能执行此操作。到目前为止，我尝试过的所有方法都花费了100毫秒以上的时间，而我需要的最大时间是<10毫秒。除此之外，我还需要生成这些图像的程序是跨平台的，并支持 WebAssembly (但是，即使您有例如如何使用posix线程执行此操作的想法，我也很乐意以此为起点。，也)。
Net5实现

using System.Diagnostics;
using System;
using System.Drawing;

namespace ImageGeneratorBenchmark
{
    class Program
    {
        static int rowColCount = 16;
        static int tileSize = 128;
        static void Main(string[] args)
        {
            var watch = Stopwatch.StartNew();

            Bitmap bitmap = new Bitmap(rowColCount * tileSize, rowColCount * tileSize);
            Graphics graphics = Graphics.FromImage(bitmap);

            Brush[] usedBrushes = { Brushes.Blue, Brushes.Red, Brushes.Green, Brushes.Orange, Brushes.Yellow };

            int totalCount = rowColCount * rowColCount;
            Random random = new Random();

            StringFormat format = new StringFormat();
            format.LineAlignment = StringAlignment.Center;
            format.Alignment = StringAlignment.Center;

            for (int i = 0; i < totalCount; i++)
            {
                int x = i % rowColCount * tileSize;
                int y = i / rowColCount * tileSize;

                graphics.FillRectangle(usedBrushes[random.Next(0, usedBrushes.Length)], x, y, tileSize, tileSize);
                graphics.DrawString(i.ToString(), SystemFonts.DefaultFont, Brushes.Black, x + tileSize / 2, y + tileSize / 2, format);
            }

            bitmap.Save("Test.png");

            watch.Stop();
            Console.WriteLine($"Output took {watch.ElapsedMilliseconds} ms.");
        }
    }
}

这在我的计算机上大约需要 115ms 。我在这里使用System.Drawing.Common nuget。
保存位图大约需要55毫秒，在循环中绘制到图形对象也大约需要60毫秒，而40毫秒可以归因于绘制文本。
防 rust 实现

use std::path::Path;
use std::time::Instant;
use image::{Rgba, RgbaImage};
use imageproc::{drawing::{draw_text_mut, draw_filled_rect_mut, text_size}, rect::Rect};
use rusttype::{Font, Scale};
use rand::Rng;

#[derive(Default)]
struct TextureAtlas {
    segment_size: u16, // The side length of the tile
    row_col_count: u8, // The amount of tiles in horizontal and vertical direction
    current_segment: u32 // Points to the next segment, that will be used 
}

fn main() {
    let before = Instant::now();

    let mut atlas = TextureAtlas {
        segment_size: 128,
        row_col_count: 16,
        ..Default::default()
    };

    let path = Path::new("test.png");
    let colors = vec![Rgba([132u8, 132u8, 132u8, 255u8]), Rgba([132u8, 255u8, 32u8, 120u8]), Rgba([200u8, 255u8, 132u8, 255u8]), Rgba([255u8, 0u8, 0u8, 255u8])];

    let mut image = RgbaImage::new(2048, 2048);

    let font = Vec::from(include_bytes!("../assets/DejaVuSans.ttf") as &[u8]);
    let font = Font::try_from_vec(font).unwrap();

    let font_size = 40.0;
    let scale = Scale {
        x: font_size,
        y: font_size,
    };

    // Draw random color rects for benchmarking
    for i in 0..256 {
        let rand_num = rand::thread_rng().gen_range(0..colors.len());

        draw_filled_rect_mut(
            &mut image, 
            Rect::at((atlas.current_segment as i32 % atlas.row_col_count as i32) * atlas.segment_size as i32, (atlas.current_segment as i32 / atlas.row_col_count as i32) * atlas.segment_size as i32)
                .of_size(atlas.segment_size.into(), atlas.segment_size.into()), 
            colors[rand_num]);

        let number = i.to_string();
        //let text = &number[..];
        let text = number.as_str(); // Somehow this conversion takes ~15ms here for 255 iterations, whereas it should normally only be less than 1us
        let (w, h) = text_size(scale, &font, text);
        draw_text_mut(
            &mut image, 
            Rgba([0u8, 0u8, 0u8, 255u8]), 
            (atlas.current_segment % atlas.row_col_count as u32) * atlas.segment_size as u32 + atlas.segment_size as u32 / 2 - w as u32 / 2, 
            (atlas.current_segment / atlas.row_col_count as u32) * atlas.segment_size as u32 + atlas.segment_size as u32 / 2 - h as u32 / 2, 
            scale, 
            &font, 
            text);

        atlas.current_segment += 1;
    }

    image.save(path).unwrap();

    println!("Output took {:?}", before.elapsed());
}

对于Rust，我使用的是imageproc条板箱。以前，我使用piet-common条板箱，但输出时间超过300毫秒。使用imageproc条板箱，我在 Release模式下绕过了 110ms ，这与C#版本相当，但我认为它与webassembly一起使用会更好。
当我使用静态字符串而不是从循环中转换数字时(请参见注释)，我获得了不到100ms的执行时间。对于Rust而言，绘制图像仅需要30毫秒左右，而保存则需要80毫秒。
C++实现

#include <iostream>
#include <cstdlib>
#define cimg_display 0
#define cimg_use_png
#include "CImg.h"
#include <chrono>
#include <string>

using namespace cimg_library;
using namespace std;

/* Generate random numbers in an inclusive range. */
int random(int min, int max)
{
    static bool first = true;
    if (first)
    {
        srand(time(NULL));
        first = false;
    }
    return min + rand() % ((max + 1) - min);
}

int main() {
    auto t1 = std::chrono::high_resolution_clock::now();

    static int tile_size = 128;
    static int row_col_count = 16;

    // Create 2048x2048px image.
    CImg<unsigned char> image(tile_size*row_col_count, tile_size*row_col_count, 1, 3);

    // Make some colours.
    unsigned char cyan[] = { 0, 255, 255 };
    unsigned char black[] = { 0, 0, 0 };
    unsigned char yellow[] = { 255, 255, 0 };
    unsigned char red[] = { 255, 0, 0 };
    unsigned char green[] = { 0, 255, 0 };
    unsigned char orange[] = { 255, 165, 0 };

    unsigned char colors [] = { // This is terrible, but I don't now C++ very well.
        cyan[0], cyan[1], cyan[2],
        yellow[0], yellow[1], yellow[2],
        red[0], red[1], red[2],
        green[0], green[1], green[2],
        orange[0], orange[1], orange[2],
    };

    int total_count = row_col_count * row_col_count;

    for (size_t i = 0; i < total_count; i++)
    {
        int x = i % row_col_count * tile_size;
        int y = i / row_col_count * tile_size;

        int random_color_index = random(0, 4);
        unsigned char current_color [] = { colors[random_color_index * 3], colors[random_color_index * 3 + 1], colors[random_color_index * 3 + 2] };

        image.draw_rectangle(x, y, x + tile_size, y + tile_size, current_color, 1.0); // Force use of transparency. -> Does not work. Always outputs 24bit PNGs.

        auto s = std::to_string(i);

        CImg<unsigned char> imgtext;
        unsigned char color = 1;
        imgtext.draw_text(0, 0, s.c_str(), &color, 0, 1, 40); // Measure the text by drawing to an empty instance, so that the bounding box will be set automatically.

        image.draw_text(x + tile_size / 2 - imgtext.width() / 2, y + tile_size / 2 - imgtext.height() / 2, s.c_str(), black, 0, 1, 40);
    }

    // Save result image as PNG (libpng and GraphicsMagick are required).
    image.save_png("Test.png");

    auto t2 = std::chrono::high_resolution_clock::now();
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();

    std::cout << "Output took " << duration << "ms.";
    getchar();
}

我还使用CImg在C++中重新实现了相同的程序。对于.png输出，还需要libpng和GraphicsMagick。我不太熟练使用C++，甚至没有费心进行优化，因为在Release模式下保存操作花费了大约200ms，而整个图像生成(目前尚未非常优化)仅花费了30ms。因此，该解决方案也达不到我的目标。
我现在在哪里
我现在所在位置的图表。当我取得一些进展时，我将对此进行更新。

为什么我要尝试这样做以及为什么让我如此困扰
评论中要求我提供更多背景信息。我知道这个问题变得肿了，但是如果您有兴趣，请继续阅读...
因此，基本上我需要为.gltf文件构建一个Texture Atlas。我需要从数据生成一个.gltf文件，并且.gltf文件中的基元也将根据输入数据分配一个纹理。为了优化少量绘制调用，我将尽可能多的几何体放入一个单一图元，然后使用纹理坐标将纹理映射到模型。现在，GPU具有纹理可以具有的最大大小。我将使用2048x2048像素，因为大多数设备至少支持该功能。这意味着，如果我有超过256个对象，则需要向.gltf中添加一个新的图元并生成另一个纹理图集。在某些情况下，一个纹理图集可能就足够了，而在其他情况下，则需要多达15-20个。
纹理将具有(半)透明的背景，可能是文本，也可能是一些线条/阴影或简单的符号，可以使用路径进行绘制。
我已经在Rust中设置了整个系统，并且.gltf生成确实非常有效:我可以在10ms左右的时间内生成54000个顶点(例如= 1500个盒子)。现在，我需要生成6个纹理图集，这在多核系统上并不是真正的问题(.gltf 7个线程，纹理6个线程)。问题是生成一个文件大约需要100毫秒(或现在为55毫秒)，这会使整个过程慢5倍以上。
不幸的是，情况变得更糟，因为另一种常见情况是15000个对象。生成顶点(实际上还有很多自定义属性)并组装.gltf仍然只花费96毫秒(540000个顶点/20MB .gltf)，但是那时候我需要生成59个纹理图集。我正在使用8核系统，因此到那时我无法并行运行它们，并且每个线程将不得不生成约9个图集(这意味着55ms * 9 = 495ms)，所以这又是5倍，实际上造成了相当明显的滞后。实际上，当前需要花费超过2.5 s的时间，因为我已经更新为使用速度更快的代码，而且速度似乎有所增加。
我需要做什么
我确实知道写出4194304 32位像素需要花费一些时间。但据我所知，因为我只写图像的不同部分(例如，只写到上部瓦片等)，所以应该可以构建一个使用多个线程来执行此操作的程序。那就是我想尝试的方法，并且我会暗示如何使Rust程序运行得更快。
如果有帮助，我也愿意用C或任何其他语言重写它，可以将其编译为wasm并可以通过Rust的FFI进行调用。因此，如果您对性能更高的库提出建议，我也将非常感谢。
编辑
更新1:我从注释中对C#版本进行了所有建议的改进。感谢所有人。现在是115毫秒，几乎和Rust版本一样快，这让我相信我在那儿快要走到尽头了，我真的需要找到一种使它平行化的方法，以便进行进一步的重大改进。。
更新2:多亏了@ pinkfloydx33，我使用dotnet publish -p:PublishReadyToRun=true --runtime win10-x64 --configuration Release发布二进制文件后能够运行大约60ms(包括第一次运行)。
同时，我自己也尝试了其他方法，即使用Pillow(〜400ms)的Python，使用C#和Rust都使用Skia(〜314ms和〜260ms)的方法，并且我还使用CImg(以及libpng和GraphicsMagick)在C++中重新实现了该程序。。

最佳答案

我可以通过以下方法将图形的所有(创建网格和文本)的降低至4-5ms:

尽可能缓存值(Random，StringFormat，Math.Pow)

使用ArrayPool作为暂存缓冲区

使用DrawString重载来接受带有以下选项的StringFormat:

居中(代替手动计算)的Alignment和LineAlignment

FormatFlags和Trimming选项可禁用诸如溢出/包装之类的功能，因为我们只是在写少量数字(尽管可以忽略不计，但产生了影响)

使用Font字体家族中的自定义GenericMonospace代替SystemFonts.DefaultFont
减少〜15ms

摆弄各种Graphics选项，例如TextRenderingHint和SmoothingMode
我得到了不同的结果，所以您可能想摆弄一些

Color数组和ToArgb函数创建一个int，代表像素颜色
的4x byte s
使用LockBits，(半)unsafe代码和Span来
用代表随机颜色
的ARGB值的size * count填充代表1px高和int px宽(整个图像宽度)的缓冲区
复制缓冲区size次(现在代表整个高度)
冲洗/重复
从锁定位的unsafe指针创建Span<>时需要
Scan0

最后，使用GDI/native在图形
上绘制文本

这样，我就可以通过使用Image.Save(Stream)重载将实际的保存过程节省一些时间。我使用了FileStream，其自定义缓冲区大小为16kb(超过默认的4kb)，这似乎是最有效的选择。这使总的端到端时间减少到大约40毫秒(在我的机器上)。

private static readonly Random Random = new(); private static readonly Color[] UsedColors = { Color.Blue, Color.Red, Color.Green, Color.Orange, Color.Yellow }; private static readonly StringFormat Format = new() { Alignment = StringAlignment.Center, LineAlignment = StringAlignment.Center, FormatFlags = StringFormatFlags.NoWrap | StringFormatFlags.FitBlackBox | StringFormatFlags.NoClip, Trimming = StringTrimming.None, HotkeyPrefix = HotkeyPrefix.None }; private static unsafe void DrawGrid(int count, int size, bool save) { var intsPerRow = size * count; var sizePerFullRow = intsPerRow * size; var colorsLen = UsedColors.Length; using var bitmap = new Bitmap(intsPerRow, intsPerRow, PixelFormat.Format32bppArgb); var bmpData = bitmap.LockBits(new Rectangle(0, 0, bitmap.Width, bitmap.Height), ImageLockMode.WriteOnly, PixelFormat.Format32bppArgb); var byteSpan = new Span<byte>(bmpData.Scan0.ToPointer(), Math.Abs(bmpData.Stride) * bmpData.Height); var intSpan = MemoryMarshal.Cast<byte, int>(byteSpan); var arr = ArrayPool<int>.Shared.Rent(intsPerRow); var buff = arr.AsSpan(0, intsPerRow); for (int y = 0, offset = 0; y < count; ++y) { // fill buffer with an entire 1px row of colors for (var bOffset = 0; bOffset < intsPerRow; bOffset += size) buff.Slice(bOffset, size).Fill(UsedColors[Random.Next(0, colorsLen)].ToArgb()); // duplicate the pixel high row until we've created a row of squares in full var len = offset + sizePerFullRow; for ( ; offset < len; offset += intsPerRow) buff.CopyTo(intSpan.Slice(offset, intsPerRow)); } ArrayPool<int>.Shared.Return(arr); bitmap.UnlockBits(bmpData); using var graphics = Graphics.FromImage(bitmap); graphics.TextRenderingHint = TextRenderingHint.ClearTypeGridFit; // some or all of these may not even matter? // you may try removing/modifying the rest graphics.CompositingQuality = CompositingQuality.HighSpeed; graphics.InterpolationMode = InterpolationMode.Default; graphics.SmoothingMode = SmoothingMode.HighSpeed; graphics.PixelOffsetMode = PixelOffsetMode.HighSpeed; var font = new Font(FontFamily.GenericMonospace, 14, FontStyle.Regular); var lenSquares = count * count; for (var i = 0; i < lenSquares; ++i) { var x = i % count * size; var y = i / count * size; var rect = new Rectangle(x, y, size, size); graphics.DrawString(i.ToString(), font, Brushes.Black, rect, Format); } if (save) { using var fs = new FileStream("Test.png", FileMode.Create, FileAccess.Write, FileShare.Write, 16 * 1024); bitmap.Save(fs, ImageFormat.Png); } }
以下是在StopWatch模式下使用Release的计时(以毫秒为单位)，该计时在Visual Studio外部运行。至少应该忽略前1或2个时序，因为这些方法尚未完全固定。您的里程会因您的PC等而异。
仅图像生成:
Elapsed: 38 Elapsed: 6 Elapsed: 4 Elapsed: 4 Elapsed: 4 Elapsed: 4 Elapsed: 5 Elapsed: 4 Elapsed: 5 Elapsed: 4 Elapsed: 4
图像生成和保存:
Elapsed: 95 Elapsed: 48 Elapsed: 41 Elapsed: 40 Elapsed: 37 Elapsed: 42 Elapsed: 42 Elapsed: 39 Elapsed: 38 Elapsed: 40 Elapsed: 41
我认为关于慢速保存没有什么可以做的。我查看了Image.Save的源代码。它调用Native/GDI，将Handle传递到Stream， native 图像指针和代表PNG的Guid(编码器)的ImageCodecInfo。任何缓慢都将在此结束。 更新:我已经验证了保存到MemoryStream时得到的速度相同，因此这与保存到文件以及GDI/native幕后发生的一切无关。
我还尝试使用直接unsafe(指针)和/或Unsafe和MemoryMarshal(例如CopyBlock)的技巧来进一步缩小图像，并展开循环。这些方法要么产生相同的结果，要么产生更糟的结果，使事情难以遵循。
注意:使用PublishReadyToRun=true作为控制台应用程序发布似乎也有所帮助。
更新
我意识到以上只是示例，因此这可能不适用于您的最终目标。经过进一步的广泛审查，我发现所花费时间的批量实际上是Image::Save的一部分。我们保存到哪种类型的Stream都没有关系，即使MemoryStream表现出相同的速度(显然不考虑文件I/O)。我相信这与在图像/图形中包含GDI对象有关-在我们的例子中是DrawString的文本。
作为“简单”测试，我更新了上面的内容，以使文本绘制发生在全白的辅助图像上。在不保存该图像的情况下，我然后遍历了其各个像素，并根据粗糙的颜色(因为我们要处理别名)，在主要位图上手动设置了相应的像素。整个端到端过程在我的计算机上花费了不到20毫秒的时间。渲染的图像虽然是快速测试，但并不完美，但是它证明您可以手动完成部分操作，但仍然可以节省很短的时间。问题是文本绘图，但我们可以利用GDI，而无需在最终图像中实际使用它。您只需要找到最佳位置。我还尝试过使用索引格式，并预先在调色板中填充颜色，这似乎对某些人有帮助。无论如何，只是值得深思。

关于c# - 将文本绘制到png文件上的性能方法？，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/66241394/

c# - 将文本绘制到png文件上的性能方法？

上一篇：Rust 解压枚举的内容

下一篇：vector - 如何在Vec中获取对元素的多个可变引用？