c - Win32/C : Convert line endings to DOS/Windows format

标签 c windows string replace line-endings

我在读取文件的 Windows API 项目中有以下 C 函数,它基于行尾(UNIX、MAC、DOS)将行尾替换为适用于 Windows 的正确行尾(\r\n):

// Standard C header needed for string functions
#include <string.h>

// Defines for line-ending conversion function
#define LESTATUS INT 
#define LE_NO_CHANGES_NEEDED (0)
#define LE_CHANGES_SUCCEEDED (1)
#define LE_CHANGES_FAILED   (-1)

/// <summary>
/// If the line endings in a block of data loaded from a file contain UNIX (\n) or MAC (\r) line endings, this function replaces it with DOS (\r\n) endings.
/// </summary>
/// <param name="inData">An array of bytes of input data.</param>
/// <param name="inLen">The size, in bytes, of inData.</param>
/// <param name="outData">An array of bytes to be populated with output data.  This array must already be allocated</param>
/// <param name="outLen">The maximum number of bytes that can be stored in outData.</param>
/// <param name="bytesWritten">A pointer to an integer that receives the number of bytes written into outData.</param>
/// <returns>
/// If no changes were necessary (the file already contains \r\n line endings), then the return value is LE_NO_CHANGES_NEEDED.<br/>
/// If changes were necessary, and it was possible to store the entire output buffer, the return value is LE_CHANGES_SUCCEEDED.<br/>
/// If changes were necessary but the output buffer was too small, the return value is LE_CHANGES_FAILED.<br/>
/// </returns>
LESTATUS ConvertLineEndings(BYTE* inData, INT inLen, BYTE* outData, INT outLen, INT* bytesWritten)
{
    char *posR = strstr(inData, "\r");
    char *posN = strstr(inData, "\n");
    // Case 1: the file already contains DOS/Windows line endings.
    // So, copy the input array into the output array as-is (if we can)
    // Report an error if the output array is too small to hold the input array; report success otherwise.
    if (posN != NULL && posR != NULL)
    {
        if (outLen >= inLen)
        {
            strcpy(outData, inData);
            return LE_NO_CHANGES_NEEDED;
        }
        return LE_CHANGES_FAILED;
    }
    // Case 2: the file contains UNIX line endings.
    else if (posN != NULL && posR == NULL)
    {
        int i = 0;
        int track = 0;
        for (i = 0; i < inLen; i++)
        {
            if (inData[i] != '\n')
            {
                outData[track] = inData[i];
                track++;
                if (track>outLen) return LE_CHANGES_FAILED;
            }
            else
            {
                outData[track] = '\r';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
                outData[track] = '\n';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
            }
            *bytesWritten = track;
        }
    }
    // Case 3: the file contains Mac-style line endings.
    else if (posN == NULL && posR != NULL)
    {
        int i = 0;
        int track = 0;
        for (i = 0; i < inLen; i++)
        {
            if (inData[i] != '\r')
            {
                outData[track] = inData[i];
                track++;
                if (track>outLen) return LE_CHANGES_FAILED;
            }
            else
            {
                outData[track] = '\r';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
                outData[track] = '\n';
                track++;
                if (track > outLen) return LE_CHANGES_FAILED;
            }
            *bytesWritten = track;
        }
    }
    return LE_CHANGES_SUCCEEDED;
}

但是,我觉得这个函数很长(将近 70 行)并且可以以某种方式减少。我在谷歌上搜索过,但找不到任何有用的东西; C 库或 Windows API 中是否有任何函数允许我执行字符串替换而不是在 O(n) 时间内手动逐字节搜索字符串?

最佳答案

每个角色都需要精确看一次,不多也不少。您的代码的第一行已经进行了重复比较,因为两个 strstr 调用都从相同的位置开始。你可以使用类似的东西

char *posR = strstr(inData, "\r");
if (posR && posR[1] == '\n')
   // Case 1: the file already contains DOS/Windows line endings.

如果失败,如果找到 \r,则从结束处继续,如果 posR == NULL,则再次从顶部开始。但是随后您使 strstr 已经“查看”每个字符,直到结束!

两个附加说明:

  1. 不需要 strstr 因为您正在寻找单个字符;下次使用strchr
  2. strXXX 函数都假定您的输入是格式正确的 C 字符串:它应该以终止符 0 结尾。但是,您已经在 inLen 中提供了长度,因此您没有检查零。如果在 inLen 字节之前的输入中可能有也可能没有 0,您需要采取适当的措施。基于此函数的用途,我假设您根本不需要检查零。

我的建议:从头开始查看每个字符一次,只有当它是或者 \r 时才采取行动或 \n。如果您遇到的第一个是 \r 并且 下一个是 \n,那么您就完成了。 (这假设行尾不是“混合的”。)

如果您没有在第一个循环中返回,那么除了 \r\n 之外还有其他内容,您可以从那一点继续。但是您仍然只需要对或者 \r \n 采取行动!所以我建议这个更短的代码(和一个 enum 而不是你的定义):

enum LEStatus_e { LE_CHANGES_FAILED=-1, LE_NO_CHANGES_NEEDED, LE_CHANGES_SUCCEEDED };

enum LEStatus_e ConvertLineEndings(BYTE *inData, INT inLen, BYTE *outData, INT outLen, INT *bytesWritten)
{
    INT sourceIndex = 0, destIndex;

    if (outLen < inLen)
        return LE_CHANGES_FAILED;

    /*  Find first occurrence of either \r or \n
        This will return immediately for No Change Needed */
    while (sourceIndex < inLen)
    {
        if (inData[sourceIndex] == '\r')
        {
            if (sourceIndex < inLen-1 && inData[sourceIndex+1] == '\n')
            {
                memcpy (outData, inData, inLen);
                *bytesWritten = inLen;
                return LE_NO_CHANGES_NEEDED;
            }
            break;
        }
        if (inData[sourceIndex] == '\n')
            break;
        sourceIndex++;
    }
    /* We processed this far already: */
    memcpy (outData, inData, sourceIndex);
    if (sourceIndex == inLen)
        return LE_NO_CHANGES_NEEDED;
    destIndex = sourceIndex;

    while (sourceIndex < inLen)
    {
        switch (inData[sourceIndex])
        {
            case '\n':
            case '\r':
                sourceIndex++;
                if (destIndex+2 >= outLen)
                    return LE_CHANGES_FAILED;
                outData[destIndex++] = '\r';
                outData[destIndex++] = '\n';
                break;
            default:
                outData[destIndex++] = inData[sourceIndex++];
        }
    }
    *bytesWritten = destIndex;
    return LE_CHANGES_SUCCEEDED;
}

有一些使用其他结构的古老且罕见的“纯文本”格式;来自内存,类似于 \r\n\n。如果您希望能够清理任何东西,您可以在单个 \n 之后为所有 \r 添加一个跳过,并且相同对于相反的情况。这也将清除任何“混合”行尾,因为它也会正确处理 \r\n

关于c - Win32/C : Convert line endings to DOS/Windows format,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30540607/

相关文章:

c - 处理多个 UDP 流

string - fatal error : high- and low-surrogate code points are not valid Unicode scalar values

string - 多行字符串文字仅在 REPL 和 Worksheet 中表现正常

python - 如何检查一个单词是否适合指定数量的列?

c - 需要在C代码中运行 'read'命令

c - 程序返回错误答案和 "-nan(ind)"。我做错了什么?

c - 将指向结构的指针保存到指针数组中

c++ - TCP Telnet 传输中丢失字符

c - 运行存储在动态允许的内存中的 shellcode

C++ - 如何隐藏其他应用程序的窗口