javascript - 在 Javascript 中解析带有可变封闭字符的 CSV 字符串

标签 javascript regex csv

我发现这个函数可以在 javascript 中将 csv 字符串解析为多维数组,here , 但是我需要更新它以允许单引号 (') 除了它已经使用的双引号 (") 之外还用作附件。即我需要它能够解析:

CSVToArray('"a, comma","2","3"');
// => [['a, comma', '2', '3']]

还有这个

CSVToArray("'a, comma','2','3'");
// => [['a, comma', '2', '3']]

函数:

// This will parse a delimited string into an array of
// arrays. The default delimiter is the comma, but this
// can be overriden in the second argument.
function CSVToArray( strData, strDelimiter ){
    // Check to see if the delimiter is defined. If not,
    // then default to comma.
    strDelimiter = (strDelimiter || ",");

    // Create a regular expression to parse the CSV values.
    var objPattern = new RegExp(
            (
                    // Delimiters.
                    "(\\" + strDelimiter + "|\\r?\\n|\\r|^)" +

                    // Quoted fields.
                    "(?:\"([^\"]*(?:\"\"[^\"]*)*)\"|" +

                    // Standard fields.
                    "([^\"\\" + strDelimiter + "\\r\\n]*))"
            ),
            "gi"
            );


    // Create an array to hold our data. Give the array
    // a default empty first row.
    var arrData = [[]];

    // Create an array to hold our individual pattern
    // matching groups.
    var arrMatches = null;


    // Keep looping over the regular expression matches
    // until we can no longer find a match.
    while (arrMatches = objPattern.exec( strData )){

            // Get the delimiter that was found.
            var strMatchedDelimiter = arrMatches[ 1 ];

            // Check to see if the given delimiter has a length
            // (is not the start of string) and if it matches
            // field delimiter. If id does not, then we know
            // that this delimiter is a row delimiter.
            if (
                    strMatchedDelimiter.length &&
                    (strMatchedDelimiter != strDelimiter)
                    ){

                    // Since we have reached a new row of data,
                    // add an empty row to our data array.
                    arrData.push( [] );

            }


            // Now that we have our delimiter out of the way,
            // let's check to see which kind of value we
            // captured (quoted or unquoted).
            if (arrMatches[ 2 ]){

                    // We found a quoted value. When we capture
                    // this value, unescape any double quotes.
                    var strMatchedValue = arrMatches[ 2 ].replace(
                            new RegExp( "\"\"", "g" ),
                            "\""
                            );

            } else {

                    // We found a non-quoted value.
                    var strMatchedValue = arrMatches[ 3 ];

            }


            // Now that we have our value string, let's add
            // it to the data array.
            arrData[ arrData.length - 1 ].push( strMatchedValue );
    }

    // Return the parsed data.
    return( arrData );
}

最佳答案

您的基本模式包含一些缺陷,并且不容易扩展:分隔符总是以 \\ 为前缀,即使它们不应该如此。

我重写了代码,使其更可靠。支持多种引号类型,并且正确转义了分隔符。

fiddle :http://jsfiddle.net/qz53J/

function CSVToArray( strData, strDelimiter ){    
    // Properly escape the delimiter, if existent.
    // If no delimiter is given, use a comma
    strDelimiter = (strDelimiter || ",").replace(/([[^$.|?*+(){}])/g, '\\$1');

    //What are the quotation characters? "'
    var quotes = "\"'";

    // Create a regular expression to parse the CSV values.
    // match[1] = Contains the delimiter if the RegExp is not at the begin
    // match[2] = quote, if any
    // match[3] = string inside quotes, if match[2] exists
    // match[4] = non-quoted strings
    var objPattern = new RegExp(
                // Delimiter or marker of new row
        "(?:(" + strDelimiter + ")|[\\n\\r]|^)" +
                // Quoted fields
        "(?:([" + quotes + "])((?:[^" + quotes + "]+|(?!\\2).|\\2\\2)*)\\2" + 
                // Standard fields
        "|([^" + quotes + strDelimiter + "\\n\\r]*))"
    , "gi");

    // Create a matrix (2d array) to hold data, which will be returned.
    var arrData = [];

    // Execute the RegExp until no match is found
    var arrMatches;
    while ( arrMatches = objPattern.exec( strData ) ){
            // If the first group of the RegExp does is empty, no delimiter is
            // matched. This only occurs at the beginning of a new row
            if ( !arrMatches[ 1 ] ){
                    // Add an empty row to our data array.
                    arrData.push( [] );    
            }

            var quote = arrMatches[ 2 ]
            if ( quote ){
                    // We found a quoted value. When we capture
                    // this value, unescape any double quotes.
                    var strMatchedValue = arrMatches[ 3 ].replace(
                        new RegExp( quote + quote, "g" ),
                        quote
                    );
            } else {
                    // We found a non-quoted value.
                    var strMatchedValue = arrMatches[ 4 ];
            }
            // Add the found value to the array
            arrData[ arrData.length - 1 ].push( strMatchedValue );
    }
    // Return the parsed data.
    return arrData;
}

关于javascript - 在 Javascript 中解析带有可变封闭字符的 CSV 字符串,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/7878762/

相关文章:

javascript - ExtJS TreeGrid 中的复选框列

javascript - 如何使用 JavaScript 或 jQuery 更改数组内对象的值?

以任意顺序匹配多个可选字符串的 JavaScript 正则表达式

java - 为什么 [\\s*] 不等同于\\s*?

regex - 匹配 *.ts 但不匹配 *.d.ts 的 Webpack 正则表达式测试

java - getParts() 也获取文本输入

javascript - 实现一个名为 `processLastItem` 的高阶函数

Python UTF-16 CSV 阅读器

html - AWK 将 CSV 转换为 HTML 表格

r - 如何编写R来循环设置目录中每个文件的每个工作表