java - CSVReader 的 readNext() 函数不循环遍历 csv 的所有行 [编辑 : How to handle erroneous CSV (remove unescaped quotes)]

标签 java csv error-handling opencsv

FileReader fr = new FileReader(inp);
CSVReader reader = new CSVReader(fr, ',', '"');

// writer
File writtenFromWhile = new File(dliRootPath + writtenFromWhilePath);
writtenFromWhile.createNewFile();
CSVWriter writeFromWhile = new CSVWriter(new FileWriter(writtenFromWhile), ',', '"');

int insideWhile = 0;
String[] currRow = null;
while ((currRow = reader.readNext()) != null) {
    insideWhile++;
    writeFromWhile.writeNext(currRow);
}
System.out.println("inside While: " + insideWhile);
System.out.println("lines read (acc.to CSV reader): " + reader.getLinesRead());

输出为:

inside While: 162199
lines read (acc.to CSV reader): 256865

即使所有行都写入输出 CSV(在文本编辑器中查看时,Excel 显示的行数要少得多),while 循环的迭代次数也不会与输入 CSV 中的行重复次数相同。我的主要目标是在每行的 while 循环内实现一些其他逻辑。 我整整两天都在尝试调试(更大的代码),但没有任何结果。

请解释如何循环 while 256865 次

<小时/>

引用数据,完整图片:

Here是我在上面的代码片段中读取的 CSV。

我的完整程序尝试将这些记录从 this CSV 中分离出来。 this CSV 中不存在这些内容,基于字段标题和作者(即,如果两条记录中的作者和标题相同,即使其他字段不同,它们也会被视为重复,不应写入输出文件)。这是我的完整代码(差异应该在 300000 左右,但我在我的代码的输出文件中只得到 ~210000):

//TODO ask id
/*(*
 * id also there in fields getting matched (thisRow[0] is id)
 * u can replace it by thisRow[fielAnd Column.get(0)] to eliminate  id
 */

package mainOne;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.opencsv.CSVReader;
import com.opencsv.CSVWriter;

public class Diff_V3 {
    static String dliRootPath = "/home/gurnoor/Incoming/Untitled Folder 2/";
    static String dli = "new-dli-IITG.csv";
    static String oldDli = "dli-iisc.csv";
    static String newFile = "newSampleFile.csv";// not used
    static String unqFile = "UniqueFileFinal.csv";
    static String log = "Diff_V3_log.txt";
    static String splittedNewDliDir = "/home/gurnoor/Incoming/Untitled Folder 2/splitted new file";
    static String splittedOldDliDir = "/home/gurnoor/Incoming/Untitled Folder 2/splitted old file";

    // debug
    static String testFilePath = "testFile.csv";
    static int insidepopulateMapFromSplittedCSV = 0;

    public static void main(String[] args) throws IOException, CustomException {
        // _readSample(dliRootPath+dli, dliRootPath+newFile);
        // System.out.println(areIDsunique(dliRootPath + dli, 550841) );// open
        // in geany to get total no
        // of lines

        // TODO implement sparate function to check equals
        // File filteredFile = new File(dliRootPath + "filteredFile.csv");
        // filteredFile.createNewFile();
        File logFile = new File(dliRootPath + log);
        logFile.createNewFile();
        new File(dliRootPath + testFilePath).createNewFile();
        List<String> fieldsToBeMatched = new ArrayList<>();
        fieldsToBeMatched.add("dc.contributor.author[]");
        fieldsToBeMatched.add("dc.title[]");
        filterUniqueFileds(new File(splittedNewDliDir), new File(splittedOldDliDir), fieldsToBeMatched);

    }

    /**
     * NOTE: might remove the row where fieldToBeMatched is null
     * 
     * @param inpfile
     * @param file
     * @param filteredFile
     * @param fieldsToBeMatched
     * @throws IOException
     * @throws CustomException
     */
    private static void filterUniqueFileds(File newDir, File oldDir, List<String> fieldsToBeMatched)
            throws IOException, CustomException {

        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        // writer
        File unqFileOp = new File(dliRootPath + unqFile);
        unqFileOp.createNewFile();
        CSVWriter writer = new CSVWriter(new FileWriter(unqFileOp), '|');

        // logWriter
        BufferedWriter logWriter = new BufferedWriter(new FileWriter(new File(dliRootPath + log)));

        String[] headingRow = // allRows.get(0);
        reader.readNext();
        writer.writeNext(headingRow);
        int headingLen = headingRow.length;

        // old List
        System.out.println("[INFO] reading old list...");
        // CSVReader oldReader = new CSVReader(new FileReader(new
        // File(dliRootPath + oldDli)));
        Map<String, List<String>> oldMap = new HashMap<>();
        oldMap = populateMapFromSplittedCSV(oldMap, oldDir);// populateMapFromCSV(oldMap,
                                                            // oldReader);
        // oldReader.close();
        System.out.println("[INFO] Read old List. Size = " + oldMap.size());
        printMapToCSV(oldMap, dliRootPath + testFilePath);

        // map of fieldName, ColumnNo
        Map<String, Integer> fieldAndColumnNoInNew = new HashMap<>(getColumnNo(fieldsToBeMatched, headingRow));
        Map<String, Integer> fieldAndColumnNoInOld = new HashMap<>(
                getColumnNo(fieldsToBeMatched, (String[]) oldMap.get("id").toArray()));
        // error check: did columnNo get populated?
        if (fieldAndColumnNoInNew.isEmpty()) {
            reader.close();
            writer.close();
            throw new CustomException("field to be matched not present in input CSV");
        }

        // TODO implement own array compare using areEqual()
        // error check
        // if( !Arrays.equals(headingRow, (String[]) oldMap.get("id").toArray())
        // ){
        // System.out.println("heading in new file, old file: \n"+
        // Arrays.toString(headingRow));
        // System.out.println(Arrays.toString((String[])
        // oldMap.get("id").toArray()));
        // reader.close();
        // writer.close();
        // oldReader.close();
        // throw new CustomException("Heading rows are not same in old and new
        // file");
        // }

        int noOfRecordsInOldList = 0, noOfRecordsWritten = 0, checkManually = 0;
        String[] thisRow;
        while ((thisRow = reader.readNext()) != null) {
            // for(int l=allRows.size()-1; l>=0; l--){
            // thisRow=allRows.get(l);

            // error check
            if (thisRow.length != headingLen) {
                String error = "Line no: " + reader.getLinesRead() + " in file: " + dliRootPath + dli
                        + " not read. Check manually";

                System.err.println(error);
                logWriter.append(error + "\n");
                logWriter.flush();
                checkManually++;
                continue;
            }

            // write if not present in oldMap
            if (!oldMap.containsKey(thisRow[0])) {
                writer.writeNext(thisRow);
                writer.flush();
                noOfRecordsWritten++;
            } else {
                // check if all reqd fields match
                List<String> twinRow = oldMap.get(thisRow[0]);
                boolean writtenToOp = false;
                // for (int k = 0; k < fieldsToBeMatched.size(); k++) {
                List<String> newFields = new ArrayList<>(fieldAndColumnNoInNew.keySet());
                List<String> oldFields = new ArrayList<>(fieldAndColumnNoInOld.keySet());
                // faaltu error check
                if (newFields.size() != oldFields.size()) {
                    reader.close();
                    writer.close();
                    CustomException up = new CustomException("something is really wrong");
                    throw up;
                }
                // for(String fieldName : fieldAndColumnNoInNew.keySet()){
                for (int m = 0; m < newFields.size(); m++) {
                    int columnInNew = fieldAndColumnNoInNew.get(newFields.get(m)).intValue();
                    int columnInOld = fieldAndColumnNoInOld.get(oldFields.get(m)).intValue();
                    String currFieldTwin = twinRow.get(columnInOld);
                    String currField = thisRow[columnInNew];
                    if (!areEqual(currField, currFieldTwin)) {
                        writer.writeNext(thisRow);
                        writer.flush();
                        writtenToOp = true;
                        noOfRecordsWritten++;
                        System.out.println(noOfRecordsWritten);
                        break;
                    }
                }
                if (!writtenToOp) {
                    noOfRecordsInOldList++;
                    // System.out.println("[INFO] present in old List: \n" +
                    // Arrays.toString(thisRow) + " AND\n"
                    // + twinRow.toString());
                }
            }
        }
        System.out.println("--------------------------------------------------------\nDebug info");
        System.out.println("old File: " + oldMap.size());
        System.out.println("new File:" + reader.getLinesRead());

        System.out.println("no of records in old list (present in both old and new) = " + noOfRecordsInOldList);
        System.out.println("checkManually: " + checkManually);
        System.out.println("noOfRecordsInOldList+checkManually = " + (noOfRecordsInOldList + checkManually));
        System.out.println("no of records written = " + noOfRecordsWritten);
        System.out.println();
        System.out.println("inside populateMapFromSplittedCSV() " + insidepopulateMapFromSplittedCSV + "times");

        logWriter.close();
        reader.close();
        writer.close();

    }

    private static void printMapToCSV(Map<String, List<String>> oldMap, String testFilePath2) throws IOException {
        // writer
        int i = 0;
        CSVWriter writer = new CSVWriter(new FileWriter(new File(testFilePath2)), '|');
        for (String key : oldMap.keySet()) {
            List<String> row = oldMap.get(key);
            String[] tempRow = new String[row.size()];
            tempRow = row.toArray(tempRow);
            writer.writeNext(tempRow);
            writer.flush();
            i++;
        }
        writer.close();
        System.out.println("[hello from line 210 ( inside printMapToCSV() ) of ur code] wrote " + i + " lines");
    }

    private static Map<String, List<String>> populateMapFromSplittedCSV(Map<String, List<String>> oldMap, File oldDir)
            throws IOException {

        File defective = new File(dliRootPath + "defectiveOldFiles.csv");
        defective.createNewFile();
        CSVWriter defectWriter = new CSVWriter(new FileWriter(defective));

        CSVReader reader = null;
        for (File oldFile : oldDir.listFiles()) {
            insidepopulateMapFromSplittedCSV++;
            reader = new CSVReader(new FileReader(oldFile), ',', '"');
            oldMap = populateMapFromCSV(oldMap, reader, defectWriter);
            // printMapToCSV(oldMap, dliRootPath+testFilePath);
            System.out.println(oldMap.size());
            reader.close();
        }
        defectWriter.close();
        System.out.println("inside populateMapFromSplittedCSV() " + insidepopulateMapFromSplittedCSV + "times");
        return new HashMap<String, List<String>>(oldMap);
    }

    private static Map<String, Integer> getColumnNo(List<String> fieldsToBeMatched, String[] headingRow) {
        Map<String, Integer> fieldAndColumnNo = new HashMap<>();
        for (String field : fieldsToBeMatched) {
            for (int i = 0; i < headingRow.length; i++) {
                String heading = headingRow[i];
                if (areEqual(field, heading)) {
                    fieldAndColumnNo.put(field, Integer.valueOf(i));
                    break;
                }
            }
        }
        return fieldAndColumnNo;
    }

    private static Map<String, List<String>> populateMapFromCSV(Map<String, List<String>> oldMap, CSVReader oldReader,
            CSVWriter defectWriter) throws IOException {
        int headingLen = 0;
        List<String> headingRow = null;
        if (oldReader.getLinesRead() > 1) {
            headingRow = oldMap.get("id");
            headingLen = headingRow.size();
        }
        String[] thisRow;
        int insideWhile = 0, addedInMap = 0, doesNotContainKey = 0, containsKey = 0;
        while ((thisRow = oldReader.readNext()) != null) {

            // error check
            // if (oldReader.getLinesRead() > 1) {
            // if (thisRow.length != headingLen) {
            // System.err.println("Line no: " + oldReader.getLinesRead() + " in
            // file: " + dliRootPath + oldDli
            // + " not read. Check manually");
            // defectWriter.writeNext(thisRow);
            // defectWriter.flush();
            // continue;
            // }
            // }

            insideWhile++;
            if (!oldMap.containsKey(thisRow[0])) {
                doesNotContainKey++;
                List<String> fullRow = Arrays.asList(thisRow);
                fullRow = oldMap.put(thisRow[0], fullRow);
                if (fullRow == null) {
                    addedInMap++;
                }
            } else {
                List<String> twinRow = oldMap.get(thisRow[0]);
                boolean writtenToOp = false;

                // for(String fieldName : fieldAndColumnNoInNew.keySet()){
                for (int m = 0; m < headingRow.size(); m++) {

                    String currFieldTwin = twinRow.get(m);
                    String currField = thisRow[m];
                    if (!areEqual(currField, currFieldTwin)) {
                        System.err.println("do something!!!!!!  DUPLICATE ID in old file");
                        containsKey++;
                        FileWriter logWriter = new FileWriter(new File((dliRootPath + log)));
                        System.err.println("[Skipped record] in old file. Row no: " + oldReader.getLinesRead()
                                + "\nRecord: " + Arrays.toString(thisRow));
                        logWriter.append("[Skipped record] in old file. Row no: " + oldReader.getLinesRead()
                                + "\nRecord: " + Arrays.toString(thisRow));
                        logWriter.close();
                        break;
                    }
                }

            }
        }
        System.out.println("inside while:      " + insideWhile);
        System.out.println("oldMap size =      " + oldMap.size());
        System.out.println("addedInMap:        " + addedInMap);
        System.out.println("doesNotContainKey: " + doesNotContainKey);
        System.out.println("containsKey:       " + containsKey);

        return new HashMap<String, List<String>>(oldMap);

    }

    private static boolean areEqual(String field, String heading) {
        // TODO implement, askSubhayan

        return field.trim().equals(heading.trim());
    }

    /**
     * Returns the first duplicate ID OR the string "unique" OR (rarely)
     * totalLinesInCSV != totaluniqueIDs
     * 
     * @param inpCSV
     * @param totalLinesInCSV
     * @return
     * @throws IOException
     */
    private static String areIDsunique(String inpCSV, int totalLinesInCSV) throws IOException {
        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        List<String[]> allRows = new ArrayList<>(reader.readAll());
        reader.close();
        Set<String> id = new HashSet<>();
        for (String[] thisRow : allRows) {
            if (thisRow[0] != null || !thisRow[0].isEmpty() || id.add(thisRow[0])) {
                return thisRow[0];
            }
        }
        if (id.size() == totalLinesInCSV) {
            return "unique";
        } else {
            return "totalLinesInCSV != totaluniqueIDs";
        }
    }

    /**
     * writes 20 rowsof input csv into the output file
     * 
     * @param input
     * @param output
     * @throws IOException
     */
    public static void _readSample(String input, String output) throws IOException {
        File opFile = new File(dliRootPath + newFile);
        opFile.createNewFile();
        CSVWriter writer = new CSVWriter(new FileWriter(opFile));

        CSVReader reader = new CSVReader(new FileReader(new File(dliRootPath + dli)), '|');
        for (int i = 0; i < 20; i++) {
            // String[] op;
            // for(String temp: reader.readNext()){
            writer.writeNext(reader.readNext());
            // }
            // System.out.println();
        }
        reader.close();
        writer.flush();
        writer.close();
    }

}

最佳答案

RC 的评论说明了这一点!

如果你检查java文档,你会发现CSVReader中有两个方法:getLinesRead和getRecordsRead。他们都言出必行。 getLinesRead 返回使用 FileReader 读取的行数。 getRecordsRead 返回 CSVReader 读取的记录数。请记住,如果您在文件记录中嵌入了新行,则需要读取多行才能获取一条记录。因此,一个包含 100 条记录的 csv 文件,但需要 200 行读取才能读取全部记录,这是非常可以想象的。

关于java - CSVReader 的 readNext() 函数不循环遍历 csv 的所有行 [编辑 : How to handle erroneous CSV (remove unescaped quotes)],我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/34327574/

相关文章:

java - 地下最短路径 - Java

java - 在不知道长度的情况下从 TcpClient.GetStream() 读取

php - 在 PHP 中的 50K 行 CSV 文件中查找最长匹配项的最快方法

python - 如何在处理文本时使用 pandas 忽略 CSV 列中的空值?

python - 强制发生错误时无法打印 except 语句

java - 需要有关 spring Controller 的信息

javascript - 在javascript中整个网站只自动打开一次模态

csv - 自动将 csv 数据上传到 Hadoop 中的表中

php - 引用 - 这个错误在 PHP 中是什么意思?

haskell "Non-Exhaustive pattern exception"