java - 为什么 .doc 文件可以通过 Lucene 建立索引,而 .docx 文件无法建立索引?

标签 java lucene apache-poi docx doc

我无法使用 lucene 在 java 中索引 docx 文件。只有我的文档文件被编入索引。但是当我将它传递给 Doc 解析器时,它返回空,我无法搜索内容。


Here is indexer code.
package luceneapplication;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import parsers.DocFileParser;
import parsers.PdfFileParser;

public class Indexer {

    private final String sourceFilePath = "C:/temp";    //give the location of the source files location here
    private final String indexFilePath = "C:/com/New folder";   //give the location where you guys want to create index
    private IndexWriter writer = null;
    private File indexDirectory = null;
    private String fileContent;  //temporary storer of all the text parsed from doc and pdf 

    /**
     *
     * @throws FileNotFoundException
     * @throws CorruptIndexException
     * @throws IOException
     */
    private Indexer() throws FileNotFoundException, CorruptIndexException, IOException {
        try {
            long start = System.currentTimeMillis();
            createIndexWriter();
            checkFileValidity();
            closeIndexWriter();
            long end = System.currentTimeMillis();
            System.out.println("Total Document Indexed : " + TotalDocumentsIndexed());
            System.out.println("Total time" + (end - start) / (100 * 60));
        } catch (Exception e) {
            System.out.println("Sorry task cannot be completed");
        }
    }

    /**
     * IndexWriter writes the data to the index. Its provided by Lucene
     *
     * @param analyzer : its a standard analyzer, in this case it filters out
     * englishStopWords and also analyses TFIDF
     */
    private void createIndexWriter() {
        try {
            indexDirectory = new File(indexFilePath);
            if (!indexDirectory.exists()) {
                indexDirectory.mkdir();
            }
            FSDirectory dir = FSDirectory.open(indexDirectory);
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer);
            writer = new IndexWriter(dir, config);
        } catch (Exception ex) {
            System.out.println("Sorry cannot get the index writer");
        }
    }

    /**
     * This function checks whenther the file passed is valid or not
     */
    private void checkFileValidity() {
        File[] filesToIndex = new File[100]; // suppose there are 100 files at max
        filesToIndex = new File(sourceFilePath).listFiles();
        for (File file : filesToIndex) {
            try {
                //to check whenther the file is a readable file or not.
                if (!file.isDirectory()
                        && !file.isHidden()
                        && file.exists()
                        && file.canRead()
                        && file.length() > 0.0
                        && file.isFile()) {
                    if (file.getName().endsWith(".doc") || file.getName().endsWith(".pdf")
                            || file.getName().endsWith(".docx")) {
                        //different method for indexing doc and pdf file.
                        StartIndex(file);
                    }
                }
            } catch (Exception e) {
                System.out.println("Sorry cannot index " + file.getAbsolutePath());
            }
        }
    }

    /**
     * This method is for indexing pdf file and doc file. The text parsed from
     * them are indexed along with the filename and filepath
     *
     * @param file : the file which you want to index
     * @throws FileNotFoundException
     * @throws CorruptIndexException
     * @throws IOException
     */
    public void StartIndex(File file) throws FileNotFoundException, CorruptIndexException, IOException {
        fileContent = null;
        try {
            Document doc = new Document();
            if (file.getName().endsWith(".docx")) {
                //call the doc file parser and get the content of doc file in txt format
                String path = file.getCanonicalPath();
                System.out.println("Path is:"+path);
                DocFileParser docParser = new DocFileParser();
                System.out.println("DocFileParser contains:"+docParser.toString());
                fileContent = ((docParser.DocFileContentParser(path)));
                System.out.println("file contents :"+fileContent);
            }
            doc.add(new Field("content", fileContent,Field.Store.YES,Field.Index.ANALYZED));
            doc.add(new Field("filename", file.getName(),
                    Field.Store.YES, Field.Index.ANALYZED));
            doc.add(new Field("fullpath", file.getCanonicalPath(),
                    Field.Store.YES, Field.Index.ANALYZED));
            if (doc != null) {
                writer.addDocument(doc);
            }
            System.out.println("Indexed" + file.getAbsolutePath());
        } catch (Exception e) {
            System.out.println("error in indexing" + (file.getAbsolutePath()));
        }
    }

    /**
     * This method returns the total number of documents indexed.
     *
     * @return total number of documents indexed.
     */
    private int TotalDocumentsIndexed() {
        try {
            IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory));
            return reader.maxDoc();
        } catch (Exception ex) {
            System.out.println("Sorry no index found");
        }
        return 0;
    }

    /**
     * closes the IndexWriter
     */
    private void closeIndexWriter() {
        try {
            writer.optimize();
            writer.close();
        } catch (Exception e) {
            System.out.println("Indexer Cannot be closed");
        }
    }

    /**
     * Main method.
     *
     * @param arg
     */
    public static void main(String arg[]) throws ParseException {
        try {
            new Indexer();
            new Searcher().searchIndex("Program");
        } catch (IOException ex) {
            System.out.println("Cannot Start :(");
        }
    }
}

Searcher code
public class Searcher {

    public void searchIndex(String instring) throws IOException, ParseException {
        System.out.println("Searching for ' " + instring + " '");
        IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("C:\\com\\New folder")));
        Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_34);
        QueryParser queryParser = new QueryParser(Version.LUCENE_34, "content", analyzer1);
        QueryParser queryParserfilename = new QueryParser(Version.LUCENE_34, "fullpath", analyzer1);
        Query query = queryParser.parse(instring);
       // Query queryfilename = queryParserfilename.parse(instring);
        TopDocs hits = searcher.search(query, 100);
        ScoreDoc[] document = hits.scoreDocs;

        System.out.println("Total no of hits for content: " + hits.totalHits);
        for (int i = 0; i < document.length; i++) {
            Document doc = searcher.doc(document[i].doc);
            String filePath = doc.get("fullpath");
            System.out.println(filePath);
        }

//        TopDocs hitfilename = searcher.search(queryfilename, 100);
//        ScoreDoc[] documentfilename = hitfilename.scoreDocs;
//        System.out.println("Total no of hits according to file name" + hitfilename.totalHits);
//        for (int i = 0; i < documentfilename.length; i++) {
//            Document doc = searcher.doc(documentfilename[i].doc);
//            String filename = doc.get("filename");
//            System.out.println(filename);
//        }
    }

and my DocParser code
public class DocFileParser {

    /**
     * This method parses the content of the .doc file. i.e. this method will
     * return all the text of the file passed to it.
     *
     * @param fileName : file name of which you want the conent of.
     * @return : returns the content of the file
     */
    public String DocFileContentParser(String fileName)throws Exception {
        System.out.println("Filename in DocParser:" + fileName);
        POIFSFileSystem fs = null;
        //try 
        {

//            if (fileName.endsWith(".xls")) { //if the file is excel file
//                ExcelExtractor ex = new ExcelExtractor(fs);
//                return ex.getText(); //returns text of the excel file
//            } else if (fileName.endsWith(".ppt")) { //if the file is power point file
//                PowerPointExtractor extractor = new PowerPointExtractor(fs);
//                return extractor.getText(); //returns text of the power point file
//


            //else for .doc file
            FileInputStream inputstream =  new FileInputStream(new File(fileName));
            fs = new POIFSFileSystem(inputstream);
            System.out.println("POIFSFileSystem:" + fs);
            HWPFDocument doc = new HWPFDocument(fs);
//            System.out.println("HWPFDocument:" + doc);
            WordExtractor we = new WordExtractor(fs);
            System.out.println("WordExtractor:" + we);
            String content  = we.getTextFromPieces();
            return content;//if the extension is .doc
        }
//         catch (Exception e) {
//            e.getMessage();
//        }
        //return "";
    }
}

我返回内容但我没有得到。它显示我是空的,在函数调用返回后我有 SYSO。请告诉我我让她困了 3 天。谢谢

最佳答案

将评论提升为答案:

您的代码仅适用于 .doc 文件的原因是您只编写了处理 .doc 文件的代码!作为Apache POI components page explains , HWPF(你用过的)处理.doc文件,.docx文件需要XWPF

但是,您几乎肯定不想针对 POI 编写自己的代码来进行简单的文本提取。高级的东西,有特殊的规则 - 当然!但是基本的“请给我一些纯文本”的东西,有更高级别的库可以为你做这一切

我强烈建议您改用 Apache Tika反而。 Tika 建立在一大堆开源库之上,包括 POI。它可以为 Word .doc.docx 生成用于索引的纯文本,所有这些都使用相同的几行代码。它支持 large and growing number of file formats .使用起来更容易,而且您需要的一切都已准备就绪!

关于java - 为什么 .doc 文件可以通过 Lucene 建立索引,而 .docx 文件无法建立索引?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32009144/

相关文章:

php - 创建和更新 Zend_Search_Lucene 索引

java - 使用 Apache-POI 在 Excel 超链接中设置屏幕提示

java - IBM Integration 总线,解析 json

java - 创建未知类型的数组

java - Hibernate 时间戳搜索查询返回空列表

elasticsearch - ElasticSearch只能添加字段索引,而不能像lucene Field.Store.NO一样保存原始值

java - 如何使用ProcessBuilder将EAR文件部署到wildfly服务器?

mysql - 使用 NodeJS 进行全文搜索

Java - Apache POI - 读/写 .xlsx 文件 - 文件损坏并变为空

java - Apache POI 3.7 OutOfMemoryError : Java heap space when writing to large no of rows to xlsx files