我无法使用 lucene 在 java 中索引 docx 文件。只有我的文档文件被编入索引。但是当我将它传递给 Doc 解析器时,它返回空,我无法搜索内容。
Here is indexer code.
package luceneapplication;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import parsers.DocFileParser;
import parsers.PdfFileParser;
public class Indexer {
private final String sourceFilePath = "C:/temp"; //give the location of the source files location here
private final String indexFilePath = "C:/com/New folder"; //give the location where you guys want to create index
private IndexWriter writer = null;
private File indexDirectory = null;
private String fileContent; //temporary storer of all the text parsed from doc and pdf
/**
*
* @throws FileNotFoundException
* @throws CorruptIndexException
* @throws IOException
*/
private Indexer() throws FileNotFoundException, CorruptIndexException, IOException {
try {
long start = System.currentTimeMillis();
createIndexWriter();
checkFileValidity();
closeIndexWriter();
long end = System.currentTimeMillis();
System.out.println("Total Document Indexed : " + TotalDocumentsIndexed());
System.out.println("Total time" + (end - start) / (100 * 60));
} catch (Exception e) {
System.out.println("Sorry task cannot be completed");
}
}
/**
* IndexWriter writes the data to the index. Its provided by Lucene
*
* @param analyzer : its a standard analyzer, in this case it filters out
* englishStopWords and also analyses TFIDF
*/
private void createIndexWriter() {
try {
indexDirectory = new File(indexFilePath);
if (!indexDirectory.exists()) {
indexDirectory.mkdir();
}
FSDirectory dir = FSDirectory.open(indexDirectory);
StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer);
writer = new IndexWriter(dir, config);
} catch (Exception ex) {
System.out.println("Sorry cannot get the index writer");
}
}
/**
* This function checks whenther the file passed is valid or not
*/
private void checkFileValidity() {
File[] filesToIndex = new File[100]; // suppose there are 100 files at max
filesToIndex = new File(sourceFilePath).listFiles();
for (File file : filesToIndex) {
try {
//to check whenther the file is a readable file or not.
if (!file.isDirectory()
&& !file.isHidden()
&& file.exists()
&& file.canRead()
&& file.length() > 0.0
&& file.isFile()) {
if (file.getName().endsWith(".doc") || file.getName().endsWith(".pdf")
|| file.getName().endsWith(".docx")) {
//different method for indexing doc and pdf file.
StartIndex(file);
}
}
} catch (Exception e) {
System.out.println("Sorry cannot index " + file.getAbsolutePath());
}
}
}
/**
* This method is for indexing pdf file and doc file. The text parsed from
* them are indexed along with the filename and filepath
*
* @param file : the file which you want to index
* @throws FileNotFoundException
* @throws CorruptIndexException
* @throws IOException
*/
public void StartIndex(File file) throws FileNotFoundException, CorruptIndexException, IOException {
fileContent = null;
try {
Document doc = new Document();
if (file.getName().endsWith(".docx")) {
//call the doc file parser and get the content of doc file in txt format
String path = file.getCanonicalPath();
System.out.println("Path is:"+path);
DocFileParser docParser = new DocFileParser();
System.out.println("DocFileParser contains:"+docParser.toString());
fileContent = ((docParser.DocFileContentParser(path)));
System.out.println("file contents :"+fileContent);
}
doc.add(new Field("content", fileContent,Field.Store.YES,Field.Index.ANALYZED));
doc.add(new Field("filename", file.getName(),
Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("fullpath", file.getCanonicalPath(),
Field.Store.YES, Field.Index.ANALYZED));
if (doc != null) {
writer.addDocument(doc);
}
System.out.println("Indexed" + file.getAbsolutePath());
} catch (Exception e) {
System.out.println("error in indexing" + (file.getAbsolutePath()));
}
}
/**
* This method returns the total number of documents indexed.
*
* @return total number of documents indexed.
*/
private int TotalDocumentsIndexed() {
try {
IndexReader reader = IndexReader.open(FSDirectory.open(indexDirectory));
return reader.maxDoc();
} catch (Exception ex) {
System.out.println("Sorry no index found");
}
return 0;
}
/**
* closes the IndexWriter
*/
private void closeIndexWriter() {
try {
writer.optimize();
writer.close();
} catch (Exception e) {
System.out.println("Indexer Cannot be closed");
}
}
/**
* Main method.
*
* @param arg
*/
public static void main(String arg[]) throws ParseException {
try {
new Indexer();
new Searcher().searchIndex("Program");
} catch (IOException ex) {
System.out.println("Cannot Start :(");
}
}
}
Searcher code
public class Searcher {
public void searchIndex(String instring) throws IOException, ParseException {
System.out.println("Searching for ' " + instring + " '");
IndexSearcher searcher = new IndexSearcher(FSDirectory.open(new File("C:\\com\\New folder")));
Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_34);
QueryParser queryParser = new QueryParser(Version.LUCENE_34, "content", analyzer1);
QueryParser queryParserfilename = new QueryParser(Version.LUCENE_34, "fullpath", analyzer1);
Query query = queryParser.parse(instring);
// Query queryfilename = queryParserfilename.parse(instring);
TopDocs hits = searcher.search(query, 100);
ScoreDoc[] document = hits.scoreDocs;
System.out.println("Total no of hits for content: " + hits.totalHits);
for (int i = 0; i < document.length; i++) {
Document doc = searcher.doc(document[i].doc);
String filePath = doc.get("fullpath");
System.out.println(filePath);
}
// TopDocs hitfilename = searcher.search(queryfilename, 100);
// ScoreDoc[] documentfilename = hitfilename.scoreDocs;
// System.out.println("Total no of hits according to file name" + hitfilename.totalHits);
// for (int i = 0; i < documentfilename.length; i++) {
// Document doc = searcher.doc(documentfilename[i].doc);
// String filename = doc.get("filename");
// System.out.println(filename);
// }
}
and my DocParser code
public class DocFileParser {
/**
* This method parses the content of the .doc file. i.e. this method will
* return all the text of the file passed to it.
*
* @param fileName : file name of which you want the conent of.
* @return : returns the content of the file
*/
public String DocFileContentParser(String fileName)throws Exception {
System.out.println("Filename in DocParser:" + fileName);
POIFSFileSystem fs = null;
//try
{
// if (fileName.endsWith(".xls")) { //if the file is excel file
// ExcelExtractor ex = new ExcelExtractor(fs);
// return ex.getText(); //returns text of the excel file
// } else if (fileName.endsWith(".ppt")) { //if the file is power point file
// PowerPointExtractor extractor = new PowerPointExtractor(fs);
// return extractor.getText(); //returns text of the power point file
//
//else for .doc file
FileInputStream inputstream = new FileInputStream(new File(fileName));
fs = new POIFSFileSystem(inputstream);
System.out.println("POIFSFileSystem:" + fs);
HWPFDocument doc = new HWPFDocument(fs);
// System.out.println("HWPFDocument:" + doc);
WordExtractor we = new WordExtractor(fs);
System.out.println("WordExtractor:" + we);
String content = we.getTextFromPieces();
return content;//if the extension is .doc
}
// catch (Exception e) {
// e.getMessage();
// }
//return "";
}
}
我返回内容但我没有得到。它显示我是空的,在函数调用返回后我有 SYSO。请告诉我我让她困了 3 天。谢谢
最佳答案
将评论提升为答案:
您的代码仅适用于 .doc
文件的原因是您只编写了处理 .doc
文件的代码!作为Apache POI components page explains , HWPF
(你用过的)处理.doc
文件,.docx
文件需要XWPF
但是,您几乎肯定不想针对 POI 编写自己的代码来进行简单的文本提取。高级的东西,有特殊的规则 - 当然!但是基本的“请给我一些纯文本”的东西,有更高级别的库可以为你做这一切
我强烈建议您改用 Apache Tika反而。 Tika 建立在一大堆开源库之上,包括 POI。它可以为 Word .doc
和 .docx
生成用于索引的纯文本,所有这些都使用相同的几行代码。它支持 large and growing number of file formats .使用起来更容易,而且您需要的一切都已准备就绪!
关于java - 为什么 .doc 文件可以通过 Lucene 建立索引,而 .docx 文件无法建立索引?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32009144/