c# - Lucene.Net拼音搜索怎么用

标签 c# lucene.net

Lucene.Net 文档极度缺乏——我一直在努力弄清楚如何使用 Lucene.Net phonetic searching functionality .

我主要引用了 lucene.ne git project PhoneticFilter 的测试用例:https://github.com/apache/lucenenet/blob/master/src/Lucene.Net.Tests.Analysis.Phonetic/TestPhoneticFilter.cs

我如何创建索引:

// add new index entry
Document doc = new Document
{
    new TextField("brand", vehicle.Brand ?? string.Empty, Field.Store.YES),
    new TextField("range", vehicle.Range ?? string.Empty, Field.Store.YES),
    new TextField("model", vehicle.Model ?? string.Empty, Field.Store.YES),
    new TextField("year", vehicle.Year ?? string.Empty, Field.Store.YES),
};

// add entry to index
writer.AddDocument(doc);

接下来,来自test case ,看来您可以将 PhoneticFilter 添加到 Analyzer:

// set up lucene searcher
using DirectoryReader reader = DirectoryReader.Open(Directory);

Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) =>
{
    Tokenizer tokenizer = new KeywordTokenizer(reader);
    return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, new Soundex(), false));
});

IndexSearcher searcher = new IndexSearcher(reader);
int hits_limit = searchModel.Start + searchModel.Qty;
TopDocs topDocs;
Sort sortOrder = null;

BooleanQuery vehicleFilterQuery = new BooleanQuery();

var brandParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "brand", analyzer);
var brandQuery = ParseQuery(searchModel.SearchTerm, brandParser);
brandQuery.Boost = 4.0f;
vehicleFilterQuery.Add(brandQuery, Occur.SHOULD);

var rangeParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "range", analyzer);
var rangeQuery = ParseQuery(searchModel.SearchTerm, rangeParser);
rangeQuery.Boost = 3.0f;
vehicleFilterQuery.Add(rangeQuery, Occur.SHOULD);

var modelParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "model", analyzer);
var modelQuery = ParseQuery(searchModel.SearchTerm, modelParser);
modelQuery.Boost = 2.0f;
vehicleFilterQuery.Add(modelQuery, Occur.SHOULD);

var yearParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "year", analyzer);
var yearQuery = ParseQuery(searchModel.SearchTerm, yearParser);
yearQuery.Boost = 1.0f;
vehicleFilterQuery.Add(yearQuery, Occur.SHOULD);

topDocs = searcher.Search(vehicleFilterQuery, null, hits_limit, sortOrder ?? Sort.RELEVANCE);
ScoreDoc[] subset = topDocs.ScoreDocs.Skip(searchModel.Start).Take(searchModel.Qty).ToArray();

return MapToModelList(subset, searcher);

为了测试实现这个之后,我用 Audee 搜索,期望得到一堆 Audi 结果,但是当常规(正确拼写)搜索工作时,拼音搜索似乎没有影响。

我试过 Metaphone()DoubleMetaphone()Soundex()RefinedSoundex()Caverphone1()Caverphone2() 编码器

Metaphone https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.Metaphone.html

DoubleMetaphone https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.DoubleMetaphone.html

Soundex https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.Soundex.html

RefinedSoundex https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.RefinedSoundex.html

Caverphone1 https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.Caverphone1.html

Caverphone2 https://lucenenetdocs.azurewebsites.net/api/Lucene.Net.Analysis/Lucene.Net.Analysis.Phonetic.Language.Caverphone2.html

最佳答案

有一个专门的DoubleMetaphoneFilter专为基本的语音匹配而设计。如果您需要显式控制所使用的语音算法,则只需使用 PhoneticFilter

这是一个例子(基于 this StackOverflow answer ):

// Ensures index backward compatibility
const LuceneVersion AppLuceneVersion = LuceneVersion.LUCENE_48;

[Test]
public void DoubleMetaphoneExample()
{
    // Create a directory
    var directory = new RAMDirectory();

    // Create a phonetic analyzer
    var analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => {
        var tokenizer = new KeywordTokenizer(input: reader);
        var stream = new DoubleMetaphoneFilter(input: tokenizer, maxCodeLength: 8, inject: false);
        return new TokenStreamComponents(tokenizer, stream);
    });

    // Create an index writer
    var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer);
    using (var writer = new IndexWriter(directory, indexConfig))
    {
        // Add documents
        foreach (var vehicle in Vehicles)
        {
            // add new index entry
            Document doc = new Document
            {
                new TextField("brand", vehicle.Brand ?? string.Empty, Field.Store.YES),
                new TextField("range", vehicle.Range ?? string.Empty, Field.Store.YES),
                new TextField("model", vehicle.Model ?? string.Empty, Field.Store.YES),
                new TextField("year", vehicle.Year ?? string.Empty, Field.Store.YES),
            };

            // add entry to index
            writer.AddDocument(doc);
        }
    }
    // Done indexing

    // Begin Search

    var searchModel = new { SearchTerm = "audee" };

    // Open an IndexReader
    using var reader = DirectoryReader.Open(directory);

    IndexSearcher searcher = new IndexSearcher(reader);
    int hits_limit = 5;
    TopDocs topDocs;
    Sort sortOrder = null;

    BooleanQuery vehicleFilterQuery = new BooleanQuery();

    var brandParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "brand", analyzer);
    var brandQuery = ParseQuery(searchModel.SearchTerm, brandParser);
    brandQuery.Boost = 4.0f;
    vehicleFilterQuery.Add(brandQuery, Occur.SHOULD);

    var rangeParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "range", analyzer);
    var rangeQuery = ParseQuery(searchModel.SearchTerm, rangeParser);
    rangeQuery.Boost = 3.0f;
    vehicleFilterQuery.Add(rangeQuery, Occur.SHOULD);

    var modelParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "model", analyzer);
    var modelQuery = ParseQuery(searchModel.SearchTerm, modelParser);
    modelQuery.Boost = 2.0f;
    vehicleFilterQuery.Add(modelQuery, Occur.SHOULD);

    var yearParser = new QueryParser(Lucene.Net.Util.LuceneVersion.LUCENE_48, "year", analyzer);
    var yearQuery = ParseQuery(searchModel.SearchTerm, yearParser);
    yearQuery.Boost = 1.0f;
    vehicleFilterQuery.Add(yearQuery, Occur.SHOULD);

    topDocs = searcher.Search(vehicleFilterQuery, null, hits_limit, sortOrder ?? Sort.RELEVANCE);

    // topDocs.ScoreDocs contains

    // doc = 2
    // doc = 3
    // doc = 4
    // doc = 7
}

private static Search.Query ParseQuery(string searchTerm, QueryParser queryParser)
{
    return queryParser.Parse(searchTerm);
}

private static Vehicle[] Vehicles = new Vehicle[] {
    new Vehicle { Brand = "Ford", Model = "Taurus", Year = "1986" },
    new Vehicle { Brand = "Ford", Model = "Fiesta", Year = "1990" },
    new Vehicle { Brand = "Audi", Model = "A4 45", Year = "2021" },
    new Vehicle { Brand = "Audi", Model = "Q3 45 S", Year = "2021" },
    new Vehicle { Brand = "Audie", Model = "Q3 45 S", Year = "2021" },
    new Vehicle { Brand = "Toyota", Model = "Corolla", Year = "2010" },
    new Vehicle { Brand = "Toyota", Model = "Hilux", Year = "2015" },
    new Vehicle { Brand = "Audi", Model = "A4", Year = "2017" },
};

public class Vehicle
{
    public string Brand { get; set; }
    public string Range { get; set; }
    public string Model { get; set; }
    public string Year { get; set; }
}

或者,您可以使用 PhoneticFilter 来选择算法。我用 DoubleMetaphone 尝试了您的示例(设置和不设置 MaxCodeLen)并且效果很好。

var stream = new PhoneticFilter(
    input: tokenizer,
    encoder: new DoubleMetaphone() { MaxCodeLen = 8 },
    inject: false);

有一些关于 phonetic algorithms 的一般信息在维基百科上。在大多数情况下,Double Metaphone 是英语语言的最佳选择。

关于c# - Lucene.Net拼音搜索怎么用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/60654369/

相关文章:

Lucene查询- "Match exactly one of x, y, z"

c# - 有没有用C# + ASP.NET实现国际化的基本好教程?

sitecore - 通过 Lucene.Net 获取最近的文章

c# - 方法在读取 TCP 套接字数据时结束

c# - 没有构造函数的结构体

full-text-search - Lucene 检查的搜索条件

c# - 将 Lucene 索引文件存储到远程位置

lucene - 在 Lucene/Lucene.net 搜索中,如何计算每个文档的点击次数?

c# - 类(class)提供进度信息的方式有哪些

c# - 使用通过 Intellisense 生成代码的 MSBuild 任务的正确方法