elasticsearch - 使用Nest搜索无法获得预期结果

标签 elasticsearch nest

我正在使用以下代码创建索引:

        var ElasticSettings = new ConnectionSettings(new Uri(ConnectionString))
            .DefaultIndex(_IndexName)
        .DefaultMappingFor<PictureObject>(M => M
            .Ignore(_ => _._id)
            .Ignore(_ => _.Log))
            .DefaultFieldNameInferrer(_ => _);

    _ElasticClient = new ElasticClient(ElasticSettings);

    if (!_ElasticClient.IndexExists(_IndexName).Exists)
    {
        var I = _ElasticClient.CreateIndex(_IndexName, Ci => Ci
            .Settings(S => S
                .Analysis(A => A
                    .CharFilters(Cf => Cf.Mapping("expressions",
                        E => E.Mappings(ExpressionsList))
                    )
                    .TokenFilters(Tf => Tf.Synonym("synonyms",
                        Descriptor => new SynonymTokenFilter
                        {
                            Synonyms = SynonymsList,
                            Tokenizer = "whitespace"
                        })
                    )
                    .Analyzers(Analyzer => Analyzer
                        .Custom("index", C => C
                            .CharFilters("expressions")
                            .Tokenizer("standard")
                            .Filters("synonyms", "standard", "lowercase", "stop")
                        )
                        .Custom("search", C => C
                            .CharFilters("expressions")
                            .Tokenizer("standard")
                            .Filters("synonyms", "standard", "lowercase", "stop")
                        )
                    )
                )
            )
            .Mappings(Mapping => Mapping
                .Map<PictureObject>(Map => Map
                    .AutoMap()
                    .Properties(P => P
                        .Text(T => T
                            .Name(N => N.Title)
                            .Analyzer("index")
                            .SearchAnalyzer("search")
                        )
                        .Text(T => T
                            .Name(N => N.Tags)
                            .Analyzer("index")
                            .SearchAnalyzer("search")
                        )
                    )
                )
            )
        );

我要搜索的字段是“标题”和“标签”

我的同义词是这种格式:

[ "big => large, huge", "small => tiny, minuscule", ]



我的表情像:

[ "stormy weather => storm", "happy day => joy", ]



我正在使用以下两种方法进行测试:
var Test1 = _ElasticClient.Search<PictureObject>(S => S
        .From(From)
        .Size(Take)
        .Query(_ => _.Fuzzy(Fuzz => Fuzz.Field(F => F.Tags).Field(T => T.Title).Value(Terms).MaxExpansions(2)))).Documents;

var resTest2 = _ElasticClient.Search<PictureObject>(S => S
        .Query(_ => _.QueryString(F => F.Query(Terms)))
        .From(From)
        .Size(Take));

当尝试完全匹配标签字段中的术语时,这两个函数将返回不同的结果。
尝试使用同义词时,结果再次有所不同。

(最终,我也想处理拼写错误,但现在我只使用逐字字符串进行测试)

我想念什么? (我对API仍然不太了解,因此错误可能非常明显)

编辑:
这是一个可以编译的完整示例。
namespace Test
{
    using System;
    using System.Collections.Generic;
    using Nest;

    public class MyData
    {
        public string Id;
        public string Title;
        public string Tags;
    }

    public static class Program
    {
        public static void Main()
        {
            const string INDEX_NAME = "testindex";

            var ExpressionsList = new[]
            {
                "bad weather => storm",
                "happy day => sun"
            };

            var SynonymsList = new[]
            {
                "big => large, huge",
                "small => tiny, minuscule",
                "sun => sunshine, shiny, sunny"
            };

            // connect
            var ElasticSettings = new ConnectionSettings(new Uri("http://elasticsearch:9200"))
                .DefaultIndex(INDEX_NAME)
                .DefaultFieldNameInferrer(_ => _) // stop the camel case
                .DefaultMappingFor<MyData>(M => M.IdProperty("Id"));

            var Client = new ElasticClient(ElasticSettings);

            // erase the old index, if any
            if (Client.IndexExists(INDEX_NAME).Exists) Client.DeleteIndex(INDEX_NAME);

            // create the index
            var I = Client.CreateIndex(INDEX_NAME, Ci => Ci
                .Settings(S => S
                    .Analysis(A => A
                        .CharFilters(Cf => Cf.Mapping("expressions",
                            E => E.Mappings(ExpressionsList))
                        )
                        .TokenFilters(Tf => Tf.Synonym("synonyms",
                            Descriptor => new SynonymTokenFilter
                            {
                                Synonyms = SynonymsList,
                                Tokenizer = "whitespace"
                            })
                        )
                        .Analyzers(Analyzer => Analyzer
                            .Custom("index", C => C
                                .CharFilters("expressions")
                                .Tokenizer("standard")
                                .Filters("synonyms", "standard", "lowercase", "stop")
                            )
                            .Custom("search", C => C
                                .CharFilters("expressions")
                                .Tokenizer("standard")
                                .Filters("synonyms", "standard", "lowercase", "stop")
                            )
                        )
                    )
                )
                .Mappings(Mapping => Mapping
                    .Map<MyData>(Map => Map
                        .AutoMap()
                        .Properties(P => P
                            .Text(T => T
                                .Name(N => N.Title)
                                .Analyzer("index")
                                .SearchAnalyzer("search")
                            )
                            .Text(T => T
                                .Name(N => N.Tags)
                                .Analyzer("index")
                                .SearchAnalyzer("search")
                            )
                        )
                    )
                )
            );

            // add some data
            var Data = new List<MyData>
            {
                new MyData { Id = "1", Title = "nice stormy weather", Tags = "storm nice" },
                new MyData { Id = "2", Title = "a large storm with sunshine", Tags = "storm large sunshine" },
                new MyData { Id = "3", Title = "a storm during a sunny day", Tags = "sun storm" }
            };

            Client.IndexMany(Data);
            Client.Refresh(INDEX_NAME);


            // do some queries
            var TestA1 = Client.Search<MyData>(S => S.Query(_ => _.Fuzzy(Fuzz => Fuzz.Field(F => F.Tags).Field(T => T.Title).Value("stormy sunny").MaxExpansions(2)))).Documents;
            var TestA2 = Client.Search<MyData>(S => S.Query(_ => _.Fuzzy(Fuzz => Fuzz.Field(F => F.Tags).Field(T => T.Title).Value("stromy sunny").MaxExpansions(2)))).Documents;

            var TestB1 = Client.Search<MyData>(S => S.Query(_ => _.QueryString(F => F.Query("stormy sunny")))).Documents;
            // expected to return documents 1, 2, 3 because of synonyms: sun => sunny, shiny, sunshine

            var TestB2 = Client.Search<MyData>(S => S.Query(_ => _.QueryString(F => F.Query("bad weather")))).Documents;
            var TestB3 = Client.Search<MyData>(S => S.Query(_ => _.QueryString(F => F.Query("a large happy day")))).Documents;

            /*
             * I'm expecting the fuzzy queries to handle misspellings
             * Also, I'm expecting the expressions and synonyms to do the substitutions as they're written
             *
             * Ideally I'd like to handle:
             *  - expressions
             *  - synonyms
             *  - misspellings
             *
             * all together
             *
             * I have tried a lot of string examples while debugging and it's really hit or miss.
             * Unfortunately, I haven't kept the strings, but it was enough to see that there is something
             * wrong with my approach in this code.
             */
        }
    }
}

最佳答案

这里有一些指示,可以帮助您走上正确的路

字符过滤器

var ExpressionsList = new[]
{
    "bad weather => storm",
    "happy day => sun"
};


考虑这些是否应该是字符过滤器;它们可能是,但通常是在 token 生成器可能会错误地 token 化的地方使用字符过滤器,例如
  • 在标记
  • 之前剥离HTML标记
  • 当我们理想地希望保留并替换为字符过滤器
  • 中的&时,标准标记生成器删除and
  • 标准 token 生成器将c#标记为c,理想情况下,我们希望保留并替换为字符过滤器
  • 中的csharp
    可能是您要进行字符过滤,但是在多单词的情况下,最好用同义词或同义词图来处理。

    定制分析仪
    indexsearch自定义分析器相同,您可以删除其中一个。同样,如果未明确设置,则search_analyzer数据类型字段的text将是已配置的analyzer,因此这可以简化一些事情。

    同义词

    var SynonymsList = new[]
    {
        "big => large, huge",
        "small => tiny, minuscule",
        "sun => sunshine, shiny, sunny"
    };
    


    这是directional synonym map,即左侧的匹配项将替换为右侧的所有替代项。如果所有人都应被视为相等的同义词,那么您可能不想要方向图,即
    var SynonymsList = new[]
    {
        "big, large, huge",
        "small, tiny, minuscule",
        "sun, sunshine, shiny, sunny"
    };
    

    这将返回所有3个文档

    var TestB1 = Client.Search<MyData>(S => S.Query(_ => _.QueryString(F => F.Query("stormy sunny")))).Documents;
    // expected to return documents 1, 2, 3 because of synonyms: sun => sunny, shiny, sunshine
    


    token 过滤器

    .Custom("index", C => C
        .CharFilters("expressions")
        .Tokenizer("standard")
        .Filters("synonyms", "standard", "lowercase", "stop")
    )
    .Custom("search", C => C
        .CharFilters("expressions")
        .Tokenizer("standard")
        .Filters("synonyms", "standard", "lowercase", "stop")
    )
    


    token 过滤器的顺序很重要,因此您要在小写过滤器之后运行同义词过滤器

    模糊查询

    Fuzzy queries是术语级查询,因此查询输入不会进行分析,这意味着如果您针对在索引时分析的字段运行它,则模糊查询输入将需要与在索引时从分析中输出的文档的术语进行匹配。如果查询输入是将在索引时间标记为多个术语的查询输入,则这可能不会产生正确的结果,即模糊查询输入将被视为一个完整术语,但是目标文档字段的索引时间值可能具有被分成多个术语。

    看看《权威指南》中的Fuzziness section-它适用于Elasticsearch 2.x,但在很大程度上仍与更高版本相关。您可能希望使用支持模糊性并在查询时执行分析的全文查询,例如query_stringmatchmulti_match查询。

    一个例子

    放在一起,这是开发时要使用的示例
    public class MyData
    {
        public string Id;
        public string Title;
        public string Tags;
    }
    
    public static void Main()
    {
        const string INDEX_NAME = "testindex";
    
        var expressions = new[]
        {
                "bad weather => storm",
                "happy day => sun"
        };
    
        var synonyms = new[]
        {
                "big, large, huge",
                "small, tiny, minuscule",
                "sun, sunshine, shiny, sunny"
        };
    
        // connect
        var settings = new ConnectionSettings(new Uri("http://localhost:9200"))
            .DefaultIndex(INDEX_NAME)
            .DefaultFieldNameInferrer(s => s) // stop the camel case
            .DefaultMappingFor<MyData>(m => m.IdProperty("Id"))
            .DisableDirectStreaming()
            .PrettyJson()
            .OnRequestCompleted(callDetails =>
            {
                if (callDetails.RequestBodyInBytes != null)
                {
                    Console.WriteLine(
                        $"{callDetails.HttpMethod} {callDetails.Uri} \n" +
                        $"{Encoding.UTF8.GetString(callDetails.RequestBodyInBytes)}");
                }
                else
                {
                    Console.WriteLine($"{callDetails.HttpMethod} {callDetails.Uri}");
                }
    
                Console.WriteLine();
    
                if (callDetails.ResponseBodyInBytes != null)
                {
                    Console.WriteLine($"Status: {callDetails.HttpStatusCode}\n" +
                             $"{Encoding.UTF8.GetString(callDetails.ResponseBodyInBytes)}\n" +
                             $"{new string('-', 30)}\n");
                }
                else
                {
                    Console.WriteLine($"Status: {callDetails.HttpStatusCode}\n" +
                             $"{new string('-', 30)}\n");
                }
            });
    
        var Client = new ElasticClient(settings);
    
        // erase the old index, if any
        if (Client.IndexExists(INDEX_NAME).Exists) Client.DeleteIndex(INDEX_NAME);
    
        // create the index
        var createIndexResponse = Client.CreateIndex(INDEX_NAME, c => c
            .Settings(s => s
                .Analysis(a => a
                    .CharFilters(cf => cf
                        .Mapping("expressions", E => E
                            .Mappings(expressions)
                        )
                    )
                    .TokenFilters(tf => tf
                        .Synonym("synonyms", sy => sy
                            .Synonyms(synonyms)
                            .Tokenizer("whitespace")
                        )
                    )
                    .Analyzers(an => an
                        .Custom("index", ca => ca
                            .CharFilters("expressions")
                            .Tokenizer("standard")
                            .Filters("standard", "lowercase", "synonyms",  "stop")
                        )
                    )
                )
            )
            .Mappings(m => m
                .Map<MyData>(mm => mm
                    .AutoMap()
                    .Properties(p => p
                        .Text(t => t
                            .Name(n => n.Title)
                            .Analyzer("index")
                        )
                        .Text(t => t
                            .Name(n => n.Tags)
                            .Analyzer("index")
                        )
                    )
                )
            )
        );
    
        // add some data
        var data = new List<MyData>
            {
                new MyData { Id = "1", Title = "nice stormy weather", Tags = "storm nice" },
                new MyData { Id = "2", Title = "a large storm with sunshine", Tags = "storm large sunshine" },
                new MyData { Id = "3", Title = "a storm during a sunny day", Tags = "sun storm" }
            };
    
        Client.IndexMany(data);
        Client.Refresh(INDEX_NAME);
    
        //var query = "stormy sunny";
        var query = "stromy sunny";
        // var query = "bad weather";
        // var query = "a large happy day";
    
        var testA1 = Client.Search<MyData>(s => s
            .Query(q => q
                .MultiMatch(fu => fu
                    .Fields(f => f
                        .Field(ff => ff.Tags)
                        .Field(ff => ff.Title)
                    )           
                    .Query(query)
                    .Fuzziness(Fuzziness.EditDistance(2))
                )
            )
        ).Documents;
    }
    

    我在连接设置中添加了.DisableDirectStreaming().PrettyJson().OnRequestCompleted(...)处理程序,以便您可以看到写入控制台的请求和响应。这些在开发过程中很有用,但是您可能希望将其删除以用于生产,因为它们会增加开销。像Linqpad这样的小应用程序将在这里发展:

    该示例使用启用了模糊度的multi_match查询,其编辑距离为2(可能只想在此处使用自动模糊度,它做得很明智),并在TagsTitle字段上运行。返回所有三个文档以供(misspelt)查询"stromy sunny"

    关于elasticsearch - 使用Nest搜索无法获得预期结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52840944/

    相关文章:

    amazon-web-services - 是否会使用 cloudformation 配置对 Amazon Elasticsearch Service 域的 VPC 支持

    elasticsearch - 无法将Filebeat连接到Logstash以使用ELK进行日志记录

    c# - NEST - 索引各个字段

    c# - 精确文本匹配的 NEST 查询

    Spring 中的 ElasticSearch 与 @Query

    c# - 在 Elasticsearch 和嵌套中传递和比较具有不同时区的日期时间值

    Nest 2.0启用跟踪

    c# - 如何使用应存在的2个字段编写Nest查询

    c# - 带有期限和日期范围的汇总