elasticsearch - 在 Elasticsearch 中突出显示术语问题

标签 elasticsearch

  1. We created an index with below settings and mapping.
 PUT http://localhost:9200/essearch
{ 
"mappings": {
        "object": {
            "_all": {
                "enabled": false
            },
            "properties": {
               "content": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                    "similarity": "classic",
                    "analyzer": "content_standard"
                },
                "content_phonic": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                    "similarity": "classic",
                    "analyzer": "content_phonetic"
                },
                "content_stemming": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                    "similarity": "classic",
                    "analyzer": "content_stemming"
                }
            }
        }
    },
 "settings": {
        "index": {
            "number_of_shards": "1",
            "similarity": {
                "default": {
                    "type": "classic"
                }
            },
            "max_result_window": "50000",
            "mapper": {
                "dynamic": "false"
            },
            "analysis": {
                "filter": {
                    "content_phonetic": {
                        "type": "phonetic",
                        "encoder": "doublemetaphone"
                    },
                    "StopWords": {
                        "type": "stop",
                        "stopwords": [
                            "after",
                            "all",
                            "under",
                            "very",
                            "well"]
                    }
                },
                "analyzer": {
                    "content_phonetic": {
                        "filter": [
                            "content_phonetic"
                        ],
                        "char_filter": [
                            "CharFilters"
                        ],
                        "type": "custom",
                        "tokenizer": "standard"
                    },
                    "content_stemming": {
                        "filter": [
                            "lowercase",
                            "porter_stem"
                        ],
                        "char_filter": [
                            "CharFilters"
                        ],
                        "type": "custom",
                        "tokenizer": "standard"
                    },
                    "content_standard": {
                        "filter": [
                            "lowercase",
                            "StopWords"
                        ],
                        "char_filter": [
                            "CharFilters"
                        ],
                        "type": "custom",
                        "tokenizer": "standard"
                    }
                },
                "char_filter": {
                    "CharFilters": {
                        "type": "mapping",
                        "mappings": [
                            ". => ' '",
                            "' => ' '",
                            "_ => ' '",
                            ": => ' '"
                        ]
                    }
                }
            },
            "number_of_replicas": "0"
        }
    }}

2: Indexed a document

 http://localhost:9200/essearch/object/1
{ "content" : "beginning thirty days after the anticipated COD. 
             Buyer shall be responsible for all natural gas and electrical imbalance charges.
             All prices shall be at the Reference Conditions.
             Buyer’s performance of its obligations under the ECSA with a form of guarantee in an amount. Seller shall assign its rights under said requests to Buyer.  Buyer shall have full dispatch rights subject to operational parameters  (including ramp rates. buyer said to me..."   }

3: Performed Highlight query

    http://localhost:9200/essearch/_search
 {
 "highlight": {
"pre_tags": [ "<term0 style='background-color:Lime'>", "<term1 style='background-color:Chocolate'>", "<term2 style='background-color:Pink'>"
],"post_tags": [ "</term0>", "</term1>", "</term2>" ],
"encoder": "html",
"fields": { "content": { "fragment_size": 50, "number_of_fragments": 0, "type": "fvh" } } },
"_source": false,
"query": {
"bool": {
  "must": [
    {
      "query_string": {
        "query": "(\"under said\") OR (said) OR (buyer)",
        "default_field": "content"}} ],
  "filter": [
    {
      "ids": {
        "values": [ "1" ] } } ] } } }

4: Highlight Query Output

    {
"took": 0,
"timed_out": false,
"_shards": {
    "total": 1,
    "successful": 1,
    "failed": 0
},
"hits": {
    "total": 1,
    "max_score": 0.30490398,
    "hits": [
        {
            "_index": "essearch",
            "_type": "object",
            "_id": "1",
            "_score": 0.30490398,
            "highlight": {
                "content": [
                    "beginning thirty days after the anticipated COD.
                    <term1 style='background-color:Chocolate'>Buyer</term1> 
                    shall be responsible for all natural gas and electrical imbalance charges.
                    All prices shall be at the Reference Conditions.Buyer’s performance of its obligations under the ECSA with a form of guarantee in an amount. Seller shall assign its rights under <term0 style='background-color:Lime'>said</term0> requests    to <term1 style='background-color:Chocolate'>Buyer</term1>. <term1 style='background-color:Chocolate'>Buyer</term1> shall have full dispatch rights subject to operational parameters (including ramp rates. <term1 style='background-color:Chocolate'>buyer</term1> <term0 style='background-color:Lime'>said</term0> to me..."
                ]    }  } ] } }

如果您看到我们已经根据提供的查询字词数量应用了前置、后置标签。这里我们有 3 个带有 OR 运算符的术语,因此总共提供了三个前后标签。在执行高亮查询后,它应该按照顺序将 term1 标签应用于“said”术语,但 ES 正在将 term0 标签应用于“said”术语并为“买家”应用term1 标签。

最佳答案

我不确定这是否是您的映射的问题,因为我运行了相同的文本 + 查询并得到了预期的结果。也许它与您查询中的其他字段有关。我发现使用突出显示查询来隔离要突出显示的内容有助于合理化标记的顺序。

"beginning thirty days after the anticipated COD. \n             <term2 style='background-color:Pink'>Buyer</term2> shall be responsible for all natural gas and electrical imbalance charges.\n             All prices shall be at the Reference Conditions.\n             <term2 style='background-color:Pink'>Buyer’s</term2> performance of its obligations under the ECSA with a form of guarantee in an amount. Seller shall assign its rights <term0 style='background-color:Lime'>under said</term0> requests to <term2 style='background-color:Pink'>Buyer</term2>.  <term2 style='background-color:Pink'>Buyer</term2> shall have full dispatch rights subject to operational parameters  (including ramp rates. <term2 style='background-color:Pink'>buyer</term2> <term1 style='background-color:Chocolate'>said</term1> to me..."

我的映射:

{
        "mappings": {
            "properties": {
                "text": {
                    "type": "text",
                    "term_vector": "with_positions_offsets",
                    "analyzer": "english",
                }                
            }
        }
    }

我的文档:

{"text": """beginning thirty days after the anticipated COD. 
             Buyer shall be responsible for all natural gas and electrical imbalance charges.
             All prices shall be at the Reference Conditions.
             Buyer’s performance of its obligations under the ECSA with a form of guarantee in an amount. Seller shall assign its rights under said requests to Buyer.  Buyer shall have full dispatch rights subject to operational parameters  (including ramp rates. buyer said to me..."""}

我的查询:

{
"bool": {
  "must": [
    {
      "query_string": {
        "query": "(\"under said\") OR (said) OR (buyer)",
        "default_field": "text"}} ],
   } } }

关于elasticsearch - 在 Elasticsearch 中突出显示术语问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52871588/

相关文章:

hadoop - Kibana、Logstash 大数据环境

elasticsearch - Elasticsearch中的嵌套查询

elasticsearch - 标点不正确的结果-ElasticSearch

filter - elasticsearch - 将嵌套字段与文档中的另一个字段进行比较

elasticsearch - 在Elasticsearch中将数字用作类型

node.js - elasticSearch/npm:Elasticdump返回 “self signed certificate”错误

elasticsearch - Elasticsearch:复合聚合支持最小文档数过滤器吗?

当单词以 n-gram 开头时,Elasticsearch Edge NGram 分词器得分更高

python - Django Haystack/Elasticsearch dwithin 只返回一个 SearchResult。期待几个

elasticsearch - 根据特定条件在Elasticsearch中建立索引之前过滤掉文档