语境
我正在尝试在使用Elasticsearch的应用程序中支持smart-case search
。我要支持的用例是使用智能用例语义能够部分匹配任何文本块。我设法以能够模拟智能案例搜索的方式配置索引。它使用最大长度为8的ngram来满足存储需求。
它的工作方式是,每个文档都有一个生成的case-sensitive
和一个使用case-insensitive
的copy_to
字段以及自己的特定索引策略。在搜索给定输入时,我将输入分为几部分。这取决于ngram的长度,空格和双引号转义。检查每个部分的大写字母。找到大写字母时,它将使用区分大小写的字段为该特定部分生成匹配过滤器,否则使用不区分大小写的字段。
事实证明,这种方法可以很好地工作,但是我很难以我想要的方式突出显示工作。为了更好地解释该问题,我在下面添加了我的测试设置的概述。
设定值
curl -X DELETE localhost:9200/custom
curl -X PUT localhost:9200/custom -d '
{
"settings": {
"analysis": {
"filter": {
"default_min_length": {
"type": "length",
"min": 1
},
"squash_spaces": {
"type": "pattern_replace",
"pattern": "\\s{2,}",
"replacement": " "
}
},
"tokenizer": {
"ngram_tokenizer": {
"type": "nGram",
"min_gram": "2",
"max_gram": "8"
}
},
"analyzer": {
"index_raw": {
"type": "custom",
"filter": ["lowercase","squash_spaces","trim","default_min_length"],
"tokenizer": "keyword"
},
"index_case_insensitive": {
"type": "custom",
"filter": ["lowercase","squash_spaces","trim","default_min_length"],
"tokenizer": "ngram_tokenizer"
},
"search_case_insensitive": {
"type": "custom",
"filter": ["lowercase","squash_spaces","trim"],
"tokenizer": "keyword"
},
"index_case_sensitive": {
"type": "custom",
"filter": ["squash_spaces","trim","default_min_length"],
"tokenizer": "ngram_tokenizer"
},
"search_case_sensitive": {
"type": "custom",
"filter": ["squash_spaces","trim"],
"tokenizer": "keyword"
}
}
}
},
"mappings": {
"_default_": {
"_all": { "enabled": false },
"date_detection": false,
"dynamic_templates": [
{
"case_insensitive": {
"match_mapping_type": "string",
"match": "case_insensitive",
"mapping": {
"type": "string",
"analyzer": "index_case_insensitive",
"search_analyzer": "search_case_insensitive"
}
}
},
{
"case_sensitive": {
"match_mapping_type": "string",
"match": "case_sensitive",
"mapping": {
"type": "string",
"analyzer": "index_case_sensitive",
"search_analyzer": "search_case_sensitive"
}
}
},
{
"text": {
"match_mapping_type": "string",
"mapping": {
"type": "string",
"analyzer": "index_raw",
"copy_to": ["case_insensitive","case_sensitive"],
"fields": {
"case_insensitive": {
"type": "string",
"analyzer": "index_case_insensitive",
"search_analyzer": "search_case_insensitive",
"term_vector": "with_positions_offsets"
},
"case_sensitive": {
"type": "string",
"analyzer": "index_case_sensitive",
"search_analyzer": "search_case_sensitive",
"term_vector": "with_positions_offsets"
}
}
}
}
}
]
}
}
}
'
数据
curl -X POST "http://localhost:9200/custom/test" -d '{ "text" : "tHis .is a! Test" }'
询问
用户搜索:
tHis test
,它被分为两部分,因为ngram的最大长度为8:(1)tHis
和(2)test
。对于(1),使用区分大小写的字段,而(2)使用不区分大小写的字段。curl -X POST "http://localhost:9200/_search" -d '
{
"size": 1,
"query": {
"bool": {
"must": [
{
"match": {
"case_sensitive": {
"query": "tHis",
"type": "boolean"
}
}
},
{
"match": {
"case_insensitive": {
"query": "test",
"type": "boolean"
}
}
}
]
}
},
"highlight": {
"pre_tags": [
"<em>"
],
"post_tags": [
"</em>"
],
"number_of_fragments": 0,
"require_field_match": false,
"fields": {
"*": {}
}
}
}
'
响应
{
"took": 10,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.057534896,
"hits": [
{
"_index": "custom",
"_type": "test",
"_id": "1",
"_score": 0.057534896,
"_source": {
"text": "tHis .is a! Test"
},
"highlight": {
"text.case_sensitive": [
"<em>tHis</em> .is a! Test"
],
"text.case_insensitive": [
"tHis .is a!<em> Test</em>"
]
}
}
]
}
}
问题:突出显示
如您所见,响应显示智能案例搜索非常有效。但是,我也想使用突出显示向用户提供反馈。我当前的设置使用
"term_vector": "with_positions_offsets"
生成亮点。这确实可以给人以正确的亮点。但是,突出显示的值将分别区分大小写和不区分大小写地返回。"highlight": {
"text.case_sensitive": [
"<em>tHis</em> .is a! Test"
],
"text.case_insensitive": [
"tHis .is a!<em> Test</em>"
]
}
这要求我在将其返回给用户之前,将同一字段上的多个突出显示手动压缩为一个组合突出显示。当高光变得更复杂并且可以重叠时,这将变得非常痛苦。
题
是否有其他设置可以实际获取组合的突出显示。即我希望将此作为我的回应的一部分。
"highlight": {
"text": [
"<em>tHis</em> .is a!<em> Test</em>"
]
}
最佳答案
尝试
利用突出显示查询获取合并结果:
curl -XPOST 'http://localhost:9200_search' -d '
{
"size": 1,
"query": {
"bool": {
"must": [
{
"match": {
"case_sensitive": {
"query": "tHis",
"type": "boolean"
}
}
},
{
"match": {
"case_insensitive": {
"query": "test",
"type": "boolean"
}
}
}
]
}
},
"highlight": {
"pre_tags": [
"<em>"
],
"post_tags": [
"</em>"
],
"number_of_fragments": 0,
"require_field_match": false,
"fields": {
"*.case_insensitive": {
"highlight_query": {
"bool": {
"must": [
{
"match": {
"*.case_insensitive": {
"query": "tHis",
"type": "boolean"
}
}
},
{
"match": {
"*.case_insensitive": {
"query": "test",
"type": "boolean"
}
}
}
]
}
}
}
}
}
}
'
响应
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.9364339,
"hits": [
{
"_index": "custom",
"_type": "test",
"_id": "1",
"_score": 0.9364339,
"_source": {
"text": "tHis .is a! Test"
},
"highlight": {
"text.case_insensitive": [
"<em>tHis</em> .is a!<em> Test</em>"
]
}
}
]
}
}
警告
摄取以下内容时,请注意其他小写的
test
关键字:curl -X POST "http://localhost:9200/custom/test" -d '{ "text" : "tHis this .is a! Test" }'
对同一查询的响应变为:
{
"took": 5,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.9364339,
"hits": [
{
"_index": "custom",
"_type": "test",
"_id": "1",
"_score": 0.9364339,
"_source": {
"text": "tHis this .is a! Test"
},
"highlight": {
"text.case_insensitive": [
"<em>tHis</em><em> this</em> .is a!<em> Test</em>"
]
}
}
]
}
}
如您所见,现在突出显示的内容还包括小写的
this
。对于这样的测试示例,我们不介意。但是,对于复杂的查询,用户可能(并且可能会)对智能案例何时以及如何产生影响感到困惑。尤其是当小写字母匹配项包含仅小写字母匹配的字段时。结论
此解决方案将为您提供所有合并为突出显示的突出显示,但可能包含不需要的结果。
关于elasticsearch - 使用ElasticSearch进行Smartcase搜索/突出显示,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/36960683/