elasticsearch - Elasticsearch-将ngrams用作标记器和过滤器可提供不同的输出

标签 elasticsearch

有人可以解释为什么将ngrams用作tokenzier与将其用作过滤器相比会产生不同的输出。例如,将其用作“扑热息痛”的标记剂,我得到:

{
   "tokens": [
      {
         "token": "par",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "para",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "parac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "parace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paraceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ara",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "arac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "arace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "araceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "rac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "race",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "raceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "eta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "amo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "amol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "mol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      }
   ]
}

将其用作过滤器的地方:
{
   "tokens": [
      {
         "token": "par",
         "start_offset": 0,
         "end_offset": 3,
         "type": "word",
         "position": 1
      },
      {
         "token": "para",
         "start_offset": 0,
         "end_offset": 4,
         "type": "word",
         "position": 2
      },
      {
         "token": "parac",
         "start_offset": 0,
         "end_offset": 5,
         "type": "word",
         "position": 3
      },
      {
         "token": "parace",
         "start_offset": 0,
         "end_offset": 6,
         "type": "word",
         "position": 4
      },
      {
         "token": "paracet",
         "start_offset": 0,
         "end_offset": 7,
         "type": "word",
         "position": 5
      },
      {
         "token": "paraceta",
         "start_offset": 0,
         "end_offset": 8,
         "type": "word",
         "position": 6
      },
      {
         "token": "paracetam",
         "start_offset": 0,
         "end_offset": 9,
         "type": "word",
         "position": 7
      },
      {
         "token": "paracetamo",
         "start_offset": 0,
         "end_offset": 10,
         "type": "word",
         "position": 8
      },
      {
         "token": "paracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 9
      },
      {
         "token": "ara",
         "start_offset": 1,
         "end_offset": 4,
         "type": "word",
         "position": 10
      },
      {
         "token": "arac",
         "start_offset": 1,
         "end_offset": 5,
         "type": "word",
         "position": 11
      },
      {
         "token": "arace",
         "start_offset": 1,
         "end_offset": 6,
         "type": "word",
         "position": 12
      },
      {
         "token": "aracet",
         "start_offset": 1,
         "end_offset": 7,
         "type": "word",
         "position": 13
      },
      {
         "token": "araceta",
         "start_offset": 1,
         "end_offset": 8,
         "type": "word",
         "position": 14
      },
      {
         "token": "aracetam",
         "start_offset": 1,
         "end_offset": 9,
         "type": "word",
         "position": 15
      },
      {
         "token": "aracetamo",
         "start_offset": 1,
         "end_offset": 10,
         "type": "word",
         "position": 16
      },
      {
         "token": "aracetamol",
         "start_offset": 1,
         "end_offset": 11,
         "type": "word",
         "position": 17
      },
      {
         "token": "rac",
         "start_offset": 2,
         "end_offset": 5,
         "type": "word",
         "position": 18
      },
      {
         "token": "race",
         "start_offset": 2,
         "end_offset": 6,
         "type": "word",
         "position": 19
      },
      {
         "token": "racet",
         "start_offset": 2,
         "end_offset": 7,
         "type": "word",
         "position": 20
      },
      {
         "token": "raceta",
         "start_offset": 2,
         "end_offset": 8,
         "type": "word",
         "position": 21
      },
      {
         "token": "racetam",
         "start_offset": 2,
         "end_offset": 9,
         "type": "word",
         "position": 22
      },
      {
         "token": "racetamo",
         "start_offset": 2,
         "end_offset": 10,
         "type": "word",
         "position": 23
      },
      {
         "token": "racetamol",
         "start_offset": 2,
         "end_offset": 11,
         "type": "word",
         "position": 24
      },
      {
         "token": "ace",
         "start_offset": 3,
         "end_offset": 6,
         "type": "word",
         "position": 25
      },
      {
         "token": "acet",
         "start_offset": 3,
         "end_offset": 7,
         "type": "word",
         "position": 26
      },
      {
         "token": "aceta",
         "start_offset": 3,
         "end_offset": 8,
         "type": "word",
         "position": 27
      },
      {
         "token": "acetam",
         "start_offset": 3,
         "end_offset": 9,
         "type": "word",
         "position": 28
      },
      {
         "token": "acetamo",
         "start_offset": 3,
         "end_offset": 10,
         "type": "word",
         "position": 29
      },
      {
         "token": "acetamol",
         "start_offset": 3,
         "end_offset": 11,
         "type": "word",
         "position": 30
      },
      {
         "token": "cet",
         "start_offset": 4,
         "end_offset": 7,
         "type": "word",
         "position": 31
      },
      {
         "token": "ceta",
         "start_offset": 4,
         "end_offset": 8,
         "type": "word",
         "position": 32
      },
      {
         "token": "cetam",
         "start_offset": 4,
         "end_offset": 9,
         "type": "word",
         "position": 33
      },
      {
         "token": "cetamo",
         "start_offset": 4,
         "end_offset": 10,
         "type": "word",
         "position": 34
      },
      {
         "token": "cetamol",
         "start_offset": 4,
         "end_offset": 11,
         "type": "word",
         "position": 35
      },
      {
         "token": "eta",
         "start_offset": 5,
         "end_offset": 8,
         "type": "word",
         "position": 36
      },
      {
         "token": "etam",
         "start_offset": 5,
         "end_offset": 9,
         "type": "word",
         "position": 37
      },
      {
         "token": "etamo",
         "start_offset": 5,
         "end_offset": 10,
         "type": "word",
         "position": 38
      },
      {
         "token": "etamol",
         "start_offset": 5,
         "end_offset": 11,
         "type": "word",
         "position": 39
      },
      {
         "token": "tam",
         "start_offset": 6,
         "end_offset": 9,
         "type": "word",
         "position": 40
      },
      {
         "token": "tamo",
         "start_offset": 6,
         "end_offset": 10,
         "type": "word",
         "position": 41
      },
      {
         "token": "tamol",
         "start_offset": 6,
         "end_offset": 11,
         "type": "word",
         "position": 42
      },
      {
         "token": "amo",
         "start_offset": 7,
         "end_offset": 10,
         "type": "word",
         "position": 43
      },
      {
         "token": "amol",
         "start_offset": 7,
         "end_offset": 11,
         "type": "word",
         "position": 44
      },
      {
         "token": "mol",
         "start_offset": 8,
         "end_offset": 11,
         "type": "word",
         "position": 45
      }
   ]
}

最佳答案

这两种方法可能会产生相等的输出。
但是根据具体情况,一种方法可能会比另一种更好。
如果您在搜索词中需要特殊字符,则可能需要在映射中使用ngram标记器。知道如何同时使用这两种方法很有用。
Reference

关于elasticsearch - Elasticsearch-将ngrams用作标记器和过滤器可提供不同的输出,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/33476417/

相关文章:

mongodb - 一旦删除并重新创建索引,Elasticsearch不会自动提取现有的mongoDB数据

elasticsearch - 更改ElasticSearch中连字符的解释方式

elasticsearch - 从 cmd 启动 elasticsearch.bat 时无法创建临时 keystore 错误

elasticsearch - 如何在Elastic Search中加密索引数据?

Elasticsearch 错误 : MapperParsingException failed to parse

java - Elasticsearch +jdk

elasticsearch - Grafana 4 模板与 Elasticsearch 5

Elasticsearch查询SQL Server LAG函数模拟

elasticsearch - Elasticsearch多匹配查询并匹配单个字段

elasticsearch - 使用外部属性作为document_id会返回属性的名称,即document_id