我有一个收藏产品,里面有 ~7.000.000 本书和总共 ~40GB mongodb 3.4 数据库。这是一本书文档的示例:
{
"_id" : ObjectId("597f17d22be7925d9a056e82"),
"ean13" : "9783891491904",
"price" : NumberInt(2100),
"name" : "My cool title",
"author_name" : "Doe, John",
"warengruppe" : "HC",
"book_category_key" : "728",
"keywords": ["fairy tale", "magic", "fantasy"]
...
}
现在我想对产品集合进行一些文本搜索:
db.products.find({
$text : {
$search: '"harry" "potter" "3" lsxger'
}
}, {
score: {
"$meta": "textScore"
},
ean13: 1,
name: 1,
author_name: 1,
price: 1,
images: 1,
warengruppe: 1
}).sort({
score: {
"$meta": "textScore"
},
name: 1
}).limit(9);
下面是解释的结果:
{
"queryPlanner" : {
"plannerVersion" : NumberInt(1),
"namespace" : "mydb.products",
"indexFilterSet" : false,
"parsedQuery" : {
"$text" : {
"$search" : "\"harry\" \"potter\" \"3\" lsxger",
"$language" : "german",
"$caseSensitive" : false,
"$diacriticSensitive" : false
}
},
"winningPlan" : {
"stage" : "PROJECTION",
"transformBy" : {
"score" : {
"$meta" : "textScore"
},
"ean13" : 1.0,
"name" : 1.0,
"author_name" : 1.0,
"price" : 1.0,
"images" : 1.0,
"warengruppe" : 1.0
},
"inputStage" : {
"stage" : "SORT",
"sortPattern" : {
"score" : {
"$meta" : "textScore"
},
"name" : 1.0
},
"limitAmount" : NumberInt(9),
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"inputStage" : {
"stage" : "TEXT",
"indexPrefix" : {
},
"indexName" : "fulltextsearch",
"parsedTextQuery" : {
"terms" : [
"3",
"harry",
"lsxger",
"pott"
],
"negatedTerms" : [
],
"phrases" : [
"harry",
"potter",
"3"
],
"negatedPhrases" : [
]
},
"textIndexVersion" : NumberInt(3),
"inputStage" : {
"stage" : "TEXT_MATCH",
"inputStage" : {
"stage" : "TEXT_OR",
"inputStages" : [
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
}
},
{
"stage" : "IXSCAN",
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
}
}
]
}
}
}
}
}
},
"rejectedPlans" : [
]
},
"executionStats" : {
"executionSuccess" : true,
"nReturned" : NumberInt(9),
"executionTimeMillis" : NumberInt(15441),
"totalKeysExamined" : NumberInt(1206999),
"totalDocsExamined" : NumberInt(1195069),
"executionStages" : {
"stage" : "PROJECTION",
"nReturned" : NumberInt(9),
"executionTimeMillisEstimate" : NumberInt(15294),
"works" : NumberInt(2402085),
"advanced" : NumberInt(9),
"needTime" : NumberInt(2402075),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"transformBy" : {
"score" : {
"$meta" : "textScore"
},
"ean13" : 1.0,
"name" : 1.0,
"author_name" : 1.0,
"price" : 1.0,
"images" : 1.0,
"warengruppe" : 1.0
},
"inputStage" : {
"stage" : "SORT",
"nReturned" : NumberInt(9),
"executionTimeMillisEstimate" : NumberInt(15234),
"works" : NumberInt(2402085),
"advanced" : NumberInt(9),
"needTime" : NumberInt(2402075),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"sortPattern" : {
"score" : {
"$meta" : "textScore"
},
"name" : 1.0
},
"memUsage" : NumberInt(22949),
"memLimit" : NumberInt(33554432),
"limitAmount" : NumberInt(9),
"inputStage" : {
"stage" : "SORT_KEY_GENERATOR",
"nReturned" : NumberInt(455),
"executionTimeMillisEstimate" : NumberInt(15074),
"works" : NumberInt(2402075),
"advanced" : NumberInt(455),
"needTime" : NumberInt(2401619),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"inputStage" : {
"stage" : "TEXT",
"nReturned" : NumberInt(455),
"executionTimeMillisEstimate" : NumberInt(15024),
"works" : NumberInt(2402074),
"advanced" : NumberInt(455),
"needTime" : NumberInt(2401618),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"indexPrefix" : {
},
"indexName" : "fulltextsearch",
"parsedTextQuery" : {
"terms" : [
"3",
"harry",
"lsxger",
"pott"
],
"negatedTerms" : [
],
"phrases" : [
"harry",
"potter",
"3"
],
"negatedPhrases" : [
]
},
"textIndexVersion" : NumberInt(3),
"inputStage" : {
"stage" : "TEXT_MATCH",
"nReturned" : NumberInt(455),
"executionTimeMillisEstimate" : NumberInt(14974),
"works" : NumberInt(2402074),
"advanced" : NumberInt(455),
"needTime" : NumberInt(2401618),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"docsRejected" : NumberInt(1194614),
"inputStage" : {
"stage" : "TEXT_OR",
"nReturned" : NumberInt(1195069),
"executionTimeMillisEstimate" : NumberInt(4500),
"works" : NumberInt(2402074),
"advanced" : NumberInt(1195069),
"needTime" : NumberInt(1207004),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"docsExamined" : NumberInt(1195069),
"inputStages" : [
{
"stage" : "IXSCAN",
"nReturned" : NumberInt(59101),
"executionTimeMillisEstimate" : NumberInt(131),
"works" : NumberInt(59102),
"advanced" : NumberInt(59101),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
},
"keysExamined" : NumberInt(59101),
"seeks" : NumberInt(1),
"dupsTested" : NumberInt(59101),
"dupsDropped" : NumberInt(0),
"seenInvalidated" : NumberInt(0)
},
{
"stage" : "IXSCAN",
"nReturned" : NumberInt(9512),
"executionTimeMillisEstimate" : NumberInt(0),
"works" : NumberInt(9513),
"advanced" : NumberInt(9512),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
},
"keysExamined" : NumberInt(9512),
"seeks" : NumberInt(1),
"dupsTested" : NumberInt(9512),
"dupsDropped" : NumberInt(0),
"seenInvalidated" : NumberInt(0)
},
{
"stage" : "IXSCAN",
"nReturned" : NumberInt(1134940),
"executionTimeMillisEstimate" : NumberInt(1381),
"works" : NumberInt(1134941),
"advanced" : NumberInt(1134940),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
},
"keysExamined" : NumberInt(1134940),
"seeks" : NumberInt(1),
"dupsTested" : NumberInt(1134940),
"dupsDropped" : NumberInt(0),
"seenInvalidated" : NumberInt(0)
},
{
"stage" : "IXSCAN",
"nReturned" : NumberInt(3446),
"executionTimeMillisEstimate" : NumberInt(0),
"works" : NumberInt(3447),
"advanced" : NumberInt(3446),
"needTime" : NumberInt(0),
"needYield" : NumberInt(0),
"saveState" : NumberInt(18814),
"restoreState" : NumberInt(18814),
"isEOF" : NumberInt(1),
"invalidates" : NumberInt(0),
"keyPattern" : {
"_fts" : "text",
"_ftsx" : NumberInt(1)
},
"indexName" : "fulltextsearch",
"isMultiKey" : true,
"isUnique" : false,
"isSparse" : false,
"isPartial" : false,
"indexVersion" : NumberInt(2),
"direction" : "backward",
"indexBounds" : {
},
"keysExamined" : NumberInt(3446),
"seeks" : NumberInt(1),
"dupsTested" : NumberInt(3446),
"dupsDropped" : NumberInt(0),
"seenInvalidated" : NumberInt(0)
}
]
}
}
}
}
}
},
"allPlansExecution" : [
]
},
"serverInfo" : {
"host" : "lvps83-169-23-14.dedicated.hosteurope.de",
"port" : NumberInt(27017),
"version" : "3.4.4",
"gitVersion" : "888390515874a9debd1b6c5d36559ca86b44babd"
},
"ok" : 1.0
}
这大约需要 25 秒或更长时间。我已经为 book_category_key、ean13、author_name、name 和 fulltextsearch 设置了一些索引:
{
"v" : 2,
"name" : "fulltextsearch",
"ns" : "mydb.products",
"background" : true,
"weights" : {
"author_name" : 5,
"ean13" : 10,
"isbn" : 10,
"keywords" : 2,
"languages.search" : 8,
"mainsubject.name" : 3,
"name" : 10
},
"default_language" : "german",
"language_override" : "language_x",
"textIndexVersion" : 3
}
如何提高速度或到哪里寻找更多信息?
最佳答案
搜索耗时约 15 秒。
执行 TEXT_OR 搜索需要 4.5 秒
"stage" : "TEXT_OR",
"nReturned" : NumberInt(1195069),
"executionTimeMillisEstimate" : NumberInt(4500),
剩下的 10 秒需要执行匹配
"stage" : "TEXT_MATCH",
"nReturned" : NumberInt(455),
"executionTimeMillisEstimate" : NumberInt(14974), //this includes the 4.5
text_or 匹配表明,必须检查 1.2 Mio 文档。这有一些含义:
如果文档不在内存中,则从磁盘加载文档需要一段时间。由于您的总内存小于集合大小 (40GB) + 索引 (9GB),因此很有可能必须交换某些数据(您是否检查过连续搜索是否更快?)。 有两种选择:1. 减少索引大小(仅包括部分字段),2. 增加内存。尽管如此,获取文档只占总执行时间的 1/3。
主要问题 (2/3) 是 ~1.2 Mio 文档上的文本匹配,这显然需要一段时间。所以你得想办法减少文档的数量(见下文)
可能有几种策略可以解决这个问题:
您应该考虑使用带有附加条件的复合索引来限制总数(即仅在图书类别中搜索:“728”……不管是什么意思)(另请参见此处 Limit the Number of Entries Scanned)
将索引限制为仅包含实际文本(名称、关键字、作者)的字段,并对其他类型(isbn、ean)使用专用索引。您的应用程序可以对用户输入进行有根据的猜测(根据格式测试它是否可能是 ean 或 isb 并为这些输入直接查找/正则表达式查找)。 这可能会有所帮助,尤其是因为“3”很可能会碰到几个完全不相关的 isbn 或 ean。
也许使用 AND 而不是 OR 来连接搜索词 (
"\"harry potter 3\""
) 也可能会加快这个过程,尽管它会改变搜索的语义.监控和分析常见搜索模式的实际用户搜索行为。因此,您可以优化实际使用模式(即添加一个包含常用搜索词的附加数组,并在数组字段上进行精确搜索,几秒钟后可以使用全文搜索结果对其进行细化)
关于MongoDb 查询需要很长时间,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/45461015/