我从一组如下所示的文档开始:
{
state: 'CA',
year: 2014,
accepted: true
}
{
state: 'AL',
year: 2012,
accepted: false
}
{
state: 'CA',
year: 2013,
accepted: false
}
...
我希望最终得到以下格式的新聚合集合:
{
_id: 'CA',
value: {
submittedApplications2012: 34,
submittedApplications2013: 23,
submittedApplications2014: 72,
acceptedApplications2012: 12,
acceptedApplications2013: 7,
acceptedApplications2014: 5
}
}
{
_id: 'AL',
value: {
submittedApplications2012: 73,
submittedApplications2013: 67,
submittedApplications2014: 98,
acceptedApplications2012: 45,
acceptedApplications2013: 34,
acceptedApplications2014: 31
}
}
我编写了一个mapreduce,它按状态名称对文档进行分组,并循环遍历每个状态,增加适当的属性:
var map = function() {
var key = this.state;
var value = {
year: this.year,
accepted: this.accepted
};
emit(key, value);
};
var reduce = function(key, values) {
var reducedObject = {
submittedApplications2012: 0,
submittedApplications2013: 0,
submittedApplications2014: 0,
acceptedApplications2012: 0,
acceptedApplications2013: 0,
acceptedApplications2014: 0
};
values.forEach(function(v) {
switch (v.year) {
case 2014:
reducedObject.submittedApplications2014++;
if (v.accepted) {
reducedObject.acceptedApplications2014++;
}
break;
case 2013:
reducedObject.submittedApplications2013++;
if (v.accepted) {
reducedObject.acceptedApplications2013++;
}
break;
case 2012:
reducedObject.submittedApplications2012++;
if (v.accepted) {
reducedObject.acceptedApplications2012++;
}
break;
default:
}
});
return reducedObject;
};
db.test_collection.mapReduce(
map,
reduce,
{out: {inline: 1}}
)
不幸的是,结果不准确。阿拉巴马州的 subscribed2012
、subscribed2013
和 subscribed2014
最终得分为 9、8 和 3。其他州的最终数字也很低。如果有 10,000 条记录,这个数字应该会高很多。
我认为发生这种情况是因为reduce函数被多次调用(参见Reduce is called several times with the same key in mongodb map-reduce)并且reducedObject
对象在后续传递中被覆盖。
如何防止这种情况,以便准确计算提交和接受的申请数量?
以下是一些以原始格式创建测试集合的代码:
// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
states = ['AL', 'CA', 'FL', 'TN', 'OH'],
years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
db.test_collection.insert({
state: states[Math.floor(Math.random() * states.length)],
year: NumberInt(years[Math.floor(Math.random() * years.length)]),
accepted: Math.random() >= 0.5
});
}
最佳答案
我真的不认为mapReduce 是正确的选择。就我个人而言,我会使用聚合框架,因为它的处理速度会更快,因为所有操作都在 native 代码中,无需 JavaScript 翻译代码或对象。
这样做只是一个简单的$group
手术,通过$cond
进行一些治疗将 true/false
值转换为数字:
db.test_collection.aggregate([
{ "$group": {
"_id": {
"state": "$state",
"year": "$year"
},
"submitted": { "$sum": 1 },
"accepted": {
"$sum": {
"$cond": [
"$accepted",
1,
0
]
}
}
}},
{ "$group": {
"_id": "$_id.state",
"values": {
"$push": {
"year": "$_id.year",
"submitted": "$submitted",
"accepted": "$accepted"
}
}
}}
])
它会产生这样的输出(为了简洁起见,只有一种状态):
{
"_id" : "CA",
"values" : [
{
"year" : 2014,
"submitted" : 691,
"accepted" : 360
},
{
"year" : 2013,
"submitted" : 653,
"accepted" : 332
},
{
"year" : 2012,
"submitted" : 681,
"accepted" : 350
}
]
}
或者,如果您确实必须指定输出中的所有键,请使用以下形式。在代码中生成结构将是一件小事,因为“数据结构”都是一个聚合管道,或者实际上任何 native MongoDB 查询实际上都是:
db.test_collection.aggregate([
{ "$group": {
"_id": "$state",
"submitted2012": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2012 ] },
1,
0
]
}
},
"accepted2012": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2012 ] },
"$accepted"
]},
1,
0
]
}
},
"submitted2013": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2013 ] },
1,
0
]
}
},
"accepted2013": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2013 ] },
"$accepted"
]},
1,
0
]
}
},
"submitted2014": {
"$sum": {
"$cond": [
{ "$eq": [ "$year", 2014 ] },
1,
0
]
}
},
"accepted2014": {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", 2014 ] },
"$accepted"
]},
1,
0
]
}
}
}}
])
事实上,它实际上就像这样微不足道:
var groupStage = {
"$group": {
"_id": "$state"
}
};
[2012,2013,2014].forEach(function(year) {
groupStage["$group"]["submitted" + year] = {
"$sum": {
"$cond": [
{ "$eq": [ "$year", year ] },
1,
0
]
}
};
groupStage["$group"]["accepted" + year] = {
"$sum": {
"$cond": [
{ "$and": [
{ "$eq": [ "$year", year ] },
"$accepted"
]},
1,
0
]
}
};
});
db.test_collection.aggregate([groupStage])
它的输出:
{
"_id" : "CA",
"submitted2012" : 681,
"accepted2012" : 350,
"submitted2013" : 653,
"accepted2013" : 332,
"submitted2014" : 691,
"accepted2014" : 360
}
使用mapReduce 执行此操作要慢得多,但您需要考虑的主要因素是让“mapper”发出与reducer 本身将返回的输出相同的输出。这是因为“reducer”实际上并不会立即处理所有分组的文档,而是“reduce”的输出可以反过来作为“输入”与其他发出或“reduced”值一起返回以进一步减少:
db.test_collection.mapReduce(
function() {
var obj = {};
obj["submitted" + this.year] = 1,
obj["accepted" + this.year] = (this.accepted) ? 1: 0;
emit(this.state,obj);
},
function(key,values) {
var obj = {};
values.forEach(function(value) {
Object.keys(value).forEach(function(key) {
if ( !obj.hasOwnProperty(key) )
obj[key] = 0;
obj[key] += value[key];
});
});
return obj;
},
{ "out": { "inline": 1 } }
)
使用这种输出:
{
"_id" : "CA",
"value" : {
"submitted2014" : 691,
"accepted2014" : 360,
"submitted2013" : 653,
"accepted2013" : 332,
"submitted2012" : 681,
"accepted2012" : 350
}
}
郑重声明,像原始聚合示例一样的输出可以这样获得:
db.test_collection.mapReduce(
function() {
var obj = {
"year": this.year,
"submitted": 1,
"accepted": (this.accepted) ? 1 : 0
};
emit(this.state,{ "values": [obj] });
},
function(key,values) {
var obj = { "values": [] };
var accum = {};
values.forEach(function(value) {
value.values.forEach(function(data) {
if ( !accum.hasOwnProperty(data.year) )
accum[data.year] = {
submitted: 0,
accepted: 0
};
accum[data.year]["submitted"] += data.submitted;
accum[data.year]["accepted"] += data.accepted;
});
});
Object.keys(accum).forEach(function(key) {
obj.values.push({
"year": parseInt(key),
"submitted": accum[key].submitted,
"accepted": accum[key].accepted
});
});
obj.values.sort(function(a,b){
return a.year < b.year;
});
return obj;
},
{ "out": { "inline": 1 } }
)
输出键遵循mapReduce规则:
{
"_id" : "CA",
"value" : {
"values" : [
{
"year" : 2014,
"submitted" : 691,
"accepted" : 360
},
{
"year" : 2013,
"submitted" : 653,
"accepted" : 332
},
{
"year" : 2012,
"submitted" : 681,
"accepted" : 350
}
]
}
}
因此,mapReduce 是可行的,但聚合框架无疑是此类任务的更好选择。
<小时/>此外,您的生成脚本使用 Bulk 可能会更好一些操作:
var i = 10000,
states = ['AL', 'CA', 'FL', 'TN', 'OH'],
years = [2012, 2013, 2014],
bulk = db.test_collection.initializeOrderedBulkOp();
db.test_collection.drop();
while (i--) {
bulk.insert({
state: states[Math.floor(Math.random() * states.length)],
year: NumberInt(years[Math.floor(Math.random() * years.length)]),
accepted: Math.random() >= 0.5
});
if ( i % 1000 == 0 ) {
bulk.execute();
bulk = db.test_collection.initializeOrderedBulkOp();
}
}
关于javascript - 使用 MongoDB map-reduce 生成扁平化文档,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32173228/