javascript - 使用 MongoDB map-reduce 生成扁平化文档

标签 javascript mongodb mapreduce mongodb-query aggregation-framework

我从一组如下所示的文档开始:

{
  state: 'CA',
  year: 2014,
  accepted: true
}
{
  state: 'AL',
  year: 2012,
  accepted: false
}
{
  state: 'CA',
  year: 2013,
  accepted: false
}
...

我希望最终得到以下格式的新聚合集合:

{
  _id: 'CA',
  value: {
    submittedApplications2012: 34,
    submittedApplications2013: 23,
    submittedApplications2014: 72,
    acceptedApplications2012: 12,
    acceptedApplications2013: 7,
    acceptedApplications2014: 5
  }
}
{
  _id: 'AL',
  value: {
    submittedApplications2012: 73,
    submittedApplications2013: 67,
    submittedApplications2014: 98,
    acceptedApplications2012: 45,
    acceptedApplications2013: 34,
    acceptedApplications2014: 31
  }
}

我编写了一个mapreduce,它按状态名称对文档进行分组,并循环遍历每个状态,增加适当的属性:

var map = function() {
  var key = this.state;
  var value = {
    year: this.year,
    accepted: this.accepted
  };
  emit(key, value);
};

var reduce = function(key, values) {
  var reducedObject = {
    submittedApplications2012: 0,
    submittedApplications2013: 0,
    submittedApplications2014: 0,
    acceptedApplications2012: 0,
    acceptedApplications2013: 0,
    acceptedApplications2014: 0
  };

  values.forEach(function(v) {
    switch (v.year) {
      case 2014:
        reducedObject.submittedApplications2014++;
        if (v.accepted) {
          reducedObject.acceptedApplications2014++;
        }
        break;
      case 2013:
        reducedObject.submittedApplications2013++;
        if (v.accepted) {
          reducedObject.acceptedApplications2013++;
        }
        break;
      case 2012:
        reducedObject.submittedApplications2012++;
        if (v.accepted) {
          reducedObject.acceptedApplications2012++;
        }
        break;
      default:
    }
  });
  return reducedObject;
};

db.test_collection.mapReduce(
  map,
  reduce,
  {out: {inline: 1}}
)

不幸的是,结果不准确。阿拉巴马州的 subscribed2012subscribed2013subscribed2014 最终得分为 9、8 和 3。其他州的最终数字也很低。如果有 10,000 条记录,这个数字应该会高很多。

我认为发生这种情况是因为reduce函数被多次调用(参见Reduce is called several times with the same key in mongodb map-reduce)并且reducedObject对象在后续传递中被覆盖。

如何防止这种情况,以便准确计算提交和接受的申请数量?

以下是一些以原始格式创建测试集合的代码:

// Generate a test collection with 10K documents for demo'ing purposes
var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014];
db.test_collection.drop();
while (i--) {
  db.test_collection.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
}

最佳答案

我真的不认为mapReduce 是正确的选择。就我个人而言,我会使用聚合框架,因为它的处理速度会更快,因为所有操作都在 native 代码中,无需 JavaScript 翻译代码或对象。

这样做只是一个简单的$group手术,通过$cond进行一些治疗将 true/false 值转换为数字:

db.test_collection.aggregate([
    { "$group": {
        "_id": {
            "state": "$state",
            "year": "$year"
        },
        "submitted": { "$sum": 1 },
        "accepted": {
            "$sum": {
                "$cond": [
                    "$accepted",
                    1,
                    0
                ]
            }
        }
    }},
    { "$group": {
        "_id": "$_id.state",
        "values": {
            "$push": {
                "year": "$_id.year",
                "submitted": "$submitted",
                "accepted": "$accepted"
            }
        }
    }}
])

它会产生这样的输出(为了简洁起见,只有一种状态):

{
    "_id" : "CA",
    "values" : [
        {
                "year" : 2014,
                "submitted" : 691,
                "accepted" : 360
        },
        {
                "year" : 2013,
                "submitted" : 653,
                "accepted" : 332
        },
        {
                "year" : 2012,
                "submitted" : 681,
                "accepted" : 350
        }
    ]
}

或者,如果您确实必须指定输出中的所有键,请使用以下形式。在代码中生成结构将是一件小事,因为“数据结构”都是一个聚合管道,或者实际上任何 native MongoDB 查询实际上都是:

db.test_collection.aggregate([
    { "$group": {
        "_id": "$state",
        "submitted2012": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2012 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2012": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2012 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2013": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2013 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2013": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2013 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        },
        "submitted2014": { 
            "$sum": {
                "$cond": [
                    { "$eq": [ "$year", 2014 ] },
                    1,
                    0
                ]
            }
        },
        "accepted2014": {
            "$sum": {
                "$cond": [
                    { "$and": [
                        { "$eq": [ "$year", 2014 ] },
                        "$accepted"
                    ]},
                    1,
                    0
                ]
            }
        }
    }}
])

事实上,它实际上就像这样微不足道:

var groupStage  = {
    "$group": {
        "_id": "$state"
    }
};

[2012,2013,2014].forEach(function(year) {
    groupStage["$group"]["submitted" + year] = {
        "$sum": {
            "$cond": [
                { "$eq": [ "$year", year ] },
                1,
                0
            ]
        }
    };
    groupStage["$group"]["accepted" + year] = {
        "$sum": {
            "$cond": [
                { "$and": [
                    { "$eq": [ "$year", year ] },
                    "$accepted"
                ]},
                1,
                0
            ]
        }
    };
});

db.test_collection.aggregate([groupStage])

它的输出:

{
    "_id" : "CA",
    "submitted2012" : 681,
    "accepted2012" : 350,
    "submitted2013" : 653,
    "accepted2013" : 332,
    "submitted2014" : 691,
    "accepted2014" : 360
}

使用mapReduce 执行此操作要慢得多,但您需要考虑的主要因素是让“mapper”发出与reducer 本身将返回的输出相同的输出。这是因为“reducer”实际上并不会立即处理所有分组的文档,而是“reduce”的输出可以反过来作为“输入”与其他发出或“reduced”值一起返回以进一步减少:

db.test_collection.mapReduce(
    function() {
        var obj = {};
        obj["submitted" + this.year] = 1,
        obj["accepted" + this.year] = (this.accepted) ? 1: 0;
        emit(this.state,obj);
    },
    function(key,values) {
        var obj = {};
        values.forEach(function(value) {
            Object.keys(value).forEach(function(key) {
                if ( !obj.hasOwnProperty(key) )
                    obj[key] = 0;
                obj[key] += value[key];
            });
        });
        return obj;
    },
    { "out": { "inline": 1 } }
)

使用这种输出:

{
    "_id" : "CA",
    "value" : {
            "submitted2014" : 691,
            "accepted2014" : 360,
            "submitted2013" : 653,
            "accepted2013" : 332,
            "submitted2012" : 681,
            "accepted2012" : 350
    }
}

郑重声明,像原始聚合示例一样的输出可以这样获得:

db.test_collection.mapReduce(
    function() {
        var obj = {
            "year": this.year,
            "submitted": 1,
            "accepted": (this.accepted) ? 1 : 0
        };
        emit(this.state,{ "values": [obj] });
    },
    function(key,values) {
        var obj = { "values": [] };

        var accum = {};

        values.forEach(function(value) {
            value.values.forEach(function(data) {
                if ( !accum.hasOwnProperty(data.year) )
                    accum[data.year] = {
                        submitted: 0,
                        accepted: 0
                    };
                accum[data.year]["submitted"] += data.submitted;
                accum[data.year]["accepted"] += data.accepted;
            });
        });

        Object.keys(accum).forEach(function(key) {
            obj.values.push({
                "year": parseInt(key),
                "submitted": accum[key].submitted,
                "accepted": accum[key].accepted
            });
        });
        obj.values.sort(function(a,b){
            return a.year < b.year;
        });

        return obj;
    },
    { "out": { "inline": 1  } }
)

输出键遵循mapReduce规则:

{
    "_id" : "CA",
    "value" : {
        "values" : [
            {
                    "year" : 2014,
                    "submitted" : 691,
                    "accepted" : 360
            },
            {
                    "year" : 2013,
                    "submitted" : 653,
                    "accepted" : 332
            },
            {
                    "year" : 2012,
                    "submitted" : 681,
                    "accepted" : 350
            }
        ]
    }
}

因此,mapReduce 是可行的,但聚合框架无疑是此类任务的更好选择。

<小时/>

此外,您的生成脚本使用 Bulk 可能会更好一些操作:

var i = 10000,
    states = ['AL', 'CA', 'FL', 'TN', 'OH'],
    years = [2012, 2013, 2014],
    bulk = db.test_collection.initializeOrderedBulkOp();

db.test_collection.drop();
while (i--) {
  bulk.insert({
    state: states[Math.floor(Math.random() * states.length)],
    year: NumberInt(years[Math.floor(Math.random() * years.length)]),
    accepted: Math.random() >= 0.5
  });
  if ( i % 1000 == 0 ) {
    bulk.execute();
    bulk = db.test_collection.initializeOrderedBulkOp();
  }
}

关于javascript - 使用 MongoDB map-reduce 生成扁平化文档,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/32173228/

相关文章:

hadoop - 使用MapReduce在HDFS中搜索文件

javascript - 在窗口调整大小时动态改变高度到最高的 div

javascript - 获取 Javascript 数组的唯一值

javascript - 如何使用相同的 CSS 代码为一个 id/class 实现多个背景图像?

javascript - Mongoose 通过子文档的_id查找文档

algorithm - 并行化串行算法

javascript - 使用npm而不是yarn安装包时出错

mongodb - 多个数据库与 eve

python - 蒙戈 : Resolver configuration could not be read or specified no nameservers

python - 在 Hadoop Streaming 中生成单独的输出文件