arrays - Mongodb 检查批量插入的重复记录

标签 arrays mongodb mongodb-query mongoskin

我的收藏有这样的记录:

{ "_id":"1", "field1":"foo","field2":"xyz", "field3":"something" ...}
{ "_id":"2", "field1":"bar","field2":"xyz", "field3":"something" ...}
{ "_id":"3", "field1":"foo","field2":"abc", "field3":"something" ...}
{ "_id":"4", "field1":"bar","field2":"lmn", "field3":"something" ...}

在插入新记录之前,我需要检查是否已存在具有相同 field1 和 field2 值的记录。然后丢弃该请求(如果它已经存在)。如果我一次插入一条记录,我可以设法做到这一点。如果我正在执行批量插入(即当我插入文档数组时),我该如何处理?

我有一组 [field1, field2] 组合,我需要查找它们 例如:

queryArray=[ { "field1":"foo","field2":"xyz"},
             { "field1":"bar","field2":"lmn"} ]

预期结果:

result=[  { "_id":"1", "field1":"foo","field2":"xyz", "field3":"something" ...},
          { "_id":"4", "field1":"bar","field2":"lmn", "field3":"something" ...}]

最佳答案

创建一个 unique compound index在两个领域

db.collection.createIndex( { "field1": 1, "field2": 1 }, { "unique": true } )

使用 insertMany() 方法来执行批量插入,但将 ordered 选项设置为 false,因为这将确保尝试所有写入操作,即使出现错误也是如此。有序操作在出错后停止,而无序操作继续处理队列中任何剩余的写操作:

var queryArray = [ 
    { "field1": "foo", "field2": "xyz" },
    { "field1": "bar", "field2": "lmn" }
];
try { db.collection.insertMany(queryArray, { "ordered": false }); } 
catch (e) {  print (e); }

这将输出一个文档

{
    "acknowledged" : true,
    "insertedIds" : [ 
        ObjectId("57443e6fa58e5654f3a6c5ae"), 
        ObjectId("57443e6fa58e5654f3a6c5af")
    ]
}

生成的文档显示如果操作运行时出现写入问题则确认为 true 的字段,如果禁用写入问题则显示确认为 false 的字段以及每个成功插入的文档的 _id 数组。

因为 queryArray 中的文档不包含 _id,mongod 为每个文档创建并添加 _id 字段,并为其分配一个唯一的 ObjectId 值。由于您在 field1field2 这两个字段上强制执行了唯一性,因此上面显示了尝试写入,因为操作是无序的,因此它继续处理任何剩余的写入操作。


假设您删除了 ordered 选项(默认设置为 true),您将从该操作中获得以下输出:

var queryArray = [ 
    { "field1": "foo", "field2": "xyz" },
    { "field1": "bar", "field2": "lmn" }
];
try { db.collection.insertMany(queryArray); } 
catch (e) {  print (e); }

控制台输出:

{
    "name" : "BulkWriteError",
    "message" : "write error at item 0 in bulk operation",
    "ok" : undefined,
    "nInserted" : 0,
    "nUpserted" : 0,
    "nMatched" : 0,
    "nModified" : 0,
    "nRemoved" : 0,
    "getUpsertedIds" : function () {
      return bulkResult.upserted;
    },
    "getUpsertedIdAt" : function (index) {
      return bulkResult.upserted[index];
    },
    "getRawResponse" : function () {
      return bulkResult;
    },
    "hasWriteErrors" : function () {
      return bulkResult.writeErrors.length > 0;
    },
    "getWriteErrorCount" : function () {
      return bulkResult.writeErrors.length;
    },
    "getWriteErrorAt" : function (index) {
      if(index < bulkResult.writeErrors.length) {
        return bulkResult.writeErrors[index];
      }
      return null;
    },
    "getWriteErrors" : function () {
      return bulkResult.writeErrors;
    },
    "hasWriteConcernError" : function () {
      return bulkResult.writeConcernErrors.length > 0;
    },
    "getWriteConcernError" : function () {
      if(bulkResult.writeConcernErrors.length == 0) {
        return null;
      } else if(bulkResult.writeConcernErrors.length == 1) {
        // Return the error
        return bulkResult.writeConcernErrors[0];
      } else {

        // Combine the errors
        var errmsg = "";
        for(var i = 0; i < bulkResult.writeConcernErrors.length; i++) {
          var err = bulkResult.writeConcernErrors[i];
          errmsg = errmsg + err.errmsg;
          // TODO: Something better
          if (i != bulkResult.writeConcernErrors.length - 1) {
            errmsg = errmsg + " and ";
          }
        }

        return new WriteConcernError({ errmsg : errmsg, code : WRITE_CONCERN_FAILED });
      }
    },
    "tojson" : function (indent, nolint) {
      return tojson(bulkResult, indent, nolint);
    },
    "toString" : function () {
      return "BulkWriteError(" + this.tojson() + ")";
    },
    "shellPrint" : function () {
      return this.toString();
    },
    "hasErrors" : function () {
      return this.hasWriteErrors() || this.hasWriteConcernError();
    },
    "toSingleResult" : function () {
      if(singleBatchType == null) throw Error(
          "Cannot output single WriteResult from multiple batch result");
      return new WriteResult(bulkResult, singleBatchType, writeConcern);
    },
    "stack" : "BulkWriteError({\n\t\"writeErrors\" : [\n\t\t{\n\t\t\t\"index\" : 0,\n\t\t\t\"code\" : 11000,\n\t\t\t\"errmsg\" : \"E11000 duplicate key error index: test.collection.$field1_1_field2_1 dup key: { : \\\"foo\\\", : \\\"xyz\\\" }\",\n\t\t\t\"op\" : {\n\t\t\t\t\"_id\" : ObjectId(\"574441aea58e5654f3a6c5b6\"),\n\t\t\t\t\"field1\" : \"foo\",\n\t\t\t\t\"field2\" : \"xyz\"\n\t\t\t}\n\t\t}\n\t],\n\t\"writeConcernErrors\" : [ ],\n\t\"nInserted\" : 0,\n\t\"nUpserted\" : 0,\n\t\"nMatched\" : 0,\n\t\"nModified\" : 0,\n\t\"nRemoved\" : 0,\n\t\"upserted\" : [ ]\n})\nBulkWriteError@src/mongo/shell/bulk_api.js:372:44\nBulkWriteResult/this.toError@src/mongo/shell/bulk_api.js:335:16\nBulk/this.execute@src/mongo/shell/bulk_api.js:1162:1\nDBCollection.prototype.insertMany@src/mongo/shell/crud_api.js:279:5\n@(shell):1:7\n",
    "toResult" : function () {
      return new BulkWriteResult(bulkResult, singleBatchType, writeConcern);
    }
}

强调返回的写入错误:

"E11000 duplicate key error index: test.collection.$field1_1_field2_1 dup key: { : \\\"foo\\\", : \\\"xyz\\\" }\"

除了 insertMany() 方法,你也可以试试 Bulk() API 方法,特别是您需要调用 initializeUnorderedBulkOp() 方法在创建唯一复合索引后进行无序批量插入。

针对上述情况考虑以下示例:

db.collection('collectionName', function(err, collection) {
    var bulk = collection.initializeUnorderedBulkOp();
    counter = 0;

    queryArray.forEach(function (doc){
        bulk.insert(doc);
        counter++;

        if (counter % 1000 == 0) {
            bulk.execute(function(err, result) {
                // you could do something with results, check for duplicate errors
                bulk = collection.initializeUnorderedBulkOp(); // re-initialise
            }); 
        }
    });

    // Clean-up remaining operations in the queue 
    if (counter % 1000 != 0 ) {     
        bulk.execute(function(err, result) {
            // you could do something with results, check for duplicate errors
            console.log(result);
        });
    }
});

关于arrays - Mongodb 检查批量插入的重复记录,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37412235/

相关文章:

python - 如何从 numpy 二维数组中提取子数组?

arrays - 如何在子文档的每个字段中搜索

node.js - Mongoose-mongoDB 高级查询连接

python - 以资源友好的方式使用 Python 将巨大的 Mongo 结果集写入磁盘

python - 将 SQL 查询转换为 mongo 查询

java - Java 中的 Arrays.fill 多维数组

javascript - 数组数组上的 Lodash 属性迭代器

c - 将数组传递给函数而无需事先显式声明数组

c++ - Mongo C++ 客户端库没有看到我的 boost

python:如何通过http put在mongo中存储数据