我正在尝试从大约 400mb 的 csv 文件中提取一些数据并将其保存到数据库中以供本地查询。这是免费提供的 ip2location lite 数据库,我尝试将其导入的数据库是嵌入式 nedb .
require('dotenv').load()
const fs = require('fs')
const csv = require('csv-parse')
const es = require('event-stream')
const Datastore = require('nedb')
const BatchStream = require('batch-stream')
const db = new Datastore({ filename: process.env.DB_PATH, autoload: true })
const debug = require('debug')('setup')
function massage ([ipLo, ipHi, cc, country, area, city, lat, lng]) {
return { ipLo, ipHi, cc, country, area, city, lat, lng }
}
function setup () {
let qty = 0
return new Promise((resolve, reject) => {
fs.createReadStream(process.env.IP2LOCATION_PATH)
// read and parse csv
.pipe(csv())
// batch it up
.pipe(new BatchStream({ size: 100 }))
// write it into the database
.pipe(es.map((batch, cb) => {
// massage and persist it
db.insert(batch.map(massage), _ => {
qty += batch.length
if (qty % 100 === 0)
debug(`Inserted ${qty} documents…`)
cb.apply(this, arguments)
})
}))
.on('end', resolve)
.on('error', reject)
})
}
module.exports = setup
if (!module.parent) {
debug('Setting up geo database…')
setup()
.then(_ => debug('done!'))
.catch(err => debug('there was an error :/', err))
}
大约 75000 个条目后,我收到以下错误:
<--- Last few GCs --->
80091 ms: Mark-sweep 1372.0 (1435.0) -> 1371.7 (1435.0) MB, 1174.6 / 0 ms (+ 1.4 ms in 1 steps since start of marking, biggest step 1.4 ms) [allocation failure] [GC in old space requested].
81108 ms: Mark-sweep 1371.7 (1435.0) -> 1371.6 (1435.0) MB, 1017.2 / 0 ms [last resort gc].
82158 ms: Mark-sweep 1371.6 (1435.0) -> 1371.6 (1435.0) MB, 1049.9 / 0 ms [last resort gc].
<--- JS stacktrace --->
==== JS stack trace =========================================
Security context: 0x4e36fec9e31 <JS Object>
1: substr [native string.js:~320] [pc=0xdab4e7f1185] (this=0x35500e175a29 <Very long string[65537]>,Q=50,am=65487)
2: __write [/Users/arnold/Develop/mount-meru/node_modules/csv-parse/lib/index.js:304] [pc=0xdab4e7b8f98] (this=0x350ff4f97991 <JS Object>,chars=0x35500e175a29 <Very long string[65537]>,end=0x4e36fe04299 <false>,callback=0x4e36fe04189 <undefined>)
3: arguments adaptor fra...
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
1: node::Abort() [/usr/local/Cellar/node/6.3.1/bin/node]
2: node::FatalException(v8::Isolate*, v8::Local<v8::Value>, v8::Local<v8::Message>) [/usr/local/Cellar/node/6.3.1/bin/node]
3: v8::Utils::ReportApiFailure(char const*, char const*) [/usr/local/Cellar/node/6.3.1/bin/node]
4: v8::internal::V8::FatalProcessOutOfMemory(char const*, bool) [/usr/local/Cellar/node/6.3.1/bin/node]
5: v8::internal::Factory::NewByteArray(int, v8::internal::PretenureFlag) [/usr/local/Cellar/node/6.3.1/bin/node]
6: v8::internal::TranslationBuffer::CreateByteArray(v8::internal::Factory*) [/usr/local/Cellar/node/6.3.1/bin/node]
7: v8::internal::LCodeGenBase::PopulateDeoptimizationData(v8::internal::Handle<v8::internal::Code>) [/usr/local/Cellar/node/6.3.1/bin/node]
8: v8::internal::LChunk::Codegen() [/usr/local/Cellar/node/6.3.1/bin/node]
9: v8::internal::OptimizedCompileJob::GenerateCode() [/usr/local/Cellar/node/6.3.1/bin/node]
10: v8::internal::Compiler::GetConcurrentlyOptimizedCode(v8::internal::OptimizedCompileJob*) [/usr/local/Cellar/node/6.3.1/bin/node]
11: v8::internal::OptimizingCompileDispatcher::InstallOptimizedFunctions() [/usr/local/Cellar/node/6.3.1/bin/node]
12: v8::internal::StackGuard::HandleInterrupts() [/usr/local/Cellar/node/6.3.1/bin/node]
13: v8::internal::Runtime_StackGuard(int, v8::internal::Object**, v8::internal::Isolate*) [/usr/local/Cellar/node/6.3.1/bin/node]
14: 0xdab4e60961b
15: 0xdab4e7f1185
16: 0xdab4e7b8f98
[1] 18102 abort npm run setup
到底发生了什么? Stream API 的全部意义不就是不必一次性在内存中存有大量数据,而是能够逐段处理它们吗?看起来错误是直接来自 csv 解析库,对吗?
最佳答案
经过一些调试,我发现内存链接位于我使用的第三方库中(特别是 nedb
)。我认为它也不适合存储那么多文档,所以我决定替换它。
我发现一些文章对于解决这个问题很有用:
关于node.js - NodeJS 流超出堆,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/38964964/