javascript - NodeJS Express 网络抓取 header 问题

标签 javascript node.js express

我正在抓取一个粉丝网站以获取 Angular 色信息以显示在我的网络应用程序中,但我遇到了发送后无法设置标题的问题。我试图在我的请求中使用 promise ,但是我想我可能对我的代码实际上在做什么有根本的误解。

最终目标是通过遍历老板姓名数组来抓取 100 多页数据,将数据存储在数组中,然后最终将其导出以备后用。目前我能够将数据存储在一个数组中,但即使我的代码执行并抓取数据仍然会出错。

服务器.js

var express = require('express');
var cheerio = require('cheerio');
var app = express();
var rp = require('request-promise');
var fsp = require('fs-promise');

app.get('/', function(req, res){

  urls = [
    'fansite/boss1', 'fansite/boss2'
  ];

  var bosses = [];

  function parse(html) {

    var $ = cheerio.load(html);

    $('.page-header__title').filter(function () {
      var data = $(this);
      name = data.text();
      bosses.push(name);
    })
    console.log(bosses);
    return bosses;
  }

  urls.forEach(function (url) {
    rp(url)
    .then(parse)
    .then(res.send('Bosses Updated.'))  
    .catch(err => console.log('Error:', err));
  });
})

app.listen('8081')
console.log('Running on port 8081');
exports = module.exports = app;

输出:

node server.js start
Running on port 8081
[ 'Obor' ]
[ 'Obor', 'Zulrah' ]
Error: Error: Can't set headers after they are sent.
    at ServerResponse.OutgoingMessage.setHeader (_http_outgoing.js:356:11)
    at ServerResponse.header (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:767:10)
    at ServerResponse.send (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:170:12)
    at rp.then.then (/Users/aaron/Personal Projects/node-scraper/server.js:31:21)
    at tryCatcher (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/util.js:16:23)
    at Promise._settlePromiseFromHandler (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:512:31)
    at Promise._settlePromise (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:569:18)
    at Promise._settlePromise0 (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:693:18)
    at Async._drainQueue (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:672:20)
    at tryOnImmediate (timers.js:645:5)
    at processImmediate [as _immediateCallback] (timers.js:617:5)

最佳答案

如果你想等待所有的url被处理后再发送响应

Promise.all(urls.map(function (url) {
  return rp(url).then(parse);
}))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

Promise.all(urls.map(url => rp(url).then(parse)))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

关于javascript - NodeJS Express 网络抓取 header 问题,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/48533672/

相关文章:

javascript - 如何从桌面通知中覆盖或删除签名 "electron.app.Electron"

javascript - JS 库的 TypeScript 类型的 API 文档

node.js - 通过Postman和httpie访问CRUD路由时身份验证失败

javascript - 保持文本动态更新并位于 Canvas 中央

node.js - 使用 Node 和 Angular 5 从 Linux 服务器下载 zip 文件时,zip 文件成功下载但为空

java - NodeJS 服务器到 Tomcat 服务器的 CORS 问题

node.js - 对于路径 "586d62878fc14d30e0ac5379"处的值 "_id",转换为 ObjectId 失败

javascript - 1x1 像素图像技术是跨域捕获分析数据的好方法吗?

javascript奇怪的编码

javascript - 从 javascript 对象中删除较少的序列计数