javascript - Nightmare.js和Vo.js-提前退出循环

标签 javascript nightmare

我正在使用Nightmare.js构建一个屏幕抓取工具,并使用Vo.js来控制流程。我有一个来自CSV的邮政编码列表,并循环浏览这些邮政编码,进行搜索,跟踪搜索结果中的每个链接,然后抓取该页面。但是,它似乎只使用第一个邮政编码进行搜索,似乎早就退出了循环。有任何想法吗?这是我的代码:

var Nightmare = require('nightmare'),
    vo = require('vo'),
    fs = require('fs'),
    parse = require('csv-parse'),
    csvWriter = require('csv-write-stream'),
    nightmare = Nightmare(),
    writer = csvWriter(),
    path = process.argv[2]

var searchByPostcode = function*(postcode) {
  var result = yield nightmare
    .goto(URL)
    .select('#body_umbBodyContent_BranchSearch_1_ddlRadius', 20)
    .type('input[id=body_umbBodyContent_BranchSearch_1_txtLocation]', postcode)
    .click('#body_umbBodyContent_BranchSearch_1_btnSearch')
    .wait('.resultsarea .result')
    .evaluate(function () {
      var agents = []
      var results = $('.result a, .alternate_result a')
      urls = results.map(function(r) { return BASE_URL + $(this).attr('href') })
      return urls
    })

  return result
}

var getDetail = function*(url) {
  var result = yield nightmare
    .goto(url)
    .wait('.wizard')
    .evaluate(function() {
      var company = $("div.divlabel:contains('Company:')").next('div').find('a').attr('href')
      var name = $('h1.tint').text().trim()
      var address = $('#address_container div:nth-child(2)').text().trim()
      var website = $("div.divlabel:contains('Website:')").next('div').find('a').attr('href')
      var tel = $("div.divlabel:contains('Telephone:')").next('div').text().trim()
      var email = $("div.divlabel:contains('E-Mail:')").next('div').find('a').text().trim()

      return {
        url: document.URL,
        company: company,
        name: name,
        address: address,
        website: website,
        tel: tel,
        email: email
      }
    })

  return result
}

var run = function*() {
  var agents = []
  var postcodes = fs.readFileSync(path, 'utf8').split('\n')

  for (var i = 0, len = postcodes.length; i < len; i++) {
    console.log(postcodes[i])
    var urls = yield searchByPostcode(postcodes[i])
    console.log(urls)

    for (var i = 0, len = urls.length; i < len; i++) {
      var agent = yield getDetail(urls[i])
      if (agent.email == '' & agent.company != '') {
        company = yield getDetail('http://www.arla.co.uk/' + agent.company)
        agent.website = agent.website == undefined ? company.website : agent.website
        agent.email = agent.email == '' ? company.email : agent.email
        agent.tel = agent.tel == '' ? company.tel : agent.tel
      }
      agents.push(agent)
    }

  }

  yield nightmare.end();

  return agents
}

vo(run)()
  .then(function(agents) {
    writer.pipe(fs.createWriteStream('results.csv'))
    for (var i = 0, len = agents.length; i < len; i++) {
      writer.write(agents[i])
    }
    writer.end()
  })
  .catch(e => console.error(e))

最佳答案

是的,事实证明我是个白痴。我将循环中的两个索引计数器都设置为i,因此i在第一次搜索后被设置为url计数,因此一旦解析完所有url后就退出循环。天哪!

关于javascript - Nightmare.js和Vo.js-提前退出循环,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/40016981/

相关文章:

javascript - 如何使用 beforeClose 回调取消打开弹出窗口? #放大弹出窗口

JavaScript 逻辑问题

javascript - nightmare run function queue in while loop(让循环等待队列完成)

javascript - 如何检查按钮是否被禁用?

javascript - meteor 的 Nightmare 引发异常

node.js - 如何异步执行多个 Nightmare 函数

javascript - 添加上一个下一个导航到列表项淡入/淡出循环

javascript - 与 fullcalendar 和 google calendar 相同的域

javascript - 如何调试 Mustache js 模板?

kubernetes - 在 Kubernetes 集群中运行 E2E 测试