javascript - 等待 Javascript 网页抓取功能完成后再运行下一页?

标签 javascript jquery json node.js

我正在尝试创建一个网络抓取工具(在 Node.js 中),它将从站点中提取信息并将其写入文件。我将其构建为可以在一页上正确工作,但是当我尝试在 for 循环中使用该函数来迭代多个游戏时,我在所有游戏中都得到了错误的数据。

我知道这与 Javascript 的异步特性有关,并且我已经阅读过有关回调函数的内容,但我不确定我是否理解如何将其应用到我的代码中。任何帮助将不胜感激:

for(x = 4648; x < 4650; x++){  //iterate over a few gameIDs, used in URL for request
    scrapeGame(x);
}

function scrapeGame(gameId){
    //request from URL, scrape HTML to arrays as necessary
    //write final array to file
}

本质上,我想要做的是在 for 循环中,告诉它等待完成 scrapeGame(x) 函数,然后再增加 x 并为下一个游戏运行它 - 否则,数组开始互相覆盖,数据会变得一团糟。

编辑:我现在已经包含了我尝试运行的完整代码!写入文件后查看文件时出现错误。例如,第一个文件是 8kb,第二个文件是 ~16,第三个文件是 ~32,等等。在运行下一个游戏之前,似乎事情还没有被清除。

该计划的想法是从存档网站中提取 Jeopardy 问题/答案,以便最终为自己构建一个测验应用程序。

//Iterate over arbitrary number of games, scrape each

for(x = 4648; x < 4650; x++){
    scrapeGame(x, function(scrapeResult) {
        if(scrapeResult){
            console.log('Scrape Successful');
        } else {
            console.log('Scrape ERROR');
        }
    });
}

function scrapeGame(gameId, callback){
    var request = require('request');
        cheerio = require('cheerio');
        fs = require('fs');
        categories = [];
        categorylist = [];
        ids = [];
        clues = [];
        values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
        valuelist = [];
        answers = [];
        array = [];
        file = [];
        status = false;

    var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
    var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;

    request(showGameURL, function(err, resp, body){ 
    if(!err && resp.statusCode === 200){
        var $ = cheerio.load(body);
        //add a row to categories to avoid starting at 0
        categories.push('Category List');
        //pull all categories to use for later
        $('td.category_name').each(function(){
            var category = $(this).text();
            categories.push(category);
        });
        //pull all clue IDs (coordinates), store to 1d array
        //pull any id that has "stuck" in the string, to prevent duplicates
        $("[id*='stuck']").each(function(){
            var id = $(this).attr('id');
            id = id.toString();
            id = id.substring(0, id.length - 6);
            ids.push(id);
            //if single J, pick category 1-6
            if (id.indexOf("_J_") !== -1){
                var catid = id.charAt(7);
                categorylist.push(categories[catid]);
                var valId = id.charAt(9);
                valuelist.push(values[valId]);
            }
            //if double J, pick category 7-12
            else if (id.indexOf("_DJ_") !== -1){
                var catid = parseInt(id.charAt(8)) + 6;
                categorylist.push(categories[catid]);
                var valId = parseInt(id.charAt(10)) + 5;
                valuelist.push(values[valId]);                
            }
            //if final J, pick category 13
            else {
                categorylist.push(categories[13]);
            }
        });
        //pull all clue texts, store to 1d array
        $('td.clue_text').each(function(){
            var clue = $(this).text();
            clues.push(clue);
        });
        //push pulled values to big array
        array.push(ids);
        array.push(categorylist);
        array.push(valuelist);
        array.push(clues);

        //new request to different URL to pull responses
        request(showAnswerURL, function(err, resp, body){ 
            if(!err && resp.statusCode === 200){
                var $ = cheerio.load(body);

                $('.correct_response').each(function(){
                    var answer = $(this).text();
                    answers.push(answer);
                });
                //push answers to big array
                array.push(answers);
                //combine arrays into 1-d array to prep for writing to file
                for(var i = 0; i < array[0].length; i++){
                    var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
                    var stringPrint = print.toString();
                    file.push(stringPrint);
                }
                //update string, add newlines, etc.
                var stringFile = JSON.stringify(file);
                stringFile = stringFile.split('\\').join('');
                stringFile = stringFile.split('","').join('\n');
                //write to file, eventually will append to end of one big file
                fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
                    if(err) {
                        console.log(err);
                    } else {
                        console.log("Game #" + gameId + " has been scraped.");
                        status = true;
                    }
                });
            }
        });
    }
});
        //clear arrays used
        valuelist = [];
        answers = [];
        categories = [];
        categorylist = [];
        ids = [];
        clues = [];
        array = [];
        file = [];
        //feed callback status
        callback(status);
}

最佳答案

// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
  // Pass in the callback as an anonymous function.
  // So below I am passing in the id and the function I want to execute.
  // AND, defining the results I am expecting as passed in arguments. 
  scrapeGame(x, function(scrapeResult, err) {
    // This will *NOT* execute *UNTIL* you call it in the function below.
    // That means that the for loop's execution is halted. 
    // This function receives the status that is passed in, 
    // in this case, a boolean true/false and an error if any.
    if (scrapeResult) {
      // Scrape was true, nothing to do.
      // The for loop will now move on to the next iteration.
      console.log('Scrape Successful');
    } else {
      // Scrape was false, output error to console.log and 
      // break loop to handle error.
      console.log('Scrape ERROR :: ' + err);
      // Notice we are calling break while in the 
      // scope of the callback function
      // Remove the break if you want to just move onto
      // the next game ID and not stop the loop
      break;
    }
  });
}

// This function now accepts two arguments.
function scrapeGame(gameId, callback) {

  // ************************************************
  // ** Do Your Work Here **
  // Request from URL, scrape HTML to arrays as necessary.
  // Write final array to file.
  // After file creation, execute the callback and pass bool
  // status (true/false).
  // ************************************************

  var request = require('request'),
      cheerio = require('cheerio'),
      fs = require('fs'),
      categories = [],
      categorylist = [],
      ids = [],
      clues = [],
      values = [
          '0',
          '$200',
          '$400',
          '$600',
          '$800',
          '$1000',
          '$400',
          '$800',
          '$1200',
          '$1600',
          '$2000'
      ],
      valuelist = [],
      answers = [],
      array = [],
      file = [],
      showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
      showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;

  request(showGameURL, function(err, resp, body) {
    if (!err && resp.statusCode === 200) {
      var $ = cheerio.load(body);
      //add a row to categories to avoid starting at 0
      categories.push('Category List');
      //pull all categories to use for later
      $('td.category_name').each(function() {
        var category = $(this).text();
        categories.push(category);
      });
      //pull all clue IDs (coordinates), store to 1d array
      //pull any id that has "stuck" in the string, to prevent duplicates
      $("[id*='stuck']").each(function() {
        var id = $(this).attr('id');
        id = id.toString();
        id = id.substring(0, id.length - 6);
        ids.push(id);
        //if single J, pick category 1-6
        if (id.indexOf("_J_") !== -1) {
          var catid = id.charAt(7);
          categorylist.push(categories[catid]);
          var valId = id.charAt(9);
          valuelist.push(values[valId]);
        }
        //if double J, pick category 7-12
        else if (id.indexOf("_DJ_") !== -1) {
          var catid = parseInt(id.charAt(8)) + 6;
          categorylist.push(categories[catid]);
          var valId = parseInt(id.charAt(10)) + 5;
          valuelist.push(values[valId]);
        }
        //if final J, pick category 13
        else {
          categorylist.push(categories[13]);
        }
      });
      //pull all clue texts, store to 1d array
      $('td.clue_text').each(function() {
        var clue = $(this).text();
        clues.push(clue);
      });
      //push pulled values to big array
      array.push(ids);
      array.push(categorylist);
      array.push(valuelist);
      array.push(clues);

      //new request to different URL to pull responses
      request(showAnswerURL, function(err, resp, body) {
        if (!err && resp.statusCode === 200) {
          var $ = cheerio.load(body);

          $('.correct_response').each(function() {
            var answer = $(this).text();
            answers.push(answer);
          });
          //push answers to big array
          array.push(answers);
          //combine arrays into 1-d array to prep for writing to file
          for (var i = 0; i < array[0].length; i++) {
            var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
            var stringPrint = print.toString();
            file.push(stringPrint);
          }
          //update string, add newlines, etc.
          var stringFile = JSON.stringify(file);
          stringFile = stringFile.split('\\').join('');
          stringFile = stringFile.split('","').join('\n');
          //write to file, eventually will append to end of one big file
          fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {

            //clear arrays used
            valuelist = [];
            answers = [];
            categories = [];
            categorylist = [];
            ids = [];
            clues = [];
            array = [];
            file = [];

            if (err) {
              // ******************************************
              // Callback false with error.
              callback(false, err);
              // ******************************************
            } else {
              console.log("Game #" + gameId + " has been scraped.");
              // ******************************************
              // Callback true with no error. 
              callback(true);
              // ******************************************
            }
          });
        }
      });
    }
  });
}

关于javascript - 等待 Javascript 网页抓取功能完成后再运行下一页?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27975316/

相关文章:

javascript - Chrome 开发者工具暂停 jQuery 1.7 的初始化

javascript - 合并然后排序 2 个 feed

javascript - jQuery.ajax() 解析器错误

javascript - jquery 重新加载后未解析 dojo 组件

javascript - event.waitUntil 在 Service Worker 中做什么以及为什么需要它?

jquery - 如何防止崩溃 bootstrap-3 菜单

javascript - 使用 JavaScript 从 JSON 获取特定值

javascript - 样式组件未将样式应用于自定义功能 react 组件

javascript - 合并两个 JavaScript 脚本

javascript - jQuery 动画回调不起作用