我正在尝试创建一个网络抓取工具(在 Node.js 中),它将从站点中提取信息并将其写入文件。我将其构建为可以在一页上正确工作,但是当我尝试在 for 循环中使用该函数来迭代多个游戏时,我在所有游戏中都得到了错误的数据。
我知道这与 Javascript 的异步特性有关,并且我已经阅读过有关回调函数的内容,但我不确定我是否理解如何将其应用到我的代码中。任何帮助将不胜感激:
for(x = 4648; x < 4650; x++){ //iterate over a few gameIDs, used in URL for request
scrapeGame(x);
}
function scrapeGame(gameId){
//request from URL, scrape HTML to arrays as necessary
//write final array to file
}
本质上,我想要做的是在 for 循环中,告诉它等待完成 scrapeGame(x)
函数,然后再增加 x 并为下一个游戏运行它 - 否则,数组开始互相覆盖,数据会变得一团糟。
编辑:我现在已经包含了我尝试运行的完整代码!写入文件后查看文件时出现错误。例如,第一个文件是 8kb,第二个文件是 ~16,第三个文件是 ~32,等等。在运行下一个游戏之前,似乎事情还没有被清除。
该计划的想法是从存档网站中提取 Jeopardy 问题/答案,以便最终为自己构建一个测验应用程序。
//Iterate over arbitrary number of games, scrape each
for(x = 4648; x < 4650; x++){
scrapeGame(x, function(scrapeResult) {
if(scrapeResult){
console.log('Scrape Successful');
} else {
console.log('Scrape ERROR');
}
});
}
function scrapeGame(gameId, callback){
var request = require('request');
cheerio = require('cheerio');
fs = require('fs');
categories = [];
categorylist = [];
ids = [];
clues = [];
values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
valuelist = [];
answers = [];
array = [];
file = [];
status = false;
var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function(){
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function(){
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1){
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1){
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function(){
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body){
if(!err && resp.statusCode === 200){
var $ = cheerio.load(body);
$('.correct_response').each(function(){
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for(var i = 0; i < array[0].length; i++){
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
if(err) {
console.log(err);
} else {
console.log("Game #" + gameId + " has been scraped.");
status = true;
}
});
}
});
}
});
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
//feed callback status
callback(status);
}
最佳答案
// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
// Pass in the callback as an anonymous function.
// So below I am passing in the id and the function I want to execute.
// AND, defining the results I am expecting as passed in arguments.
scrapeGame(x, function(scrapeResult, err) {
// This will *NOT* execute *UNTIL* you call it in the function below.
// That means that the for loop's execution is halted.
// This function receives the status that is passed in,
// in this case, a boolean true/false and an error if any.
if (scrapeResult) {
// Scrape was true, nothing to do.
// The for loop will now move on to the next iteration.
console.log('Scrape Successful');
} else {
// Scrape was false, output error to console.log and
// break loop to handle error.
console.log('Scrape ERROR :: ' + err);
// Notice we are calling break while in the
// scope of the callback function
// Remove the break if you want to just move onto
// the next game ID and not stop the loop
break;
}
});
}
// This function now accepts two arguments.
function scrapeGame(gameId, callback) {
// ************************************************
// ** Do Your Work Here **
// Request from URL, scrape HTML to arrays as necessary.
// Write final array to file.
// After file creation, execute the callback and pass bool
// status (true/false).
// ************************************************
var request = require('request'),
cheerio = require('cheerio'),
fs = require('fs'),
categories = [],
categorylist = [],
ids = [],
clues = [],
values = [
'0',
'$200',
'$400',
'$600',
'$800',
'$1000',
'$400',
'$800',
'$1200',
'$1600',
'$2000'
],
valuelist = [],
answers = [],
array = [],
file = [],
showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;
request(showGameURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
//add a row to categories to avoid starting at 0
categories.push('Category List');
//pull all categories to use for later
$('td.category_name').each(function() {
var category = $(this).text();
categories.push(category);
});
//pull all clue IDs (coordinates), store to 1d array
//pull any id that has "stuck" in the string, to prevent duplicates
$("[id*='stuck']").each(function() {
var id = $(this).attr('id');
id = id.toString();
id = id.substring(0, id.length - 6);
ids.push(id);
//if single J, pick category 1-6
if (id.indexOf("_J_") !== -1) {
var catid = id.charAt(7);
categorylist.push(categories[catid]);
var valId = id.charAt(9);
valuelist.push(values[valId]);
}
//if double J, pick category 7-12
else if (id.indexOf("_DJ_") !== -1) {
var catid = parseInt(id.charAt(8)) + 6;
categorylist.push(categories[catid]);
var valId = parseInt(id.charAt(10)) + 5;
valuelist.push(values[valId]);
}
//if final J, pick category 13
else {
categorylist.push(categories[13]);
}
});
//pull all clue texts, store to 1d array
$('td.clue_text').each(function() {
var clue = $(this).text();
clues.push(clue);
});
//push pulled values to big array
array.push(ids);
array.push(categorylist);
array.push(valuelist);
array.push(clues);
//new request to different URL to pull responses
request(showAnswerURL, function(err, resp, body) {
if (!err && resp.statusCode === 200) {
var $ = cheerio.load(body);
$('.correct_response').each(function() {
var answer = $(this).text();
answers.push(answer);
});
//push answers to big array
array.push(answers);
//combine arrays into 1-d array to prep for writing to file
for (var i = 0; i < array[0].length; i++) {
var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
var stringPrint = print.toString();
file.push(stringPrint);
}
//update string, add newlines, etc.
var stringFile = JSON.stringify(file);
stringFile = stringFile.split('\\').join('');
stringFile = stringFile.split('","').join('\n');
//write to file, eventually will append to end of one big file
fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {
//clear arrays used
valuelist = [];
answers = [];
categories = [];
categorylist = [];
ids = [];
clues = [];
array = [];
file = [];
if (err) {
// ******************************************
// Callback false with error.
callback(false, err);
// ******************************************
} else {
console.log("Game #" + gameId + " has been scraped.");
// ******************************************
// Callback true with no error.
callback(true);
// ******************************************
}
});
}
});
}
});
}
关于javascript - 等待 Javascript 网页抓取功能完成后再运行下一页?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/27975316/