我使用 axios
和 cheerio
创建了一个脚本,以从 yellowpages 中获取不同的商店名称及其相关链接。然后使用这些链接从他们的内页中抓取电话和电子邮件。脚本运行良好。
我现在想做的是使用 next page link继续从下一页抓取内容。我只是不知道如何在 getLinks()
函数中应用解析和使用下一页的逻辑。
目前这就是我正在尝试的:
const axios = require('axios');
const cheerio = require('cheerio');
const startUrl = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA';
const host = 'https://www.yellowpages.com';
const getLinks = async (url,host,callback) => {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
$('[class="result"] a.business-name').each(function(){
let items = $(this).find('span').text();
let links = host + $(this).attr("href");
return callback(items,links);
});
}
const fetchContent = async (shopName,shopLink,callback) => {
const { data } = await axios.get(shopLink);
const $ = cheerio.load(data);
let phone = $('.contact > p.phone').eq(0).text();
let email = $('.business-card-footer > a.email-business').eq(0).attr("href");
return callback(shopName,shopLink,phone,email);
}
async function scrapeData() {
getLinks(startUrl,host,function(itemName,link){
fetchContent(itemName,link,function(shopName,shopLink,phone,email){
console.log({shopName,shopLink,phone,email});
});
});
}
scrapeData();
最佳答案
下一页链接通常在 [rel=next]
中,所以通常是这样的:
async function get(url) {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
return $
}
async function run(){
let url = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA'
let $ = await get(url)
// doSomething($)
let href = $('[rel=next]').attr('href')
while(href){
url = new URL(href, url).href
$ = await get(url)
// doSomething($)
href = $('[rel=next]').attr('href')
}
}
关于javascript - 无法在函数中应用抓取下一页的逻辑,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69182801/