有人可以帮我解决这个问题吗?
我正在尝试抓取网站并将收集的数据存储在 Json 文件中。我正在使用 Cheerios 和 request-promise。
Json 结构如下:公司 > 包 > 城市
"companies": [
{
"id": 0,
"name": "companyName",
"url": "https://www.url-company.net/",
"packages": [
{
"id": 0,
"name": "general",
"url": "https://www.url-company.net/package",
"cities": [
{
"id": 0,
"name": "cityName",
"url": "https://www.url-company.net/package/city",
},
...]
}
...]
}
..]
我已从此网站中提取了一系列公司。
- 每个公司都有一个特定的网址 --> 从我抓取的每个网址中 每个公司的套餐。
- 每个 PACKAGE 都有一个特定的 url --> 来自 我想抓取每个包裹的每个网址,但我不是 能够做到。
我只能填充公司和packageByCompany,但在尝试填充citiesByPackage时我迷失了:
const rp = require('request-promise');
const cheerio = require('cheerio');
const jsonfile = require('jsonfile');
const baseUrl = 'https://www.base-url-example.net';
scrapeAll();
function scrapeAll() {
return scrapeCompanies().then(function (dataCompanys) {
//Map every endpoint so we can make a request with each URL
var promises = dataCompanys.map(function (company) {
return scrapePackagesByCompany(company) // Populate each company with all the array of packages from this company
});
return Promise.all(promises);
})
.then(function(promiseArray) { // Need help here!!!!
var promise4all = Promise.all(
promiseArray.map(function(company) {
return Promise.all( // This is NOT working, I do not know how to get promises from nested arrays
company.packages.map(function(package) {
return Promise.all(
scrapeCitiesByPackage(package) // Try to populate each package with all the array of cities from this package
);
})
);
})
);
return promise4all;
})
.then(function (data) {
saveScrapedDateIntoJsonFile(data);
return data;
})
.catch(function (err) {
return Promise.reject(err);
});
}
function scrapeCompanies() {
return rp(baseUrl)
.then(function(html){
const data = [];
let companysImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(companysImg).each(function(index, element){
const urlCompany = $(element).find('a').attr('href');
const imgCompany = $(element).find('img').data('lazy-src');
if (urlCompany && imgCompany) {
const nameCompany = urlCompany;
const company = {
id : index,
name: nameCompany,
url : baseUrl + urlCompany,
img: imgCompany,
};
data.push(company);
}
});
return data;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapePackagesByCompany(company) {
return rp(company.url)
.then(function(html){
company.packages = [];
let packagesImg = '#content section .elementor-container > .elementor-row > .elementor-element.elementor-top-column .elementor-widget-wrap .elementor-widget-image >.elementor-widget-container > .elementor-image';
let $ = cheerio.load(html);
$(packagesImg).each(function(index, element){
const urlPackage = $(element).find('a').attr('href');
const imgPackage = $(element).find('img').data('lazy-src');
if (urlPackage && imgPackage) {
const namePackage = urlPackage.text();
const package = {
id : index,
name: namePackage,
url : urlPackage,
img: imgPackage,
};
company.packages.push(package);
}
});
return company;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function scrapeCitiesByPackage(insurancePackage) {
return rp(insurancePackage.url)
.then(function(html){
insurancePackage.cities = [];
let citiesLinks = '#content section .elementor-container > .elementor-row > .elementor-element .elementor-widget.elementor-widget-posts .elementor-posts-container article';
let $ = cheerio.load(html);
$(citiesLinks).each(function(index, element) {
const $linkCity = $(element).find('a');
const urlCity = $linkCity.attr('href');
const nameCity = $linkCity.text();
if (urlCity && nameCity) {
const city = {
id : index,
name: nameCity,
url : urlCity,
};
insurancePackage.cities.push(city);
}
});
return insurancePackage;
})
.catch(function(err){
//handle error
console.error('errorrr2', err);
});
}
function saveScrapedDateIntoJsonFile(data) {
jsonfile.writeFile(
'./data/company.json',
{companies : data },
//data,
{spaces: 2},
function(err) {
console.error('errorrr', err);
});
}
提前致谢:)
最佳答案
您正在尝试的内容可能会起作用,但对于 scrapePackagesByCompany()
和 scrapeCitiesByPackage()
来说,它可以更好地简单地传递数据并执行所有“组装” “在 scrapeAll()
中工作(即将交付的数组捆绑到更高级别的对象中)。
你可以这样写:
scrapeAll()
.catch(function(err) {
console.log(err);
});
function scrapeAll() {
return scrapeCompanies()
.then(function(companies) {
return Promise.all(companies.map(function(company) {
return scrapePackagesByCompany(company)
.then(function(packages) {
company.packages = packages; // assembly
return Promise.all(packages.map(function(package) {
return scrapeCitiesByPackage(package)
.then(function(cities) {
package.cities = cities; // assembly
});
}));
});
}))
.then(function() {
return saveScrapedDateIntoJsonFile(companies);
});
});
}
然后简化 scrapePackagesByCompany()
和 scrapeCitiesByPackage(package)
就相当简单了,这样它们就可以传递 packages
数组和 cities
分别为数组。
关于node.js - 如何从嵌套数组中获取 promise ?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/57938070/