node.js - 无法使用链接获取不同的标题

我在 node 中创建了一个脚本，使用 promise 结合 request 和 cheerio 来解析此 webpage 中 Province 列下的 links然后重新使用这些链接从所有此类 pages 中抓取 Office 列下的所有 url最后使用这些 links 从所有这些 target pages 中收集 title ，如本页中的 Cairos main Post Office 所示。

我当前的脚本大部分时间都卡住了。但是，有时它会抛出此错误 UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined。我检查了每个功能，发现它们都以正确的方式单独工作。

虽然脚本看起来有点大，但它是建立在一个非常简单的逻辑之上的，即利用每个 links 从它的着陆页直到它到达 title这是目标页面。

这是我目前的尝试:

const request = require('request');
const cheerio = require('cheerio');

const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';

const items = [];
const nitems = [];

let getLinks = () => {
    return new Promise((resolve, reject) => {
        request(link, function(error, response, html) {
            let $ = cheerio.load(html);
            if (error) return reject(error);
            try {
                $('.table tbody tr').each(function() {
                    items.push(base_link + $(this).find("a").attr("href"));
                });
                resolve(items);
            } catch (e) {
                reject(e);
            }
        });
    });
};

let getData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    $('.table tbody tr').each(function() {
                        nitems.push(base_link + $(this).find("a").attr("href"));
                    });
                    resolve(nitems);
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

let FetchData = (links) => {
    const promises = links
        .map(nurl => new Promise((resolve, reject) => {
            request(nurl, function(error, response, html) {
                let $ = cheerio.load(html);
                if (error) return reject(error);
                try {
                    resolve($(".home-title > h2").eq(0).text());
                } catch (e) {
                    reject(e);
                }
            })
        }))

    return Promise.all(promises)
}

getLinks().then(resultList => {
    getData(resultList).then(resultSet => {
        FetchData(resultSet).then(title =>{
            console.log(title);
        })
    })
})

How can I scrape the titles from target pages making use of all the links from landing pages?

最佳答案

向网站所有者询问您需要的数据会容易得多。
他可能会理解您的要求并免费提供给您，而不是抓取他的网站。

P.S: I was surprised to find a question about how to scrape my own website.
P.S2: If you just need all post office titles I could have given it for you for free :D
P.S3: Your error is maybe happening because of some time the page doesn't have the element which you are trying to parse using cheerio.

关于node.js - 无法使用链接获取不同的标题，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/55684725/

node.js - 无法使用链接获取不同的标题

上一篇：javascript - 在 node.js 中检测字符串的语言

下一篇：javascript - 如何在 node.js 和 express 中导出路由？