node.js - 如何使用 Puppeteer 抓取无限滚动网站

标签 node.js web-scraping puppeteer

<分区>

我正在尝试抓取一个无限滚动的网站。

我正在控制滚动,但它仍然在到达网页末尾后退出。

这是我的代码:

const puppeteer = require("puppeteer");

module.exports.scraper = async (url, callBack) => {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.setUserAgent(
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    );

    await page.setViewport({ width: 1200, height: 768 });

    function wait(ms) {
        return new Promise((resolve) => setTimeout(() => resolve(), ms));
    }

    await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
        waitUntil: "networkidle0",
    });

    // Get the height of the rendered page
    const bodyHandle = await page.$("body");
    const { height } = await bodyHandle.boundingBox();
    await bodyHandle.dispose();

    // Scroll one viewport at a time, pausing to let content load
    const viewportHeight = page.viewport().height;
    let viewportIncr = 0;
    while (viewportIncr + viewportHeight < height) {
        await page.evaluate((_viewportHeight) => {
            window.scrollBy(0, _viewportHeight);
        }, viewportHeight);
        await wait(1600);
        viewportIncr = viewportIncr + viewportHeight;
    }

    let data = await page.evaluate(() => {
        window.scrollTo(0, 0);
        let products = [];
        let productElements = document.querySelectorAll(".product-wrap");

        productElements.forEach((productElement) => {
            let productJson = {};
            try {
                productJson.imageUrl = productElement.querySelector(".renderedImg").src;
                productJson.brandName = productElement.querySelector(
                    ".brand-name",
                ).innerText;
            } catch (e) {
                console.log(e);
            }
            products.push(productJson);
        });
        return products;
    });
    await wait(100);
    callBack(data, true);
    await browser.close();
};

遇到这种情况怎么抓取?

最佳答案

这是处理无限滚动的一种策略。它在循环中重复滚动/比较,直到滚动无效。即,当我们告诉它滚动,但我们仍然处于上次迭代时相同的 scrollTop 值时,就认为它完成了。在极端情况下,浏览器最终会耗尽堆内存并崩溃,但这是我们对一般站点的出发点:

const puppeteer = require('puppeteer');
const url = 'https://example.com';

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  page.on('console', async msg => {
    const args = msg.args();
    const vals = [];
    for (let i = 0; i < args.length; i++) {
      vals.push(await args[i].jsonValue());
    }
    console.log(vals.join('\t'));
  });
  await page.goto(url);
  await page.evaluate(()=> {
    
    const wait = (duration) => { 
      console.log('waiting', duration);
      return new Promise(resolve => setTimeout(resolve, duration)); 
    };

    (async () => {
      
      window.atBottom = false;
      const scroller = document.documentElement;  // usually what you want to scroll, but not always
      let lastPosition = -1;
      while(!window.atBottom) {
        scroller.scrollTop += 1000;
        // scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
        await wait(300);
        const currentPosition = scroller.scrollTop;
        if (currentPosition > lastPosition) {
          console.log('currentPosition', currentPosition);
          lastPosition = currentPosition;
        }
        else {
          window.atBottom = true;
        }
      }
      console.log('Done!');

    })();

  });

  await page.waitForFunction('window.atBottom == true', {
    timeout: 900000,
    polling: 1000 // poll for finish every second
  });

  await page.screenshot({path: 'example.png', fullPage: true});

  await browser.close();
})();

关于node.js - 如何使用 Puppeteer 抓取无限滚动网站,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/64227537/

相关文章:

javascript - 使用nodejs安装websocket

RSelenium:使用窗口句柄切换窗口

javascript - Puppeteer 中的滚动问题

javascript - 在 puppeteer 中加载页面之前设置本地存储项目?

arrays - Mongoose 从 findByIdAndUpdate 中删除数组(继承模式)

node.js - 如何在 mongoose 中保存从 query.exec() 函数返回的对象

node.js - 从 Google Compute Engine 外部访问 Node.js 服务器

python - 网页抓取 : How to extract just the Information that I need

node.js - 如何在 Node.js 中高效地进行网页抓取?

javascript - Puppeteer 的行为与开发者控制台不同