css - Puppeteer - 无法在酒店网站中拉出正确的 CSS 选择器

标签 css node.js web-scraping puppeteer

目标:将所有酒店名称保存在 .txt 文件中(点击网站中的输入后)。

问题:我相信我的 CSS 选择器 ('[class^="js-hotel-location"] > [class^="js-hotel-name"] > span. l-property-name') 可能不正确,因为我的 candidates 函数返回一个空数组,而且没有 if(!nameEl) return false; 行,我收到一个 TypeError: Cannot read properties of undefined (reading 'trim')

candidates_distance.txt 的预期输出: [“芝加哥市中心北河雅乐轩酒店”、“芝加哥密歇根大道格温豪华精选酒店”、“喜来登大酒店”芝加哥河滨步道”]

实际输出: []

enter image description here

导致问题的代码: 返回空数组的 candidates 函数:

    // ! STUCK: candidates function returning empty array - likely CSS selector is incorrect 
    const candidates = await page.$$eval('[class^="js-hotel-location"] > [class^="js-hotel-name"] > span.l-property-name', elements => {
        return elements
          .filter(el => {
            const nameEl = el.querySelector('[class^="js-hotel-location"] > [class^="js-hotel-name"] > span.l-property-name');
            if(!nameEl) return false;
            const name = nameEl.textContent.trim();
            return name !== "";
          })
          .map(el => el.textContent.trim());
    });
    fs.writeFile('candidates_name.txt', JSON.stringify(candidates), function (err) {
        if (err) throw err;
    });

重现代码:

constants.js:

module.exports = {
    URL: "https://www.marriott.com/default.mi",
    DESTINATION: "Chicago, IL, USA",
    START_MONTH: "August",
    START_DAY: 7,
    END_MONTH: "August",
    END_DAY: 10,
    PROMO_CODE: "MMP"
}

index.js:

const puppeteer = require('puppeteer-extra');
const constants = require('./marriot_constants.js');
const blockResources = require('puppeteer-extra-plugin-block-resources');
const fs = require('fs/promises');

// ! NOT WORKING: Configure the plugin to block all images and stylesheets
const pluginConfig = blockResources({
    blockedTypes: new Set(['image', 'stylesheet'])
});
puppeteer.use(pluginConfig);

let browser;
(async () => {
    browser = await puppeteer.launch();
    const [page] = await browser.pages();
    const ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
    await page.setUserAgent(ua);
    await page.setViewport({width: 800, height: 10700});
    await page.goto(constants.URL, {waitUntil: "domcontentloaded"});

    await page.type('[id$="-search-destination"]', constants.DESTINATION);

    await page.click(".search__calendar-value");
    await page.waitForSelector(".search-dates-popup");

    const findMonth = async targetMonth => {
        for(let i = 0; i < 12; i++){ 
            const month = await page.$eval(".DayPicker-Month", el => el.textContent);
            if(month.includes(targetMonth)) break;
            await page.click('[aria-label="Next Month"]');
        }   
    };

    const chooseDay = day => page.$$eval(".DayPicker-Day-value", (els, day) =>
        els.find(e => e.textContent.trim() === day).click(), String(day)
    );

    await findMonth(constants.START_MONTH);
    await chooseDay(constants.START_DAY);
    await findMonth(constants.END_MONTH);
    await chooseDay(constants.END_DAY);
    await page.click('button.m-button-secondary');

    await page.click('[class^="StyledSpecialRatesDiv"] > [class="t-font-s"]');
    await page.waitForSelector('[class^="StyledPopupMain"]');
    await page.click('[class^="StyledRadioButtonDiv"]:nth-child(5)')
    await page.type('[class^="StyledRadioButtonDiv"]:nth-child(5)', constants.PROMO_CODE);
    await page.waitForSelector('[class="crop-promo-code"] > [class="m-button-secondary m-button-s"]');
    await page.click('[class="crop-promo-code"] > [class="m-button-secondary m-button-s"]');

    await page.click('[id$="find-a-hotel-homePage-form"] > [class^="StyledFindBtn"]');
    await page.waitForNavigation();

    // ! STUCK: candidates function returning null array or cannot pull Hotel Names - likely CSS selector is incorrect 
    const candidates = await page.$$eval('[class^="js-hotel-location"] > [class^="js-hotel-name"] > span.l-property-name', elements => {
        return elements
          .filter(el => {
            const nameEl = el.querySelector('[class^="js-hotel-location"] > [class^="js-hotel-name"] > span.l-property-name');
            if(!nameEl) return false;
            const name = nameEl.textContent.trim();
            return name !== "";
          })
          .map(el => el.textContent.trim());
    });
    fs.writeFile('candidates_name.txt', JSON.stringify(candidates), function (err) {
        if (err) throw err;
    });
      
})()
.catch(err => console.error(err))
.finally(() => browser?.close());

最佳答案

您可以在浏览器控制台中测试选择器,以以下格式编写并运行它。它将显示您的选择器选择的内容。

$$('span.l-property-name'); 

就您而言,[class^="js-hotel-name"] 并不直接位于 [class^="js-hotel-location"] 下所以它什么也不选择。

应该是

[class^="js-hotel-location"] [class^="js-hotel-name"] > span.l-property-name

但这也可以选择其他度假屋优惠,因此您的标签应该是

[id^="property-record-map"] span.l-property-name

您的代码中也不需要过滤器部分:

const candidates = await page.$$eval('[id^="property-record-map"] span.l-property-name', elements => elements.map(el => el.textContent.trim()));

对于您的 constants.js 文件中的内容,选择器返回的酒店数量比您编写的要多,这是我获取输出的唯一方式

["Aloft Chicago Downtown River North", "The Gwen, a Luxury Collection Hotel, Michigan Avenue Chicago", "Sheraton Grand Chicago Riverwalk"]

如果我执行以下额外步骤,按品牌(豪华精选、喜来登、雅乐轩)和价格(200+ 美元)进行筛选:

// add after -> await page.waitForNavigation();

async function waitClick(selector) {
    let btn = await page.waitForSelector(selector);
    await btn.click();
}

await waitClick('[data-component-name="searchFilters"] a.m-button'); // All Filters
await waitClick('a[data-code=Brands]'); // filters > brands btn


let selectBrands = ['a[data-for=brands_LC]','a[data-for=brands_SI]','a[data-for=brands_AL]']; // Brands to select
for (let s of selectBrands) {
    await waitClick(s); 
    await page.waitForSelector('div.js-sort-and-filter-form',{visible : true}); // wait for selector
}

await waitClick('a[data-code=price]'); // filters > price btn
await waitClick('label[for=price_3]'); // 200+ USD

await page.waitForSelector('div.js-sort-and-filter-form',{visible : true});

await waitClick('form.js-submit-form button[type=Submit]'); // Apply filters

await page.waitForSelector('body');  

const candidates = await page.$$eval('[id^="property-record-map"] span.l-property-name', elements => elements.map(el => el.textContent.trim()));

// next comes the fs.writeFile part

关于css - Puppeteer - 无法在酒店网站中拉出正确的 CSS 选择器,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/76125539/

相关文章:

node.js - target.prototype 是类装饰器上的空对象

jquery - 获取多个元素的文本作为单独的值

python - Selenium 下载整个 html

html - 如何将网页上的两个图标并排对齐?

html - 如何在单行中使用列表标签

javascript - 单击时在 Twitter Bootstrap 中隐藏模式

html - 背景中的多个图像显示无效的属性值错误

javascript - 尝试运行 node.js 样板项目时找不到模块 'Socket.IO-node'

javascript - 如何像 Mocha 在 Node.js 上使用 Jasmine 一样

python - Request.get超时