python - 如何从shopee网站抓取商品?

标签 python beautifulsoup web-crawler

我尝试使用 python 来获取产品信息,如名称和价格。 但这一次不起作用,即使我通过网络浏览器程序员模式检查html代码来获取类名并尝试使用这个名称来获取我想要的任何东西。

但是我得到了这样的结果,我找不到 "class_="col-xs-2-4 shopee-search-item-result__item" 的任何项目,我应该添加更多 header 信息?

打印结果

import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import json

url = 'https://shopee.tw/shop/1819984/search?shopCollection=9271157'
headers = {
'Host': 'shopee.tw',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Cookie':'SPC_IA=-1; SPC_EC=-; SPC_F=L07IMDECRHjifEKyg7XuNCJ00GNdJGTA; REC_T_ID=246cfcdc-18fa-11ea-b254-f8f21e2be0b8; SPC_T_ID="Fyr1skVDq7FDiJOuTYHBmMfMr2Cw1eZyPbYJhBYoRmf/gvfvkOf5zgjIVXLrYYlg32aSx1PfmhWq7QsQzwM86mdeXG8VU7ERK4N+gfPFd14="; SPC_U=-; SPC_T_IV="/oJN8EB7iQwg7+n5mXd6cw=="; _gcl_au=1.1.788704691.1575727322; _fbp=fb.1.1575727322914.443117835; _ga=GA1.2.1422761069.1575727324; __BWfp=c1575727332595xf5a099d8b; cto_lwid=7ea874b3-f31f-47d7-aef9-60eed0156d33; cto_bundle=0tgQ7V9rU3JlRTU4aWlTc09JNXRaN014Y3ZXa1BtVVcwT2RhOU1UZ0tweUFvWUo2WHRPQjd0JTJCM1duaG5iWXFFRWxpbHZkTFluWUZLSEFudTFreGJueFoxU0EyanhnMWN6ZEVIUVV6cFlhd050emhFMWQ4bmhVelZwVSUyRmwwQUp5c29lOEhPT2ZobE10S1dvT09HYWNhVXV1YWx5R3dSOGw0MHcwZWpiZ2pXU2VHSzdrJTNE; _med=refer; G_ENABLED_IDPS=google; fbm_382498665271383=base_domain=.shopee.tw; SPC_SI=jq6hwq6ju6hig9hfulumcagdqaiopatc; _gid=GA1.2.143857303.1577796150; csrftoken=3Pya3o5WYEvhLOj9FqCqbV3angfwBlko; AMP_TOKEN=%24NOT_FOUND; _dc_gtm_UA-61915057-6=1'
}

r = requests.get(url,headers=headers,allow_redirects=True)
print(r.status_code)
print(r.history)
print(r.url)

soup = BeautifulSoup(r.text, 'html.parser')
items = soup.find_all("div", class_="col-xs-2-4 shopee-search-item-result__item")
print(len(items))
```**strong text**

最佳答案

此页面使用 JavaScript显示项目但 BeautifulSoup/requests无法运行JavaScipt .

使用DevToolsFirefox/Chrome (选项卡 "Network" )我发现 JavaScript 使用的网址从服务器获取 JSON 数据,因此它甚至不需要 BeautifulSoup .

为了正常工作,它需要所有这些 header 。

没有User-AgentX-Requested-With它发送空数据。
没有Referer它不发送价格。

import requests

url = 'https://shopee.tw/api/v2/search_items/?by=pop&limit=30&match_id=1819984&newest=0&order=desc&page_type=shop&shop_categoryids=9271157&version=2'

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0',
    'X-Requested-With': 'XMLHttpRequest',
    'Referer': 'https://shopee.tw/shop/1819984/search?shopCollection=9271157',
}    

r = requests.get(url, headers=headers)

data = r.json()

#print(data['items'][0].keys())

for item in data['items']:
    print('name:', item['name'])
    print('prince:', item['price'])
    print('sold:', item['historical_sold'])
    print('---')

#print(data['items'][0]) # for test only 

结果:

name: 『現貨+預購』 Balea 精華膠囊 7 入
prince: 4900000
sold: 5104
---
name: 💯現貨供應 💯德國 Invisibobble 神奇魔髮圈流線魔髮圈
prince: 7500000
sold: 26
---
<小时/>

顺便说一句:用于测试以查看您可以使用的所有值 json使用缩进格式化它

import json

print(json.dumps(data['items'][0], indent=4))

结果:

{
    "itemid": 1212735748,
    "welcome_package_info": null,
    "liked": false,
    "recommendation_info": null,
    "bundle_deal_info": null,
    "price_max_before_discount": -1,
    "image": "338673ff6f2b23d63514e5af85269d46",
    "is_cc_installment_payment_eligible": false,
    "shopid": 1819984,
    "can_use_wholesale": true,
    "group_buy_info": null,
    "reference_item_id": "",
    "currency": "TWD",
    "raw_discount": null,
    "show_free_shipping": false,
    "video_info_list": [],
    "ads_keyword": null,
    "collection_id": null,
    "images": [
        "338673ff6f2b23d63514e5af85269d46"
    ],
    "match_type": null,
    "price_before_discount": 0,
    "is_category_failed": false,
    "show_discount": 0,
    "cmt_count": 306,
    "view_count": 93,
    "display_name": null,
    "catid": 67,
    "json_data": null,
    "upcoming_flash_sale": null,
    "is_official_shop": false,
    "brand": "Dm Ebelin",
    "price_min": 4900000,
    "liked_count": 136,
    "can_use_bundle_deal": false,
    "show_official_shop_label": false,
    "coin_earn_label": null,
    "price_min_before_discount": -1,
    "cb_option": 0,
    "sold": 0,
    "deduction_info": null,
    "stock": 3647,
    "status": 1,
    "price_max": 4900000,
    "add_on_deal_info": null,
    "is_group_buy_item": null,
    "flash_sale": null,
    "price": 4900000,
    "shop_location": "\u53f0\u4e2d\u5e02\u6f6d\u5b50\u5340",
    "item_rating": {
        "rating_star": 4.996732,
        "rating_count": [
            306,
            0,
            0,
            0,
            1,
            305
        ],
        "rcount_with_image": 11,
        "rcount_with_context": 139
    },
    "show_official_shop_label_in_title": false,
    "tier_variations": [],
    "is_adult": null,
    "discount": null,
    "flag": 65536,
    "is_non_cc_installment_payment_eligible": false,
    "has_lowest_price_guarantee": false,
    "has_group_buy_stock": false,
    "preview_info": null,
    "welcome_package_type": 0,
    "name": "\u300e\u73fe\u8ca8+\u9810\u8cfc\u300f Balea \u7cbe\u83ef\u81a0\u56ca 7 \u5165",
    "distance": null,
    "adsid": null,
    "ctime": 1527866201,
    "wholesale_tier_list": [
        {
            "min_count": 150,
            "price": 4700000,
            "max_count": 300
        },
        {
            "min_count": 301,
            "price": 4600000,
            "max_count": 1000
        },
        {
            "min_count": 1001,
            "price": 4500000,
            "max_count": null
        }
    ],
    "show_shopee_verified_label": false,
    "campaignid": null,
    "show_official_shop_label_in_normal_position": null,
    "item_status": "normal",
    "shopee_verified": false,
    "hidden_price_display": null,
    "size_chart": null,
    "item_type": 0,
    "shipping_icon_type": null,
    "campaign_stock": null,
    "label_ids": [],
    "service_by_shopee_flag": 0,
    "badge_icon_type": 0,
    "historical_sold": 5104,
    "transparent_background_image": ""
}

关于python - 如何从shopee网站抓取商品?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59557071/

相关文章:

java - 使用java读取网页的检查元素数据

python - 使用 Python/Pandas/Numpy 的几何级数(无循环并使用递归)

python - Sklearn LogisticRegressionCV 的类似数组的输入

python - 将 HTML 转换为 CSV

python - 将 python 结果返回到 html

php - 关于从互联网上蜘蛛/抓取/收集音频内容的最佳方式的建议/提示

java - Java 和 Python 中对 HBase 的并行扫描请求具有不同的性能

python - 如果满足某些条件,如何配置 Tkinter 小部件?

python - 尝试/除了抓取 URL 末尾带有 3 个随机数字的网站

python - 部署scrapy项目时出错