python - Scrapy 调用另一个 Url

标签 python web-scraping scrapy

我正在使用 scrapy 来抓取网站。我正在从列表页面获取所有产品。现在我想转到产品的每个网址,但我没有得到满意的结果。 这是我的代码:

import scrapy
from scrapy.http import Request

from tutorial.items import DmozItem

class DmozSpider(scrapy.Spider):
    name = "dmoz"
    allowed_domain = ["test.com"]
    start_urls = [
            "http://www.test.com/?page=1"
        ]

    page_index = 1

    def parse(self,response):
        products = response.xpath('//li')
        items = []
        if products:
            for product in products:
                item = DmozItem()
                    item['link'] = product.xpath('@data-url').extract()
                item['sku'] = product.xpath('@data-sku').extract()
                item['brand'] = product.xpath('.//span[contains(@class, "qa-brandName")]/text()').extract()
                item['img'] = product.xpath('.//img[contains(@class, "itm-img")]/@src').extract()
                page_url = "http://www.jabong.com/Lara-Karen-Black-Sweaters-893039.html"                
                request = Request(url=page_url,callback=self.parse_page2,
                headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"})
                request.meta['item'] = item
                item['other'] = request
                    yield item
        else:
            return
        self.page_index += 1
            if self.page_index:
                    yield Request(url="http://www.test.com/?page=%s" %              (self.page_index),
            headers={"Referer": "http://www.test.com/", "X-Requested-With":             "XMLHttpRequest"},
                        callback=self.parse)

    def parse_page2(self, response):
            item = response.meta['item']
            item['title'] = response.xpath("//span[@id='before_price']/text()")
        yield item

我得到的结果是

{"sku": [], "brand": [], "other": "<Request GET http://www.test.com/>", "link": [], "img": []},

我需要从 pars2 函数返回的数据而不是请求 Get

我哪里错了。

最佳答案

你的 xpaths 在这里似乎是错误的,

试试这个

In [0]: products[0].xpath('./@data-url').extract()
Out[0]: [u'Sangria-Green-Kurtis-Kurtas-1081831.html']

In [1]: products[0].xpath('./a/@unbxdparam_sku').extract()
Out[1]: [u'SA038WA68OIXINDFAS']

In [2]: products[0].xpath('./a/span[contains(@class,"qa-brandName")]/text()').extract()
Out[2]: [u'Sangria']

In [3]: products[0].xpath('./a/span[@class="lazyImage cat-prd-img"]/span/@id').extract()
Out[3]: [u'http://static14.jassets.com/p/Sangria-Green--Kurtis-26-Kurtas-5520-1381801-1-catalog.jpg']

所以代码将是,

BASE_URL = 'http://www.jabong.com/'
for product in products:
    item = DmozItem()
    item_url = product.xpath('./@data-url').extract()
    item_url = self.BASE_URL + item_url[0] if item_url else ''
    item['link'] = product.xpath('./@data-url').extract()
    item['sku'] = product.xpath('./a/@unbxdparam_sku').extract()
    item['brand'] = product[0].xpath('./a/span[contains(@class,"qa-brandName")]/text()').extract()
    item['img'] = product.xpath('./a/span[@class="lazyImage cat-prd-img"]/span/@id').extract()
    if item_url:
        yield Request(url=self.BASE_URL + ,callback=self.parse_page2,
            headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8}, meta={'item'=item})

编辑

完整的爬虫代码

import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.spider import Spider
from scrapy.http import Request


class JabongItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    link = scrapy.Field()
    sku = scrapy.Field()
    brand = scrapy.Field()
    img = scrapy.Field()



class JabongSpider(scrapy.Spider):
    name = "jabong"
    allowed_domains = ["jabong.com"]
    start_urls = ["http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1"]
    page_index = 1

    BASE_URL = 'http://www.jabong.com/'

    def parse(self, response):
        products = response.xpath("//li[@data-url]")
        if products:
            for product in products:
                link = product.xpath('@data-url').extract()
                link = self.BASE_URL + link[0] if link else ''
                sku = product.xpath('@data-sku').extract()
                sku = sku[0].strip() if sku else 'n/a'
                brand = product.xpath('.//span[contains(@class, "qa-brandName")]/text()').extract()
                brand = brand[0].strip() if brand else 'n/a'
                img = product.xpath('.//img[contains(@class, "itm-img")]/@src').extract()
                img = img[0].strip() if img else 'n/a'
                item = JabongItem()
                item['link'] = link
                item['sku'] = sku
                item['brand'] = brand
                item['img'] = img
                if link:
                    yield Request(url=link, callback=self.parse_page2, meta={'item': item})

        else:
            return

        self.page_index += 1
        yield Request(url="http://www.jabong.com/women/clothing/kurtas-suit-sets/kurtas-kurtis/?page=1%s" % (self.page_index + 1),
                          callback=self.parse, dont_filter=True)

    def parse_page2(self, response):
        item = response.meta['item']
        # add whatever extra details you want to item
        yield item

关于python - Scrapy 调用另一个 Url,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/29749854/

相关文章:

Python:删除字符串中的反斜杠

python - 如何在 Python 中获取 timedelta 的总小时数和分钟数

python - 如何为具有多对多字段的 Django 模型创建对象?

python - Pymongo 使用投影运算符查找

python - Dryscrape/webkit_server 内存泄漏

java - 使用 Jsoup 提取 HTML 数据

web-scraping - 使用Scrapy时如何防止twisted.internet.error.ConnectionLost错误?

python - 使用 Beautiful Soup 和 Python 从搜索页面提取 HTML 内容

python-2.7 - Python Scrapy - 将蜘蛛定向到特定管道

python - 使用 scrapy 限制检索项目的数量