python - 为什么scrapy不将数据存储到mongodb中?

标签 python mongodb web-scraping scrapy

我的主文件:

import scrapy
from scrapy.exceptions import CloseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request


class Product(scrapy.Item):
    brand = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    name = scrapy.Field()
    title = scrapy.Field()
    date = scrapy.Field()
    heading = scrapy.Field()
    data = scrapy.Field()
    Model_name = scrapy.Field()


class aqaqspider(CrawlSpider):
    name = "mouth_shut_new"
    allowed_domains = ["mouthshut.com"]
    start_urls = ["http://www.mouthshut.com/mobile-phones/Yu-Yureka-reviews-925723476"

    ]
    rules = (
        Rule(
            SgmlLinkExtractor(allow=('.*\-page-.*',)),
            callback="parse_start_url",
            follow=True),
    )

    def parse_start_url(self, response):
        products = response.xpath('//div[@id="allreviews"]/ul/li')
        items = []
        if not products:
            raise CloseSpider("No more products!")

        for product in products:
            item = Product()
            #item['Model_name'] = product.xpath('/html/body/form/div[12]/div/div[5]/div/div[1]/div[3]/ul/li[1]/h1/a/span/text()').extract()
            item['name'] = product.xpath('.//li[@class="profile"]/div/a/span/text()').extract()[0]
            item['title'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/text()').extract()[0]
            item['date'] = product.xpath('.//div[@class="reviewrate"]//span[@class="datetime"]/span/span/span/text()').extract()[0]
            item['link'] = product.xpath('.//div[@class="reviewtitle fl"]/strong/a/@href').extract()[0]
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield scrapy.Request(item['link'],
                                    meta={'item': item},
                                    callback=self.anchor_page)

            items.append(item)

    def anchor_page(self, response):
        old_item = response.request.meta['item']

        old_item['data'] = response.xpath('.//div[@itemprop="description"]/p/text()').extract()
        yield old_item

    # yield Request(url="http://www.mouthshut.com/Product/mobileListing.aspx?cid=925602729&f1=1&view=list&nsort1=0&nsort2=2015-06-01%2016:12:23.000&ntype=3&mpad=1&ran=0.3691624044781373&dcal=Intex%20Aqua%20Xtreme" ,
                      # headers={"Referer": "http://www.mouthshut.com/mobile-phones.php", "X-Requested-With": "XMLHttpRequest"},
                      # callback=self.parse, 
                      # dont_filter=True)

我的设置.py:

# -*- coding: utf-8 -*-

# Scrapy settings for mouth project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'mouth'

SPIDER_MODULES = ['mouth.spiders']
NEWSPIDER_MODULE = 'mouth.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'mouth (+http://www.yourdomain.com)'


ITEM_PIPELINES = {'mouth.pipelines.MongoDBPipeline':300}

MONGODB_HOST = 'localhost' # Change in prod
MONGODB_PORT = 27017 # Change in prod
MONGODB_DATABASE = "mobiles_complaints" # Change in prod
MONGODB_COLLECTION = "Yu_Yureka"
MONGODB_USERNAME = "" # Change in prod
MONGODB_PASSWORD = "" # Change in prod

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'consumer (+http://www.yourdomain.com)'

我的管道.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import pymongo
from scrapy.conf import settings
from scrapy import log


class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

def process_item(self, item, spider):
    self.collection.insert(dict(item))
    log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
        settings['MONGODB_DATABASE'],
        settings['MONGODB_COLLECTION'],
        settings['MONGODB_HOST'],
        settings['MONGODB_PORT']))
    return item

我运行了 scrapy scrapy scrapymouth_shut_new。但我的数据没有存储在数据库中。在输出中,它应该显示数据存储在 mongo 中以及集合名称。我缺少什么?

最佳答案

process_item() 方法没有正确缩进,应该是:

class MongoDBPipeline(object):
    def __init__(self):
        connection = pymongo.Connection(settings['MONGODB_HOST'], settings['MONGODB_PORT'])
        db = connection[settings['MONGODB_DATABASE']]
        self.collection = db[settings['MONGODB_COLLECTION']]

    def process_item(self, item, spider):
        self.collection.insert(dict(item))
        log.msg("Item wrote to MongoDB database {}, collection {}, at host {}, port {}".format(
            settings['MONGODB_DATABASE'],
            settings['MONGODB_COLLECTION'],
            settings['MONGODB_HOST'],
            settings['MONGODB_PORT']))
        return item

关于python - 为什么scrapy不将数据存储到mongodb中?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/30701704/

相关文章:

python - 以下扭曲的代码会出现重新输入问题吗?

python - 如何在 Pygame 中创建暂停按钮?

python - Hadoop:在迭代映射作业之间维护内存缓存

python - Portia Spider 日志在爬行期间显示 ['Partial' ]

python - pycurl,如何为多选表单发送POST数据?

java - 如何使用 Java 导航 Web 搜索

python - 使用PIL将一张512*256的图片分成2张每张256*256的图片

c# - 如何使用 MongoDB 的官方 C# 驱动程序检索所有嵌入式文档值?

mongodb - 在没有持久性(mongo)的情况下有哪些操作顺序保证?

mongodb - 如何按字段值 $pull 数组元素(当元素是对象时)