python - 使用 Scrapy 通过电子邮件发送项目和日志

标签 python email scrapy

我试图让 Scrapy 在爬虫完成或中断时向我发送电子邮件。已经有一个用于发送统计信息的内置扩展,但我想将蜘蛛的错误附加为 <spidername>-errors.log和抓取掉的元素为<spidername>-items.json .

我已将回调连接到每个信号,但由于某种原因只有最后一个信号触发:

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from StringIO import StringIO

class StatusMailer(object):
    def __init__(self, recipients, mail, crawler):
        self.recipients = recipients
        self.mail = mail
        self.files = defaultdict(StringIO)
        self.encoder = ScrapyJSONEncoder(crawler=crawler)

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist("STATUSMAILER_RCPTS")

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '.json'].write(self.encoder.encode(item) + '\n')

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '-errors.log'].write(failure.getTraceback() + '\n')

    def spider_closed(self, spider):
        return self.mail.send(
            to=self.recipients,
            subject="Crawler for %s finished" % spider.name,
            body="",
            attachs=[(name, 'text/plain', contents) for name, contents in self.files.items()]
        )

有没有什么方法可以从 Scrapy 中访问导出的项目和蜘蛛的错误(可能在将这些消息打印到控制台之前制作某种钩子(Hook)来拦截这些消息)?

最佳答案

嗯,看起来问题比我想象的要简单得多。完成写入后,您必须“倒回”StringIO 实例:

def spider_closed(self, spider):
    files = []

    for name, contents in self.files.items():
        contents.seek(0)

        files.append((name, 'text/plain', contents))

    return self.mail.send(
        to=self.recipients,
        subject="Crawler for %s finished" % spider.name,
        body="",
        attachs=files
    )

对于任何感兴趣的人,这是我的电子邮件扩展名:

import gzip
import datetime

from scrapy import signals
from scrapy.mail import MailSender
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder

from collections import defaultdict

try:
    from cStringIO import cStringIO as StringIO
except ImportError:
    from StringIO import StringIO

def format_size(size):
    for x in ['bytes', 'KB', 'MB', 'GB']:
        if size < 1024.0:
            return "%3.1f %s" % (size, x)

        size /= 1024.0

class GzipCompressor(gzip.GzipFile):
    extension = '.gz'
    mimetype = 'application/gzip'

    def __init__(self):
        super(GzipCompressor, self).__init__(fileobj=PlainCompressor(), mode='w')
        self.read = self.fileobj.read

class PlainCompressor(StringIO):
    extension = ''
    mimetype = 'text/plain'

    def read(self, *args, **kwargs):
        self.seek(0)

        return StringIO.read(self, *args, **kwargs)

    @property
    def size(self):
        return len(self.getvalue())

class StatusMailer(object):
    def __init__(self, recipients, mail, compressor, crawler):
        self.recipients = recipients
        self.mail = mail
        self.encoder = ScrapyJSONEncoder(crawler=crawler)
        self.files = defaultdict(compressor)

        self.num_items = 0
        self.num_errors = 0

    @classmethod
    def from_crawler(cls, crawler):
        recipients = crawler.settings.getlist('STATUSMAILER_RECIPIENTS')
        compression = crawler.settings.get('STATUSMAILER_COMPRESSION')

        if not compression:
            compressor = PlainCompressor
        elif compression.lower().startswith('gz'):
            compressor = GzipCompressor
        else:
            raise NotConfigured

        if not recipients:
            raise NotConfigured

        mail = MailSender.from_settings(crawler.settings)
        instance = cls(recipients, mail, compressor, crawler)

        crawler.signals.connect(instance.item_scraped, signal=signals.item_scraped)
        crawler.signals.connect(instance.spider_error, signal=signals.spider_error)
        crawler.signals.connect(instance.spider_closed, signal=signals.spider_closed)
        crawler.signals.connect(instance.request_received, signal=signals.request_received)

        return instance

    def item_scraped(self, item, response, spider):
        self.files[spider.name + '-items.json'].write(self.encoder.encode(item))
        self.num_items += 1

    def spider_error(self, failure, response, spider):
        self.files[spider.name + '.log'].write(failure.getTraceback())
        self.num_errors += 1

    def request_received(self, request, spider):
        self.files[spider.name + '.log'].write(str(request) + '\n')

    def spider_closed(self, spider, reason):
        files = []

        for name, compressed in self.files.items():
            files.append((name + compressed.extension, compressed.mimetype, compressed))

        try:
            size = self.files[spider.name + '-items.json'].size
        except KeyError:
            size = 0

        body='''Crawl statistics:

 - Spider name: {0}
 - Spider finished at: {1}
 - Number of items scraped: {2}
 - Number of errors: {3}
 - Size of scraped items: {4}'''.format(
            spider.name,
            datetime.datetime.now(),
            self.num_items,
            self.num_errors,
            format_size(size)
        )

        return self.mail.send(
            to=self.recipients,
            subject='Crawler for %s: %s' % (spider.name, reason),
            body=body,
            attachs=files
        )

将其添加到您的settings.py:

EXTENSIONS = {
    'your_package.extensions.StatusMailer': 80
}

并配置它:

STATUSMAILER_RECIPIENTS = []
STATUSMAILER_COMPRESSION = 'gzip'
#STATUSMAILER_COMPRESSION = None

MAIL_HOST = 'smtp.gmail.com'
MAIL_PORT = 587
MAIL_USER = ''
MAIL_PASS = ''

关于python - 使用 Scrapy 通过电子邮件发送项目和日志,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/16260753/

相关文章:

python - 登录 Scrapy

python - BeautifulSoup:如何删除空表,同时保留部分空表或非空表

python - 在 Celery 中如何更新主任务的状态直到他的所有子任务完成?

python - Python和代理模拟/可视化环境

java - 发送电子邮件时是否必须提及 SMTP 主机?

python - Scrapy:如何在不存在时创建条件(存在或不存在)XPATH 返回值?

python - 计算 Pandas DataFrame 中每个 USIM 的网上商店订单中的平均商品

vba - 如何将 String 变量的值插入到将出现在电子邮件正文中的某些文本中?

php - 从电子邮件正文中删除 header

Python Scrapy extract_first() 文档