如果你看here我无法让两个不同的蜘蛛自动将结果添加到 mysql 数据库中。现在我添加了一个 if 和 elif 语句,它们可以工作,但它们错过了一些结果,以前 Bath 表中有 52 行,现在只有 41 行。布里斯托尔过去有 154 行,现在只有 141 行。我不明白为什么结果不一样。
管道.py
import sys
import MySQLdb
import MySQLdb.cursors
import hashlib
from scrapy.exceptions import DropItem
from scrapy.http import Request
class TestPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect(
user='user',
passwd='pwd',
db='db',
host='host',
charset='utf8',
use_unicode=True
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
try:
if 'BristolQualification' in item:
self.cursor.execute("""INSERT INTO Bristol(BristolCountry, BristolQualification) VALUES ('{0}', '{1}')""".format(item['BristolCountry'], "".join([s.encode('utf8') for s in item['BristolQualification']])))
elif 'BathQualification' in item:
self.cursor.execute("""INSERT INTO Bath(BathCountry, BathQualification) VALUES ('{0}', '{1}')""".format(item['BathCountry'], "".join([s.encode('utf8') for s in item['BathQualification']])))
self.conn.commit()
return item
except MySQLdb.Error as e:
print "Error %d: %s" % (e.args[0], e.args[1])
项目.py
from scrapy.item import Item, Field
class QualificationItem(Item):
BristolQualification = Field()
BristolCountry = Field()
BathQualification = Field()
BathCountry = Field()
布里斯托尔.py
from scrapy.spider import BaseSpider
from project.items import QualificationItem
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
from urlparse import urljoin
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
class recursiveSpider(BaseSpider):
name = 'bristol'
allowed_domains = ['bristol.ac.uk/']
start_urls = ['http://www.bristol.ac.uk/international/countries/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
xpath = '//*[@id="all-countries"]/li/ul/li/a/@href'
a_of_the_link = '//*[@id="all-countries"]/li/ul/li/a/text()'
for text, link in zip(hxs.select(a_of_the_link).extract(), hxs.select(xpath).extract()):
yield Request(urljoin(response.url, link),
meta={'a_of_the_link': text},
headers={'User-Agent': USER_AGENT},
callback=self.parse_linkpage,
dont_filter=True)
def parse_linkpage(self, response):
hxs = HtmlXPathSelector(response)
item = QualificationItem()
xpath = """
//h2[normalize-space(.)="Entry requirements for undergraduate courses"]
/following-sibling::p[not(preceding-sibling::h2[normalize-space(.)!="Entry requirements for undergraduate courses"])]
"""
item['BristolQualification'] = hxs.select(xpath).extract()[1:]
item['BristolCountry'] = response.meta['a_of_the_link']
return item
最佳答案
在使用 Sqlite3 进行测试时,我成功地重现了该问题,并且 scrapy 日志中的错误数量与缺失的条目相对应。这些错误是由 BristolQualification
项字段中未转义的单引号引起的(大概 Bath 蜘蛛也遇到了同样的问题),导致严重破坏(例如代码片段中的 d'Etudes
)如下):
Candidates holding a Dipl\xf4me de Technicien Sup\xe9rieur / Sciences Appliqu\xe9es with suitable grades or those with the Dipl\xf4me d'Etudes Universitaires G\xe9n\xe9rales (DEUG) with good grades in suitable subjects will be considered for appropriate undergraduate courses.
通过分解资格项字段的连接和编码,我设法让它工作(至少在 SQLite3 中)。下面的代码应该可以工作,但请注意它未经 MySQL 测试。如果出现任何错误,请检查 scrapy 日志错误并让我知道是否有任何问题。
def process_item(self, item, spider):
try:
if 'BristolQualification' in item:
qualification = ''.join(s for s in item['BristolQualification'])
qualification.encode('utf8')
self.cursor.execute("INSERT INTO Bristol(BristolCountry, BristolQualification) VALUES (?, ?)", (item['BristolCountry'], qualification))
elif 'BathQualification' in item:
qualification = ''.join(s for s in item['BathQualification'])
qualification.encode('utf8')
self.cursor.execute("INSERT INTO Bath(BathCountry, BathQualification) VALUES (?, ?)", (item['BathCountry'], qualification))
self.conn.commit()
return item
except MySQLdb.Error as e:
print "Error %d: %s" % (e.args[0], e.args[1])
关于python - 将我的项目管道更改为 if 和 elif 语句后,Spider 未返回所有结果,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/23045582/