我不知道我做错了什么。我正在尝试提取文本并将其存储在列表中。在 firebug 和 firepath 中,当我输入路径时,它会显示完全正确的文本。但是当我应用它时它返回空列表。 我正在尝试抓取 www.insider.in/mumbai。它会转到所有链接并抓取事件标题、地址和其他信息。 这是我新编辑的代码:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from scrapy.selector import HtmlXPathSelector
import time
import requests
import csv
class insiderSpider(BaseSpider):
name = 'insider'
allowed_domains = ["insider.in"]
start_urls = ["http://www.insider.in/mumbai/"]
def parse(self,response):
driver = webdriver.Firefox()
print response.url
driver.get(response.url)
s = Selector(response)
#hxs = HtmlXPathSelector(response)
source_link = []
temp = []
title =""
Price = ""
Venue_name = ""
Venue_address = ""
description = ""
event_details = []
alllinks = s.xpath('//div[@class="bottom-details-right"]//a/@href').extract()
print alllinks
length_of_alllinks = len(alllinks)
for single_event in range(1,length_of_alllinks):
if "https://insider.in/event" in alllinks[single_event]:
source_link.append(alllinks[single_event])
driver.get(alllinks[single_event])
s = Selector(response)
#hxs = HtmlXPathSelector(response)
time.sleep(3)
title = s.xpath('//div[@class = "cell-title in-headerTitle"]/h1//text()').extract()
print title
temp = s.xpath('//div[@class = "cell-caption centered in-header"]//h3//text()').extract()
print temp
time.sleep(2)
a = len(s.xpath('//div[@class = "bold-caption price"]//text()').extract())
if a > 0:
Price = s.xpath('//div[@class = "bold-caption price"]//text()').extract()
time.sleep(2)
else:
Price = "RSVP"
time.sleep(2)
print Price
Venue_name = s.xpath('//div[@class = "address"]//div[@class = "section-title"]//text()').extract()
print Venue_name
Venue_address = s.xpath('//div[@class ="address"]//div//text()[preceding-sibling::br]').extract()
print Venue_address
description = s.xpath('//div[@class="cell-caption accordion-padding"]//text()').extract()
print description
time.sleep(5)
event_details.append([title,temp,Price,Venue_name,Venue_address,description])
else:
print "Other part"
编辑后的输出:
[u'https://insider.in/weekender-music-festival-2015', u'https://insider.in/event/east-india-comedy-presents-back-benchers#', u'https://insider.in/event/art-of-story-telling', u'https://insider.in/feelings-in-india-with-kanan-gill', u'https://insider.in/event/the-tall-tales-workshop-capture-your-story', u'https://insider.in/halloween-by-the-pier-2015', u'https://insider.in/event/whats-your-story', u'https://insider.in/event/beyond-contemporary-art']
2015-08-03 12:53:29 [selenium.webdriver.remote.remote_connection] DEBUG: POST http://127.0.0.1:60924/hub/session/f675b909-5515-41d4-a89e-d197c296023d/url {"url": "https://insider.in/event/east-india-comedy-presents-back-benchers#", "sessionId": "f675b909-5515-41d4-a89e-d197c296023d"}
2015-08-03 12:53:29 [selenium.webdriver.remote.remote_connection] DEBUG: Finished Request
[]
[]
RSVP
[]
[]
[]
[[[], [], 'RSVP', [], [], []]]
即使 if 条件失败并打印 RSVP。我似乎不明白我做错了什么。我已经被困在这部分了三天了。请帮忙。
最佳答案
我删除了像 webdriver 这样的东西,并得到了一个可以工作的基本代码
import scrapy
import logging
from scrapy.http import Request
from scrapy.selector import Selector
class insiderSpider(scrapy.Spider):
name = 'insider'
allowed_domains = ["insider.in"]
start_urls = ["http://www.insider.in/mumbai/"]
event_details = list() # Changed. Now event_detail is a menber data of class
def parse(self, response):
source_link = []
temp = []
title =""
Price = ""
Venue_name = ""
Venue_address = ""
description = ""
alllinks = response.xpath('//div[@class="bottom-details-right"]//a/@href').extract()
print alllinks
for single_event in alllinks:
if "https://insider.in/event" in single_event:
yield Request(url = single_event, callback = self.parse_event)
else:
print 'Other part'
def parse_event(self, response):
title = response.xpath('//div[@class = "cell-title in-headerTitle"]/h1//text()').extract()
print title
temp = response.xpath('//div[@class = "cell-caption centered in-header"]//h3//text()').extract()
print temp
a = len(response.xpath('//div[@class = "bold-caption price"]//text()').extract())
if a > 0:
Price = response.xpath('//div[@class = "bold-caption price"]//text()').extract()
else:
Price = "RSVP"
print Price
Venue_name = response.xpath('normalize-space(//div[@class = "address"]//div[@class = "section-title"]//text())').extract()
print Venue_name
Venue_address = response.xpath('normalize-space(//div[@class ="address"]//div//text()[preceding-sibling::br])').extract()
print Venue_address
description = response.xpath('normalize-space(//div[@class="cell-caption accordion-padding"]//text())').extract()
print description
self.event_details.append([title,temp,Price,Venue_name,Venue_address,description]) # Notice that event_details is used as self.event_details ie, using member data
print self.event_details # Here also self.event_details
关于python - Scrapy xpath 在 python 中返回空列表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/31782321/