假设有以下 CrawlSpider:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from tutorial.items import TestItem
from scrapy.http import HtmlResponse
class TestCrawlSpider(CrawlSpider):
name = "test_crawl"
allowed_domains = ["www.immobiliare.it"]
start_urls = [
"http://www.immobiliare.it/Roma/case_in_vendita-Roma.html?criterio=rilevanza",
"http://www.immobiliare.it/Napoli/case_in_vendita-Napoli.html?criterio=rilevanza"
]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="no-decoration button next_page_act"]',)), callback="parse_start_url", follow= True),
)
def parse_start_url(self, response):
for selector in response.css('div.content'):
l = ItemLoader(item=TestItem(), selector=selector)
l.add_css('Price', '.price::text')
l.add_value('City', '...')
l.add_value('Longitude', '...')
l.add_value('Latitude', '...')
yield l.load_item()
和对应的items.py:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose, Join
class TestItem(scrapy.Item):
Price = scrapy.Field(
output_processor=MapCompose(unicode.strip),
)
City = scrapy.Field(serializer=str)
Latitude = scrapy.Field(serializer=str)
Longitude = scrapy.Field(serializer=str)
对于每个 start_url,我都有相应的地理信息(“城市”、“经度”、“纬度”)存储在 pandas 数据框中。对于上面的示例,数据框如下所示:
City Latitude Longitude
0 Roma 40.85 14.30
1 Napoli 41.53 12.30
如何使用存储在数据框中的信息填充项目“城市”、“经度”、“纬度”?
最佳答案
我会使用 start_requests()
方法来填充 meta
每个城市的信息,通过 .to_dict()
将数据框转储到字典中简化查找:
def start_requests(self):
df = pd.DataFrame(...)
# make a dictionary, City -> City info
d = df.set_index('City').to_dict()
pattern = re.compile(r"http://www.immobiliare.it/(\w+)/")
for url in self.start_urls:
city = pattern.search(url).group(1)
yield scrapy.Request(url, meta={"info": d[city]})
然后,在回调中,从 response.meta
中获取信息字典:
def parse_start_url(self, response):
info = response.meta["info"]
for selector in response.css('div.content'):
l = ItemLoader(item=TestItem(), selector=selector)
l.add_css('Price', '.price::text')
l.add_value('City', info['City'])
l.add_value('Longitude', info['Longitude'])
l.add_value('Latitude', info['Latitude'])
yield l.load_item()
未测试。
关于python - scrapy - 如何用 Pandas 数据框中的数据填充项目?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/37882663/