我编写了一个爬虫来从网站中提取价格、名称和评论。但是当我将它们组合在一起时,30 个结果中只显示了 9 个结果。不知道问题出在哪里。我还需要添加下一页链接,我不应该这样做。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import unittest, time, re
import time
from scrapy.item import Item, Field
from selenium import webdriver
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
import urllib2
import sys;
reload(sys);
sys.setdefaultencoding("utf8")
class Agoda(CrawlSpider):
name = 'agoda'
allowed_domains = ["agoda.com"]
start_urls = ["http://www.agoda.com"]
driver = webdriver.Firefox()
driver.get("http://www.agoda.com")
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_TextSearch1_searchText").clear()
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_TextSearch1_searchText").send_keys("Mumbai")
driver.find_element_by_xpath("//select[contains(@id,'ddlCheckInDay')]")
driver.find_element_by_xpath("//option[contains(.,'Mon 09')]").click()
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_SearchButton").click()
driver.find_element_by_id("ctl00_ContentMain_rptAB1936_ctl01_rptSearchResultAB1936_ctl01_lnkResult1936" or "ctl00_ContentMain_rptSearchResult_ctl01_lnkResult" or "ctl00_ContentMain_rptSearchResult_ctl01_lnkResult").click()
#driver.find_element_by_id("ctl00_ContentMain_rptSearchResult_ctl01_lnkResult").click()
time.sleep(40);
#print driver.page_source
TotalResults = driver.find_element_by_xpath("//span[@class='blue ssr_search_text']")
print TotalResults.text
html_source = driver.page_source
soup = BeautifulSoup(html_source)
names = soup("a", {"class":"hot_name"})
prices = soup("span", {"class":"fontxlargeb purple"})
reviews = soup("a", {"class":"fontlargeb"})
hotel_names = [name[1].get_text() for name in enumerate(names)] #or [name[1].get_text() for name in enumerate(names)]
prices = [price[1].get_text() for price in enumerate(prices)]
reviews = [review[1].get_text() for review in enumerate(reviews)] #[price[1].get_text() for price in enumerate(prices)]
name_price_list = zip(hotel_names, prices, reviews)
for name, price, review in name_price_list:
print name, price, review
最佳答案
您可以使用izip_longest来自itertools
Make an iterator that aggregates elements from each of the iterables. If the iterables are of uneven length, missing values are filled-in with fillvalue. Iteration continues until the longest iterable is exhausted
示例:
>>> import itertools
>>> l2 = ['a','b','c']
>>> l1 = [1, 2]
>>> list(itertools.izip_longest(l1, l2))
[(1, 'a'), (2, 'b'), (None, 'c')]
关于python - Zip 函数未显示已抓取数据的完整列表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/20477279/