我不确定为什么,但我的脚本在到达 page 9 后总是停止抓取.没有错误、异常或警告,所以我有点不知所措。
有人可以帮帮我吗?
附言Here is the full script in case anybody wants to test it for themselves!
def initiate_crawl():
def refresh_page(url):
ff = create_webdriver_instance()
ff.get(url)
ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
items = WebDriverWait(ff, 15).until(
EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
)
print(len(items))
for count, item in enumerate(items):
slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
if len(slashed_price) > 0 and len(active_deals) > 0:
product_title = item.find_element(By.ID, 'dealTitle').text
if product_title not in already_scraped_product_titles:
already_scraped_product_titles.append(product_title)
url = ff.current_url
ff.quit()
refresh_page(url)
break
if count+1 is len(items):
try:
next_button = WebDriverWait(ff, 15).until(
EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
)
ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
url = ff.current_url
ff.quit()
refresh_page(url)
except Exception as error:
print(error)
ff.quit()
refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')
initiate_crawl()
打印 items
的长度也会引发一些奇怪的行为。它不是总是返回 32,这对应于每页上的项目数,而是第一页打印 32
,第二页打印 64
,96
对于第三个,依此类推。我通过使用 //div[contains(@id, "100_dealView_")]/div[contains(@class, "dealContainer")]
而不是 //div[contains( @id, "100_dealView_")]
作为 items
变量的 XPath。我希望这就是它在第 9 页遇到问题的原因。我现在正在运行测试。 更新:现在正在抓取第 10 页及以后的页面,因此问题已解决。
最佳答案
根据您的 10th revision这个问题的错误信息...
HTTPConnectionPool(host='127.0.0.1', port=58992): Max retries exceeded with url: /session/e8beed9b-4faa-4e91-a659-56761cb604d7/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000022D31378A58>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))
...暗示 get()
方法引发 HTTPConnectionPool 错误并显示消息 Max retries exceeded。
一些事情:
根据讨论 max-retries-exceeded exceptions are confusing 回溯 有点误导。为方便用户,请求包装了异常。原始异常是显示的消息的一部分。
请求从不重试(它为 urllib3 的
HTTPConnectionPool
设置了retries=0
),所以如果没有 MaxRetryError,错误会更加规范 和 HTTPConnectionPool 关键字。因此,理想的追溯应该是:NewConnectionError(<class 'socket.error'>: [Errno 10061] No connection could be made because the target machine actively refused it)
解决方案
根据 Selenium 3.14.1 的发行说明:
* Fix ability to set timeout for urllib3 (#6286)
合并是:repair urllib3 can't set timeout!
结论
升级到 Selenium 3.14.1 后,您将能够设置超时并查看规范的 Tracebacks 并能够采取所需的操作。
引用资料
一些相关的引用资料:
- Adding max_retries as an argument
- Removed the bundled charade and urllib3.
- Third party libraries committed verbatim
这个用例
我已经从 codepen.io - A PEN BY Anthony 中获取了您的完整脚本.我不得不对您现有的代码进行一些调整,如下所示:
如你所用:
ua_string = random.choice(ua_strings)
你必须强制导入 random
为:
import random
您已经创建了变量 next_button 但还没有使用它。我将以下四行组合在一起:
next_button = WebDriverWait(ff, 15).until( EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→') ) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
作为:
WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
您修改后的代码块将是:
# -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait import time import random """ Set Global Variables """ ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'] already_scraped_product_titles = [] """ Create Instances of WebDriver """ def create_webdriver_instance(): ua_string = random.choice(ua_strings) profile = webdriver.FirefoxProfile() profile.set_preference('general.useragent.override', ua_string) options = Options() options.add_argument('--headless') return webdriver.Firefox(profile) """ Construct List of UA Strings """ def fetch_ua_strings(): ff = create_webdriver_instance() ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]') for ua_string in ua_strings_ff_eles: if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text: ua_strings.append(ua_string.text) ff.quit() """ Log in to Amazon to Use SiteStripe in order to Generate Affiliate Links """ def log_in(ff): ff.find_element(By.XPATH, '//a[@id="nav-link-yourAccount"] | //a[@id="nav-link-accountList"]').click() ff.find_element(By.ID, 'ap_email').send_keys('anthony_falez@hotmail.com') ff.find_element(By.ID, 'continue').click() ff.find_element(By.ID, 'ap_password').send_keys('lo0kyLoOkYig0t4h') ff.find_element(By.NAME, 'rememberMe').click() ff.find_element(By.ID, 'signInSubmit').click() """ Build Lists of Product Page URLs """ def initiate_crawl(): def refresh_page(url): ff = create_webdriver_instance() ff.get(url) ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click() ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click() items = WebDriverWait(ff, 15).until( EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]')) ) for count, item in enumerate(items): slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]') active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]') # For Groups of Items on Sale # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")] if len(slashed_price) > 0 and len(active_deals) > 0: product_title = item.find_element(By.ID, 'dealTitle').text if product_title not in already_scraped_product_titles: already_scraped_product_titles.append(product_title) url = ff.current_url # Scrape Details of Each Deal #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href')) print(product_title[:10]) ff.quit() refresh_page(url) break if count+1 is len(items): try: print('') print('new page') WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click() time.sleep(10) url = ff.current_url print(url) print('') ff.quit() refresh_page(url) except Exception as error: """ ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click() url = ff.current_url ff.quit() refresh_page(url) """ print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next?")') print('Because of... {}'.format(error)) ff.quit() refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8') #def extract_info(ff, url): fetch_ua_strings() initiate_crawl()
控制台输出:使用 Selenium v3.14.0 和 Firefox Quantum v62.0.3,我可以在控制台上提取以下输出:
J.Rosée Si B.Catcher Bluetooth4 FRAM G4164 Major Crim 20% off Oh True Blood Prime-Line Marathon 3 True Blood B.Catcher 4 Film Fav True Blood Texture Pa Westinghou True Blood ThermoPro ... ... ...
Note: I could have optimized your code and performed the same web scraping operations initializing the Firefox Browser Client only once and traverse through various products and their details. But to preserve your logic and innovation I have suggested the minimal changes required to get you through.
关于python - 脚本突然停止爬取,无错误无异常,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/52692120/