我试过在 Windows 和 Ubuntu 上运行脚本,两者都使用 Python 3 和最新版本的 geckodriver,导致不同的行为。完整的脚本如下。
我正在尝试从备考网站获取多个不同测试的数据。有不同的科目,每个科目都有专业,每个科目都有练习测试,每个科目都有几个问题。 scrape
函数遍历了获取每种类型数据的步骤。
subject <--- specialization <---- practice-test *------ question
get_questions
函数是不同之处:
- 在 Windows 中,它的行为符合预期。单击最后一个问题的选择后,将转到结果页面。
在 Ubuntu 中,当在最后一个问题上单击一个选项时,它会重新加载最后一个问题并不断单击相同的选项并重新加载相同的问题。
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import pathlib import time import json import os driver=webdriver.Firefox(executable_path="./geckodriver.exe") wait = WebDriverWait(driver, 15) data=[] def setup(): driver.get('https://www.varsitytutors.com/practice-tests') try: go_away_1= driver.find_element_by_class_name("ub-emb-iframe") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1) go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2) go_away_3= driver.find_element_by_class_name("ub-emb-visible") driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3) except: pass def get_subjects(subs=[]): subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]" subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath) subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables) subject_pairs=zip(subject_names, subject_clickables) return subject_pairs def get_specializations(subject): specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]" specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.." specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath)) specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath) specialization_pairs=zip(specialization_names, specialization_clickables) return specialization_pairs def get_practices(subject, specialization): practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]" practice_names_xpath="//*/h3[@class='subject_header']" lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]" lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath)) print(lengths) practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath)) practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath) practice_pairs=zip(practice_names, practice_clickables) return practice_pairs def remove_popup(): try: button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]"))) button.location_once_scrolled_into_view button.click() except: print('could not find the popup') def get_questions(subject, specialization, practice): remove_popup() questions=[] current_question=None while True: question={} try: WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]"))) question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','') question_pre=driver.find_element_by_class_name('question_pre') question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p') answer_choices=driver.find_elements_by_class_name('question_row') answers=map(lambda x : x.text, answer_choices) question['id']=question_number question['pre']=question_pre.text question['body']=question_body.text question['answers']=list(answers) questions.append(question) choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button"))) driver.execute_script("arguments[0].click();", choice[3]) time.sleep(3) except Exception as e: if 'results' in driver.current_url: driver.get(driver.current_url.replace('http://', 'https://')) # last question has been answered; record results remove_popup() pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True) with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile: json.dump(list(questions), outfile) break else: driver.get(driver.current_url.replace('http://', 'https://')) return questions def scrape(): setup() subjects=get_subjects() for subject_name, subject_clickable in subjects: subject={} subject['name']=subject_name subject['specializations']=[] subject_clickable.click() subject_url=driver.current_url.replace('http://', 'https://') specializations=get_specializations(subject_name) for specialization_name, specialization_clickable in specializations: specialization={} specialization['name']=specialization_name specialization['practices']=[] specialization_clickable.click() specialization_url=driver.current_url.replace('http://', 'https://') practices=get_practices(subject_name, specialization_name) for practice_name, practice_clickable in practices: practice={} practice['name']=practice_name practice_clickable.click() questions=get_questions(subject_name, specialization_name, practice_name) practice['questions']=questions driver.get(specialization_url) driver.get(subject_url) data.append(subject) print(data) scrape()
谁能帮我弄清楚是什么原因造成的?
最佳答案
这只是时机。在加载下一页之前,最后一个问题将比 3 秒 sleep 时间长得多。等待页面消失可以解决此问题并加快脚本执行速度。
from selenium.common.exceptions import StaleElementReferenceException
<snip>
choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
choice[3].click()
try:
while choice[3].is_displayed():
time.sleep(1)
except StaleElementReferenceException as e:
continue
关于python - Selenium Python 脚本在 Windows 和 Ubuntu 环境中有不同的行为,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59077712/