python - Selenium Python 脚本在 Windows 和 Ubuntu 环境中有不同的行为

标签 python windows selenium ubuntu webdriver

我试过在 Windows 和 Ubuntu 上运行脚本,两者都使用 Python 3 和最新版本的 geckodriver,导致不同的行为。完整的脚本如下。

我正在尝试从备考网站获取多个不同测试的数据。有不同的科目,每个科目都有专业,每个科目都有练习测试,每个科目都有几个问题。 scrape 函数遍历了获取每种类型数据的步骤。

subject <--- specialization <---- practice-test *------ question

get_questions 函数是不同之处:

  • 在 Windows 中,它的行为符合预期。单击最后一个问题的选择后,将转到结果页面。
  • 在 Ubuntu 中,当在最后一个问题上单击一个选项时,它会重新加载最后一个问题并不断单击相同的选项并重新加载相同的问题。

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import pathlib
    import time
    import json
    import os
    
    driver=webdriver.Firefox(executable_path="./geckodriver.exe")
    wait = WebDriverWait(driver, 15)
    data=[]
    
    def setup():
    
       driver.get('https://www.varsitytutors.com/practice-tests')
       try:
          go_away_1= driver.find_element_by_class_name("ub-emb-iframe")
          driver.execute_script("arguments[0].style.visibility='hidden'", go_away_1)
          go_away_2= driver.find_element_by_class_name("ub-emb-iframe-wrapper")
          driver.execute_script("arguments[0].style.visibility='hidden'", go_away_2)
          go_away_3= driver.find_element_by_class_name("ub-emb-visible")
          driver.execute_script("arguments[0].style.visibility='hidden'", go_away_3)
       except:
          pass
    
    def get_subjects(subs=[]):
       subject_clickables_xpath="/html/body/div[3]/div[9]/div/*/div[@data-subject]/div[1]"
       subject_clickables=driver.find_elements_by_xpath(subject_clickables_xpath)
       subject_names=map(lambda x : x.find_element_by_xpath('..').get_attribute('data-subject'), subject_clickables)
       subject_pairs=zip(subject_names, subject_clickables)
       return subject_pairs
    
    def get_specializations(subject):
    
       specialization_clickables_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]"
       specialization_names_xpath="//div//div[@data-subject='"+subject+"']/following-sibling::div//div[@class='public_problem_set']//a[contains(.,'Practice Tests')]/../.."
       specialization_names=map(lambda x : x.get_attribute('data-subject'), driver.find_elements_by_xpath(specialization_names_xpath))
       specialization_clickables = driver.find_elements_by_xpath(specialization_clickables_xpath)
       specialization_pairs=zip(specialization_names, specialization_clickables)
       return specialization_pairs
    
    def get_practices(subject, specialization):
       practice_clickables_xpath="/html/body/div[3]/div[8]/div[3]/*/div[1]/a[1]"
       practice_names_xpath="//*/h3[@class='subject_header']"
       lengths_xpath="/html/body/div[3]/div[8]/div[3]/*/div[2]"
       lengths=map(lambda x : x.text, driver.find_elements_by_xpath(lengths_xpath))
       print(lengths)
       practice_names=map(lambda x : x.text, driver.find_elements_by_xpath(practice_names_xpath))
       practice_clickables = driver.find_elements_by_xpath(practice_clickables_xpath)
       practice_pairs=zip(practice_names, practice_clickables)
       return practice_pairs
    
    def remove_popup():
       try:
    
          button=wait.until(EC.element_to_be_clickable((By.XPATH,"//button[contains(.,'No Thanks')]")))
          button.location_once_scrolled_into_view
          button.click()
       except:
          print('could not find the popup')
    
    def get_questions(subject, specialization, practice):
       remove_popup()
       questions=[]
       current_question=None
       while True:
          question={}
          try:
             WebDriverWait(driver,5).until(EC.presence_of_element_located((By.XPATH,"/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]")))
             question_number=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[1]').text.replace('.','')
             question_pre=driver.find_element_by_class_name('question_pre')
             question_body=driver.find_element_by_xpath('/html/body/div[3]/div[7]/div[1]/div[2]/div[2]/table/tbody/tr/td[2]/p')
             answer_choices=driver.find_elements_by_class_name('question_row')
             answers=map(lambda x : x.text, answer_choices)
             question['id']=question_number
             question['pre']=question_pre.text
             question['body']=question_body.text
             question['answers']=list(answers)
             questions.append(question)
             choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
             driver.execute_script("arguments[0].click();", choice[3])
             time.sleep(3)
          except Exception as e:
             if 'results' in driver.current_url:
                driver.get(driver.current_url.replace('http://', 'https://'))
                # last question has been answered; record results
                remove_popup()
                pathlib.Path('data/'+subject+'/'+specialization).mkdir(parents=True, exist_ok=True)
                with open('data/'+subject+'/'+specialization+'/questions.json', 'w') as outfile:
                   json.dump(list(questions), outfile)
                   break
             else:
                driver.get(driver.current_url.replace('http://', 'https://'))
       return questions
    
    
    def scrape():
       setup()
       subjects=get_subjects()
       for subject_name, subject_clickable in subjects:
          subject={}
          subject['name']=subject_name
          subject['specializations']=[]
          subject_clickable.click()
          subject_url=driver.current_url.replace('http://', 'https://')
          specializations=get_specializations(subject_name)
          for specialization_name, specialization_clickable in specializations:
             specialization={}
             specialization['name']=specialization_name
             specialization['practices']=[]
             specialization_clickable.click()
             specialization_url=driver.current_url.replace('http://', 'https://')
             practices=get_practices(subject_name, specialization_name)
             for practice_name, practice_clickable in practices:
                practice={}
                practice['name']=practice_name
                practice_clickable.click()
                questions=get_questions(subject_name, specialization_name, practice_name)
                practice['questions']=questions
                driver.get(specialization_url)
             driver.get(subject_url)
          data.append(subject)
       print(data)
    scrape()
    

谁能帮我弄清楚是什么原因造成的?

最佳答案

这只是时机。在加载下一页之前,最后一个问题将比 3 秒 sleep 时间长得多。等待页面消失可以解决此问题并加快脚本执行速度。

  from selenium.common.exceptions import StaleElementReferenceException
<snip>
             choice=WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"input.test_button")))
             choice[3].click()
             try:
                while choice[3].is_displayed():
                   time.sleep(1)
             except StaleElementReferenceException as e:
                continue

关于python - Selenium Python 脚本在 Windows 和 Ubuntu 环境中有不同的行为,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59077712/

相关文章:

python - 在 python 中使用 BeautifulSoup 进行标签和字符串混合查找和替换

windows - 何时使用 Wave 可扩展格式?

linux - Windows 上的 wx python 图像刷新

python - 如何使用带有 Python 的 Selenium WebDriver 获取选定的选项?

java - Selenium 从 WebDriver 获取 HTML(或 JSON)响应

java - WaitforElement 超时错误?

python - 如何在 pandas 中指定空格

python - 如何自动将图像中的零填充添加到预期形状?

python - Bokeh 趋势/补丁图中带有两个参数的悬停工具

c - 程序按预期工作,但打印结果后我收到错误消息