Python 值 reshape 问题

我确实使用scraper。

我根据自己的需要修改了以下代码:

from bs4 import BeautifulSoup as bs
from selenium import webdriver
import urllib.request, urllib.error, urllib.parse
import re
import ssl
import pandas as pd
import numpy as np
import os

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--no-sandbox')
prefs = {'download.default_directory' : os.getcwd()}
chrome_options.add_experimental_option('prefs', prefs)

class SoupMaker():
    """
    A class that scrapes indeed's Job ads
    """
    def __init__(self, _url, _driver):
        self.base_url = "https://www.indeed.com"
        self.home_url = self.base_url + _url
        self.job_links = []
        self.driver = _driver
        self.job_datas = []
        self.job_table = []
        
    def read_page(self):        
        self.ctx = ssl.create_default_context()
        self.ctx.check_hostname = False
        self.ctx.verify_mode = ssl.CERT_NONE
        print("Parsing: ", self.home_url)
        self.url = urllib.request.urlopen(self.home_url,
                              context = self.ctx).read()
        _soup1 = bs(self.url, "html.parser")
        self.a_tags = _soup1('a')
        
    def get_job_url(self):
        for link in self.a_tags:
            link = link.get("href", None)
            if link != None:
                cmp_url = re.search("^/.+/.+/jobs/.+", link)
                rc_url = re.search("^/rc.+", link)
                if cmp_url or rc_url:
                    self.job_links.append(self.base_url + link.strip())
                    
    def get_job_info(self):
        for link in self.job_links:
            print("    Scraping: ", link)
            self.driver.get(link)
            self.driver.implicitly_wait(2750)
            _soup2 = bs(self.driver.page_source, "lxml")
            self.title = _soup2.find("title").get_text()
            self.job_descs = _soup2.find_all('div', 'jobsearch-JobComponent-description icl-u-xs-mt--md')
            self.job_origins = _soup2.find_all('div', 'jobsearch-JobMetadataFooter')
            
            self.job_title = re.findall("(.+) - .+ - .+", self.title)[0]
            self.job_location = re.findall(".+ - (.+) - .+", self.title)[0]
            self.description = ''
            for d in self.job_descs:
                self.description += d.get_text("|", strip = True) 
            self.origin = re.findall("^.+ ago", self.job_origins[0].get_text())[0]    
            self.job_datas.append(self.job_title)
            self.job_datas.append(self.job_location)
            self.job_datas.append(self.description)
            self.job_datas.append(self.origin)
            
        self.x = np.array(self.job_datas).reshape((10,4))
        df = pd.DataFrame(data=self.x, columns=['Job Title', 'Job Location',
                                    'Job Description', 'Job Origin'])
        return df
        
if __name__ == '__main__':
    n = int(input("Enter no. of pages to scrape: "))
    n = n*10
    file_name = input("Enter CSV filename: ")
    #    driver = webdriver.Chrome(r"C:\chromedriver\chromedriver.exe")
    #driver = webdriver.Chrome('/usr/local/bin/chromedrive') 
    driver = webdriver.Chrome('/usr/local/bin/chromedriver',chrome_options=chrome_options)  
    chrome_options=chrome_options
    writer = pd.ExcelWriter('{0}.xlsx'.format(file_name), engine='xlsxwriter')
    df = []
    
    for i in range(10, n+10, 10):
        #ext = "/jobs?q=&l=United+States&start={0}".format(i-10)
        ext = "/jobs?l=United+States&start={0}".format(i-10)
        if n == 10:
            #ext = "/jobs-in-United+States"
            ext ="/l-United+States-jobs.html"
        s = SoupMaker(ext, driver)
        s.read_page()
        s.get_job_url()
        df.append(s.get_job_info())
        
    result = pd.concat(df)
    result.to_excel(writer, index=False)
    writer.save()
    driver.close()

如果我尝试仅抓取 1 页，则脚本工作正常，但如果我尝试抓取超过 10 页，则会出现以下错误:

Traceback (most recent call last): File "file.py", line 96, in (module) df.append(s.get_job_info()) File "file.py", line 71, in get_job_info self.x = np.array(self.job_datas).reshape((10,4)) ValueError: cannot reshape array of size 0 into shape (10,4)

如果页面的输入大于100或50，则会出现以下错误:

Traceback (most recent call last): File "file.py", line 100, in df.append(s.get_job_info()) File "file.py", line 64, in get_job_info self.job_title = re.findall("(.+) - .+ - .+", self.title)[0] IndexError: list index out of range

如果有人能帮助我解决这个问题，我将非常感激!预先感谢!

最佳答案

只要看看这个，我认为问题在于它实际上没有检索任何数据。如果“get_job_url”方法中没有解析任何链接，“get_job_info”中的循环将不会运行，并且该数组将是零维的。这将导致 reshape 失败。

为了更好地了解正在发生的情况，请尝试使用调试器来检查状态，或者仅使用打印来更好地了解正在发生的情况。可能 10 个页面的 URL 是错误的，并给出 404 页面，该页面没有任何链接。

关于Python 值 reshape 问题，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/53330860/

Python 值 reshape 问题

上一篇：python - 使用多根工作区时如何为 Python 语言服务器指定正确的 python 版本

下一篇：python - CuPy 内存不足