python - 如何跳过超出范围的索引?

标签 python pandas web-scraping beautifulsoup

我试图进行学习练习以制作 eBay 列表抓取,我的项目框长度为 48,但只有 26 个项目有评级 div,我收到 IndexError: list index out of range,我该怎么办跳过这行,或者如果 item_ rating 为空,例如带“N/A”,我该如何编写。我尝试继续,但无法修复。实际上,这是针对不同变量(如 item_shipping 等)的这种情况的一般问题。提前致谢。

已更新

import requests
from bs4 import BeautifulSoup
import pandas as pd

URL='https://www.ebay.com/b/Makeup-Products/31786/bn_1865570' #'https://www.ebay.com/b/Makeup-Products/31786/bn_1865570' #https://www.ebay.com/b/Eye-Makeup/172020/bn_1880663
response=requests.get(URL)
soup= BeautifulSoup(response.content, 'html.parser')
columns=["Name","Price","Rating","Location"]
#Product features
main_table=soup.find('ul',attrs={'class':'b-list__items_nofooter'})
item_boxes=main_table.find_all('div',attrs={'class':'s-item__info clearfix'})
item = item_boxes[0]

df=pd.DataFrame(columns=columns)

for item in item_boxes:

    item_name = item.findAll('h3')
    try:
       item_name_row = item_name[0].text.replace('\n','')
    except:
       item_name = "N/A"


    item_price = item.find_all('span',{'class':'s-item__price'})
    try:
       item_price_row = item_price[0].text.replace('\n','')
    except:
       item_price_row = "N/A"      

    try:
       item_rating = item.findAll('div',{'class':'s-item__reviews'})[0].div
       item_rating_row = item_rating.text
    except:
       item_rating_row = None

    try:
       item_location = item_location = item.find_all('span',{'class':'s-item__location s-item__itemLocation'})[0]
       item_location_row = item_location.text
    except:
       item_location_row = None   

    row = [ item_name_row, item_price_row, item_rating_row, item_location_row ]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)
    df.to_csv('ebay1.csv', index=False)


    if item_rating != None:

      row = [item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating.text.replace('\n','') for rating in item_rating] + [item_location_row[0].replace('\n','') for location in item_location]

    elif item_location != None:

      row = [item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating.text.replace('\n','') for rating in item_rating] + [item_location_row[0].replace('\n','') for location in item_location]
    else: 
      row = [ item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating] + [item_location_row]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)

df.to_csv('ebay4.csv', index=False)

enter image description here

最佳答案

给你,这是没有评级的列表:

import requests
from bs4 import BeautifulSoup
import pandas as pd

URL='https://www.ebay.com/b/Makeup-Products/31786/bn_1865570'
response=requests.get(URL)
soup= BeautifulSoup(response.content, 'html.parser')
columns=['name',"price","rating"]
#Product features
main_table=soup.find('ul',attrs={'class':'b-list__items_nofooter'})
item_boxes=main_table.find_all('div',attrs={'class':'s-item__info clearfix'})
item = item_boxes[0]

df=pd.DataFrame(columns=columns)

for item in item_boxes:

    item_name = item.findAll('h3')
    item_price = item.find_all('span',{'class':'s-item__price'})
    try:
       item_rating = item.findAll('div',{'class':'s-item__reviews'})[0].div
    except:
       item_rating = None
    if item_rating != None:
      row = [item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating.text.replace('\n','') for rating in item_rating]
    else: 
      row = [ item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)

df.to_csv('ebay1.csv', index=False)

这是我使用的一个修改版,用于抓取邻居的 recolorado 数据:

import requests
from bs4 import BeautifulSoup
import pandas as pd
URL='https://www.recolorado.com/find-real-estate/80817/1-pg/exclusive-dorder/price-dorder/photo-tab/'
response=requests.get(URL)
soup= BeautifulSoup(response.content, 'html.parser')
columns=['address',"price","active","bedrooms","bathrooms","sqft","courtesy"]
#Product features
main_table=soup.find('div',attrs={'class':'page--column', 'data-id':'listing-results'})
item_boxes=main_table.find_all('div',attrs={'class':'listing--information listing--information__photo'})
df=pd.DataFrame(columns=columns)

for item in item_boxes:

    price = item.find('li', attrs={'class': 'listing--detail listing--detail__photo listing--detail__price'})
    price_row = price.text.replace('\r','').replace('\n','').replace(' ', '')
    #print(price_row)

    address = item.find('h2', attrs={'class': 'listing--street listing--street__photo'})
    address_row = address.text.replace(', ', '')
    #print(address_row)

    active_listing = item.find('div', attrs={'class': 'listing--status listing--status__photo listing--status__Under Contract'})
    try:
       active_row = active_listing.text
    except:
       active_row = "N/A"
    #print(active_row)

    bedrooms = item.find('li', attrs={'class': 'listing--detail listing--detail__photo listing--detail__bedrooms'})
    try:
       bedrooms_row = bedrooms.text.replace('\r','').replace('\n','').replace(' ', '')
    except:
       bedrooms_row = "N/A"
    #print(bedrooms_row)

    bathrooms = item.find('li', attrs={'class': 'listing--detail listing--detail__photo listig--detail__bathrooms'})
    try:
       bathrooms_row = bathrooms.text.replace('\r','').replace('\n','').replace(' ', '')
    except:
       bathrooms_row = "N/A"
    #print(bathrooms_row)

    sqft = item.find('li', attrs={'class': 'listing--detail listing--detail__photo listing--detail__sqft'})
    try:
       sqft = item.find('li', attrs={'class': 'listing--detail listing--detail__photo listing--detail__sqft'})
       sqft_row = sqft.text.replace('\r','').replace('\n','').replace(' ', '')
    except:
       sqft_row = "N/A"
    #print(sqft_row)

    courtesy = item.find('div', attrs={'class': 'listing--courtesy listing--courtesy__photo show-mobile'})
    try:
       courtesy_row = courtesy.text.replace('\r','').replace('\n','').replace(' ', '')
    except:
       courtesy_row = "N/A"
    #print(courtesy_row)

    row = [ address_row, price_row, active_row, bedrooms_row, bathrooms_row, sqft_row, courtesy_row ]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)

df
#                         address     price          active    bedrooms    bathrooms       sqft                                     courtesy
#0    6920 South US Highway 85-87  $699,000             N/A  5Bedrooms●  4Bathrooms●  3,978Sqft        CourtesyofColdwellBankerResidentialBK
#1               7095 Prado Drive  $414,900  Under Contract  9Bedrooms●  4Bathrooms●  3,000Sqft  CourtesyofKellerWilliamsClientsChoiceRealty
#2          7941 Whistlestop Lane  $399,500             N/A  3Bedrooms●  3Bathrooms●  2,577Sqft           CourtesyofRE/MAXRealEstateGroupInc
#3            7287 Van Wyhe Court  $389,900  Under Contract  4Bedrooms●  3Bathrooms●  2,750Sqft                         CourtesyofPinkRealty
#4   10737 Hidden Prairie Parkway  $369,900  Under Contract  4Bedrooms●  3Bathrooms●  2,761Sqft       CourtesyofKellerWilliamsPartnersRealty
#5            7327 Van Wyhe Court  $362,400             N/A  3Bedrooms●  2Bathrooms●  1,640Sqft                         CourtesyofPinkRealty
#6               7354 Chewy Court  $359,000             N/A  3Bedrooms●  2Bathrooms●  1,680Sqft      CourtesyofRedWhiteAndBlueRealtyGroupInc
#7           238 West Iowa Avenue  $355,000             N/A         N/A  4Bathrooms●  1,440Sqft                        CourtesyofAllenRealty
#8         8181 Wagon Spoke Trail  $350,000  Under Contract  4Bedrooms●  3Bathrooms●  2,848Sqft    CourtesyofKellerWilliamsPremierRealty,LLC
#9                     0 Missouri  $350,000             N/A         N/A          N/A        N/A                 CourtesyofRE/MAXNORTHWESTINC
#10  10817 Hidden Prairie Parkway  $340,000  Under Contract  3Bedrooms●  3Bathrooms●  2,761Sqft       CourtesyofKellerWilliamsPartnersRealty
#11         8244 Campground Drive  $335,000  Under Contract  4Bedrooms●  3Bathrooms●  2,018Sqft                         CourtesyofPinkRealty

我很快就会尝试在这里重新抓取一个 eBay 网站,如果您有其他链接,请将其留在评论中,我很乐意看看是否可以抓取

更新:

在另一个页面上尝试过,它有效

import requests
from bs4 import BeautifulSoup
import pandas as pd

URL='https://www.ebay.com/b/Eye-Makeup/172020/bn_1880663' #'https://www.ebay.com/b/Makeup-Products/31786/bn_1865570'
response=requests.get(URL)
soup= BeautifulSoup(response.content, 'html.parser')
columns=['name',"price","rating"]
#Product features
main_table=soup.find('ul',attrs={'class':'b-list__items_nofooter'})
item_boxes=main_table.find_all('div',attrs={'class':'s-item__info clearfix'})
item = item_boxes[0]

df=pd.DataFrame(columns=columns)

for item in item_boxes:

    item_name = item.findAll('h3')
    try:
       item_name_row = item_name[0].text.replace('\n','')
    except:
       item_name = "N/A"

    
    item_price = item.find_all('span',{'class':'s-item__price'})
    try:
       item_price_row = item_price[0].text.replace('\n','')
    except:
       item_price_row = "N/A"      
 
    try:
       item_rating = item.findAll('div',{'class':'s-item__reviews'})[0].div
       item_rating_row = item_rating.text
    except:
       item_rating_row = None

    row = [ item_name_row, item_price_row, item_rating_row ]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)
    df.to_csv('ebay1.csv', index=False)


    if item_rating != None:
      row = [item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating.text.replace('\n','') for rating in item_rating]
    else: 
      row = [ item_name[0].text.replace('\n','') for name in item_name] + [item_price[0].text.replace('\n','') for price in item_price] + [item_rating]
    df =df.append(pd.Series(row,index=columns),ignore_index=True)

df.to_csv('ebay1.csv', index=False)


关于python - 如何跳过超出范围的索引?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/59013771/

相关文章:

javascript - 使用 Selenium 和 Python 下载 JavaScript 加载的音频

python - 从 MS Word 中的表格中识别图像文件

python - 如何在没有循环的情况下将生成器元素添加到 set()?

python - Pandas 从列表中重命名 df 行

python - JSON 文件到 Pandas df

python - 如何在使用 scrapy 时从多个标签中排除特定的 html 标签(没有任何 id)?

php - 如何使用xpath获取ul内所有图像的src

python - 如何自动化使用 pyinstaller 和 innosetup 创建的 Django-App_EXE

python - 使用 glDraw 数组对立方体进行纹理处理时出现问题

python - 从数据帧的一列中绘制 2 个变量