我是python和一般编程的入门者。
我在python中有一个代码,用于将特定网站中的数据抓取到csv数据中。总的来说,它对我来说很好。我通常让它运行一整夜,这样网站响应速度更快,更稳定。
问题是:有时我自己的连接失败或网站中发生了一些不稳定情况,并且脚本返回了错误,这使我失去了很多时间。
我想使用一些错误处理方法来改进代码,以便它可以继续检查insternet连接是否正常工作,并在工作时移至下一个链接,而不是崩溃。你们当中有人知道如何实现吗?
这是我的python代码:
#!-*- coding: utf8 -*-
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from tqdm import tqdm
import datetime
import requests
import pandas
import os
class SigefRequests:
"""Class responsible for accessing, extracting and parsing sigef
information into a csv file.
The output file will be at ./data/outputs
"""
def __init__(self, path):
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
self.url_list = self.reading_url_file(path)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) "
"Gecko/20100101 Firefox/54.0",
"Connection": "close",
"Accept-Language": "en-US,en;q=0.5",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/"
";q=0.8",
"Upgrade-Insecure-Requests": "1"
}
self.session = requests.session()
self.data = {
'código': [],
'denominação': [],
'área': [],
'data de entrada': [],
'situação': [],
'responsável técnico': [],
'ART': [],
'envio': [],
'requerimento': [],
'status': [],
'data': [],
'nome': [],
'cpf/cnpj': [],
'situação - georreferência': [],
'natureza': [],
'número de parcelas': [],
'municípios': [],
'código do imóvel': [],
'shp - polígono': [],
'shp - vértices': [],
'shp - limites': [],
'kml - polígono': [],
'kml - vértices': [],
'kml - limites': [],
'csv - polígono': [],
'csv - vértices': [],
'csv - limites': [],
}
self.export_list = [
"https://sigef.incra.gov.br/geo/exportar/parcela/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/shp/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/kml/{}",
"https://sigef.incra.gov.br/geo/exportar/parcela/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/vertice/csv/{}",
"https://sigef.incra.gov.br/geo/exportar/limite/csv/{}"
]
# Used in __init__
@staticmethod
def reading_url_file(path):
"""This function reads the links.txt file and return a links list.
Parameters
----------
path : str
The path to links.txt file.
(By default this file is in data folder).
Returns
-------
url_list : iterator
The links list.
"""
return open(
os.path.abspath('../' + path)
).readlines()
# Used in __call__
def requesting(self, url):
"""This function makes a GET requisition into the given sigef url.
Parameters
----------
url : str
Sigef's URL.
Returns
-------
response : requests.models.Response
The GET Requisition response.
"""
return self.session.get(url, verify=False, headers=self.headers)
# Used in __call__
@staticmethod
def soup(html):
"""This function parses the html.
Parameters
----------
html : requests.models.Response
Unparsed html.
Returns
-------
parsed_html : bs4.BeautifulSoup
Parsed html.
"""
return BeautifulSoup(html.content, 'html5lib')
# Used in __call__
def filtering_content(self, html):
"""This function filters the page content and looks for the relevant
data.
Parameters
----------
html : bs4.BeautifulSoup
Parsed html.
Returns
-------
"""
tables = html.find_all('table', {
'class': 'table table-hover tabela-atributos'
})
tables_ = [tables[0], tables[1], tables[2], tables[-1]]
content_list = []
for table in tables_:
for row in table.find_all('td'):
content_list.append((row.text.strip()))
content_list.pop(content_list.index('Envio'))
if 'Nenhum requerimento' in content_list:
content_list.insert(9, '-')
content_list.insert(9, '-')
names = []
for row in tables[3].find_all('th'):
names.append(row.text)
table_3_content = []
for row in tables[3].find_all('td'):
table_3_content.append(row.text.strip())
content_list.append(table_3_content[1])
content_list.append(table_3_content[2])
content_list.append(table_3_content[names.index('Número parcelas')])
content_list.append(table_3_content[-1])
try:
content_list.append(table_3_content[names.index(
'Código do Imóvel (SNCR/INCRA)')])
except ValueError:
content_list.append('-')
for elem in self.export_list:
content_list.append(elem.format(content_list[0]))
for elem in content_list:
if u'\u2013' in elem:
content_list[content_list.index(elem)] = \
elem.replace(u'\u2013', '-')
for key, value in zip(self.data.keys(), content_list):
self.data.get(key).append(value)
self.parsing_to_csv()
# Used in filtering_content
def parsing_to_csv(self):
"""This function parses the acquired data into a csv file.
Returns
-------
"""
pandas.DataFrame(self.data).set_index('código').to_csv(os.path.abspath(
'../data/outputs/sigef-{}.csv'.format(datetime.date.today())),
encoding='latin-1', sep=';'
)
def __call__(self, *args, **kwargs):
for url in tqdm(self.url_list):
self.filtering_content(self.soup(self.requesting(url)))
if __name__ == '__main__':
SigefRequests(r'data\links.txt').__call__()
这是当它停止工作时出现的错误的示例:(env) D:\Documentos\LAGESA\Programas\Scraper\up3\sigef-crawler\src>python crawler.py
12%|█████████▎ | 543/4493 [1:59:07<14:26:33, 13.16s/it]
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 159, in _new_conn
conn = connection.create_connection(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\connection.py", line 61, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "C:\Users\joaop\AppData\Local\Programs\Python\Python38\lib\socket.py", line 918, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 670, in urlopen
httplib_response = self._make_request(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 381, in _make_request
self._validate_conn(conn)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 976, in _validate_conn
conn.connect()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 308, in connect
conn = self._new_conn()
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connection.py", line 171, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 439, in send
resp = conn.urlopen(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\connectionpool.py", line 724, in urlopen
retries = retries.increment(
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\urllib3\util\retry.py", line 439, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "crawler.py", line 212, in <module>
SigefRequests(r'data\links.txt').__call__()
File "crawler.py", line 208, in __call__
self.filtering_content(self.soup(self.requesting(url)))
File "crawler.py", line 110, in requesting
return self.session.get(url, verify=False, headers=self.headers)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 543, in get
return self.request('GET', url, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "D:\Documentos\LAGESA\Programas\Scraper\env\lib\site-packages\requests\adapters.py", line 516, in send
raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPSConnectionPool(host='sigef.incra.gov.br', port=443): Max retries exceeded with url: /geo/parcela/detalhe/a7144e88-f458-4c25-b275-64b24284fac0/%0A (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000295683D0520>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
谢谢您的帮助!
最佳答案
嘿乔奥在python中,如果程序出现特定错误,您可以使用try try语句继续运行。
继承人和例子。
string = "string"
try:
print(int(string))
except ValueError:
print("it didn't work")
没有尝试,除了你得到Traceback (most recent call last):
File "C:\Users\jojop\OneDrive\Desktop\python.py", line 4, in <module>
print(int(string))
ValueError: invalid literal for int() with base 10: 'string'
该消息将为您提供可以在这种情况下使用的错误“ValueError”
关于python - 在python网络抓取脚本中为错误创建异常,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63302113/