脚本应该找到包含文章的子页面的地址,并从中收集必要的数据。数据应该进入数据库,但我不知道如何让脚本从博客的每个页面中提取每篇文章的内容。
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
url = 'https://xxx/'
r = requests.get(url)
# Extract HTML
html = r.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, "html5lib")
# Get the text
text = soup.get_text()
# Create tokenizer
tokenizer = RegexpTokenizer('\w+')
# Create tokens
tokens = tokenizer.tokenize(text)
# Initialize new list
words = []
# Loop through list
for word in tokens:
words.append(word.lower())
# Get English stopwords and print some of them
sw = nltk.corpus.stopwords.words('english')
# Initialize new list
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word)
# plotting
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)
print(soup.get_text())
最佳答案
您可以根据要求使用 beautifulsoup 完成整个操作。文本提取码为@nmgeek ;同样的问题还有其他方法可供选择。我猜你可以用 nltk 处理文本。该方法很好,因为您可以使用添加到列表的选择器。您可以通过传递给 select
的选择器列表来实现类似的效果,即 [item.text for item in soup.select('selector list gone here')
编辑:下面为您提供了所有链接,但似乎网站会在一段时间后阻止您。看看rotating IPs和 these/用户代理在 all_links 上循环
如果你必须求助于selenium,至少你有所有文章链接的列表,你可以使用selenium循环和.get
import requests
from bs4 import BeautifulSoup as bs
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
headers = {
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
for article in all_links:
#print(article)
r = s.get(article, headers = headers)
soup = bs(r.content, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
# here I think you need to consider IP rotation/User-Agent changing
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break
# do something with text
添加selenium似乎肯定可以解决不良请求被阻止的问题:
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []
with requests.Session() as s:
r = s.get('https://teonite.com/blog/')
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
num_pages = int(soup.select_one('.page-number').text.split('/')[1])
for page in range(2, num_pages + 1):
r = s.get(url.format(page))
soup = bs(r.content, 'lxml')
article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
all_links.append(article_links)
all_links = [item for i in all_links for item in i]
d = webdriver.Chrome()
for article in all_links:
d.get(article)
soup = bs(d.page_source, 'lxml')
[t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
visible_text = soup.getText() # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
try:
print(soup.select_one('.post-title').text)
except:
print(article)
print(soup.select_one('h1').text)
break #for debugging
# do something with text
d.quit()
关于python - 查找包含文章的子页面 URL 并从中收集数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56330793/