python - 查找包含文章的子页面 URL 并从中收集数据

标签 python web-scraping beautifulsoup

脚本应该找到包含文章的子页面的地址,并从中收集必要的数据。数据应该进入数据库,但我不知道如何让脚本从博客的每个页面中提取每篇文章的内容。

import requests
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://xxx/'

r = requests.get(url)
# Extract HTML
html = r.text
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, "html5lib")

# Get the text

text = soup.get_text()
# Create tokenizer
tokenizer = RegexpTokenizer('\w+')

# Create tokens
tokens = tokenizer.tokenize(text)

# Initialize new list
words = []

# Loop through list

for word in tokens:
    words.append(word.lower())

# Get English stopwords and print some of them
sw = nltk.corpus.stopwords.words('english')

# Initialize new list
words_ns = []

for word in words:
    if word not in sw:
        words_ns.append(word)

# plotting
freqdist1 = nltk.FreqDist(words_ns)
freqdist1.plot(25)

print(soup.get_text())

最佳答案

您可以根据要求使用 beautifulsoup 完成整个操作。文本提取码为@nmgeek ;同样的问题还有其他方法可供选择。我猜你可以用 nltk 处理文本。该方法很好,因为您可以使用添加到列表的选择器。您可以通过传递给 select 的选择器列表来实现类似的效果,即 [item.text for item in soup.select('selector list gone here')

编辑:下面为您提供了所有链接,但似乎网站会在一段时间后阻止您。看看rotating IPsthese/用户代理在 all_links 上循环

如果你必须求助于selenium,至少你有所有文章链接的列表,你可以使用selenium循环和.get

import requests
from bs4 import BeautifulSoup as bs

url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []

headers = {
    'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'User-Agent' : 'Mozilla/5.0'
}
with requests.Session() as s:
    r = s.get('https://teonite.com/blog/')
    soup = bs(r.content, 'lxml')
    article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
    all_links.append(article_links)
    num_pages = int(soup.select_one('.page-number').text.split('/')[1])

    for page in range(2, num_pages + 1):
        r = s.get(url.format(page))
        soup = bs(r.content, 'lxml')
        article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
        all_links.append(article_links)

    all_links = [item for i in all_links for item in i]

    for article in all_links:
        #print(article)
        r = s.get(article, headers = headers)
        soup = bs(r.content, 'lxml')
        [t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
        visible_text = soup.getText()   # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek
        # here I think you need to consider IP rotation/User-Agent changing
        try:
            print(soup.select_one('.post-title').text)
        except:
            print(article)
            print(soup.select_one('h1').text)
            break
        # do something with text

添加selenium似乎肯定可以解决不良请求被阻止的问题:

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver

url = 'https://teonite.com/blog/page/{}/index.html'
all_links = []

with requests.Session() as s:
    r = s.get('https://teonite.com/blog/')
    soup = bs(r.content, 'lxml')
    article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
    all_links.append(article_links)
    num_pages = int(soup.select_one('.page-number').text.split('/')[1])

    for page in range(2, num_pages + 1):
        r = s.get(url.format(page))
        soup = bs(r.content, 'lxml')
        article_links = ['https://teonite.com' + item['href'][2:] for item in soup.select('.post-content a')]
        all_links.append(article_links)

all_links = [item for i in all_links for item in i]

d = webdriver.Chrome()

for article in all_links:
    d.get(article)
    soup = bs(d.page_source, 'lxml')
    [t.extract() for t in soup(['style', 'script', '[document]', 'head', 'title'])]
    visible_text = soup.getText()   # taken from https://stackoverflow.com/a/19760007/6241235 @nmgeek

    try:
        print(soup.select_one('.post-title').text)
    except:
        print(article)
        print(soup.select_one('h1').text)
        break #for debugging
    # do something with text
d.quit()

关于python - 查找包含文章的子页面 URL 并从中收集数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56330793/

相关文章:

python - 如何在用户提示下流式传输子进程输出?

python - 在 python-social-auth 中从 google 和 facebook 检索个人资料图片

python - 用于通用模型的 Django Createview

javascript - 网络抓取和 promise

python - 多选美汤

python - 使用 BeautifulSoup 分析和编辑 html 代码中的链接

python - 如果在 defaultdict 中找不到键,则返回最大键的值

python - 从 HTML 中提取脚本标签内的字符串

javascript - 如何在 PhantomJS 中跟踪 document.location.reload?

python - 删除另一个标签内的标签 beautifulsoup