python - 如何将类方法的输出获取到全局变量中

标签 python web-scraping pyqt global local

我正在尝试从 javascript 网页获取一些数据。我的代码正在生成多个链接并一一解析它们。解析输出是列表。我在 here 的帮助下编写了这段代码.但它会在一个类中生成列表。我想将列表项插入到 sqlite 表中,因此我想将本地列表项设为全局。我试图创建一个全局列表,将它放入类中,然后附加到它并返回它。我试图通过 processCurrentPage 方法将它们直接插入数据库,并尝试在该类下创建一个列表并通过 Webpage.list 访问它。但是这些方法都不起作用。我的尝试之一在这里,但不是最好的 - 这只是一个例子。我已经尝试过很多这样的选择。你能建议一个好的处理方法吗?

P.S: 我是 Python 新手,研究了整整两天,阅读了所有类文档,但找不到方法。

import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs

class WebPage(QtWebEngineWidgets.QWebEnginePage):
    alldatas=[]
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext

    @property
    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True
    def processCurrentPage(self, html):
        url = self.url().toString()
        # do stuff with html...
        soup = bs.BeautifulSoup(html, 'html.parser')
        data = soup.find('div', class_='tablo_dual_board')
        data1 = data.text
        data2 = data1.splitlines()
        self.alldatas+=data2

        if not self.fetchNext:
            QtWidgets.qApp.quit()


    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
        # disable javascript error output
        pass

if __name__ == '__main__':

    # generate some test urls

    onexurl = "https://1xbahis1.com/en/live/Football/"
    r = requests.get(onexurl)
    soup = BeautifulSoup(r.content, "html.parser")
    income = soup.find_all("ul", {"id":"games_content"})
    links = soup.find_all("a", {"class": "c-events__name"})

    urls = []
    for matchlink in links:
        urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))


    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)
    print(webpage.alldatas)
    sys.exit(app.exec_())

最佳答案

下面是您的脚本版本,可以执行您想要的操作。 scrape_page 函数会为每个处理过的 url 调用,并将数据添加到全局 records 列表中。在抓取所有页面后,process_records 函数被调用一次。您可以使用此功能将记录添加到您的数据库中。

import sys
import requests
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets

records = []

def scrape_page(url, html):
    print('scrape page:', url)
    soup = BeautifulSoup(html, 'html.parser')
    data = soup.find('div', class_='tablo_dual_board')
    if data is not None:
        records.append(data.text.splitlines())
    else:
        print('error: could not find tablo_dual_board')

def process_records():
    # add record to database ...
    print('process records:', len(records))

def generate_urls():
    onexurl = "https://1xbahis1.com/en/live/Football/"
    reply = requests.get(onexurl)
    soup = BeautifulSoup(reply.content, "html.parser")
    income = soup.find_all("ul", {"id":"games_content"})
    links = soup.find_all("a", {"class": "c-events__name"})
    urls = []
    for matchlink in links:
        urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
    return urls

class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        scrape_page(self.url().toString(), html)
        if not self.fetchNext():
            process_records()
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
        # disable javascript error output
        pass

if __name__ == '__main__':

    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(generate_urls())
    sys.exit(app.exec_())

关于python - 如何将类方法的输出获取到全局变量中,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/47441374/

相关文章:

python - python 中的 selenium webdriver。有第三方库吗?

python - 使用 Selenium 进行网页抓取

python - 从动态页面检索所有汽车链接

javascript - 用 R + V8 抓取内华达州议会/参议院网站

python - 如何使 QPushButton 成为加载按钮?

python - 为 skimage 中的 regionprops 添加额外的属性

python - for语句(循环计算优先级)*不确定这是不是正确的术语

Python 拒绝多次遍历文件中的行

python - 在 python 中编写模块化代码

Qt 绘图应用程序