python - PyQt5加载网页内容时返回None值

标签 python pyqt5

我正在尝试获取网页部分的内容。该部分中的数据由 JavaScript 动态加载。我在这里找到了一些代码,对其进行了编辑,但是当我运行脚本时,我返回 None

这是代码

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from pprint import pprint

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()
        

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

def main():
    page = Page('https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all')
    soup = bs.BeautifulSoup(page.html, 'html.parser')
    section = soup.find('table', {'id' : 'DataTables_Table_0'})
    pprint (section)

if __name__ == '__main__': main()

这是输出

Load finished
None

最佳答案

loadFinished 信号仅表示页面已加载,但之后可以创建更多 DOM 元素,ID 为“DataTables_Table_0”的元素就是这种情况,它是在页面加载后立即创建的。

一个可能的解决方案是注入(inject)一个脚本来检查该元素是否存在,并发出通知以便获取 HTML。

import sys
from functools import cached_property

from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel

from pprint import pprint
import bs4 as bs


def get_webchannel_source():
    file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
    if not file.open(QtCore.QIODevice.ReadOnly):
        return ""
    content = file.readAll()
    file.close()
    return content.data().decode()


class Manager(QtCore.QObject):
    def __init__(self, *, offline=True, visible=False, parent=None):
        super().__init__(parent)
        self._html = ""
        self._is_finished = False
        self.app
        self._profile = (
            QtWebEngineWidgets.QWebEngineProfile()
            if offline
            else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()
        )
        self.view.resize(640, 480)
        if not visible:
            self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
        self.view.show()
        self.webchannel.registerObject("manager", self)
        self.view.page().setWebChannel(self.webchannel)

    @cached_property
    def app(self):
        return QtWidgets.QApplication(sys.argv)

    @property
    def profile(self):
        return self._profile

    @cached_property
    def view(self):
        view = QtWebEngineWidgets.QWebEngineView()
        page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)
        view.setPage(page)
        return view

    @cached_property
    def webchannel(self):
        return QtWebChannel.QWebChannel(self)

    @property
    def html(self):
        return self._html

    def set_script(self, script):
        qscript = QtWebEngineWidgets.QWebEngineScript()
        qscript.setName("qscript")
        qscript.setSourceCode(get_webchannel_source() + "\n" + script)
        qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)
        qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)
        self.profile.scripts().insert(qscript)

    def start(self, url):
        self.view.load(QtCore.QUrl.fromUserInput(url))
        self.app.exec_()

    @QtCore.pyqtSlot()
    def save_html(self):
        if not self._is_finished:
            self.view.page().toHtml(self.html_callable)
            self._is_finished = True

    def html_callable(self, html):
        self._html = html
        self.app.quit()


JS = """
var manager = null;

function find_element() {
  var e = document.getElementById('DataTables_Table_0');
  console.log("try verify", e, manager);
  if (e != null && manager != null) {
    console.log(e)
    manager.save_html()
  } else {
    setTimeout(find_element, 100);
  }
}

(function wait_qt() {
  if (typeof qt != 'undefined') {
    console.log("Qt loaded");
    new QWebChannel(qt.webChannelTransport, function (channel) {
      manager = channel.objects.manager;
      find_element();
    });
  } else {
    setTimeout(wait_qt, 100);
  }
})();
"""


def main():
    manager = Manager()
    manager.set_script(JS)
    manager.start(
        "https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"
    )
    soup = bs.BeautifulSoup(manager.html, "html.parser")
    section = soup.find("table", {"id": "DataTables_Table_0"})
    pprint(section)


if __name__ == "__main__":
    main()

关于python - PyQt5加载网页内容时返回None值,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63944827/

相关文章:

python - 如何绘制 1000 像素的图?

python - 如何修复 "No module named ' pyarrow._plasma'"?

python - 绘制实时传感器数据时,PyQtGraph 停止更新并卡住

python - 为什么使用setWindowFlags后子对话框无法显示?

python - QLineEdit 中的 title() 问题?

python - matplotlib rc colorcycle参数与散点图

python - 对每一行的 pandas 列进行排序

python - 使用 Python Nmap 在网络中连接移动信息

python - PyQt:如何将我的脚本转换为 GUI?

python - 如何过滤子项,QTreeView使用QSortFilterProxyModel