我正在尝试获取网页部分的内容。该部分中的数据由 JavaScript 动态加载。我在这里找到了一些代码,对其进行了编辑,但是当我运行脚本时,我返回 None
这是代码
import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from pprint import pprint
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
print('Load finished')
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all')
soup = bs.BeautifulSoup(page.html, 'html.parser')
section = soup.find('table', {'id' : 'DataTables_Table_0'})
pprint (section)
if __name__ == '__main__': main()
这是输出
Load finished
None
最佳答案
loadFinished 信号仅表示页面已加载,但之后可以创建更多 DOM 元素,ID 为“DataTables_Table_0”的元素就是这种情况,它是在页面加载后立即创建的。
一个可能的解决方案是注入(inject)一个脚本来检查该元素是否存在,并发出通知以便获取 HTML。
import sys
from functools import cached_property
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets, QtWebChannel
from pprint import pprint
import bs4 as bs
def get_webchannel_source():
file = QtCore.QFile(":/qtwebchannel/qwebchannel.js")
if not file.open(QtCore.QIODevice.ReadOnly):
return ""
content = file.readAll()
file.close()
return content.data().decode()
class Manager(QtCore.QObject):
def __init__(self, *, offline=True, visible=False, parent=None):
super().__init__(parent)
self._html = ""
self._is_finished = False
self.app
self._profile = (
QtWebEngineWidgets.QWebEngineProfile()
if offline
else QtWebEngineWidgets.QWebEngineProfile.defaultProfile()
)
self.view.resize(640, 480)
if not visible:
self.view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
self.view.show()
self.webchannel.registerObject("manager", self)
self.view.page().setWebChannel(self.webchannel)
@cached_property
def app(self):
return QtWidgets.QApplication(sys.argv)
@property
def profile(self):
return self._profile
@cached_property
def view(self):
view = QtWebEngineWidgets.QWebEngineView()
page = QtWebEngineWidgets.QWebEnginePage(self.profile, self)
view.setPage(page)
return view
@cached_property
def webchannel(self):
return QtWebChannel.QWebChannel(self)
@property
def html(self):
return self._html
def set_script(self, script):
qscript = QtWebEngineWidgets.QWebEngineScript()
qscript.setName("qscript")
qscript.setSourceCode(get_webchannel_source() + "\n" + script)
qscript.setInjectionPoint(QtWebEngineWidgets.QWebEngineScript.DocumentReady)
qscript.setWorldId(QtWebEngineWidgets.QWebEngineScript.MainWorld)
self.profile.scripts().insert(qscript)
def start(self, url):
self.view.load(QtCore.QUrl.fromUserInput(url))
self.app.exec_()
@QtCore.pyqtSlot()
def save_html(self):
if not self._is_finished:
self.view.page().toHtml(self.html_callable)
self._is_finished = True
def html_callable(self, html):
self._html = html
self.app.quit()
JS = """
var manager = null;
function find_element() {
var e = document.getElementById('DataTables_Table_0');
console.log("try verify", e, manager);
if (e != null && manager != null) {
console.log(e)
manager.save_html()
} else {
setTimeout(find_element, 100);
}
}
(function wait_qt() {
if (typeof qt != 'undefined') {
console.log("Qt loaded");
new QWebChannel(qt.webChannelTransport, function (channel) {
manager = channel.objects.manager;
find_element();
});
} else {
setTimeout(wait_qt, 100);
}
})();
"""
def main():
manager = Manager()
manager.set_script(JS)
manager.start(
"https://www.ibm.com/support/fixcentral/swg/selectFixes?parent=IBM%20Security&product=ibm/Information+Management/InfoSphere+Guardium&release=10.0&platform=Linux&function=all"
)
soup = bs.BeautifulSoup(manager.html, "html.parser")
section = soup.find("table", {"id": "DataTables_Table_0"})
pprint(section)
if __name__ == "__main__":
main()
关于python - PyQt5加载网页内容时返回None值,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/63944827/