我正在使用 PyQt 完全加载一个页面(包括 JS)并使用 Beautiful Soup 获取它的内容。在第一次迭代时工作正常,但之后,它崩溃了。我对 Python 了解不多,对 PyQt 了解更少,因此非常欢迎任何帮助。
类借自here .
from PyQt4.QtCore import QUrl, SIGNAL
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import sys
import signal
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.html = None
signal.signal(signal.SIGINT, signal.SIG_DFL)
self.connect(self, SIGNAL('loadFinished(bool)'), self._finished_loading)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _finished_loading(self, result):
self.html = self.mainFrame().toHtml()
self.soup = BeautifulSoup(UnicodeDammit(self.html).unicode_markup)
self.app.quit()
###################################################################
l = ["http://www.google.com/?q=a", "http://www.google.com/?q=b", "http://www.google.com/?q=c"]
for page in l:
soup = Render(page).soup
print("# soup done: " + page)
最佳答案
该示例崩溃是因为 RenderPage
类试图为它尝试加载的每个 url 创建一个新的 QApplication
和事件循环。
相反,只应创建一个QApplication
,并且QWebPage
子类应在处理完每个页面后加载一个新的url,而不是使用for 循环。
这是一个重写的例子,它应该做你想做的事:
import sys, signal
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt4 import QtCore, QtGui, QtWebKit
class WebPage(QtWebKit.QWebPage):
def __init__(self):
QtWebKit.QWebPage.__init__(self)
self.mainFrame().loadFinished.connect(self.handleLoadFinished)
def process(self, items):
self._items = iter(items)
self.fetchNext()
def fetchNext(self):
try:
self._url, self._func = next(self._items)
self.mainFrame().load(QtCore.QUrl(self._url))
except StopIteration:
return False
return True
def handleLoadFinished(self):
self._func(self._url, self.mainFrame().toHtml())
if not self.fetchNext():
print('# processing complete')
QtGui.qApp.quit()
def funcA(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
def funcB(url, html):
print('# processing:', url)
# soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
# do stuff with soup...
if __name__ == '__main__':
items = [
('http://stackoverflow.com', funcA),
('http://google.com', funcB),
]
signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app = QtGui.QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())
关于python - PyQt 类不适用于第二次使用,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/21909907/