Reputation: 4291
I want to get the html code of a web page using PyQt5
.
However, the program keeps running and does not return values.
What is the reason and how to solve this problem?
Thanks a lot.
import sys
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self.soup_dict = {}
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
soup = BeautifulSoup(html, 'lxml')
self.soup_dict[url] = soup
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
if __name__ == '__main__':
urls = ['https://chejiahao.autohome.com.cn/info/6870956#pvareaid=6826274']
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
ret = app.exec_()
soup_dict = webpage.soup_dict
Upvotes: 0
Views: 187
Reputation: 243897
From what I have tested it seems that the website has several measures to avoid scraping. For example:
For the first case you can use a QWebEngineView that is not rendered on the screen, and for the second case I recommend scraping in a reasonable time, for example every half hour or use a proxy.
import sys
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self, parent=None):
self.profile = QtWebEngineWidgets.QWebEngineProfile()
super(WebPage, self).__init__(self.profile, parent)
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self.soup_dict = {}
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
soup = BeautifulSoup(html, "lxml")
self.soup_dict[url] = soup
if not self.fetchNext():
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
if __name__ == "__main__":
urls = ["https://chejiahao.autohome.com.cn/info/6870956#pvareaid=6826274"]
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
view = QtWebEngineWidgets.QWebEngineView()
view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
view.setPage(webpage)
view.resize(640, 480)
view.show()
ret = app.exec_()
soup_dict = webpage.soup_dict
print(soup_dict)
Upvotes: 3