Unable to get html with PyQt5

Question

I want to get the html code of a web page using PyQt5.

However, the program keeps running and does not return values.

What is the reason and how to solve this problem?

Thanks a lot.

import sys
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets


class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self.soup_dict = {}
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        url = self.url().toString()
        soup = BeautifulSoup(html, 'lxml')
        self.soup_dict[url] = soup

        if not self.fetchNext():
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)
      

if __name__ == '__main__':

    urls = ['https://chejiahao.autohome.com.cn/info/6870956#pvareaid=6826274']

    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)
    ret = app.exec_()
    soup_dict = webpage.soup_dict

eyllanesc · Accepted Answer

From what I have tested it seems that the website has several measures to avoid scraping. For example:

Verify that the user accesses through a GUI.
Each time the user accesses the page then the loading time increases up to a time threshold.

For the first case you can use a QWebEngineView that is not rendered on the screen, and for the second case I recommend scraping in a reasonable time, for example every half hour or use a proxy.

import sys

from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets


class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self, parent=None):
        self.profile = QtWebEngineWidgets.QWebEngineProfile()
        super(WebPage, self).__init__(self.profile, parent)
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self.soup_dict = {}
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        url = self.url().toString()
        soup = BeautifulSoup(html, "lxml")
        self.soup_dict[url] = soup

        if not self.fetchNext():
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)


if __name__ == "__main__":

    urls = ["https://chejiahao.autohome.com.cn/info/6870956#pvareaid=6826274"]

    app = QtWidgets.QApplication(sys.argv)
    webpage = WebPage()
    webpage.start(urls)

    view = QtWebEngineWidgets.QWebEngineView()
    view.setAttribute(QtCore.Qt.WA_DontShowOnScreen, True)
    view.setPage(webpage)
    view.resize(640, 480)
    view.show()

    ret = app.exec_()
    soup_dict = webpage.soup_dict
    print(soup_dict)

Unable to get html with PyQt5

Answers (1)

Related Questions