Reputation: 344
I'm attempting to use PyQt4 to do some web scraping, but the site I'm attempting to scrape keeps thinking I'm a mobile device, and is not presenting the dataset available to a desktop or laptop (even though I'm using a Mozilla/5.0 user agent).
To try to find out why I'm setting my URL to "whatsmyuseragent.com". And I notice that it's telling me that although my Screen Resolution is 1920px x 1080px my Browser Window Size is 0px x 0px, so could this be the problem?
Here is my code below. Any suggestions on what I need to change to convince the site I'm scraping to believe I'm a desktop or laptop (rather than a mobile) would be appreciated. Thanks.
import sys
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
import urllib.request
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
print(soup.prettify())
Upvotes: 0
Views: 661
Reputation: 2035
Try to set the Viewport Size:
import sys
import re
from PyQt4 import QtGui
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, QSize
from PyQt4.QtWebKit import QWebPage
from PyQt4.QtNetwork import QNetworkRequest
import bs4 as bs
class Client(QWebPage):
def __init__ (self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
# good ol'size
size = QSize(640, 480)
self.setViewportSize(size)
self.loadFinished.connect(self.on_page_load)
self.request = QNetworkRequest()
self.request.setUrl(QUrl(url))
self.request.setRawHeader("User-Agent",'Mozilla/5.0')
self.mainFrame().load(self.request)
self.app.exec_()
def on_page_load (self):
self.app.quit()
url = 'http://www.whatsmyuseragent.com'
client_response = Client(url)
source = client_response.mainFrame().toHtml()
soup = bs.BeautifulSoup(source, 'lxml')
# some meat from the soup
print(re.sub('\s+', ' ', soup.find(class_='browser-window').text))
This produce the following for me:
625 px x 465 px
Upvotes: 1