Reputation: 6504
Problem Description
Currently working on a project which requires me to take browse a url and take a screenshot of the webpage.
After looking various resources i found 3 ways to do so.I will be mentioning all 3 methods iam currently using.
Method - 1 : PhantomJS
from selenium import webdriver
import time
import sys
print 'Without Headless'
_start = time.time()
br = webdriver.PhantomJS()
br.get('http://' + sys.argv[1])
br.save_screenshot('screenshot-phantom.png')
br.quit
_end = time.time()
print 'Total time for non-headless {}'.format(_end - _start)
Method-2 : Headless Browser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
print 'Headless'
_start = time.time()
options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # # Bypass OS security model
options.add_argument('start-maximized')
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver')
driver.get('http://' + sys.argv[1])
driver.save_screenshot('screenshot-headless.png')
driver.quit()
_end = time.time()
print 'Total time for headless {}'.format(_end - _start)
Method - 3 :PyQT
import argparse
import sys
import logging
import sys
import time
import os
import urlparse
from selenium import webdriver
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
class Screenshot(QWebView):
def __init__(self):
self.app = QApplication(sys.argv)
QWebView.__init__(self)
self._loaded = False
self.loadFinished.connect(self._loadFinished)
def capture(self, url, output_file):
_logger.info('Received url {}'.format(url))
_start = time.time()
try:
#Check for http/https
if url[0:3] == 'http' or url[0:4] == 'https':
self.url = url
else:
url = 'http://' + url
self.load(QUrl(url))
self.wait_load(url)
# set to webpage size
frame = self.page().mainFrame()
self.page().setViewportSize(frame.contentsSize())
# render image
image = QImage(self.page().viewportSize(), QImage.Format_ARGB32)
painter = QPainter(image)
frame.render(painter)
painter.end()
_logger.info('Saving screenshot {} for {}'.format(output_file,url))
image.save(os.path.join(os.path.dirname(os.path.realpath(__file__)),'data',output_file))
except Exception as e:
_logger.error('Error in capturing screenshot {} - {}'.format(url,e))
_end = time.time()
_logger.info('Time took for processing url {} - {}'.format(url,_end - _start))
def wait_load(self,url,delay=1,retry_count=60):
# process app events until page loaded
while not self._loaded and retry_count:
_logger.info('wait_load for url {} retry_count {}'.format(url,retry_count))
self.app.processEvents()
time.sleep(delay)
retry_count -=1
_logger.info('wait_load for url {} expired'.format(url))
self._loaded = False
def _loadFinished(self, result):
self._loaded = True
Issue Faced:
These 3 methods while using,all of them are getting stuck due to one or other error.One such issue faced is asked here Error Question on Stackoverflow. So out of these 3 methods to take screenshot of a webpage in Python,which is effecient and will work on large scale deployment.
Upvotes: 5
Views: 9140
Reputation: 76
Taken from https://gist.github.com/fabtho/13e4a2e7cfbfde671b8fa81bbe9359fb and rewritten in Python 3
This method will technically work but it will not look good, as many websites will have cookie acceptance pop-ups that will appear in every screenshot, so depending on which website you use, you may wish to remove these first using selenium before beginning the screenshotting process.
from selenium import webdriver
from PIL import Image
from io import BytesIO
verbose = 1
browser = webdriver.Chrome(executable_path='C:/yourpath/chromedriver.exe')
browser.get('http://stackoverflow.com/questions/37906704/taking-a-whole-page-screenshot-with-selenium-marionette-in-python')
# from here http://stackoverflow.com/questions/1145850/how-to-get-height-of-entire-document-with-javascript
js = 'return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);'
scrollheight = browser.execute_script(js)
if verbose > 0:
print(scrollheight)
slices = []
offset = 0
while offset < scrollheight:
if verbose > 0:
print(offset)
browser.execute_script("window.scrollTo(0, %s);" % offset)
img = Image.open(BytesIO(browser.get_screenshot_as_png()))
offset += img.size[1]
slices.append(img)
if verbose > 0:
browser.get_screenshot_as_file('%s/screen_%s.png' % ('/tmp', offset))
print(scrollheight)
screenshot = Image.new('RGB', (slices[0].size[0], offset))
offset = 0
for img in slices:
screenshot.paste(img, (0, offset))
offset += img.size[1]
screenshot.save('screenshot.png')
browser.quit()
This is an alternative method that might help also.
from selenium import webdriver
import time
#Webdriver variables to open browser in headless mode
options = webdriver.ChromeOptions()
options.headless = True
#Sample URL. Google here used for ease of example
url="https://www.google.com"
driver = webdriver.Chrome(options=options)
driver.get(url)
#Give a certain amount of time for the page to load
time.sleep(7)
#Bypass cookie. The cookie button is here identified with an ID. Modify as necessary for your needs
try:
driver.find_element_by_css_selector('#L2AGLb').click()
print("Bypassed stage one of cookie consent") #Add additional stages if necessary for various cookie consent forms
except:
print("Couldn't find cookie consent")
time.sleep(10)
#Take picture
S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment
driver.find_element_by_tag_name('body').screenshot("output_image.png")
#Quit driver
driver.quit()
Upvotes: 1