Andreas Köberle
Andreas Köberle

Reputation: 110972

How to download images from a site with phantomjs

I wanna save some images from a site. At the moment I can get the paths to the images but I have no clue how to get and save the images with phantomJs.

findRotationTeaserImages = ->
  paths = page.evaluate ->
    jQuery('.rotate img').map(-> return this.src).get()

  for path, i in paths
    console.log(path);
    //save the image

Upvotes: 7

Views: 14586

Answers (5)

Ziping Sun
Ziping Sun

Reputation: 61

I've experienced really a lot troubles when using the render method. Luckily I finally come up with two better solution. Here is the code I used in my project. First solution has some trouble to update the cookie, so it cannot work well when fetching captcha image. Both method will cause a new http request. But with a few modifications, the second one can ommit such kind of request.

The first one fetches the cookie from phantomJs and makes a new http request using request. The second one uses base64 to pass the image.

 async download(download_url, stream) {
    logger.profile(`download(download_url='${download_url}')`);
    let orig_url = await this.page.property('url');
    download_url = url.resolve(orig_url, download_url);
    let cookies = await this.page.property('cookies');
    let jar = request.jar();
    for (let cookie of cookies) {
        if (cookie.name !== undefined) {
            cookie.key = cookie.name;
            delete cookie.name;
        }
        if (cookie.httponly !== undefined) {
            cookie.httpOnly = cookie.httponly;
            delete cookie.httponly;
        }
        if (cookie.expires !== undefined)
            cookie.expires = new Date(cookie.expires);
        jar.setCookie(new Cookie(cookie), download_url, {ignoreError: true});
    }
    let req = request({
        url: download_url,
        jar: jar,
        headers: {
            'User-Agent': this.user_agent,
            'Referer': orig_url
        }
    });
    await new Promise((resolve, reject) => {
        req.pipe(stream)
            .on('close', resolve)
            .on('error', reject);
    });
    // Due to this issue https://github.com/ariya/phantomjs/issues/13409, we cannot set cookies back
    // to browser. It is said to be redesigned, but till now (Mar 31 2017), no change has been made.
    /*await Promise.all([
        new Promise((resolve, reject) => {
            req.on('response', () => {
                jar._jar.store.getAllCookies((err, cookies) => {
                    if (err) {
                        reject(err);
                        return;
                    }
                    cookies = cookies.map(x => x.toJSON());
                    for (let cookie of cookies) {
                        if (cookie.key !== undefined) {
                            cookie.name = cookie.key;
                            delete cookie.key;
                        }
                        if (cookie.httpOnly !== undefined) {
                            cookie.httponly = cookie.httpOnly;
                            delete cookie.httpOnly;
                        }
                        if (cookie.expires instanceof Date) {
                            cookie.expires = cookie.expires.toGMTString();
                            cookie.expiry = cookie.expires.toTime();
                        }
                        else if (cookie.expires == Infinity)
                            delete cookie.expires;
                        delete cookie.lastAccessed;
                        delete cookie.creation;
                        delete cookie.hostOnly;
                    }
                    this.page.property('cookies', cookies).then(resolve).catch(reject);
                });
            }).on('error', reject);
        }),
        new Promise((resolve, reject) => {
            req.pipe(fs.createWriteStream(save_path))
                .on('close', resolve)
                .on('error', reject);
        })
    ]);*/
    logger.profile(`download(download_url='${download_url}')`);
}
async download_image(download_url, stream) {
    logger.profile(`download_image(download_url='${download_url}')`);
    await Promise.all([
        new Promise((resolve, reject) => {
            this.client.once('donwload image', data => {
                if (data.err)
                    reject(err);
                else
                    stream.write(Buffer.from(data.data, 'base64'), resolve);

            });
        }),
        this.page.evaluate(function (url) {
            var img = new Image(), callback = function (err, data) {
                callPhantom({
                    event: 'donwload image',
                    data: {
                        err: err && err.message,
                        data: data
                    }
                });
            };
            img.onload = function () {
                var canvas = document.createElement("canvas");
                canvas.width = img.width;
                canvas.height = img.height;
                canvas.getContext("2d").drawImage(img, 0, 0);
                callback(null, canvas.toDataURL("image/png").replace(/^data:image\/(png|jpg);base64,/, ""));
            };
            img.onerror = function () {
                callback(new Error('Failed to fetch image.'));
            };
            img.src = url;
        }, download_url)
    ]);
    logger.profile(`download_image(download_url='${download_url}')`);
}

Upvotes: 0

TheZver
TheZver

Reputation: 1582

In case image dimensions are known:



    var webPage = require('webpage');

    /**
     * Download image with known dimension.
     * @param src   Image source
     * @param dest  Destination full path
     * @param width Image width
     * @param height    Image height
     * @param timeout   Operation timeout
     * @param cbk   Callback (optional)
     * @param cbkParam  Parameter to pass back to the callback (optional)
     */
    function downloadImg(src, dest, width, height, timeout, cbk, cbkParam) {
        var page = webPage.create();

        page.settings.resourceTimeout = timeout; //resources loading timeout(ms)
        page.settings.webSecurityEnabled = false; //Disable web security
        page.settings.XSSAuditingEnabled = false; //Disable web security

        page.open(src, function(status) {

            // missing images sometime receive text from server
            var success = status == 'success' && !page.plainText;

            if (success) {
                page.clipRect = {
                    top: 0,
                    left: 0,
                    width: width,
                    height: height
                };
                page.render(dest);

            }

            cbk && cbk(success, cbkParam);
            page.close();
        });
    };


Upvotes: 0

Tom
Tom

Reputation: 450

I know this is an old question, but you do this pretty simply by storing the dimensions and location of each image on the in an object, then altering the phantomjs page.clipRect so that the page.render() method renders only the area where the image is. Here is an example, scraping multiple images from http://dribbble.com/ :

var page = require('webpage').create();

page.open('http://dribbble.com/', function() {

    page.includeJs('//ajax.googleapis.com/ajax/libs/jquery/1.10.2/jquery.min.js',function() {

        var images = page.evaluate(function() {
            var images = [];
            function getImgDimensions($i) {
                return {
                    top : $i.offset().top,
                    left : $i.offset().left,
                    width : $i.width(),
                    height : $i.height()
                }
            }
            $('.dribbble-img img').each(function() {
                var img = getImgDimensions($(this));
                images.push(img);
            });

            return images;
        });

        images.forEach(function(imageObj, index, array){
            page.clipRect = imageObj;
            page.render('images/'+index+'.png')
        });

        phantom.exit();
    });
});

Upvotes: 19

Alon Bar David
Alon Bar David

Reputation: 1727

There is now another way to do this.

var fs = require("fs");
var imageBase64 = page.evaluate(function(){
  var canvas = document.createElement("canvas");
  canvas.width =img.width;
  canvas.height =img.height;
  var ctx = canvas.getContext("2d");
  ctx.drawImage(img, 0, 0);      
  return canvas.toDataURL ("image/png").split(",")[1];
})
fs.write("file.png",atob(imageBase64),'wb');

Upvotes: 9

Andreas Köberle
Andreas Köberle

Reputation: 110972

Solve this by starting a child process running a node script that download the images:

phantomJs script:

findRotationTeaserImages = ->
  paths = page.evaluate ->
    jQuery('.rotate img').map(-> return this.src).get()

  args = ('loadRotationTeaser.js ' + paths.join(' ')).split(' ')

  child_process.execFile("node", args, null, (err, stdout, stderr) ->
    phantom.exit()
  )

nodeJs script

http = require('http-get');

args = process.argv.splice(2)

for path, i in args
  http.get path, 'public/images/rotationTeaser/img' + i + '.jpeg', (error, result) ->

Upvotes: 5

Related Questions