aze45sq6d
aze45sq6d

Reputation: 925

scrapy javascript redirect page

I am able to authenticate a login session using following code.

However, after logging in, it redirects me to the homepage with javascript. When running open_browser(response), it directs me to /nl/home.aspx instead of XXX.be/nl/home.aspx.

I am pretty new to scrapy and I am probably missing something crucial, but I am not sure what I am doing wrong.

class XXXSpider(scrapy.Spider):
    name = 'XXX'
    allowed_domains = ['XXX.be']
    start_urls = ['XXX.be/nl/signin.aspx']

    def parse(self, response):
        return FormRequest.from_response(response,
                    formdata={
                        'ctl00$MainContent$UserNameText': 'XXXXX', 
                        'ctl00$MainContent$PasswordText': 'XXXXX'},
                    callback=self.after_login)

    def after_login(self, response):
        self.log(response.body)
        open_in_browser(response)

Output of self.log(response.body): (ideally this would be a html page). I've looked into splash to handle the javascript but I've been unable follow the redirect to the correct page.

b '\r\n\r\n<!doctype html>\r\n<html lang="nl" class="popup">\r\n<head><meta charset="utf-8" /><meta http-equiv="x-ua-compatible" content="ie=edge" /><script type="text/javascript">window.NREUM||(NREUM={});NREUM.info = {"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"b77bb8d6f4","applicationID":"1127348","transactionName":"b1ZWYxBWWUcDBRBZWVYcdWQyGERdBQgNXhhZQERP","queueTime":0,"applicationTime":8,"agent":"","atts":"QxFVFVhMFVoQSBBCX0h6UBVYFQUNIFNcdg4PBQAOIHR1dlNEGRwUTREOTEBjRVEMAhdkWUgeeFgFXllgGxYBEgwaflVFCVJDXQwBRhwUbEFRWQZEY1sSSyhfUVFdd1gGUhUOQBUXQF8MBRZKHw=="}</script><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"b77bb8d6f4",applicationID:"1127348"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var i=n[t]={exports:{}};e[t][0].call(i.exports,function(n){var i=e[t][1][n];return r(i||n)},i,i.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(e,n,t){function r(){}function i(e,n,t){return function(){return o(e,[u.now()].concat(f(arguments)),n?null:this,t),n?void 0:this}}var o=e("handle"),a=e(4),f=e(5),c=e("ee").get("tracer"),u=e("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],l="api-",d=l+"ixn-";a(p,function(e,n){s[n]=i(l+n,!0,"api")}),s.addPageAction=i(l+"addPageAction",!0),s.setCurrentRouteName=i(l+"routeName",!0),n.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r.prototype={createTracer:function(e,n){var t={},r=this,i="function"==typeof n;return o(d+"tracer",[u.now(),e,t],r),function(){if(c.emit((i?"":"no-")+"fn-start",[u.now(),r,i],t),i)try{return n.apply(this,arguments)}catch(e){throw c.emit("fn-err",[arguments,this,e],t),e}finally{c.emit("fn-end",[u.now()],t)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(e,n){m[n]=i(d+n)}),newrelic.noticeError=function(e,n){"string"==typeof e&&(e=new 
Error(e)), o("err", [e, u.now(), !1, n])
}
}, {}], 2: [function(e, n, t) {
    function r(e, n) {
        var t = e.getEntries();
        t.forEach(function(e) {
            "first-paint" === e.name ? c("timing", ["fp", Math.floor(e.startTime)]) : "first-contentful-paint" === e.name && c("timing", ["fcp", Math.floor(e.startTime)])
        })
    }

    function i(e, n) {
        var t = e.getEntries();
        t.length > 0 && c("lcp", [t[t.length - 1]])
    }

    function o(e) {
        if (e instanceof s && !l) {
            var n, t = Math.round(e.timeStamp);
            n = t > 1e12 ? Date.now() - t : u.now() - t, l = !0, c("timing", ["fi", t, {
                type: e.type,
                fid: n
            }])
        }
    }
    if (!("init" in NREUM && "page_view_timing" in NREUM.init && "enabled" in NREUM.init.page_view_timing && NREUM.init.page_view_timing.enabled === !1)) {
        var a, f, c = e("handle"),
            u = e("loader"),
            s = NREUM.o.EV;
        if ("PerformanceObserver" in window && "function" == typeof window.PerformanceObserver) {
            a = new PerformanceObserver(r), f = new PerformanceObserver(i);
            try {
                a.observe({
                    entryTypes: ["paint"]
                }), f.observe({
                    entryTypes: ["largest-contentful-paint"]
                })
            } catch (p) {}
        }
        if ("addEventListener" in document) {
            var l = !1,
                d = ["click", "keydown", "mousedown", "pointerdown", "touchstart"];
            d.forEach(function(e) {
                document.addEventListener(e, o, !1)
            })
        }
    }
}, {}], 3: [function(e, n, t) {
    function r(e, n) {
        if (!i) return !1;
        if (e !== i) return !1;
        if (!n) return !0;
        if (!o) return !1;
        for (var t = o.split("."), r = n.split("."), a = 0; a < r.length; a++)
            if (r[a] !== t[a]) return !1;
        return !0
    }
    var i = null,
        o = null,
        a = /Version\\/ (\\S + )\\ s + Safari / ;
    if (navigator.userAgent) {
        var f = navigator.userAgent,
            c = f.match(a);
        c && f.indexOf("Chrome") === -1 && f.indexOf("Chromium") === -1 && (i = "Safari", o = c[1])
    }
    n.exports = {
        agent: i,
        version: o,
        match: r
    }
}, {}], 4: [function(e, n, t) {
    function r(e, n) {
        var t = [],
            r = "",
            o = 0;
        for (r in e) i.call(e, r) && (t[o] = n(r, e[r]), o += 1);
        return t
    }
    var i = Object.prototype.hasOwnProperty;
    n.exports = r
}, {}], 5: [function(e, n, t) {
    function r(e, n, t) {
        n || (n = 0), "undefined" == typeof t && (t = e ? e.length : 0);
        for (var r = -1, i = t - n || 0, o = Array(i < 0 ? 0 : i); ++r < i;) o[r] = e[n + r];
        return o
    }
    n.exports = r
}, {}], 6: [function(e, n, t) {
    n.exports = {
        exists: "undefined" != typeof window.performance && window.performance.timing && "undefined" != typeof window.performance.timing.navigationStart
    }
}, {}], ee: [function(e, n, t) {
    function r() {}

    function i(e) {
        function n(e) {
            return e && e instanceof r ? e : e ? c(e, f, o) : o()
        }

        function t(t, r, i, o) {
            if (!l.aborted || o) {
                e && e(t, r, i);
                for (var a = n(i), f = v(t), c = f.length, u = 0; u < c; u++) f[u].apply(a, r);
                var p = s[y[t]];
                return p && p.push([b, t, r, a]), a
            }
        }

        function d(e, n) {
            h[e] = v(e).concat(n)
        }

        function m(e, n) {
            var t = h[e];
            if (t)
                for (var r = 0; r < t.length; r++) t[r] === n && t.splice(r, 1)
        }

        function v(e) {
            return h[e] || []
        }

        function g(e) {
            return p[e] = p[e] || i(t)
        }

        function w(e, n) {
            u(e, function(e, t) {
                n = n || "feature", y[t] = n, n in s || (s[n] = [])
            })
        }
        var h = {},
            y = {},
            b = {
                on: d,
                addEventListener: d,
                removeEventListener: m,
                emit: t,
                get: g,
                listeners: v,
                context: n,
                buffer: w,
                abort: a,
                aborted: !1
            };
        return b
    }

    function o() {
        return new r
    }

    function a() {
        (s.api || s.feature) && (l.aborted = !0, s = l.backlog = {})
    }
    var f = "nr@context",
        c = e("gos"),
        u = e(4),
        s = {},
        p = {},
        l = n.exports = i();
    l.backlog = s
}, {}], gos: [function(e, n, t) {
    function r(e, n, t) {
        if (i.call(e, n)) return e[n];
        var r = t();
        if (Object.defineProperty && Object.keys) try {
            return Object.defineProperty(e, n, {
                value: r,
                writable: !0,
                enumerable: !1
            }), r
        } catch (o) {}
        return e[n] = r, r
    }
    var i = Object.prototype.hasOwnProperty;
    n.exports = r
}, {}], handle: [function(e, n, t) {
    function r(e, n, t, r) {
        i.buffer([e], r), i.emit(e, n, t)
    }
    var i = e("ee").get("handle");
    n.exports = r, r.ee = i
}, {}], id: [function(e, n, t) {
    function r(e) {
        var n = typeof e;
        return !e || "object" !== n && "function" !== n ? -1 : e === window ? 0 : a(e, o, function() {
            return i++
        })
    }
    var i = 1,
        o = "nr@id",
        a = e("gos");
    n.exports = r
}, {}], loader: [function(e, n, t) {
    function r() {
        if (!x++) {
            var e = E.info = NREUM.info,
                n = d.getElementsByTagName("script")[0];
            if (setTimeout(s.abort, 3e4), !(e && e.licenseKey && e.applicationID && n)) return s.abort();
            u(y, function(n, t) {
                e[n] || (e[n] = t)
            }), c("mark", ["onload", a() + E.offset], null, "api");
            var t = d.createElement("script");
            t.src = "https://" + e.agent, n.parentNode.insertBefore(t, n)
        }
    }

    function i() {
        "complete" === d.readyState && o()
    }

    function o() {
        c("mark", ["domContent", a() + E.offset], null, "api")
    }

    function a() {
        return O.exists && performance.now ? Math.round(performance.now()) : (f = Math.max((new Date).getTime(), f)) - E.offset
    }
    var f = (new Date).getTime(),
        c = e("handle"),
        u = e(4),
        s = e("ee"),
        p = e(3),
        l = window,
        d = l.document,
        m = "addEventListener",
        v = "attachEvent",
        g = l.XMLHttpRequest,
        w = g && g.prototype;
    NREUM.o = {
        ST: setTimeout,
        SI: l.setImmediate,
        CT: clearTimeout,
        XHR: g,
        REQ: l.Request,
        EV: l.Event,
        PR: l.Promise,
        MO: l.MutationObserver
    };
    var h = "" + location,
        y = {
            beacon: "bam.nr-data.net",
            errorBeacon: "bam.nr-data.net",
            agent: "js-agent.newrelic.com/nr-1167.min.js"
        },
        b = g && w && w[m] && !/CriOS/.test(navigator.userAgent),
        E = n.exports = {
            offset: f,
            now: a,
            origin: h,
            features: {},
            xhrWrappable: b,
            userAgent: p
        };
    e(1), e(2), d[m] ? (d[m]("DOMContentLoaded", o, !1), l[m]("load", r, !1)) : (d[v]("onreadystatechange", i), l[v]("onload", r)), c("mark", ["firstbyte", f], null, "api");
    var x = 0,
        O = e(6)
}, {}], "wrap-function": [function(e, n, t) {
    function r(e) {
        return !(e && e instanceof Function && e.apply && !e[a])
    }
    var i = e("ee"),
        o = e(5),
        a = "nr@original",
        f = Object.prototype.hasOwnProperty,
        c = !1;
    n.exports = function(e, n) {
        function t(e, n, t, i) {
            function nrWrapper() {
                var r, a, f, c;
                try {
                    a = this, r = o(arguments), f = "function" == typeof t ? t(r, a) : t || {}
                } catch (u) {
                    l([u, "", [r, a, i], f])
                }
                s(n + "start", [r, a, i], f);
                try {
                    return c = e.apply(a, r)
                } catch (p) {
                    throw s(n + "err", [r, a, p], f), p
                } finally {
                    s(n + "end", [r, a, c], f)
                }
            }
            return r(e) ? e : (n || (n = ""), nrWrapper[a] = e, p(e, nrWrapper), nrWrapper)
        }

        function u(e, n, i, o) {
            i || (i = "");
            var a, f, c, u = "-" === i.charAt(0);
            for (c = 0; c < n.length; c++) f = n[c], a = e[f], r(a) || (e[f] = t(a, u ? f + i : i, o, f))
        }

        function s(t, r, i) {
            if (!c || n) {
                var o = c;
                c = !0;
                try {
                    e.emit(t, r, i, n)
                } catch (a) {
                    l([a, t, r, i])
                }
                c = o
            }
        }

        function p(e, n) {
            if (Object.defineProperty && Object.keys) try {
                var t = Object.keys(e);
                return t.forEach(function(t) {
                    Object.defineProperty(n, t, {
                        get: function() {
                            return e[t]
                        },
                        set: function(n) {
                            return e[t] = n, n
                        }
                    })
                }), n
            } catch (r) {
                l([r])
            }
            for (var i in e) f.call(e, i) && (n[i] = e[i]);
            return n
        }

        function l(n) {
            try {
                e.emit("internal-error", n)
            } catch (t) {}
        }
        return
        e || (e = i), t.inPlace = u, t.flag = a, t
    }
}, {}]
}, {}, ["loader"]); < /script><meta name="viewport" content="width=device-width, initial-scale=1.0" / > < meta name = "content-language"
content = "nl" / > < meta name = "adhese_location"
content = "_nl_top100k_other_" / > < title > \r\ n\ tTrends Top\ r\ n < /title><meta name="author" content="Developed by Natch for Roularta Business Information" / > < link href = "https://fonts.googleapis.com/css?family=Roboto&amp;display=swap"
rel = "stylesheet" / > < link href = "/style/core?v=-yOH0Sz6o2VJZHvzdEhDFznPCZdutE-dhAVbJgS1mJE1"
rel = "stylesheet" / > \r\ n < link rel = "stylesheet"
href = "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.2/css/all.min.css"
integrity = "sha256-zmfNZmXoNWBMemUOo1XUGFfc0ihGGLYdgtJS3KCr/l0="
crossorigin = "anonymous" / > \r\ n < script src = "https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js"
integrity = "sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo="
crossorigin = "anonymous" > < /script>\r\n    <script type="text/javascript
">\r\n        !window.jQuery && document.write(unescape(\'%3Cscript src=" / scripts / jquery - 3.4.1. min.js "%3E%3C/script%3E\'))\r\n    </script>\r\n    \r\n\r\n<link rel="
apple - touch - icon " sizes="
180 x180 " href=" / img / favicons / apple - touch - icon.png ">\r\n<link rel="
icon " type="
image / png " sizes="
32 x32 " href=" / img / favicons / favicon - 32 x32.png ">\r\n<link rel="
icon " type="
image / png " sizes="
16 x16 " href=" / img / favicons / favicon - 16 x16.png ">\r\n<link rel="
manifest " href=" / img / favicons / site.webmanifest ">\r\n<link rel="
mask - icon " href=" / img / favicons / safari - pinned - tab.svg " color="
#5bbad5">\r\n<link rel= "shortcut icon"
href = "/img/favicons/favicon.ico" > \r\ n < meta name = "apple-mobile-web-app-title"
content = "Trends Top" > \r\ n < meta name = "application-name"
content = "Trends Top" > \r\ n < meta name = "msapplication-TileColor"
content = "#da532c" > \r\ n < meta name = "msapplication-config"
content = "/img/favicons/browserconfig.xml" > \r\ n < meta name = "theme-color"
content = "#ffffff" > \r\ n < base target = "_top" > \r\ n < meta name = "robots"
content = "noindex" / > \r\ n < style > \r\ n.label - secondary {\
    r\ n font - weight: normal;\
    r\ n padding - top: 6 px;\
    r\ n margin - right: 15 px;\
    r\ n
    float: right;\
    r\ n
}\
r\ n\ r\ n.label - secondary a {\
    r\ n font - weight: normal;\
    r\ n
}\
r\ n\ r\ n.extra - links {\
    r\ n
    text - align: right;\
    r\ n padding - right: 5 px;\
    r\ n margin - top: -2 px;\
    r\ n
}\
r\ n < /style>\r\n</head > \r\ n < body > \r\ n\ r\ n < div
class = "container" > \r\ n\ r\ n\ r\ n < div id = "InpageTitle"
class = "page-header" > \r\ n < h1 > \r\ n Inloggen < /h1>\r\n    </div > \r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n < div id = "LoginSuccessPanel" > \r\ n\ t\ r\ n < p > \r\ n Paswoord ok.Even geduld.\r\ n < /p>\r\n    \r\n</div > \r\ n\ r\ n\ r\ n < /div>\r\n    <script src="/bundles / core ? v = mpa - _ZQEPT4Bz_k5Me - nNGLzFTwhQxL9 - hAxx6mNU_M1 "></script>\r\n\r\n    \r\n        \r\n<script type="
text / javascript ">\r\n<!--//--><![CDATA[//><!--\r\n    var pp_gemius_identifier = \'ndo1lDLvFYWs_HhuygUZRaRu7O8uwGx1.xgqnRlSTQn.H7\';\r\n    var pp_gemius_extraparameters = new Array(\'lan=NL\', \'key=trendstop\');\r\n\r\n    // lines below shouldn\'t be edited\r\n    (function (d, t) { try { var gt = d.createElement(t), s = d.getElementsByTagName(t)[0], l = \'http\' + ((location.protocol == \'https:\') ? \'s\' : \'\'); gt.setAttribute(\'async\', \'async\'); gt.setAttribute(\'defer\', \'defer\'); gt.src = l + \'://gabe.hit.gemius.pl/xgemius.js\'; s.parentNode.insertBefore(gt, s); } catch (e) { } })(document, \'script\');\r\n    //--><!]]>\r\n</script>\r\n\r\n        \r\n<script type="
text / javascript ">\r\n    var _gaq = _gaq || [];\r\n    _gaq.push([\'_setVar\', \'Customer:sspi46\']);_gaq.push([\'_setCustomVar\', 1, \'marketing\', \'sspi46\', 3]);_gaq.push([\'_setCustomVar\', 2, \'LoginType\', \'marketing\', 3]);_gaq.push([\'_setCustomVar\', 3, \'LoginCode\', \'sspi46\', 3]);\r\n    _gaq.push([\'_setAccount\', \'UA-343384-1\']); _gaq.push([\'_setDomainName\', \'none\']); _gaq.push([\'_setAllowHash\', false]); _gaq.push([\'_setAllowLinker\', true]); _gaq.push([\'_trackPageview\']); _gaq.push([\'_trackPageLoadTime\']);\r\n    _gaq.push([\'rt._setAccount\', \'UA-8272409-1\']); _gaq.push([\'rt._setDomainName\', \'none\']); _gaq.push([\'rt._setAllowHash\', false]); _gaq.push([\'rt._setAllowLinker\', true]); _gaq.push([\'rt._trackPageview\']);\r\n    _gaq.push([\'mt._setAccount\', \'UA-11504259-2\']); _gaq.push([\'mt._setDomainName\', \'none\']); _gaq.push([\'mt._setAllowHash\', false]); _gaq.push([\'mt._setAllowLinker\', true]); _gaq.push([\'mt._trackPageview\']);\r\n    (function () {\r\n        var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true;\r\n        ga.src = \'https://ssl.google-analytics.com/ga.js\';\r\n        var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s);\r\n    })();\r\n</script>\r\n\r\n    \r\n    \r\n    <script>\r\n        $(function () {\r\n            if (navigator.userAgent.search("
Chrome ") >= 0 || navigator.userAgent.search("
Safari ") >= 0) {\r\n                $(\'#PasswordText\').attr(\'readonly\', true);\r\n\r\n                $("#
PasswordText ").mouseenter(function () {\r\n                    $(\'#PasswordText\').attr(\'readonly\', false);\r\n                });\r\n\r\n                $("#
PasswordText ").focus(function () {\r\n                    $(\'#PasswordText\').attr(\'readonly\', false);\r\n                });\r\n            }\r\n        });\r\n\r\n        function RedirectParent(url) {\r\n            var w = parent  || window;\
r\ n w.$("body").css("cursor", "progress");\
r\ n w.$(\'<div class="ajax-inprogress"></div>\').hide().prependTo(\'body\').fadeIn(200);\r\n\r\n            if (!url || url.length == 0) {\r\n                if (typeof w.returnUrl != \'undefined\') {\r\n                    w.document.location.href = w.returnUrl;\r\n                    return;\r\n                }\r\n\r\n                // without the hash/fragment\r\n                w.document.location.href = w.document.location.origin + w.document.location.pathname;\r\n                return;\r\n            }\r\n\r\n            w.document.location.href = url;\r\n        }\r\n    </script>\r\n    <script>RedirectParent(\'/nl/home.aspx\')</script>\r\n\r\n</body>\r\n</html>\r\n'

Upvotes: 0

Views: 615

Answers (1)

mdaniel
mdaniel

Reputation: 33158

As best we can tell with the limited information, it looks like it just does a bunch of tracking stuff, and then sets the browser's location to /nl/home.aspx, so while we can't prove it's correct, a reasonable first step is just to mimic that behavior and see how far it gets you:

def parse(self, response):
    return FormRequest.from_response(response,
                formdata={
                    'ctl00$MainContent$UserNameText': 'XXXXX', 
                    'ctl00$MainContent$PasswordText': 'XXXXX'},
                callback=self.after_login0)

def after_login0(self, response):
    yield response.follow("/nl/home.aspx", callback=self.after_login)

def after_login(self, response):
    self.log(response.body)

If you wanted a little future-proofing you could extract that redirect path out of the response.body in login0, but for the purposes of just testing the theory, hard-coding the path seems fine

Upvotes: 1

Related Questions