Reputation: 925
I am able to authenticate a login session using following code.
However, after logging in, it redirects me to the homepage with javascript. When running open_browser(response), it directs me to /nl/home.aspx instead of XXX.be/nl/home.aspx.
I am pretty new to scrapy and I am probably missing something crucial, but I am not sure what I am doing wrong.
class XXXSpider(scrapy.Spider):
name = 'XXX'
allowed_domains = ['XXX.be']
start_urls = ['XXX.be/nl/signin.aspx']
def parse(self, response):
return FormRequest.from_response(response,
formdata={
'ctl00$MainContent$UserNameText': 'XXXXX',
'ctl00$MainContent$PasswordText': 'XXXXX'},
callback=self.after_login)
def after_login(self, response):
self.log(response.body)
open_in_browser(response)
Output of self.log(response.body)
: (ideally this would be a html page).
I've looked into splash to handle the javascript but I've been unable follow the redirect to the correct page.
b '\r\n\r\n<!doctype html>\r\n<html lang="nl" class="popup">\r\n<head><meta charset="utf-8" /><meta http-equiv="x-ua-compatible" content="ie=edge" /><script type="text/javascript">window.NREUM||(NREUM={});NREUM.info = {"beacon":"bam.nr-data.net","errorBeacon":"bam.nr-data.net","licenseKey":"b77bb8d6f4","applicationID":"1127348","transactionName":"b1ZWYxBWWUcDBRBZWVYcdWQyGERdBQgNXhhZQERP","queueTime":0,"applicationTime":8,"agent":"","atts":"QxFVFVhMFVoQSBBCX0h6UBVYFQUNIFNcdg4PBQAOIHR1dlNEGRwUTREOTEBjRVEMAhdkWUgeeFgFXllgGxYBEgwaflVFCVJDXQwBRhwUbEFRWQZEY1sSSyhfUVFdd1gGUhUOQBUXQF8MBRZKHw=="}</script><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"b77bb8d6f4",applicationID:"1127348"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var i=n[t]={exports:{}};e[t][0].call(i.exports,function(n){var i=e[t][1][n];return r(i||n)},i,i.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(e,n,t){function r(){}function i(e,n,t){return function(){return o(e,[u.now()].concat(f(arguments)),n?null:this,t),n?void 0:this}}var o=e("handle"),a=e(4),f=e(5),c=e("ee").get("tracer"),u=e("loader"),s=NREUM;"undefined"==typeof window.newrelic&&(newrelic=s);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTrace","inlineHit","addRelease"],l="api-",d=l+"ixn-";a(p,function(e,n){s[n]=i(l+n,!0,"api")}),s.addPageAction=i(l+"addPageAction",!0),s.setCurrentRouteName=i(l+"routeName",!0),n.exports=newrelic,s.interaction=function(){return(new r).get()};var m=r.prototype={createTracer:function(e,n){var t={},r=this,i="function"==typeof n;return o(d+"tracer",[u.now(),e,t],r),function(){if(c.emit((i?"":"no-")+"fn-start",[u.now(),r,i],t),i)try{return n.apply(this,arguments)}catch(e){throw c.emit("fn-err",[arguments,this,e],t),e}finally{c.emit("fn-end",[u.now()],t)}}}};a("actionText,setName,setAttribute,save,ignore,onEnd,getContext,end,get".split(","),function(e,n){m[n]=i(d+n)}),newrelic.noticeError=function(e,n){"string"==typeof e&&(e=new
Error(e)), o("err", [e, u.now(), !1, n])
}
}, {}], 2: [function(e, n, t) {
function r(e, n) {
var t = e.getEntries();
t.forEach(function(e) {
"first-paint" === e.name ? c("timing", ["fp", Math.floor(e.startTime)]) : "first-contentful-paint" === e.name && c("timing", ["fcp", Math.floor(e.startTime)])
})
}
function i(e, n) {
var t = e.getEntries();
t.length > 0 && c("lcp", [t[t.length - 1]])
}
function o(e) {
if (e instanceof s && !l) {
var n, t = Math.round(e.timeStamp);
n = t > 1e12 ? Date.now() - t : u.now() - t, l = !0, c("timing", ["fi", t, {
type: e.type,
fid: n
}])
}
}
if (!("init" in NREUM && "page_view_timing" in NREUM.init && "enabled" in NREUM.init.page_view_timing && NREUM.init.page_view_timing.enabled === !1)) {
var a, f, c = e("handle"),
u = e("loader"),
s = NREUM.o.EV;
if ("PerformanceObserver" in window && "function" == typeof window.PerformanceObserver) {
a = new PerformanceObserver(r), f = new PerformanceObserver(i);
try {
a.observe({
entryTypes: ["paint"]
}), f.observe({
entryTypes: ["largest-contentful-paint"]
})
} catch (p) {}
}
if ("addEventListener" in document) {
var l = !1,
d = ["click", "keydown", "mousedown", "pointerdown", "touchstart"];
d.forEach(function(e) {
document.addEventListener(e, o, !1)
})
}
}
}, {}], 3: [function(e, n, t) {
function r(e, n) {
if (!i) return !1;
if (e !== i) return !1;
if (!n) return !0;
if (!o) return !1;
for (var t = o.split("."), r = n.split("."), a = 0; a < r.length; a++)
if (r[a] !== t[a]) return !1;
return !0
}
var i = null,
o = null,
a = /Version\\/ (\\S + )\\ s + Safari / ;
if (navigator.userAgent) {
var f = navigator.userAgent,
c = f.match(a);
c && f.indexOf("Chrome") === -1 && f.indexOf("Chromium") === -1 && (i = "Safari", o = c[1])
}
n.exports = {
agent: i,
version: o,
match: r
}
}, {}], 4: [function(e, n, t) {
function r(e, n) {
var t = [],
r = "",
o = 0;
for (r in e) i.call(e, r) && (t[o] = n(r, e[r]), o += 1);
return t
}
var i = Object.prototype.hasOwnProperty;
n.exports = r
}, {}], 5: [function(e, n, t) {
function r(e, n, t) {
n || (n = 0), "undefined" == typeof t && (t = e ? e.length : 0);
for (var r = -1, i = t - n || 0, o = Array(i < 0 ? 0 : i); ++r < i;) o[r] = e[n + r];
return o
}
n.exports = r
}, {}], 6: [function(e, n, t) {
n.exports = {
exists: "undefined" != typeof window.performance && window.performance.timing && "undefined" != typeof window.performance.timing.navigationStart
}
}, {}], ee: [function(e, n, t) {
function r() {}
function i(e) {
function n(e) {
return e && e instanceof r ? e : e ? c(e, f, o) : o()
}
function t(t, r, i, o) {
if (!l.aborted || o) {
e && e(t, r, i);
for (var a = n(i), f = v(t), c = f.length, u = 0; u < c; u++) f[u].apply(a, r);
var p = s[y[t]];
return p && p.push([b, t, r, a]), a
}
}
function d(e, n) {
h[e] = v(e).concat(n)
}
function m(e, n) {
var t = h[e];
if (t)
for (var r = 0; r < t.length; r++) t[r] === n && t.splice(r, 1)
}
function v(e) {
return h[e] || []
}
function g(e) {
return p[e] = p[e] || i(t)
}
function w(e, n) {
u(e, function(e, t) {
n = n || "feature", y[t] = n, n in s || (s[n] = [])
})
}
var h = {},
y = {},
b = {
on: d,
addEventListener: d,
removeEventListener: m,
emit: t,
get: g,
listeners: v,
context: n,
buffer: w,
abort: a,
aborted: !1
};
return b
}
function o() {
return new r
}
function a() {
(s.api || s.feature) && (l.aborted = !0, s = l.backlog = {})
}
var f = "nr@context",
c = e("gos"),
u = e(4),
s = {},
p = {},
l = n.exports = i();
l.backlog = s
}, {}], gos: [function(e, n, t) {
function r(e, n, t) {
if (i.call(e, n)) return e[n];
var r = t();
if (Object.defineProperty && Object.keys) try {
return Object.defineProperty(e, n, {
value: r,
writable: !0,
enumerable: !1
}), r
} catch (o) {}
return e[n] = r, r
}
var i = Object.prototype.hasOwnProperty;
n.exports = r
}, {}], handle: [function(e, n, t) {
function r(e, n, t, r) {
i.buffer([e], r), i.emit(e, n, t)
}
var i = e("ee").get("handle");
n.exports = r, r.ee = i
}, {}], id: [function(e, n, t) {
function r(e) {
var n = typeof e;
return !e || "object" !== n && "function" !== n ? -1 : e === window ? 0 : a(e, o, function() {
return i++
})
}
var i = 1,
o = "nr@id",
a = e("gos");
n.exports = r
}, {}], loader: [function(e, n, t) {
function r() {
if (!x++) {
var e = E.info = NREUM.info,
n = d.getElementsByTagName("script")[0];
if (setTimeout(s.abort, 3e4), !(e && e.licenseKey && e.applicationID && n)) return s.abort();
u(y, function(n, t) {
e[n] || (e[n] = t)
}), c("mark", ["onload", a() + E.offset], null, "api");
var t = d.createElement("script");
t.src = "https://" + e.agent, n.parentNode.insertBefore(t, n)
}
}
function i() {
"complete" === d.readyState && o()
}
function o() {
c("mark", ["domContent", a() + E.offset], null, "api")
}
function a() {
return O.exists && performance.now ? Math.round(performance.now()) : (f = Math.max((new Date).getTime(), f)) - E.offset
}
var f = (new Date).getTime(),
c = e("handle"),
u = e(4),
s = e("ee"),
p = e(3),
l = window,
d = l.document,
m = "addEventListener",
v = "attachEvent",
g = l.XMLHttpRequest,
w = g && g.prototype;
NREUM.o = {
ST: setTimeout,
SI: l.setImmediate,
CT: clearTimeout,
XHR: g,
REQ: l.Request,
EV: l.Event,
PR: l.Promise,
MO: l.MutationObserver
};
var h = "" + location,
y = {
beacon: "bam.nr-data.net",
errorBeacon: "bam.nr-data.net",
agent: "js-agent.newrelic.com/nr-1167.min.js"
},
b = g && w && w[m] && !/CriOS/.test(navigator.userAgent),
E = n.exports = {
offset: f,
now: a,
origin: h,
features: {},
xhrWrappable: b,
userAgent: p
};
e(1), e(2), d[m] ? (d[m]("DOMContentLoaded", o, !1), l[m]("load", r, !1)) : (d[v]("onreadystatechange", i), l[v]("onload", r)), c("mark", ["firstbyte", f], null, "api");
var x = 0,
O = e(6)
}, {}], "wrap-function": [function(e, n, t) {
function r(e) {
return !(e && e instanceof Function && e.apply && !e[a])
}
var i = e("ee"),
o = e(5),
a = "nr@original",
f = Object.prototype.hasOwnProperty,
c = !1;
n.exports = function(e, n) {
function t(e, n, t, i) {
function nrWrapper() {
var r, a, f, c;
try {
a = this, r = o(arguments), f = "function" == typeof t ? t(r, a) : t || {}
} catch (u) {
l([u, "", [r, a, i], f])
}
s(n + "start", [r, a, i], f);
try {
return c = e.apply(a, r)
} catch (p) {
throw s(n + "err", [r, a, p], f), p
} finally {
s(n + "end", [r, a, c], f)
}
}
return r(e) ? e : (n || (n = ""), nrWrapper[a] = e, p(e, nrWrapper), nrWrapper)
}
function u(e, n, i, o) {
i || (i = "");
var a, f, c, u = "-" === i.charAt(0);
for (c = 0; c < n.length; c++) f = n[c], a = e[f], r(a) || (e[f] = t(a, u ? f + i : i, o, f))
}
function s(t, r, i) {
if (!c || n) {
var o = c;
c = !0;
try {
e.emit(t, r, i, n)
} catch (a) {
l([a, t, r, i])
}
c = o
}
}
function p(e, n) {
if (Object.defineProperty && Object.keys) try {
var t = Object.keys(e);
return t.forEach(function(t) {
Object.defineProperty(n, t, {
get: function() {
return e[t]
},
set: function(n) {
return e[t] = n, n
}
})
}), n
} catch (r) {
l([r])
}
for (var i in e) f.call(e, i) && (n[i] = e[i]);
return n
}
function l(n) {
try {
e.emit("internal-error", n)
} catch (t) {}
}
return
e || (e = i), t.inPlace = u, t.flag = a, t
}
}, {}]
}, {}, ["loader"]); < /script><meta name="viewport" content="width=device-width, initial-scale=1.0" / > < meta name = "content-language"
content = "nl" / > < meta name = "adhese_location"
content = "_nl_top100k_other_" / > < title > \r\ n\ tTrends Top\ r\ n < /title><meta name="author" content="Developed by Natch for Roularta Business Information" / > < link href = "https://fonts.googleapis.com/css?family=Roboto&display=swap"
rel = "stylesheet" / > < link href = "/style/core?v=-yOH0Sz6o2VJZHvzdEhDFznPCZdutE-dhAVbJgS1mJE1"
rel = "stylesheet" / > \r\ n < link rel = "stylesheet"
href = "https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.10.2/css/all.min.css"
integrity = "sha256-zmfNZmXoNWBMemUOo1XUGFfc0ihGGLYdgtJS3KCr/l0="
crossorigin = "anonymous" / > \r\ n < script src = "https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js"
integrity = "sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo="
crossorigin = "anonymous" > < /script>\r\n <script type="text/javascript
">\r\n !window.jQuery && document.write(unescape(\'%3Cscript src=" / scripts / jquery - 3.4.1. min.js "%3E%3C/script%3E\'))\r\n </script>\r\n \r\n\r\n<link rel="
apple - touch - icon " sizes="
180 x180 " href=" / img / favicons / apple - touch - icon.png ">\r\n<link rel="
icon " type="
image / png " sizes="
32 x32 " href=" / img / favicons / favicon - 32 x32.png ">\r\n<link rel="
icon " type="
image / png " sizes="
16 x16 " href=" / img / favicons / favicon - 16 x16.png ">\r\n<link rel="
manifest " href=" / img / favicons / site.webmanifest ">\r\n<link rel="
mask - icon " href=" / img / favicons / safari - pinned - tab.svg " color="
#5bbad5">\r\n<link rel= "shortcut icon"
href = "/img/favicons/favicon.ico" > \r\ n < meta name = "apple-mobile-web-app-title"
content = "Trends Top" > \r\ n < meta name = "application-name"
content = "Trends Top" > \r\ n < meta name = "msapplication-TileColor"
content = "#da532c" > \r\ n < meta name = "msapplication-config"
content = "/img/favicons/browserconfig.xml" > \r\ n < meta name = "theme-color"
content = "#ffffff" > \r\ n < base target = "_top" > \r\ n < meta name = "robots"
content = "noindex" / > \r\ n < style > \r\ n.label - secondary {\
r\ n font - weight: normal;\
r\ n padding - top: 6 px;\
r\ n margin - right: 15 px;\
r\ n
float: right;\
r\ n
}\
r\ n\ r\ n.label - secondary a {\
r\ n font - weight: normal;\
r\ n
}\
r\ n\ r\ n.extra - links {\
r\ n
text - align: right;\
r\ n padding - right: 5 px;\
r\ n margin - top: -2 px;\
r\ n
}\
r\ n < /style>\r\n</head > \r\ n < body > \r\ n\ r\ n < div
class = "container" > \r\ n\ r\ n\ r\ n < div id = "InpageTitle"
class = "page-header" > \r\ n < h1 > \r\ n Inloggen < /h1>\r\n </div > \r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n\ r\ n < div id = "LoginSuccessPanel" > \r\ n\ t\ r\ n < p > \r\ n Paswoord ok.Even geduld.\r\ n < /p>\r\n \r\n</div > \r\ n\ r\ n\ r\ n < /div>\r\n <script src="/bundles / core ? v = mpa - _ZQEPT4Bz_k5Me - nNGLzFTwhQxL9 - hAxx6mNU_M1 "></script>\r\n\r\n \r\n \r\n<script type="
text / javascript ">\r\n<!--//--><![CDATA[//><!--\r\n var pp_gemius_identifier = \'ndo1lDLvFYWs_HhuygUZRaRu7O8uwGx1.xgqnRlSTQn.H7\';\r\n var pp_gemius_extraparameters = new Array(\'lan=NL\', \'key=trendstop\');\r\n\r\n // lines below shouldn\'t be edited\r\n (function (d, t) { try { var gt = d.createElement(t), s = d.getElementsByTagName(t)[0], l = \'http\' + ((location.protocol == \'https:\') ? \'s\' : \'\'); gt.setAttribute(\'async\', \'async\'); gt.setAttribute(\'defer\', \'defer\'); gt.src = l + \'://gabe.hit.gemius.pl/xgemius.js\'; s.parentNode.insertBefore(gt, s); } catch (e) { } })(document, \'script\');\r\n //--><!]]>\r\n</script>\r\n\r\n \r\n<script type="
text / javascript ">\r\n var _gaq = _gaq || [];\r\n _gaq.push([\'_setVar\', \'Customer:sspi46\']);_gaq.push([\'_setCustomVar\', 1, \'marketing\', \'sspi46\', 3]);_gaq.push([\'_setCustomVar\', 2, \'LoginType\', \'marketing\', 3]);_gaq.push([\'_setCustomVar\', 3, \'LoginCode\', \'sspi46\', 3]);\r\n _gaq.push([\'_setAccount\', \'UA-343384-1\']); _gaq.push([\'_setDomainName\', \'none\']); _gaq.push([\'_setAllowHash\', false]); _gaq.push([\'_setAllowLinker\', true]); _gaq.push([\'_trackPageview\']); _gaq.push([\'_trackPageLoadTime\']);\r\n _gaq.push([\'rt._setAccount\', \'UA-8272409-1\']); _gaq.push([\'rt._setDomainName\', \'none\']); _gaq.push([\'rt._setAllowHash\', false]); _gaq.push([\'rt._setAllowLinker\', true]); _gaq.push([\'rt._trackPageview\']);\r\n _gaq.push([\'mt._setAccount\', \'UA-11504259-2\']); _gaq.push([\'mt._setDomainName\', \'none\']); _gaq.push([\'mt._setAllowHash\', false]); _gaq.push([\'mt._setAllowLinker\', true]); _gaq.push([\'mt._trackPageview\']);\r\n (function () {\r\n var ga = document.createElement(\'script\'); ga.type = \'text/javascript\'; ga.async = true;\r\n ga.src = \'https://ssl.google-analytics.com/ga.js\';\r\n var s = document.getElementsByTagName(\'script\')[0]; s.parentNode.insertBefore(ga, s);\r\n })();\r\n</script>\r\n\r\n \r\n \r\n <script>\r\n $(function () {\r\n if (navigator.userAgent.search("
Chrome ") >= 0 || navigator.userAgent.search("
Safari ") >= 0) {\r\n $(\'#PasswordText\').attr(\'readonly\', true);\r\n\r\n $("#
PasswordText ").mouseenter(function () {\r\n $(\'#PasswordText\').attr(\'readonly\', false);\r\n });\r\n\r\n $("#
PasswordText ").focus(function () {\r\n $(\'#PasswordText\').attr(\'readonly\', false);\r\n });\r\n }\r\n });\r\n\r\n function RedirectParent(url) {\r\n var w = parent || window;\
r\ n w.$("body").css("cursor", "progress");\
r\ n w.$(\'<div class="ajax-inprogress"></div>\').hide().prependTo(\'body\').fadeIn(200);\r\n\r\n if (!url || url.length == 0) {\r\n if (typeof w.returnUrl != \'undefined\') {\r\n w.document.location.href = w.returnUrl;\r\n return;\r\n }\r\n\r\n // without the hash/fragment\r\n w.document.location.href = w.document.location.origin + w.document.location.pathname;\r\n return;\r\n }\r\n\r\n w.document.location.href = url;\r\n }\r\n </script>\r\n <script>RedirectParent(\'/nl/home.aspx\')</script>\r\n\r\n</body>\r\n</html>\r\n'
Upvotes: 0
Views: 615
Reputation: 33158
As best we can tell with the limited information, it looks like it just does a bunch of tracking stuff, and then sets the browser's location to /nl/home.aspx
, so while we can't prove it's correct, a reasonable first step is just to mimic that behavior and see how far it gets you:
def parse(self, response):
return FormRequest.from_response(response,
formdata={
'ctl00$MainContent$UserNameText': 'XXXXX',
'ctl00$MainContent$PasswordText': 'XXXXX'},
callback=self.after_login0)
def after_login0(self, response):
yield response.follow("/nl/home.aspx", callback=self.after_login)
def after_login(self, response):
self.log(response.body)
If you wanted a little future-proofing you could extract that redirect path out of the response.body in login0
, but for the purposes of just testing the theory, hard-coding the path seems fine
Upvotes: 1