Reputation: 2064
When page is loaded by headless playwright
+ proxy it works perfectly, every time.
When page is loaded by scrapy-playwright
, also headless and same proxy, it raises Timeout errors and the HTML content of the page object is as JS was disabled. Other domains work with scrapy-playwright
without a problem.
I'm trying to load the page below with scrapy-playwright
, however and it keeps raising TimeOut exceptions. So I wrote a MRE in Playwright and it successfully loads the page.
If I check the content of the page when the timeout is raised, I get the HTML of a page that is not getting rendered at all, asking to enable JS:
<!DOCTYPE html><html style="" class=" adownload no-applicationcache blobconstructor blob-constructor borderimage borderradius boxshadow boxsizing canvas canvastext checked classlist contenteditable no-contentsecuritypolicy no-contextmenu cors cssanimations csscalc csscolumns cssfilters cssgradients cssmask csspointerevents no-cssreflections cssremunit cssresize csstransforms3d csstransforms csstransitions cssvhunit cssvmaxunit cssvminunit cssvwunit dataset details deviceorientation displaytable display-table draganddrop fileinput filereader no-filesystem flexbox fullscreen geolocation getusermedia hashchange history hsla indexeddb inlinesvg json lastchild localstorage mathml mediaqueries meter multiplebgs notification objectfit object-fit opacity pagevisibility performance postmessage progressbar no-regions requestanimationframe raf rgba ruby scriptasync scriptdefer sharedworkers siblinggeneral smil no-strictmode no-stylescoped supports svg svgfilters textshadow no-time no-touchevents typedarrays userselect webaudio webgl websockets no-websqldatabase webworkers datalistelem video datauri svgasimg no-csshyphens"><head>\n<meta http-equiv="Pragma" content="no-cache">\n<meta http-equiv="Expires" content="-1">\n<meta http-equiv="CacheControl" content="no-cache">\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<link rel="shortcut icon" href="data:;base64,iVBORw0KGgo=">\n\n<script type="text/javascript">\n(function(){\nwindow["bobcmn"] = "10111110101010200000005200000005200000006200000001254047669200000096200000000200000002300000000300000000300000006/TSPD/300000008TSPD_10130000000cTSPD_101_DID300000005https3000000b00821df8b06ab2000403b56c35c2a566711ae86a0540ec06085987bafb13c4f6ff50c290b8817631708a53aa37a0a28005ad630686158d96f8a68420d501cd849bbe9225e4b3c2a59471abb33f0767803e9e24ce82c1584f2300000002TS200000000200000000";\n\nwindow["failureConfig"] = "524f6f70732e2e2e2e736f6d657468696e672077656e742077726f6e672e2e2e2e20796f757220737570706f72742069642069733a2025444f534c372e6368616c6c656e67652e737570706f72745f6964252e143134303331333636303235333339313535303337062f545350442f171800";window.xEg=!!window.xEg;try{(function(){(function OL(){var z=!1;function s(z){for(var s=0;z--;)s+=I(document.documentElement,null);return s}function I(z,s){var l="vi";s=s||new J;return ZL(z,function(z){z.setAttribute("data-"+l,s.Z$());return I(z,s)},null)}function J(){this.lZ=1;this.zz=0;this.ol=this.lZ;this.Jo=null;this.Z$=function(){this.Jo=this.zz+this.ol;if(!isFinite(this.Jo))return this.reset(),this.Z$();this.zz=this.ol;this.ol=this.Jo;this.Jo=null;return this.ol};this.reset=function(){this.lZ++;this.zz=0;this.ol=this.lZ}}var l=!1;\nfunction LL(z,s){var I=document.createElement(z);s=s||document.body;s.appendChild(I);I&&I.style&&(I.style.display="none")}function oL(s,I){I=I||s;var J="|";function LL(z){z=z.split(J);var s=[];for(var I=0;I<z.length;++I){var l="",oL=z[I].split(",");for(var sL=0;sL<oL.length;++sL)l+=oL[sL][sL];s.push(l)}return s}var oL=0,ZL="datalist,details,embed,figure,hrimg,strong,article,formaddress|audio,blockquote,area,source,input|canvas,form,link,tbase,option,details,article";ZL.split(J);ZL=LL(ZL);ZL=new RegExp(ZL.join(J),\n"g");while(ZL.exec(s))ZL=new RegExp((""+new Date)[8],"g"),z&&(l=!0),++oL;return I(oL&&1)}function ZL(z,s,I){(I=I||l)&&LL("div",z);z=z.children;var J=0;for(var oL in z){I=z[oL];try{I instanceof HTMLElement&&(s(I),++J)}catch(ZL){}}return J}oL(OL,s)})();var zL=77;try{var SL,iL,jL=O(179)?1:0;for(var Lo=(O(293),0);Lo<iL;++Lo)jL+=O(831)?3:1;SL=jL;window.Ll===SL&&(window.Ll=++SL)}catch(zo){window.Ll=SL}var Zo=!0;function Z(L,z){L+=z;return L.toString(36)}\nfunction io(L){var z=46;!L||document[S(z,164,151,161,151,144,151,154,151,162,167,129,162,143,162,147)]&&document[S(z,164,151,161,151,144,151,154,151,162,167,129,162,143,162,147)]!==Z(68616527620,z)||(Zo=!1);return Zo}function _(L){var z=arguments.length,s=[],I=1;while(I<z)s[I-1]=arguments[I++]-L;return String.fromCharCode.apply(String,s)}function jo(){}io(window[jo[S(zL,187,174,186,178)]]===jo);io(typeof ie9rgb4!==_(zL,179,194,187,176,193,182,188,187));\nio(RegExp("\\x3c")[Z(1372128,zL)](function(){return"\\x3c"})&!RegExp(Z(42812,zL))[_(zL,193,178,192,193)](function(){return"\'x3\'+\'d\';"}));\nvar Jo=window[S(zL,174,193,193,174,176,181,146,195,178,187,193)]||RegExp(S(zL,186,188,175,182,201,174,187,177,191,188,182,177),Z(-59,zL))[Z(1372128,zL)](window["\\x6e\\x61vi\\x67a\\x74\\x6f\\x72"]["\\x75\\x73e\\x72A\\x67\\x65\\x6et"]),LO=+new Date+(O(823)?6E5:796558),oO,ZO,sO,SO=window[_(zL,192,178,193,161,182,186,178,188,194,193)],_O=Jo?O(710)?3E4:17796:O(382)?6E3:8380;\ndocument[S(zL,174,177,177,146,195,178,187,193,153,182,192,193,178,187,178,191)]&&document[S(zL,174,177,177,146,195,178,187,193,153,182,192,193,178,187,178,191)](_(zL,195,182,192,182,175,182,185,182,193,198,176,181,174,187,180,178),function(L){var z=62;document[S(z,180,167,177,167,160,167,170,167,178,183,145,178,159,178,163)]&&(document[S(z,180,167,177,167,160,167,170,167,178,183,145,178,159,178,163)]===_(z,166,167,162,162,163,172)&&L[_(z,167,177,146,176,179,177,178,163,162)]?sO=!0:document[_(z,180,\n167,177,167,160,167,170,167,178,183,145,178,159,178,163)]===Z(68616527604,z)&&(oO=+new Date,sO=!1,iO()))});function iO(){if(!document[S(33,146,150,134,147,154,116,134,141,134,132,149,144,147)])return!0;var L=+new Date;if(L>LO&&(O(239)?6E5:861172)>L-oO)return io(!1);var z=io(ZO&&!sO&&oO+_O<L);oO=L;ZO||(ZO=!0,SO(function(){ZO=!1},O(67)?1:0));return z}iO();var lO=[O(609)?17795081:23657822,O(959)?2147483647:27611931586,O(656)?1558153217:1536529909];\nfunction Lz(L){var z=52;L=typeof L===Z(1743045624,z)?L:L[S(z,168,163,135,168,166,157,162,155)](O(942)?34:36);var s=window[L];if(!s||!s[_(z,168,163,135,168,166,157,162,155)])return;var I=""+s;window[L]=function(L,z){ZO=!1;return s(L,z)};window[L][S(z,168,163,135,168,166,157,162,155)]=function(){return I}}for(var Oz=(O(206),0);Oz<lO[_(zL,185,178,187,180,193,181)];++Oz)Lz(lO[Oz]);io(!1!==window[S(zL,197,146,180)]);window.zJ=window.zJ||{};window.zJ.iZ="083d4956fd018000820ff31a1e95228c25ab1215a5be8e34271fe201bb7934e1197a4432d0146870b4459dacfbf23ddb5bd42a71c8651e07aaf8e6f722cedf63108d8f81e28b4fbdfc370b52b62c462e5ae24d1b58a68be5492d751bd200ff878afe2e5be04961ff09b99c17ef7723a06485b449117b5da3d233e0ceea7e8f11f0e14e41d50e41d1";\nfunction S(L){var z=arguments.length,s=[];for(var I=1;I<z;++I)s.push(arguments[I]-L);return String.fromCharCode.apply(String,s)}function Zz(L){var z=+new Date,s;!document[S(90,203,207,191,204,211,173,191,198,191,189,206,201,204,155,198,198)]||z>LO&&(O(404)?6E5:710795)>z-oO?s=io(!1):(s=io(ZO&&!sO&&oO+_O<z),oO=z,ZO||(ZO=!0,SO(function(){ZO=!1},O(819)?1:0)));return!(arguments[L]^s)}function O(L){return 924>L}(function sz(z){return z?0:sz(z)*sz(z)})(!0);})();}catch(x){}finally{ie9rgb4=void(0);};function ie9rgb4(a,b){return a>>b>>0};\n\n})();\n\n</script>\n\n<script type="text/javascript" src="/TSPD/0821df8b06ab20002f964fdcf5d1316fd5a886ae74ff674af5c2590daeda4a06311eeab330d3be34?type=10"></script>\n<noscript>Please enable JavaScript to view the page content.<br/>Your support ID is: 14031366025339155037.</noscript>\n</head><body>\n<form method="post" action="" enctype="multipart/form-data"><input type="hidden" name="_pd" value=""></form></body></html>'
Just to highlight from the HTML above:
<noscript>Please enable JavaScript to view the page content.<br/>Your support ID is: 14031366025339155037.</noscript>\n
scrapy-playwright
is properly installed, with it's download handlers and etc. Other pages from other netlocs work with no problem.PLAYWRIGHT_MAX_PAGES_PER_CONTEXT
and PLAYWRIGHT_MAX_CONTEXTS
to 1 in order to debug, no improvement.playwright._impl._errors.TimeoutError: Page.wait_for_selector: Timeout 30000ms exceeded.
Call log:
waiting for locator("#ctl00_ContentPlaceholder1_ucProductDetail_fvProductDetail_lblSellPrice") to be visible
and
playwright._impl._errors.TimeoutError: Page.goto: Timeout 90000ms exceeded.
Call log:
navigating to "https://wbmason.com/ProductDetail.aspx?ItemDesc=Green-Mountain-Coffee-Breakfast-Blend-Coffee-K-Cup-Pods-24-BX&ItemID=GMT6520&uom=BX&COID=&SearchID=907879066&ii=1", waiting until "networkidle"
class ExampleSpider(Spider):
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"HTTPPROXY_ENABLED": False,
"PLAYWRIGHT_LAUNCH_OPTIONS": {
"headless": True,
"proxy":{
"server": SERVER,
"username": USERNAME,
"password": PASSWORD
},
},
"PLAYWRIGHT_BROWSER_TYPE": "firefox",
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 90_000, # Seconds
"PLAYWRIGHT_MAX_PAGES_PER_CONTEXT": 1,
"PLAYWRIGHT_MAX_CONTEXTS": 1,
}
start_urls = ["https://wbmason.com/ProductDetail.aspx?ItemDesc=Green-Mountain-Coffee-Breakfast-Blend-Coffee-K-Cup-Pods-24-BX&ItemID=GMT6520&uom=BX&COID=&SearchID=907879066&ii=1"]
def start_requests(self):
for url in self.start_urls:
yield Request(
url,
dont_filter=True,
meta={
"playwright": True,
"playwright_page_methods": [
# PageMethod("wait_for_timeout", 5000), # This also didn't help
PageMethod("wait_for_selector", "#ctl00_ContentPlaceholder1_ucProductDetail_fvProductDetail_lblSellPrice"),
],
"playwright_page_goto_kwargs": {
"wait_until": "networkidle",
},
# "playwright_include_page": True, # This also didn't help
}
)
async with async_playwright() as p:
url = "ttps://wbmason.com/ProductDetail.aspx?ItemDesc=Green-Mountain-Coffee-Breakfast-Blend-Coffee-K-Cup-Pods-24-BX&ItemID=GMT6520&uom=BX&COID=&SearchID=907879066&ii=1"
browser = await p.firefox.launch(
headless=True,
proxy={
"server": SERVER,
"username": USERNAME,
"password": PASSWORD
},
)
page = await browser.new_page()
await page.goto(url, wait_until="networkidle")
await page.wait_for_selector("#ctl00_ContentPlaceholder1_ucProductDetail_fvProductDetail_lblSellPrice")
content = await page.content()
with open("content.html", "w") as f:
f.write(content)
The problem I'm trying to solve is, obviously, how load the page with scrapy-playwright
. However, I'd love to know how is this different from the playwright solo approach that works in the first try.
Upvotes: 0
Views: 201