Reputation: 143
I am using Scrapy and splash to extract the data. I am looking to find a way to follow pagination that was powered with javascript. The URL is not changing it is always the same no matter on what page you are.
<li class="btn-next"><a href="javascript:ctrl.set_pageReload(2)">Next</a></li>
I have tried with lua script and splash to click on the element but this does not work:
"""function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(1))
assert(splash:runjs('document.getElementsByClassName("btn-next")[0].click()'))
assert(splash:wait(0.75))
-- return result as a JSON object
return {html = splash:html()}
end """
def parse(self, response):
section = response.css('li.li-result')
for item in section:
yield{
'manufacturer' : item.css('span.brand::text').extract_first(),
'model' : item.css('span.sub-title::text').extract_first(),
'engine_size' : item.css('span.nowrap::text').extract_first(),
'model_type' : item.css('span span.nowrap::text').extract_first(),
'old_price' : item.css('li.li-result p.old-prix span::text').extract_first(),
'price' : item.css('li.li-result p.prix::text').extract_first(),
'consumption' : item.css('li.li-result div.desc::text').extract_first(),
'date' : item.css('p.btn-publication::text').extract_first(),
'fuel_type' : item.css('div.bc-info div.upper::text').extract_first(),
'mileage' : item.css('li.li-result div.bc-info ul div::text')[1].extract(),
'year' : item.css('li.li-result div.bc-info ul div::text')[2].extract(),
'transmission_type' : item.css('li.li-result div.bc-info ul div::text')[3].extract(),
'add_number' : item.css('li.li-result div.bc-info ul div::text')[4].extract(),
}
next_page = response.css('li.btn-next').extract_first() #pagination
if next_page != 0:
print(response)
yield(SplashRequest(next_page, self.parse,
endpoint='execute',
cache_args=['lua_source'],
args={'lua_source': script},
))
Is it even possible to do it in this way? Appreciate help.
Upvotes: 3
Views: 865
Reputation: 43083
First, two issues with the Lua script:
.btn-next
is a non-interactive li
element, so clicking on it does nothing.To fix that:
a
child element (see alternative below).function main(splash)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(1))
-- assert(splash:runjs('document.getElementsByClassName("btn-next")[0].click()')) -- Change this
assert(splash:runjs('document.getElementsByClassName("btn-next")[0].children[0].click()')) -- to this
-- assert(splash:wait(0.75)) -- Change this
assert(splash:wait(1.5)) -- to this
-- return result as a JSON object
return {html = splash:html()}
end
(The above shows JavaScript navigation to page 2 works in Splash, but we need more work to scrape subsequent pages with Scrapy-Splash.)
Next, two issues with the parse
method:
next_page
is the HTML string of the li
element, so it cannot be passed as the url
argument to SplashRequest
.next_page
may be None
, but never 0
.To fix that, see the solution.
response.url
and response.text
, and then call splash:set_content()
to restore the state in the next request.
dontfilter=True
to skip the duplicate check for url
.next_page
is not None
instead.script = """function main(splash)
local url = splash.args.url
local content = splash.args.content
assert(splash:set_content(content, "text/html; charset=utf-8", url))
assert(splash:runjs('document.getElementsByClassName("btn-next")[0].children[0].click()'))
assert(splash:wait(2))
return {html = splash:html()}
end"""
def parse(self, response, **kwargs):
section = response.css('li.li-result')
for item in section:
yield {
'manufacturer': item.css('span.brand::text').extract_first(),
'model': item.css('span.sub-title::text').extract_first(),
'engine_size': item.css('span.nowrap::text').extract_first(),
'model_type': item.css('span span.nowrap::text').extract_first(),
'old_price': item.css('li.li-result p.old-prix span::text').extract_first(),
'price': item.css('li.li-result p.prix::text').extract_first(),
'consumption': item.css('li.li-result div.desc::text').extract_first(),
'date': item.css('p.btn-publication::text').extract_first(),
'fuel_type': item.css('div.bc-info div.upper::text').extract_first(),
'mileage': item.css('li.li-result div.bc-info ul div::text')[1].extract(),
'year': item.css('li.li-result div.bc-info ul div::text')[2].extract(),
'transmission_type': item.css('li.li-result div.bc-info ul div::text')[3].extract(),
'add_number': item.css('li.li-result div.bc-info ul div::text')[4].extract(),
}
next_page = response.css('li.btn-next').extract_first()
# print(next_page)
if next_page is not None:
yield SplashRequest(
response.url,
self.parse,
endpoint='execute',
args={
'lua_source': script,
'content': response.text,
},
cache_args=['lua_source'],
dont_filter=True,
)
(Comment out for item in section:
block and uncomment print(next_page)
to easily verify the solution.)
An alternative to clicking the button is to call the function directly:
-- assert(splash:runjs('document.getElementsByClassName("btn-next")[0].children[0].click()'))
assert(splash:runjs('ctrl.set_pageReload(ctrl.context.cur_page + 1)'))
An alternative to hardcoding a possibly insufficient wait time is to periodically check for a set variable and then wait for the jQuery ready callback (optionally hardcode an initial wait time):
assert(splash:runjs('window.notReloaded = 1'))
-- assert(splash:wait(2)) -- Optional initial wait time
local exit = false
while (exit == false)
do
result, error = splash:wait_for_resume([[
function main(splash) {
window.notReloaded ? splash.error() : splash.resume();
}
]])
if result then
exit = true
else
splash:wait(0.2) -- Adjust resolution as desired
end
end
assert(splash:wait_for_resume([[
function main(splash) {
$(() => splash.resume());
}
]]))
Upvotes: 1