Scrapy Selenium: Why pagination is not working for scrapy-selenium?

Question

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.

Source: https://www.gumtree.com/property-for-sale/london

import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


class Basic2Spider(scrapy.Spider):
    name = 'basic2'
    
    def start_requests(self):
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        driver.set_window_size(1920, 1080)
        
        driver.get("https://www.gumtree.com/property-for-sale/london")
        time.sleep(2)

        property_xpath = driver.find_elements(By.XPATH, "(//article[@class='listing-maxi']/a)[position()>=2 and position()<30]")

        for detail in property_xpath:
            href= detail.get_attribute('href')
            time.sleep(2)
            yield SeleniumRequest(
            url = href,
            )
            
        driver.quit()
        return super().start_requests()

    def parse(self, response):
        yield {
            'Title': response.xpath("//div[@class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
            'Price': response.xpath("//h3[@itemprop='price']/text()").get(),
            'Add Posted': response.xpath("//*[@id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
            'Links': response.url
            }

        next_page = response.xpath("//li[@class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
        if next_page:
            abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
            yield SeleniumRequest(
                url= abs_url,
                wait_time=5,
                callback=self.parse
        )

Md. Fazlul Hoque · Accepted Answer

Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.

import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest



class Basic2Spider(scrapy.Spider):
    name = 'basic2'

    responses = []
    
    def start_requests(self):
        url='https://www.gumtree.com/property-for-sale/london/page{page}'
        for page in range(1,6):
            print(page)
            yield SeleniumRequest(
                url=url.format(page=page),
                callback=self.parse,
                 wait_time=5
                )

    def parse(self, response):
       
        driver = response.meta['driver']
        intial_page = driver.page_source
        self.responses.append(intial_page)
        

        for resp in self.responses:
            r = Selector(text=resp)

            property_xpath = r.xpath("(//article[@class='listing-maxi']/a)[position()>=2 and position()<30]")

            for detail in property_xpath:
        
                yield {
                    'Title': detail.xpath('.//*[@class="listing-title"]/text()').get().strip(),
                    'Price': detail.xpath('.//*[@class="listing-price"]/strong/text()').get(),
                    'Add Posted': detail.xpath('.//*[@class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
                    'Links': response.url
                    }

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

Answers (1)

Related Questions