user21195292
user21195292

Reputation:

why xpath output keeps changing?

the problem i am facing is weird and its so much waste of time

in theory this should give out the link

next_page = response.xpath('//ul[@class="pagination justify-content-center"]/li[6]/a/@href').get()

but the out put i got is http://moviful.com/browseNone

the "None" part should contain link "?page=2"

so after wasting hours of time i finally try this

next_page = response.xpath('//ul[@class="pagination justify-content-center"]/li[6]/a/@href').getall()

the output i got http://moviful.com/browse\['?page=2'\] where the heck [ ] are coming from? if you go to

'//ul[@class="pagination justify-content-center"]/li[6]/a/@href' there are no [] only ?page=2

so i try this

next_page = response.xpath('//ul[@class="pagination justify-content-center"]/li[6]/a/@href').extract()

and same output http://moviful.com/browse\['?page=2'\]

i finally try again

next_page = response.xpath('//ul[@class="pagination justify-content-center"]/li[6]/a/@href').get()

this time i got data from page "2" why?

what happened to http://moviful.com/browseNone why i am getting data this time ?

but it didnt move 3rd page just stops at 2nd page also why the value of "li" keep changing every page?

import scrapy
from scraper_api import ScraperAPIClient

client = ScraperAPIClient('API-KEY-HIDDEN')


class DbmoviesSpider(scrapy.Spider):
    name = "browse2"
    allowed_domains = ["moviful.com"]

    def start_requests(self):
        links=["https://moviful.com/browse?page=1"]

        for link in links:
                yield scrapy.Request(client.scrapyGet(url=link,render=True),callback=self.parse)

    def parse(self, response):

        movies = response.xpath('//div[@class="video-grid d-flex flex-wrap justify-content-around"]/div/div/div')
        
        for movie in movies:

            link = movie.xpath('.//a/@href').get()

            full_links = f"http://moviful.com{link}"
            price = movie.xpath('.//a/text()').get()

            yield {'Movie Links':full_links,
                  'Price':price}


        next_page = response.xpath('//ul[@class="pagination justify-content-center"]/li[6]/a/@href').getall()
        
        link = f"http://moviful.com/browse{next_page}"

        print(link)

        if next_page:
            yield scrapy.Request(url=link,callback=self.parse)

Upvotes: -2

Views: 42

Answers (1)

skyriver
skyriver

Reputation: 104

import scrapy
from scraper_api import ScraperAPIClient

client = ScraperAPIClient('API-KEY-HIDDEN')


class DbmoviesSpider(scrapy.Spider):
    name = "browse2"
    allowed_domains = ["moviful.com"]

    url = "https://moviful.com/browse?page={i}"

    def start_requests(self):
        for i in range(1, 210): # there are 209 pages on website
            yield scrapy.Request(client.scrapyGet(url=self.url.format(i=i), render=True), callback=self.parse)


    def parse(self, response):

        movies = response.xpath('//div[@class="video-grid d-flex flex-wrap justify-content-around"]/div/div/div')
        
        for movie in movies:

            link = movie.xpath('.//a/@href').get()

            full_links = f"http://moviful.com{link}"
            price = movie.xpath('.//a/text()').get()

            yield {
                'Movie Links':full_links,
                'Price':price
                }

Upvotes: 0

Related Questions