Reputation: 22440
I've written a scraper in Python scrapy in combination with selenium to scrape some titles
from a website. The css selectors
defined within my scraper is flawless. I wish my scraper to keep on clicking on the next page and parse the information embedded in each page. It is doing fine for the first page but when it comes to play the role for selenium part the scraper keeps clicking on the same link over and over again.
As this is my first time to work with selenium along with scrapy, I don't have any idea to move on successfully. Any fix will be highly appreciated.
If I try like this then it works smoothly (there is nothing wrong with selectors):
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def parse(self,response):
self.driver.get(response.url)
while True:
for elem in self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,"h1.faqsno-heading"))):
name = elem.find_element_by_css_selector("div[id^='arrowex']").text
print(name)
try:
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
except TimeoutException:break
But my intention is to make my script run this way:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
These are the titles visible on that landing page (to let you know what I'm after):
INDIA INCLUSION FOUNDATION
INDIAN WILDLIFE CONSERVATION TRUST
VATSALYA URBAN AND RURAL DEVELOPMENT TRUST
I'm not willing to get the data from that site, so any alternative approach other than what I've tried above is useless to me. My only intention is to have any solution related to the way I tried in my second approach.
Upvotes: 2
Views: 623
Reputation: 146520
Your initial code was almost correct with one key piece missing from it. You were using the same response object always. The response object needs to be from the latest page source.
Also you were browsing the link again and again in click next page which was resetting it to page 1 every time. That is why you get page 1 and 2 (max). You need to get the url only once in the parse stage and then let the next page click to happen
Below is final code working fine
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link):
# self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
def parse(self, response):
self.driver.get(response.url)
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
response = response.replace(body=self.driver.page_source)
except TimeoutException:break
After that change it works perfect
Upvotes: 1
Reputation: 939
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Chrome()
self.wait = WebDriverWait(self.driver, 10)
link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
self.driver.get(link)
def click_nextpage(self):
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
time.sleep(4)
def parse(self,response):
while True:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage() #initiate the method to do the clicking
except TimeoutException:break
process = CrawlerProcess()
process.crawl(IncomeTaxSpider)
process.start()
Upvotes: 1
Reputation: 595
Create a self.page_num or something.
def parse(self,response):
self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
self.pages = int(self.pages.split('of ')[1].split(']')[0])
self.page_num = 1
while self.page_num <= self.pages:
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
try:
self.click_nextpage(response.url) #initiate the method to do the clicking
except TimeoutException:break
def click_nextpage(self,link):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
self.page_num = self.page_num + 1
self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
self.wait.until(EC.staleness_of(elem))
Upvotes: 0
Reputation: 198
Whenever the page gets loaded using the 'Next Page' arrow (using Selenium) it gets reset back to Page '1'. Not sure about the reason for this (may be the java script) Hence changed the approach to use the input field to enter the page number needed and hitting ENTER key to navigate.
Here is the modified code. Hope this may be useful for you.
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
class IncomeTaxSpider(scrapy.Spider):
name = "taxspider"
start_urls = [
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
def __init__(self):
self.driver = webdriver.Firefox()
self.wait = WebDriverWait(self.driver, 10)
def click_nextpage(self,link, number):
self.driver.get(link)
elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))
#It keeeps clicking on the same link over and over again
inputElement = self.driver.find_element_by_xpath("//input[@id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
inputElement.clear()
inputElement.send_keys(number)
inputElement.send_keys(Keys.ENTER)
self.wait.until(EC.staleness_of(elem))
def parse(self,response):
number = 1
while number < 10412: #Website shows it has 10411 pages.
for item in response.css("h1.faqsno-heading"):
name = item.css("div[id^='arrowex']::text").extract_first()
yield {"Name": name}
print (name)
try:
number += 1
self.click_nextpage(response.url, number) #initiate the method to do the clicking
except TimeoutException:break
Upvotes: 0
Reputation: 52665
In case you need pure Selenium solution:
driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")
while True:
for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
print(item.text)
try:
driver.find_element_by_xpath("//input[@text='Next' and not(contains(@class, 'disabledImageButton'))]").click()
except NoSuchElementException:
break
Upvotes: 1