Reputation: 58
I'm trying to develop a web-scraping project, in which I am scraping a website called Startup India, which you can use for connecting with startups. Here, I have clicked based on some filters I selected , and clicked on every startup and when I click on every startup, I have to go inside that startup and scrape it. But I can't scrape the data because I'm not able to capture the response for scraping profiles in startup India.
import scrapy
from selenium import webdriver
import os
import logging
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['https://www.startupindia.gov.in/']
start_urls = ['https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/advertising&states=sih:location/india/andhra-pradesh&stages=Prototype&roles=Startup&page=0']
def __init__(self):
cwd = os.getcwd()
self.driver = webdriver.Chrome("C:/Users/RAJ/PycharmProjects/WebCrawler/WebCrawler/WebCrawler/spiders/chromedriver.exe")
def parse(self, response):
self.driver.get(response.url)
next = self.driver.find_elements_by_xpath("//*[@id='persona-results']//a[@class='img-wrap']")
logging.info(next)
for i in next:
try:
logging.info(i.click())
logging.info(response.url)
# get the data and write it to scrapy items
except:
print("Yolo")
Code will be appreciated
Upvotes: 0
Views: 330
Reputation: 761
I have setup scrapy project and run scrapy crawl product_spider
and it gives URL of new tab open after clicking on an element.
import scrapy
from selenium import webdriver
import os
import logging
from selenium.webdriver.chrome.options import Options as ChromeOptions
CHROME_DRIVER_UBUNTU_PATH = "your chrome driver path"
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['https://www.startupindia.gov.in/']
start_urls = [
'https://www.startupindia.gov.in/content/sih/en/search.html?industries=sih:industry/advertising&states=sih:location/india/andhra-pradesh&stages=Prototype&roles=Startup&page=0']
def __init__(self):
cwd = os.getcwd()
opts = ChromeOptions()
opts.add_argument("--headless") # for headless browser it's not necessary
self.driver = webdriver.Chrome(executable_path=CHROME_DRIVER_UBUNTU_PATH, chrome_options=opts)
def parse(self, response):
self.driver.get(response.url)
next = self.driver.find_elements_by_xpath("//*[@id='persona-results']//a[@class='img-wrap']")
for i in next:
try:
i.click() # click on image in page
# move to new tab open
self.driver.switch_to.window(self.driver.window_handles[next.index(i) + 1])
logging.info(self.driver.current_url)
self.driver.switch_to.window(self.driver.window_handles[0])
# get the data and write it to scrapy items
except Exception as e:
print(e)
Upvotes: 1