Reputation: 55
I want to load the all url using loop and trying to get data from it But i am not unable to do. Somebody have any ideas how to deal multiple urls in scrapy.
Here is my code:
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
import time
from scrapy.http import Request
class TrSpider(scrapy.Spider):
name = 'tr'
allowed_domains = ['trc.com']
start_urls = ['https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0rtv&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAW',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/012gg6&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAc',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/027f5q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAi',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02d4_q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAo',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b1nv7&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAu',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/04m7fh&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA0',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95m1&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA6',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95tg&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBA',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02p77j&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBG']
def __init__(self):
chrome_option = Options()
chrome_option.add_argument('--headless')
chrome_path = which('chromedriver')
driver = webdriver.Chrome(executable_path=chrome_path)
for a in self.start_urls:
driver.get(a)
driver.find_element_by_xpath('//button[@class="VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-INsAgc VfPpkd-LgbsSe-OWXEXe-dgl2Hf Rj2Mlf OLiIxf PDpWxe J3Eqid"]').click()
time.sleep(1)
self.html = driver.page_source
def parse(self, response):
ab = response.url
resp = Selector(text = self.html)
for a in resp.xpath('//div[@class="GwjAi "]'):
yield{
'Name': a.xpath('./div[1]/div/text()').get(),
'Rating': a.xpath('./div[2]/span/span/span[1]/text()').get(),
'Number of reviews': a.xpath('./div[2]/span/span/span[2]/text()').get(),
'Discription': a.xpath('./div[3]/text()').extract(),
'Url': ab
}
If I run with out looping it working but it will not working with multiple links. Thanks
Upvotes: 0
Views: 236
Reputation: 22440
Try the following:
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from selenium.common.exceptions import TimeoutException
class TrSpider(scrapy.Spider):
name = 'tr'
start_urls = [
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0rtv&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAW',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/012gg6&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAc',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/027f5q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAi',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02d4_q&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAo',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b1nv7&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARAu',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/04m7fh&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA0',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95m1&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARA6',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/0b95tg&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBA',
'https://www.google.com/travel/things-to-do?g2lb=2502548,2503806,4258168,4270442,4306835,4317915,4328159,4371334,4401769,4419364,4463263,4463666,4464463,4482194,4482438,4486153,4270859,4284970,4291517&hl=en-PK&gl=pk&un=1&dest_mid=/m/02p77j&dest_state_type=main&sa=X&ved=2ahUKEwjvk6PXqLXuAhWN2BQKHZs1BNsQhdIBegQIARBG'
]
def start_requests(self):
driver = webdriver.Chrome()
wait = WebDriverWait(driver,10)
for item_link in self.start_urls:
driver.get(item_link)
try:
button = wait.until(EC.presence_of_element_located((By.XPATH, "//span[contains(.,'See all top sights')]")))
driver.execute_script("arguments[0].click();",button)
except TimeoutException:
pass
htmlelements = driver.page_source
wait.until(EC.staleness_of(button))
yield scrapy.Request(item_link,meta={"htmlelements":htmlelements})
def parse(self, response):
ab = response.url
resp = Selector(text=response.meta.get("htmlelements"))
for a in resp.xpath('//div[starts-with(@class,"GwjAi")]'):
yield {
'Name': a.xpath('./div[1]/div/text()').get(),
'Rating': a.xpath('./div[2]/span/span/span[1]/text()').get(),
'Number of reviews': a.xpath('./div[2]/span/span/span[2]/text()').get(),
'Discription': a.xpath('./div[3]/text()').extract(),
'Url': ab
}
Upvotes: 1