Reputation: 15
I'm trying to scrape a dynamic website and I need Selenium.
The links that I want to scrape only open if I click on that specific element. They are being opened by jQuery, so my only option is to click on them because there is no href attribute or anything that would give me an URL.
My approach is this one:
# -*- coding: utf-8 -*-
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "tableRepeat2"))
)
finally:
html = driver.page_source
response_obj = Selector(text=html)
links = response_obj.xpath("//tbody[@id='tableRepeat2']")
for link in links:
driver.execute_script("arguments[0].click();", link)
yield {
'Ocupatia': response_obj.xpath("//div[@id='print']/p/text()[1]")
}
but it won't work.
On the line where I want to click on that element, I get this error:
TypeError: Object of type Selector is not JSON serializable
I kind of understand this error, but I have no idea how to solve it. I somehow need to transform that object from a Selector to a Clickable button.
I checked online for solutions and also the docs, but I couldn't find anything useful.
Can anybody help me better understand this error and how should I fix it?
Thanks.
Upvotes: 0
Views: 2715
Reputation: 142744
You mix Scrapy object with Selenium functions and this makes problem. I don't know how to convert objects but I would simply use only Selenium for this
finally:
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
# doesn't work for me - even
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
Full working code.
Everyone can put it in one file and run python script.py
without creating project in scrapy
.
You have to change SELENIUM_DRIVER_EXECUTABLE_PATH
to correct path.
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
import time
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield SeleniumRequest(
url='https://www.anofm.ro/lmvw.html?agentie=Covasna&categ=3&subcateg=1',
#callback=self.parse
)
def parse(self, response):
driver = response.meta['driver']
try:
print("try")
element = WebDriverWait(driver, 20).until(
EC.presence_of_element_located((By.XPATH, "//tbody[@id='tableRepeat2']/tr/td"))
)
finally:
print("finally")
links = driver.find_elements_by_xpath("//tbody[@id='tableRepeat2']/tr")
print('len(links):', len(links))
for link in links:
#driver.execute_script("arguments[0].scrollIntoView();", link)
#link.click()
# open information
driver.execute_script("arguments[0].click();", link)
# javascript may need some time to display it
time.sleep(1)
# get data
ocupatia = driver.find_element_by_xpath(".//div[@id='print']/p").text
ocupatia = ocupatia.split('\n', 1)[0] # first line
ocupatia = ocupatia.split(':', 1)[1].strip() # text after first `:`
print('Ocupatia -->', ocupatia)
# close information
driver.find_element_by_xpath('//button[text()="Inchide"]').click()
yield {
'Ocupatia': ocupatia
}
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
'DOWNLOADER_MIDDLEWARES': {'scrapy_selenium.SeleniumMiddleware': 800},
'SELENIUM_DRIVER_NAME': 'firefox',
'SELENIUM_DRIVER_EXECUTABLE_PATH': '/home/furas/bin/geckodriver',
'SELENIUM_DRIVER_ARGUMENTS': [], # ['-headless']
})
c.crawl(AnofmSpider)
c.start()
Upvotes: 1
Reputation: 16187
Actually, data is also generating from API
calls JSON
response and you can easily scrape from API
. Here is the working solution along with pagination. Each page contains 8 items where total items 32.
CODE:
import scrapy
import json
class AnofmSpider(scrapy.Spider):
name = 'anofm'
def start_requests(self):
yield scrapy.Request(
url='https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit=8&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': 8}
)
def parse(self, response):
resp = json.loads(response.body)
hits = resp.get('lmv').get('data')
for h in hits:
yield {
'Ocupatia': h.get('OCCUPATION')
}
total_limit = resp.get('lmv').get('total')
next_limit = response.meta['limit'] + 8
if next_limit <= total_limit:
yield scrapy.Request(
url=f'https://www.anofm.ro/dmxConnect/api/oferte_bos/oferte_bos_query2L_Test.php?offset=8&cauta=&select=Covasna&limit={next_limit}&localitate=',
method='GET',
callback=self.parse,
meta= {
'limit': next_limit}
)
Upvotes: 1