Reputation: 27
I am trying to scrape company information from a company database. I have a list of companies in a text file, which I would want selenium to enter into the website's search and scrape the needed info one by one.
My problem is that for some reason it only enters the last name on the list for some reason. How would I be able to tell python to scrape the first company name on the list, then the next one and so on?
My code is the following:
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which
count = 0
file = open ("cegek.txt", "r")
lines = file.readlines()
for line in lines:
count += 1
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
#get login page
driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
driver.get("https://www.ceginfo.hu/")
driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
driver.find_element_by_xpath("//input[@type='search']").send_keys(u'\ue007')
self.html = driver.page_source
driver.close()
#scrape needed info
def parse(self, response):
resp = Selector(text=self.html)
for ceg in resp.xpath("(//div[contains(@class, 'd-flex flex-column flex-sm-row justify-content-between align-items-center')])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
This is the exact format the company names list is in:
SZIMIKRON Ipari Kft.
Tigra Computer- és Irodatechnikai Kft.
Tradeland Kft.
Török László EV Török Kulcsszervíz
Tungsram Operations Kft.
Tutti Élelmiszeripari Kft.
Water and Soil Kft.
Webkey Development Kft.
ZDMnet
With some help, now the first name from the list is searched, but the spider does not scrape and breaks due to the error:
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=100.0.4896.60)
This is my new code, at the bottom the # out part shows the supposed solution to my new problem, but I do not know how to implement it, I tried placing it in different places but it did not work. Also, I am unsure what the 'your_element' part is referring to, this solution was proposed in this thread: StaleElementReferenceException on Python Selenium
# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
# # cegek = "1000 Út Kft."
class HtSpiderSeleniumceg(scrapy.Spider):
name = 'kamara'
allowed_domains = ["wwww.ceginfo.hu/"]
start_urls = [
'https://www.ceginfo.hu'
]
global names_to_search
names_to_search = open("cegek.txt", "r").readlines()
def __init__(self):
chrome_options = Options()
# chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
self.driver.get("https://ceginfo.hu/ceglista/cegek")
sleep(2)
self.start_urls = [self.driver.current_url]
sleep(2)
global this_driver
this_driver = self.driver.find_element_by_xpath("//input[@type='search']")
this_driver.send_keys(names_to_search[0])
sleep(2)
this_driver.send_keys(u'\ue007')
sleep(5)
def parse(self, response):
self.driver.get(response.url)
print('this_driver')
print(this_driver)
print('names_to_search')
print(names_to_search)
for names in names_to_search:
print('searching this names:')
print(names)
resp = Selector(text=self.driver.page_source)
sleep(5)
for ceg in resp.xpath("(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"):
yield {
'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
}
try:
print(this_driver)
this_driver.send_keys(names)
# driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
sleep(2)
this_driver.send_keys(u'\ue007')
except:
print('exception - do not break')
self.driver.close()
# my_element_id = "(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"
# ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
# your_element = WebDriverWait(self.driver, 20,ignored_exceptions=ignored_exceptions)\
# .until(expected_conditions.presence_of_element_located((By.XPATH, my_element_id)))
Upvotes: 0
Views: 441
Reputation: 1526
I can't completely replicate your code without installing Selenium, web driver, etc, but this is how you would implement the solution.
Write a function to read names from cegek.txt and append to a list:
names_to_search = []
def get_names_to_search():
# open file to read
file = open ("cegek.txt", "r")
# read lines in file
lines = file.readlines()
# loop through file and append names to list
for line in lines:
names_to_search.append(line.strip())
# The names_to_search list will contain:
['SZIMIKRON Ipari Kft.', 'Tigra Computer- és Irodatechnikai Kft.', 'Tradeland Kft.', 'Török László EV Török Kulcsszervíz', 'Tungsram Operations Kft.', 'Tutti Élelmiszeripari Kft.', 'Water and Soil Kft.', 'Webkey Development Kft.', 'ZDMnet']
Loop through names_to_search
and pass each name to driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
for name in names_to_search:
driver.find_element_by_xpath("//input[@type='search']").send_keys(name)
Upvotes: 1