How to scrape company names from a text file one by one, my code only scrapes the last company name on the list

Question

I am trying to scrape company information from a company database. I have a list of companies in a text file, which I would want selenium to enter into the website's search and scrape the needed info one by one.

My problem is that for some reason it only enters the last name on the list for some reason. How would I be able to tell python to scrape the first company name on the list, then the next one and so on?

My code is the following:

# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which

count = 0
file = open ("cegek.txt", "r")
lines = file.readlines()
for line in lines:
    count += 1

# # cegek = "1000 Út Kft."

class HtSpiderSeleniumceg(scrapy.Spider):
    name = 'kamara'
    allowed_domains = ["wwww.ceginfo.hu/"]
    start_urls = [
        'https://www.ceginfo.hu'
    ]


    def __init__(self):
        chrome_options = Options()
        # chrome_options.add_argument("--headless")

        #get login page

        
        driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        driver.get("https://www.ceginfo.hu/")

        driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
        sleep(2)
        driver.find_element_by_xpath("//input[@type='search']").send_keys(u'\ue007')
        
        self.html = driver.page_source
        driver.close()

 #scrape needed info
    def parse(self, response):
        resp = Selector(text=self.html)
        for ceg in resp.xpath("(//div[contains(@class, 'd-flex flex-column flex-sm-row justify-content-between align-items-center')])[1]"):
            yield {
                'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
                'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
                'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
            }

This is the exact format the company names list is in:

SZIMIKRON Ipari Kft.
Tigra Computer- és Irodatechnikai Kft.
Tradeland Kft.
Török László EV Török Kulcsszervíz
Tungsram Operations Kft.
Tutti Élelmiszeripari Kft.
Water and Soil Kft.
Webkey Development Kft.
ZDMnet

With some help, now the first name from the list is searched, but the spider does not scrape and breaks due to the error:

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=100.0.4896.60)

This is my new code, at the bottom the # out part shows the supposed solution to my new problem, but I do not know how to implement it, I tried placing it in different places but it did not work. Also, I am unsure what the 'your_element' part is referring to, this solution was proposed in this thread: StaleElementReferenceException on Python Selenium

# -*- coding: utf-8 -*-
# from typing_extensions import Self
from lib2to3.pgen2 import driver
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException

# # cegek = "1000 Út Kft."

class HtSpiderSeleniumceg(scrapy.Spider):

    name = 'kamara'
    allowed_domains = ["wwww.ceginfo.hu/"]
    start_urls = [
        'https://www.ceginfo.hu'
    ]

    global names_to_search
    names_to_search = open("cegek.txt", "r").readlines()

    def __init__(self):
        chrome_options = Options()
        # chrome_options.add_argument("--headless")

        self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        self.driver.get("https://ceginfo.hu/ceglista/cegek")
        sleep(2)
        
        self.start_urls = [self.driver.current_url]
        sleep(2)

        global this_driver
        this_driver = self.driver.find_element_by_xpath("//input[@type='search']")
        this_driver.send_keys(names_to_search[0])
        sleep(2)
        this_driver.send_keys(u'\ue007')
        sleep(5)


    def parse(self, response):
        self.driver.get(response.url)
        print('this_driver')
        print(this_driver)
        print('names_to_search')
        print(names_to_search)
        
        for names in names_to_search:
            print('searching this names:')
            print(names)
            resp = Selector(text=self.driver.page_source)
            sleep(5)
            for ceg in resp.xpath("(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"):
                yield {
                    'cegnev': ceg.xpath("(//h2[contains(@class,'s-title heading')])[1]/text()").get(),
                    'adoszam': ceg.xpath("(.//span[@class='text-uppercase c-border me-lg-3'])[1]/text()").get(),
                    'cegjegy': ceg.xpath("(.//span[@class='c-border'])[1]/text()").get()
                }
            try:
                print(this_driver)
                this_driver.send_keys(names)
                # driver.find_element_by_xpath("//input[@type='search']").send_keys(line)
                sleep(2)
                this_driver.send_keys(u'\ue007')
            except:
                print('exception - do not break')
        self.driver.close()



        # my_element_id = "(//p[@class='mb-3 m-sm-0 meta d-flex flex-column flex-lg-row me-auto'])[1]"
        # ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,)
        # your_element = WebDriverWait(self.driver, 20,ignored_exceptions=ignored_exceptions)\
        #                         .until(expected_conditions.presence_of_element_located((By.XPATH, my_element_id)))

Captain Caveman · Accepted Answer

I can't completely replicate your code without installing Selenium, web driver, etc, but this is how you would implement the solution.

Write a function to read names from cegek.txt and append to a list:

names_to_search = []

def get_names_to_search():
    # open file to read
    file = open ("cegek.txt", "r")
    # read lines in file
    lines = file.readlines()
    # loop through file and append names to list
    for line in lines:
        names_to_search.append(line.strip())

# The names_to_search list will contain:

['SZIMIKRON Ipari Kft.', 'Tigra Computer- és Irodatechnikai Kft.', 'Tradeland Kft.', 'Török László EV Török Kulcsszervíz', 'Tungsram Operations Kft.', 'Tutti Élelmiszeripari Kft.', 'Water and Soil Kft.', 'Webkey Development Kft.', 'ZDMnet']

Loop through names_to_search and pass each name to driver.find_element_by_xpath("//input[@type='search']").send_keys(name)

for name in names_to_search:
    driver.find_element_by_xpath("//input[@type='search']").send_keys(name)

How to scrape company names from a text file one by one, my code only scrapes the last company name on the list

Answers (1)

Related Questions