Reputation: 306
I have a webpage that displays some products. This webpage has around 50 products, and when i click on load more, more products are displayed. I want to extract information for all these. I have written a code for same. The problem however is, that the program proceeds with retrieving information without waiting for the button to be clicked. I have tried changing the time.sleep values to very high values, but no avail. Is there some other expression i could include to make the rest of the code wait till the button is clicked?
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.support.wait import WebDriverWait
import selenium.webdriver.support.expected_conditions as ec
from selenium.webdriver.common.by import By
import xlsxwriter
driver = webdriver.Chrome(executable_path=r"C:\Users\Home\Desktop\chromedriver.exe")
driver.get("https://justnebulizers.com/collections/nebulizer-accessories")
soup = BeautifulSoup(driver.page_source, 'html.parser')
time.sleep(5)
#wait = WebDriverWait(driver, 10)
#wait.until(ec.element_to_be_clickable((By.XPATH,"//a[@class='load-more__btn action_button continue-button']")))
button= driver.find_element_by_xpath(("//a[@class='load-more__btn action_button continue-button']"))
button.click()
#wait.until(ec.invisibility_of_element_located((By.XPATH,"//a[@class='load-more__btn action_button continue-button']")))
time.sleep(10)
#WebDriverWait(driver, 10).until(ec.invisibility_of_element_located((By.XPATH, "//a[@class='load-more__btn action_button continue-button']")))
def cpap_spider(url):
soup = BeautifulSoup(driver.page_source, 'html.parser')
for link in soup.findAll("a", {"class":"product-info__caption"}):
href="https://www.justnebulizers.com"+link.get("href")
#title= link.string
each_item(href)
print(href)
#print(title)
def each_item(item_url):
global cols_names, row_i
source_code= requests.get(item_url)
plain_text= source_code.text
soup= BeautifulSoup(plain_text, 'html.parser')
table=soup.find("table", {"class":"tab_table"})
if table:
table_rows = table.find_all('tr')
else:
row_i+=1
return
for row in table_rows:
cols = row.find_all('td')
for ele in range(0,len(cols)):
temp = cols[ele].text.strip()
if temp:
# Here if you want then you can remove unwanted characters like : ? from temp
# For example "Actual Weight" and ""
if temp[-1:] == ":":
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('respiratory_care.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "href")
cpap_spider("https://justnebulizers.com/collections/nebulizer-accessories")
#each_item("https://www.1800cpap.com/viva-nasal-cpap-mask-by-3b-medical")
workbook.close()
Upvotes: 0
Views: 44
Reputation: 394
The code is working just fine but you need to soup the source again with...
soup = BeautifulSoup(driver.page_source, 'html.parser')
...after you click the on the button to get the new items. I think that is why it looks like it is running without waiting.
There are wait methods that can be used with Selenium which you can use to ensure a condition is met before proceeding: https://selenium-python.readthedocs.io/waits.html#explicit-waits
Also you may want to try Scrapy for crawling: https://pypi.org/project/Scrapy/
Update: Try this:
Change
soup = BeautifulSoup(driver.page_source, 'html.parser')
with
soup = BeautifulSoup(driver.find_element_by_tag_name('html').get_attribute('innerHTML'), 'html.parser')
Credits: https://stackoverflow.com/a/43565160/4289062
Upvotes: 1