Reputation: 29
I am working on a scraping project using selenium in python. My goal is to be able to make one .csv file with data from all of the different profiles that I scrape. Currently, my script will scrape one profile, then move one to the next, but it will not remember the data from the first profile after it moves on. Please let me know how I can fix this, and get all the data from every profile into my csv file. Thank you!
Here is my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
import csv
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# Scrape Code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text or driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text or driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text or driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]')
RecertificationCycleORExperation = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]').text or driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[3]/div[2]')
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
Data = [Name + "," + IssuedBy + "," + CertificationNumber + "," + CertfiedSince + "," + RecertificationCycleORExperation + "," + Expires + "," + AccreditedBy + '\n']
with open('data.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
h = ["Name", "Issued By", "Certification Number", "Certified Since", "Recertification Cycle/Expiration",
"Expires", "Accredited By"]
writer.writerow(h)
writer.writerow([Data] * len(h))
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.close()
Upvotes: 0
Views: 46
Reputation: 3856
so labels
are all the column names I can expect in my csv
you can just go through the websites, update curr_data
and then append that to data
so curr_data
is a dict and data
is list of dict(s)
after that, you can use pandas to create a data frame out of it and then export to csv
labels = ["Name", "Issued By", "Certification Number", "Certified Since", "Recertification Cycle/Expiration",
"Expires", "Accredited By"]
websites = ['first', 'second', 'third']
data = []
for site in websites:
curr_data = {'site':site}
for i in labels: curr_data[i]= ''
# update data for your websites here
# Eg :
# curr_data["Name"] = 'Kuldeep'
data.append(curr_data)
import pandas as pd
df = pd.DataFrame(data)
csv_location = 'myfile.csv'
df.to_csv(csv_location)
Upvotes: 1