Reputation: 360
I tried to scrape a website with multiple pages (page 1-48) and output it in a csv. But the csv created has some duplicates. I do not know if the set() is applicable in this one. I am new to python.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('Company_Info.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['COMPANY NAME', 'WEBSITE', 'ADDRESS', 'EMAIL'])
number = 1
for i in range(48):
res = requests.get('https://website.com/org?page='+ str(number) + '&sort=default')
soup = BeautifulSoup(res.text, 'lxml')
site = soup.select('.ol-Item_name>a', href=True)
for b in site:
res = requests.get('https://website.com/org/' + b['href'][15:])
soup1 = BeautifulSoup(res.text, 'lxml')
try:
company_name = soup1.find('div', class_='op-About_body').find('h1', class_='op-About_name').text.strip()
except Exception as identifier:
company_name = "None"
try:
company_website = soup1.find('div', class_='pl-3').find('section', class_='op-Section').find('a').text.strip()
except Exception as identifier:
company_website = "None"
try:
company_address = soup1.find('div', class_='pl-3').find('h2', itemprop='address').text.strip()
except Exception as identifier:
company_address = "None"
try:
company_email = soup1.find('span', itemprop='email').text.strip()
except Exception as identifier:
company_email = "None"
csv_writer.writerow([company_name, company_website, company_address, company_email])
number += 1
csv_file.close()
Upvotes: 0
Views: 109
Reputation: 82755
This is one approach using set
.
Ex:
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('Company_Info.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['COMPANY NAME', 'WEBSITE', 'ADDRESS', 'EMAIL'])
number = 1
seen = set() #Empty Set
for i in range(48):
res = requests.get('https://website.com/org?page='+ str(number) + '&sort=default')
soup = BeautifulSoup(res.text, 'lxml')
site = soup.select('.ol-Item_name>a', href=True)
for b in site:
res = requests.get('https://website.com/org/' + b['href'][15:])
soup1 = BeautifulSoup(res.text, 'lxml')
try:
company_name = soup1.find('div', class_='op-About_body').find('h1', class_='op-About_name').text.strip()
except Exception as identifier:
company_name = "None"
try:
company_website = soup1.find('div', class_='pl-3').find('section', class_='op-Section').find('a').text.strip()
except Exception as identifier:
company_website = "None"
try:
company_address = soup1.find('div', class_='pl-3').find('h2', itemprop='address').text.strip()
except Exception as identifier:
company_address = "None"
try:
company_email = soup1.find('span', itemprop='email').text.strip()
except Exception as identifier:
company_email = "None"
data = (company_name, company_website, company_address, company_email)
if data not in seen: #Check for data in seen
csv_writer.writerow(data)
seen.add(data)
number += 1
csv_file.close()
Upvotes: 1