MDB
MDB

Reputation: 360

Duplicates in output(csv) from web scraping

I tried to scrape a website with multiple pages (page 1-48) and output it in a csv. But the csv created has some duplicates. I do not know if the set() is applicable in this one. I am new to python.

import requests
from bs4 import BeautifulSoup
import csv

csv_file = open('Company_Info.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['COMPANY NAME', 'WEBSITE', 'ADDRESS', 'EMAIL'])
number = 1
for i in range(48):
    res = requests.get('https://website.com/org?page='+ str(number) + '&sort=default')
    soup = BeautifulSoup(res.text, 'lxml')
    site = soup.select('.ol-Item_name>a', href=True)

    for b in site:
        res = requests.get('https://website.com/org/' + b['href'][15:])
        soup1 = BeautifulSoup(res.text, 'lxml')

        try:
            company_name = soup1.find('div', class_='op-About_body').find('h1', class_='op-About_name').text.strip()
        except Exception as identifier:
            company_name = "None"

        try:
            company_website = soup1.find('div', class_='pl-3').find('section', class_='op-Section').find('a').text.strip()
        except Exception as identifier:
            company_website = "None"

        try:
            company_address =  soup1.find('div', class_='pl-3').find('h2', itemprop='address').text.strip()
        except Exception as identifier:
            company_address = "None"

        try:
            company_email =  soup1.find('span', itemprop='email').text.strip()
        except Exception as identifier:
            company_email = "None"

        csv_writer.writerow([company_name, company_website, company_address, company_email])
    number += 1
csv_file.close()

Upvotes: 0

Views: 109

Answers (1)

Rakesh
Rakesh

Reputation: 82755

This is one approach using set.

Ex:

import requests
from bs4 import BeautifulSoup
import csv

csv_file = open('Company_Info.csv', 'w', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['COMPANY NAME', 'WEBSITE', 'ADDRESS', 'EMAIL'])
number = 1

seen = set()   #Empty Set

for i in range(48):
    res = requests.get('https://website.com/org?page='+ str(number) + '&sort=default')
    soup = BeautifulSoup(res.text, 'lxml')
    site = soup.select('.ol-Item_name>a', href=True)

    for b in site:
        res = requests.get('https://website.com/org/' + b['href'][15:])
        soup1 = BeautifulSoup(res.text, 'lxml')

        try:
            company_name = soup1.find('div', class_='op-About_body').find('h1', class_='op-About_name').text.strip()
        except Exception as identifier:
            company_name = "None"

        try:
            company_website = soup1.find('div', class_='pl-3').find('section', class_='op-Section').find('a').text.strip()
        except Exception as identifier:
            company_website = "None"

        try:
            company_address =  soup1.find('div', class_='pl-3').find('h2', itemprop='address').text.strip()
        except Exception as identifier:
            company_address = "None"

        try:
            company_email =  soup1.find('span', itemprop='email').text.strip()
        except Exception as identifier:
            company_email = "None"

        data = (company_name, company_website, company_address, company_email)
        if data not in seen:    #Check for data in seen
            csv_writer.writerow(data)
            seen.add(data)
    number += 1
csv_file.close()

Upvotes: 1

Related Questions