Reputation: 75
This is my first time using BeautifulSoup
and I am attempting to scrap store location data from a local convenience store.
However I'm running into some issues on trying to remove empty lines when data is being passed into a CSV file, I've tried .replace('\n','')
and .strip()
both did not worked.
Also I'm having problems with splitting data that is scraped and contained in the same sibling method.
I've added the script below:
from bs4 import BeautifulSoup
from requests import get
import urllib.request
import sched, time
import csv
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
#print (soup.prettify())
#open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
#create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div' , id="store_container")
count = 0
#Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':' , '')
header2 = paragraph.find_next('b').text.replace(':' , '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':' , '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('br'):
brnext = paragraph.next_sibling.strip()
brnext1 = paragraph.next_sibling
test1 = brnext1.next_sibling.next_sibling
print(test1)
csvwriter.writerow([brnext, test1])
location_data.close()
Sample of output generated:
Sample of what output should look like:
How can I achieve this?
Thanks in advance.
Upvotes: 2
Views: 1454
Reputation: 22440
To make it slightly organized, you can try like the following. I've used .select() instead of .find_all().
import csv
from bs4 import BeautifulSoup
import requests
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(["Address","Telephone","Store hours"])
for items in soup.select("#store_container .store_col"):
addr = items.select_one("b").next_sibling.next_sibling
tel = items.select_one("b:nth-of-type(2)").next_sibling
store = items.select_one("b:nth-of-type(3)").next_sibling
writer.writerow([addr,tel,store])
Upvotes: 1
Reputation: 12840
You just need to change the way of extracting address, tel and store hours
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://www.cheers.com.sg/web/store_location.jsp'
response = get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# print (soup.prettify())
# open a file for writing
location_data = open('data/soupdata.csv', 'w', newline='')
# create the csv writer object
csvwriter = csv.writer(location_data)
cheers = soup.find('div', id="store_container")
count = 0
# Loop for Header tags
for paragraph in cheers.find_all('b'):
header1 = paragraph.text.replace(':', '')
header2 = paragraph.find_next('b').text.replace(':', '')
header3 = paragraph.find_next_siblings('b')[1].text.replace(':', '')
if count == 0:
csvwriter.writerow([header1, header2, header3])
count += 1
break
for paragraph in cheers.find_all('div'):
label = paragraph.find_all('b')
if len(label) == 3:
print(label)
address = label[0].next_sibling.next_sibling
tel = label[1].next_sibling
hours = label[2].next_sibling
csvwriter.writerow([address, tel, hours])
location_data.close()
Upvotes: 0