Reputation: 31
I'm newbie on Python and BeautifulSoup, I would like to scrape multiple pages in csv but when I'm trying to store those 3 links only the last one it's stored in csv.
How can I fix my issue ?
## importing bs4, requests, fake_useragent and csv modules
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
import csv
## create an array with URLs
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
## initializing the UserAgent object
user_agent = UserAgent()
## starting the loop
for url in urls:
## getting the reponse from the page using get method of requests module
page = requests.get(url, headers={"user-agent": user_agent.chrome})
## storing the content of the page in a variable
html = page.content
## creating BeautifulSoup object
soup = BeautifulSoup(html, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
rows = table.findAll("tr")
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in rows:
csv_row = []
for cell in row.findAll(["td", "th"]):
csv_row.append(cell.get_text())
writer.writerow(csv_row)
Thanks a lot !
Upvotes: 2
Views: 75
Reputation: 4482
To simplify the reading process of the rows, you could also give a shot with pandas
:
import csv
import requests
from bs4 import BeautifulSoup
import pandas as pd
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
df_table = pd.read_html(str(table))[0]
#add a column with additional info
df_table['hit'] = soup.find("span", {"class":"c"}).text.strip()
#store the table in a list of tables
all_data.append(df_table)
#concat the tables and export them to csv
pd.concat(all_data).to_csv('test.csv',index=False)
Upvotes: 2
Reputation: 195438
In your code, you don't store rows
variable to anywhere, so you write only values from your last URL to CSV file. This example will write values from all three URLs:
import csv
import requests
from bs4 import BeautifulSoup
urls = [
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=750300360&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=030780118&editable_length=10',
'https://www.scansante.fr/applications/casemix_ghm_cmd/submit?snatnav=&typrgp=etab&annee=2019&type=ghm&base=0&typreg=noreg2016&noreg=99&finess=620103432&editable_length=10'
]
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
all_data = []
for url in urls:
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.findAll("table", {"class":"table"})[0]
# here I store all rows to list `all_data`
for row in table.findAll('tr'):
tds = [cell.get_text(strip=True, separator=' ') for cell in row.findAll(["td", "th"])]
all_data.append(tds)
print(*tds)
# write list `all_data` to CSV
with open("test.csv", "wt+", newline="") as f:
writer = csv.writer(f)
for row in all_data:
writer.writerow(row)
Writes test.csv
from all three URLs (screenshot from LibreOffice):
Upvotes: 1