Reputation: 37
I am very confused by all posts about chaining over url requests that I can't fix it by myself only. I am trying to take some info from a web page and furthermore open a new "a href" where are stored further information I want.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("td"):
url = cds.get("a")
urls = requests.get("http://www.bda-ieo.it/test/Groupfood.aspx?Lan=Ita + url")
dage = BeautifulSoup(urls.text, "html.parser")
alimenti = ""
for alimenti in dage:
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text
href = destra.a.get("href")
print(f'{titolo_sezione}; {id_alimento.text}; {nome.text}')
The variable urls doesn't open any further page. Somebody can help me to make it clear? I am stuck on that.
Thank you Mass
Upvotes: 0
Views: 21
Reputation: 28630
You need to re-work some of the logic in there, as well as read up a bit about string formatting. I made notes of where I made changes, and I'm not sure what exactly you are looking for as an output, but this may get you going.
from bs4 import BeautifulSoup
import requests
from csv import reader, writer, DictWriter, DictReader
source = requests.get("http://www.bda-ieo.it/test/Group.aspx?Lan=Ita")
soup = BeautifulSoup(source.text, "html.parser")
titolo_sezione = ""
table_row = ""
with open("c:/test/genere.txt", "w", newline="") as txt_file:
headers = ["GRUPPO MERCEOLOGICO", "CODICE MERCEOLOGICO", "ALIMENTO"]
csv_writer = DictWriter(txt_file, fieldnames=headers, delimiter=';')
csv_writer.writeheader()
for table_row in soup.find("table", id="tblResult").find_all("tr"):
className = ""
if table_row.get("class"):
className = table_row.get("class").pop()
if className == "testobold":
titolo_sezione = table_row.text
if className == "testonormale":
for cds in table_row.find_all("a", href=True): #<-- the hrefs are in the <a> tags within the <td> tags. So you need to find <a> tags that have href
url = cds['href'] #<--- get the href
urls = requests.get("http://www.bda-ieo.it/test/%s" %url) #<--- use that stored string to put into the new url you'll be using
dage = BeautifulSoup(urls.text, "html.parser") #<-- create BeautifulSoup object with that response
dageTbl = dage.find("table", id="tblResult") #<--- find the table in this html now
if dageTbl: #<--- if there is that table
for alimenti in dageTbl.find_all('tr', {'class':'testonormale'}): #<--- find the rows with the specific class
id_alimento, destra = alimenti.find_all("td")
codice = id_alimento.text
nome = destra.text.strip() #<--- added strip() to remove whitespace
href = destra.a.get("href")
print(f'{titolo_sezione}; {codice}; {nome}') #<--- fixed string formatting here too
Output:
PATATE; 381; PATATE
PATATE; 50399; PATATE DOLCI
PATATE; 380; PATATE NOVELLE
PATATE; 3002; PATATE, FECOLA
PATATE; 100219; PATATE, POLVERE ISTANTANEA
PATATE; 382; PATATINE IN SACCHETTO
PATATE; 18; TAPIOCA
VEGETALI; 303; ASPARAGI DI BOSCO
VEGETALI; 304; ASPARAGI DI CAMPO
VEGETALI; 305; ASPARAGI DI SERRA
VEGETALI; 700484; ASPARAGI IN SCATOLA
VEGETALI; 8035; GERMOGLI DI ERBA MEDICA
...
Upvotes: 1