Reputation: 43
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen(
"https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
table = bsObj.find('table', id="cont")
rows = table.findAll("tr")
links = [a['href'] for a in table.find_all('a', href=True) if a.text]
new_links = []
for link in links:
new_links.append(("https://www.accessdata.fda.gov/scripts/drugshortages/"+link).replace(" ", "%20"))
href_rows = []
for link in new_links:
link = link.replace("®", "%C2%AE")
html = urlopen(link)
bsObj_href = BeautifulSoup(html, "lxml")
#bsObj_href = BeautifulSoup (html.decode('utf-8', 'ignore'))
div_href = bsObj_href.find("div",{"id":"accordion"})
href_rows.append(div_href.findAll("tr"))
csvFile = open("drug_shortage.csv", 'wt', newline='')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
Hello, so I created two rows like that. If you go to the this website https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm they have the drug name and status column and when you click the drug name you can find four more columns. I like to combine together(based on drug name) in order So It would be drug name,status, Presentation, Availability, and Estimated Shortage Duration,Related Information, Shortage Reason (per FDASIA). But current codes only generate for the first one(drug names,status). I tried
for row in rows,rows_href:
but then I get AttributeError: ResultSet object has no attribute 'findAll'. I get the same error for
for row in rows_href:
Any suggestion how do I generate as I wanted?
Upvotes: 1
Views: 94
Reputation: 142641
Your code is too chaotic.
You get all rows, next all links, and next you try to get all other information but this way you can't control which values to join in row. The biggest problem will be when some row will not have data on subpage and all your data will move to one row up.
You should get all rows from table on main page and then use for-loop
to work with every row separatelly to get other elements only for this single row - read link only for this row, get data from subpage only for this row, etc. and put all data for this row on list as sublist [name, status, link, presentation, availability, related, reason]
. And after that you get next work and work only with data for next row.
BTW: because subpage may have many rows so I create many rows in data
with the same name, status but with different other values
[name, status, values from first row on subpage]
[name, status, values from second row on subpage]
[name, status, values from string row on subpage]
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen("https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
# list for all rows with all values
data = []
# get table on main page
table = bsObj.find('table', {'id': 'cont'})
# work with every row separatelly
for row in table.find_all("tr")[1:]: # use `[1:]` to skip header
# get columns only in this row
cols = row.find_all('td')
# get name and url from first column
link = cols[0].find('a', href=True)
name = link.text.strip()
url = link['href']
url = "https://www.accessdata.fda.gov/scripts/drugshortages/" + url
url = url.replace(" ", "%20").replace("®", "%C2%AE")
print('name:', name)
print('url:', url)
# get status from second column
status = cols[1].text.strip()
print('status:', status)
# subpage
html = urlopen(url)
bsObj_href = BeautifulSoup(html, "lxml")
subtable = bsObj_href.find("table")
if not subtable:
data.append([name, status, link, '', '', '', ''])
print('---')
else:
for subrows in subtable.find_all('tr')[1:]: # use `[1:]` to skip header
#print(subrows)
subcols = subrows.find_all('td')
presentation = subcols[0].text.strip()
availability = subcols[1].text.strip()
related = subcols[2].text.strip()
reason = subcols[3].text.strip()
data.append([name, status, link, presentation, availability, related, reason])
print(presentation, availability, related, reason)
print('---')
print('----------')
with open("drug_shortage.csv", 'wt', newline='') as csvfile:
writer = csv.writer(csvFile)
# write header - one row - using `writerow` without `s` at the end
#writer.writerow(['Name', 'Status', 'Link', 'Presentation', 'Availability', 'Related', 'Reason'])
# write data - many rowr - using `writerows` with `s` at the end
writer.writerows(data)
# no need to close because it use `with`
Upvotes: 2