Reputation:
I am using python and beautiful soup to parse this web page. https://rpi.sodexomyway.com/dining-choices/res/sage.html In the "on the menu" section I want to get the url of the first link.
Here is the code I am using:
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
right now it is getting the second a tag, and I'm not sure why, I would think it would at least get both but its only getting the second one.
I would wither like to change the code so it gets the first tag or so that I could search by what the a tag says and get that.
For the second part I was just talking about for example if the a tag was
<a new tag </a>
I would want to search for "new tag"
EDIT:
full code, I need the current weeks link so either get the first one from the on the menu section to search by the date for that link...
# Created by Spencer Fontein on 5/28/14.
# Copyright (c) 2014 Spencer Fontein. All rights reserved.
# coding: utf-8
import pprint
from lxml import etree
import cgi
from bs4 import BeautifulSoup
import datetime
import urllib2
import cookielib
import re
#where to send the file at the end
output_path = ""#"/home/spencerf/public_html/rpi/"
def Get_website_text(url):
# url for website
base_url = url
# file for storing cookies
cookie_file = 'mfp.cookies'
# set up a cookie jar to store cookies
cj = cookielib.MozillaCookieJar(cookie_file)
# set up opener to handle cookies, redirects etc
opener = urllib2.build_opener(
urllib2.HTTPRedirectHandler(),
urllib2.HTTPHandler(debuglevel=0),
urllib2.HTTPSHandler(debuglevel=0),
urllib2.HTTPCookieProcessor(cj)
)
# pretend we're a web browser and not a python script
opener.addheaders = [('User-agent',
('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) '
'AppleWebKit/535.1 (KHTML, like Gecko) '
'Chrome/13.0.782.13 Safari/535.1'))
]
# open the front page of the website to set
# and save initial cookies
response = opener.open(base_url)
web_text = response.read()
response.close()
return web_text
#get union menus
def getUnionMenuUrls(soup):
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
#print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
#print soup.find(text=re.compile('9/22/2014 - 9/28/2014'))
menu_urls = []
url = "https://rpi.sodexomyway.com"
for tag in monthly_urls:
if ".htm" in tag['href']:
name = str(tag.text)
name = name.replace("Click ",'').replace('For ','').replace('Menu ','').replace('of ','').replace('Week ','').replace('Here ','').replace('Of ','')
name = name.replace('January ','').replace('February ','').replace('March ','').replace('April ','').replace('May ','')
name = name.replace('June ','').replace('July ','').replace('August ','').replace('September ','')
name = name.replace('October ','').replace('November ','').replace('December ','')
name = name.replace('1','').replace("2", '').replace("3", '').replace("4", '')
name = name.replace('5','').replace("6", '').replace("7", '').replace("8", '')
name = name.replace('9','').replace("0", '').replace('-','')
name = name.replace('\n','rpi_russell_sage_menu').replace('/','')
name = name.replace('!','').replace(', ','').replace(' ','').replace('College','')
newurl = url + tag['href']
menu_urls.append([name,newurl])
return menu_urls
def get_xml(url):
tag_stack = []
output_lines = []
html = urllib2.urlopen(url).read().replace(' ',"")
xml = etree.HTML(html)
open_tag(tag_stack, output_lines, "menu", "")
days = xml.xpath('//td[@class="dayouter"]')
# make the xml for each day
for day in days:
day_name = day.xpath('./a/@name')[0]
safe_open_tag(tag_stack, output_lines, "day", "menu", day_name)
dayinner_trs = day.xpath('.//table[@class="dayinner"]//tr')
for dayinner_tr in dayinner_trs:
# change meal
if (dayinner_tr.xpath('./td[@class="mealname"]')):
meal_name = dayinner_tr.xpath('./td[@class="mealname"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "meal", "day", meal_name)
# change counter
if (dayinner_tr.xpath('./td[@class="station"]/text()')):
counter_name = dayinner_tr.xpath('./td[@class="station"]/text()')[0]
safe_open_tag(tag_stack, output_lines, "counter", "meal", counter_name)
# change dish
if (dayinner_tr.xpath('./td[@class="menuitem"]')):
item_name = "".join(dayinner_tr.xpath('./td[@class="menuitem"]/div//text()')).strip()
safe_open_tag(tag_stack, output_lines, "dish", "counter", "")
output_lines.append("<name>%s</name>" % cgi.escape(item_name))
close_tags(tag_stack, output_lines, "")
output_string = '\n'.join([line.encode('utf-8') for line in output_lines])
return output_string
# close the tags up to the parent of last tag in tag_stack
def close_tags(tag_stack, output_lines, parent_tag):
while tag_stack and tag_stack[-1] != parent_tag:
top = tag_stack.pop()
output_lines.append(' ' * len(tag_stack) + '</%s>' % top)
# open the new_tag using the suitable style based on name_property
def open_tag(tag_stack, output_lines, new_tag, name_property):
if name_property:
output_lines.append(' ' * len(tag_stack) + '<%s name="%s">' % (new_tag, name_property))
else:
output_lines.append(' ' * len(tag_stack) + '<%s>' % new_tag)
tag_stack.append(new_tag)
# check if the new_tag parent is in the stack, if not it'll add the parent
def safe_open_tag(tag_stack, output_lines, new_tag, parent_tag, name_property):
if parent_tag not in tag_stack:
output_lines.append(' ' * len(tag_stack) + '<%s>' % parent_tag)
tag_stack.append(parent_tag)
else:
close_tags(tag_stack, output_lines, parent_tag)
open_tag(tag_stack, output_lines, new_tag, name_property)
# sample use of get_xml function
# In[17]:
if __name__ == "__main__":
base_url_u = "https://rpi.sodexomyway.com/dining-choices/res/sage.html"
htmltext_u = Get_website_text(base_url_u)
soup_u = BeautifulSoup(htmltext_u)
menu_url_list = getUnionMenuUrls(soup_u)
for menu in menu_url_list:
if '.htm' in menu[1]:
ofname = str(menu[0].replace(" ","A")) + ".xml"
output_file = output_path + ofname
open(output_file, "w").write(get_xml(menu[1]))
else:
print menu[0],":",menu[1], "is not valid html."
EDIT 2:
date function
def getCurrentWeekMenu(date1,date2):
now = datetime.datetime.now()
monthstr = "January,February,March,April,May,June,July,August,September,October,November,December"
months = monthstr.split(',')
d = dict(zip(months,range(1,13)))
menu_1_month = d[str(date1[0])]
menu_2_month = d[str(date2[0])]
menu_1_day = str(date1[1][:-2])
menu_2_day = str(date2[1][:-2])
if menu_1_day > menu_2_day:
if now.day >= menu_1_day:
menu = 1
else:
menu = 2
else:
if now.day >= menu_2_day:
menu = 2
elif now.month > menu_1_month:
menu = 2
else:
menu = 1
return menu-1
Upvotes: 0
Views: 1110
Reputation: 71
I have no problems running your code
from BeautifulSoup import BeautifulSoup
import requests
response = requests.get('https://rpi.sodexomyway.com/dining-choices/res/sage.html')
soup = BeautifulSoup(response.text)
#output of your code
print soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)
>>> [<a href="#">On the Menu</a>,
<a href="/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm" target="_blank">
9/22/2014 - 9/28/2014</a>,
<a href="/images/WeeklyMenuRSDH%209-29-14_tcm1068-29441.htm" target="_blank">
9/29/2014 - 10/5/2014</a>,
<a href="#">Hours of Operation</a>]
# now get the href
url = dict(soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1].attrs)['href']
# output
u'/images/WeeklyMenuRSDH%209-22-14_tcm1068-29436.htm'
answer on second part of question
import re
soup.find(text=re.compile('new tag'))
UPDATE - adding current week filter
def getUnionMenuUrls(soup):
monthly_urls = soup.findAll('div',{'id':'accordion_23473'})[0]('a',href=True)[1:3] # cut extra links
today = datetime.datetime.today() # get todays date
url = "https://rpi.sodexomyway.com"
for tag in monthly_urls:
if ".htm" in tag['href']:
name = str(tag.text)
datestrings = name.split(' - ') # split string and get the list of dates
date_range = [datetime.datetime.strptime(d, '%m/%d/%Y') for d in datestrings] # convert datestrings to datetime objects
if date_range[0] <= today <= date_range[1]: # check if today in that range
return url + tag['href']
Upvotes: 1