Reputation: 21
My code below gets the street address for each gym, but there is an error in the spacing of the output for the hours that the gym is open. Any ideas of where I went wrong?
import urlparse
from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv
sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]
with open('gyms.csv', 'w') as gf:
gymwriter = csv.writer(gf)
for link in links:
gymurl = urlparse.urljoin(sitemap, link)
sitemap_content = requests.get(gymurl).content
soup = BeautifulSoup(sitemap_content, 'html.parser')
gymrow = [ gymurl ]
address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
gymrow.append(address_line1[0].text)
locality = soup.select('p[class~=address] > span[class~=locality]')
gymrow.append(locality[0].text)
administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
gymrow.append(administrative_area[0].text)
postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
gymrow.append(postal_code[0].text)
country = soup.select('p[class~=address] > span[class~=country]')
gymrow.append(country[0].text)
strongs = soup.select('div > strong')
for strong in strongs:
if strong.text == 'Club Hours':
for sibling in strong.next_siblings:
if isinstance(sibling, Tag):
hours = sibling.text
gymrow.append(hours)
break
print(gymrow)
gymwriter.writerow(gymrow)
time.sleep(3)
Thank you for your help!
Upvotes: 0
Views: 125
Reputation: 2226
This will give you a dict with keys as the itemprop and values as the address item text, I use python3 in colab so I changed some of the imports but the last 4 lines of the loop should get you what you want:
import requests
import urllib
from bs4 import BeautifulSoup
res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')
tds = soup.find_all('td', {'class': 'club-title'})
links = [td.find('a')['href'] for td in tds]
keywords = ['gyms']
gym_data = []
for link in links:
if any(keyword in link for keyword in keywords):
req = urllib.parse.urljoin('https://www.planetfitness.com/', link)
res = requests.get(req).content
site = BeautifulSoup(res)
ps = site.find('p', class_='address')
address_dict = {p['itemprop']: p.text for p in ps.findAll('span')}
gym_data.append(address)
Output (probably dont need all these keys...:
[{'streetAddress': '5850 US Hwy 431', 'addressLocality': 'Albertville', 'addressRegion': 'AL', 'postalCode': '35950', 'addressCountry': 'United States'}
{'streetAddress': '987 Market Place', 'addressLocality': 'Alexander City', 'addressRegion': 'AL', 'postalCode': '35010', 'addressCountry': 'United States'}
{'streetAddress': '528 W Town Plaza', 'addressLocality': 'Bessemer', 'addressRegion': 'AL', 'postalCode': '35020', 'addressCountry': 'United States'}
{'streetAddress': '4500 Montevallo Rd', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35210', 'addressCountry': 'United States'}
{'streetAddress': '140 Wildwood Pkwy', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35209', 'addressCountry': 'United States'}
{'streetAddress': '168 Inverness Plaza', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35242', 'addressCountry': 'United States'}
{'streetAddress': '9118 Parkway E', 'addressLocality': 'Birmingham', 'addressRegion': 'AL', 'postalCode': '35206', 'addressCountry': 'United States'}
{'streetAddress': '1727 2nd Ave SW', 'addressLocality': 'Cullman', 'addressRegion': 'AL', 'postalCode': '35055', 'addressCountry': 'United States'}
{'streetAddress': '29685 Renaissance Blvd', 'addressLocality': 'Daphne', 'addressRegion': 'AL', 'postalCode': '36526', 'addressCountry': 'United States'}
{'streetAddress': '809 Beltline Road SW Suite B', 'addressLocality': 'Decatur', 'addressRegion': 'AL', 'postalCode': '35601', 'addressCountry': 'United States'}
{'streetAddress': '3121 Ross Clark Circle', 'addressLocality': 'Dothan', 'addressRegion': 'AL', 'postalCode': '36303', 'addressCountry': 'United States'}
{'streetAddress': '913 Rucker Blvd', 'addressLocality': 'Enterprise', 'addressRegion': 'AL', 'postalCode': '36330', 'addressCountry': 'United States'}
...
]
Upvotes: -1
Reputation: 5202
Here is a working code on what you are trying to:
>>>res1 = requests.get(urljoin('https://www.planetfitness.com/', link)).content # ie, the one of the url is 'https://www.planetfitness.com/gyms/albertville-al'
>>>soup1 = BeautifulSoup(res1, 'html.parser')
>>>ps = soup1.find('p',class_ = 'address')
>>>ps
<p class="address" itemprop="address" itemscope="" itemtype="http://schema.org/PostalAddress"><span class="address-line1" itemprop="streetAddress">5850 US Hwy 431</span><br/>
<span class="locality" itemprop="addressLocality">Albertville</span>, <span class="administrative-area" itemprop="addressRegion">AL</span> <span class="postal-code" itemprop="postalCode">35950</span><br/>
<span class="country" itemprop="addressCountry">United States</span></p>
>>>address1 = [p['itemprop'] for p in ps.findAll('span')]
>>>address1
['streetAddress', 'addressLocality', 'addressRegion', 'postalCode', 'addressCountry']
Upvotes: -1