using beautiful soup 4 to scrape weather data (site is coded in javascript)

Question

I am trying to scrape some weather data from wunderground.com using beautifulsoup 4. I was able to find a tutorial on how to do this, however it is showing how to do it using an HTML source code. Wunderground.com used to be in HTML when the tutorial was made, however it is now in js.

I was able to obtain the code and manipulate it to my specific data retrieval needs, but I am stuck on how to get it pulling javascript instead of HTML. Can anyone help with this?

The code is below and I sourced it from kiengiv from SAS Business Analytics on youtube.

from bs4 import BeautifulSoup
import urllib3, csv, os, datetime, urllib3.request, re, sys

for vYear in range(2016, 2019):
  for vMonth in range(1, 13):
    for vDay in range(1, 32):
        # go to the next month, if it is a leap year and greater than the 29th or if it is not a leap year
        # and greater than the 28th
        if vYear % 4 == 0:
            if vMonth == 2 and vDay > 29:
                break
        else:
            if vMonth == 2 and vDay > 28:
                break
        # go to the next month, if it is april, june, september or november and greater than the 30th
        if vMonth in [4, 6, 9, 11] and vDay > 30:
            break

        # defining the date string to export and go to the next day using the url
        theDate = str(vYear) + "/" + str(vMonth) + "/" + str(vDay)

        # the new url created after each day
        theurl = "https://www.wunderground.com/history/daily/us/ma/cambridge/KBOS/" + theDate + "date.html"
        # extract the source data for analysis
        http = urllib3.PoolManager()
        thepage = http.request('GET', theurl)
        soup = BeautifulSoup(thepage, "html.parser")
        MaxWindSpeed = Visibility = SeaLevelPressure = Precipitation = High_Temp = Low_Temp = Day_Average_Temp = "N/A"
        for temp in soup.find_all('tr'):
            if temp.text.strip().replace('
', '')[:6] == 'Actual' or temp.text.strip().replace('
', '')[-6:] == "Record":
                pass
            elif temp.text.replace('
', '')[-7:] == "RiseSet":
                break
            elif temp.find_all('td')[0].text == "Day Average Temp":
                if temp.find_all('td')[1].text.strip() == "-":
                    Mean = "N/A"
                else:
                    Mean = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "High Temp":
                if temp.find_all('td')[1].text.strip() == "-":
                    Max = "N/A"
                else:
                    Max = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Low Temp":
                if temp.find_all('td')[1].text.strip() == "-":
                    Min = "N/A"
                else:
                    Min = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Growing Degree Days":
                if temp.find_all('td')[1].text.strip() == "-":
                    GrowingDegreeDays = "N/A"
                else:
                    GrowingDegreeDays = temp.find_all('td')[1].text
            elif temp.find_all('td')[0].text == "Heating Degree Days":
                if temp.find_all('td')[1].text.strip() == "-":
                    HeatingDegreeDays = "N/A"
                else:
                    HeatingDegreeDays = temp.find_all('td')[1].text
            elif temp.find_all('td')[0].text == "Dew Point":
                if temp.find_all('td')[1].text.strip() == "-" or temp.find_all('td')[1].text.strip() == "":
                    DewPoint = "N/A"
                else:
                    DewPoint = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Precipitation" and temp.find_all('td')[1].text.strip() != "":
                if temp.find_all('td')[1].text.strip() == "-" or temp.find_all('td')[1].text.strip() == "":
                    Precipitation = "N/A"
                else:
                    Precipitation = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Sea Level Pressure" and temp.find_all('td')[1].text.strip() != "":
                if temp.find_all('td')[1].text.strip() == "-":
                    SeaLevelPressure = "N/A"
                else:
                    SeaLevelPressure = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Max Wind Speed":
                if temp.find_all('td')[1].text.strip() == "-" or temp.find_all('td')[1].text.strip() == "":
                    MaxWindSpeed = "N/A"
                else:
                    MaxWindSpeed = temp.find_all('td')[1].find(attrs={""}).text
            elif temp.find_all('td')[0].text == "Visibility":
                if temp.find_all('td')[1].text.strip() == "-":
                    Visibility = "N/A"
                else:
                    Visibility = temp.find_all('td')[1].find(attrs={""}).text
                    break

        # combining the values to be written to the CSV file
        CombinedString = theDate + "," + Mean + "," + Max + "," + Min + "," + HeatingDegreeDays + "," + DewPoint + "," + "," + Precipitation + "," + SeaLevelPressure + "," + MaxWindSpeed + "," + Visibility + "," + Events + "
"
        file.write(bytes(CombinedString, encoding="ascii", errors='ignore'))

        # printing to help with any debugging and tracking progress
        print(CombinedString)

file.close()

using beautiful soup 4 to scrape weather data (site is coded in javascript)

Answers (1)

Related Questions