Mampenda
Mampenda

Reputation: 671

How to collect specific data from HTML using Selenium Python

I am trying to create a weather forecast by scraping web-pages. (My prevoius question )

My code:

import time
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
from keyboard import press_and_release



def weather_forecast2():
    print('Hello, I can search up the weather for you.')
    while True:
        inp = input('Where shall I search? Enter a place :').capitalize()
        print('Alright, checking the weather in ' + inp + '...')

        URL = 'https://www.yr.no/nb'

        "Search for a place"
        driver = webdriver.Edge()  # Open Microsoft Edge
        driver.get(URL)  # Goes to the HTML-page of the given URL
        element = driver.find_element_by_id("søk")  # Find the search input box
        element.send_keys(inp)  # Enter input
        press_and_release('enter')  # Click enter

        cURL = driver.current_url  # Current URL

        "Find data"
        driver.get(cURL)  # Goes to the HTML-page that appeared after clicking button
        r = requests.get(cURL)  # Get request for contents of the page
        print(r.content)  # Outputs HTML code for the page
        soup = BeautifulSoup(r.content, 'html5lib')  # Parse the data with BeautifulSoup(HTML-string, HTML-parser)

I want to collect the temperatures from the page. I know that the xpaths to the elements Im looking for are

//[@id="dailyWeatherListItem0"]/div[2]/div1/span[2]/span1/text() //[@id="dailyWeatherListItem0"]/div[2]/div1/span[2]/span[3]/text() //[@id="dailyWeatherListItem1"]/div[2]/div1/span[2]/span1/text() //[@id="dailyWeatherListItem1"]/div[2]/div1/span[2]/span[3]/text() //[@id="dailyWeatherListItem2"]/div[2]/div1/span[2]/span1/text() //[@id="dailyWeatherListItem2"]/div[2]/div1/span[2]/span[3]/text() //[@id="dailyWeatherListItem3"]/div[2]/div1/span[2]/span1/text() //[@id="dailyWeatherListItem3"]/div[2]/div1/span[2]/span[3]/text()

//etc...

Basically I want to collect the following two elements nine times:

//[@id="dailyWeatherListItem{NUMBERS0-8}"]/div[2]/div1/span[2]/span1/text() //[@id="dailyWeatherListItem{NUMBER0-8}"]/div[2]/div1/span[2]/span[3]/text()

How can I use driver.find_element_by_xpath to do this? Or is there a more efficient function?

Upvotes: 0

Views: 187

Answers (1)

QHarr
QHarr

Reputation: 84455

Assuming you can correctly retrieve the url then you can use that as the referer header, as well as the location id within that url, to call the API which actually returns the forecasts. I don't have your definition for press_and_release so code is tested without that.

import requests, re
from selenium import webdriver

# url = 'https://www.yr.no/nb/v%C3%A6rvarsel/daglig-tabell/2-6058560/Canada/Ontario/London'

def get_forecast(str:url)->object:
    
    location_id = re.search(r'daglig-tabell/(.*?)/', url).group(1)
    headers = {'user-agent': 'Mozilla/5.0', 'referer': url}
    forecasts = requests.get(f'https://www.yr.no/api/v0/locations/{location_id}/forecast', headers=headers).json()
    return forecasts 


def get_forecast_url():
    
    print('Hello, I can search up the weather for you.')

    driver = webdriver.Chrome()  # Open Microsoft Edge. (I changed to Chrome)

    while True:

        inp = input('Where shall I search? Enter a place :').capitalize()
        print('Alright, checking the weather in ' + inp + '...')

        URL = 'https://www.yr.no/nb'

        "Search for a place"

        driver.get(URL)  # Goes to the HTML-page of the given URL
        driver.find_element_by_id("page-header__search-button").click() #open search 
        # Find the search input box
        element = driver.find_element_by_id("page-header__search-input")
        element.send_keys(inp)  # Enter input
        press_and_release('enter')  # Click enter

        cURL = driver.current_url  # Current URL
        print(get_forecast(cURL))

    driver.quit()

Upvotes: 2

Related Questions