Reputation: 97
I am trying to get the content of a table on a website using selenium. It seems the website is set up in a rather complex manner. I can't find any element, class or content to use in the find_element_by_...
functions.
If anyone has idea how to get the content of the second table starting with header Staffel
, Nr.
, Datum
, ...
, Ergebnis
, Bem.
it would be a big help for me. I tried a lot (starting with urllib2, ...). Principally the following scripts works - loading the site and looping through high level containers. But I am not sure how to get the mentioned table content.
from selenium import webdriver
from selenium.webdriver.common.by import By
the_url = 'https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver = webdriver.Chrome()
driver.get(the_url)
elem_high = driver.find_elements(By.CLASS_NAME, 'container')
for e in elem_high:
print(e)
# what class or element to search for second table
elem_deep = driver.find_elements(By.CLASS_NAME, 'row.game')
driver.close()
Any ideas or comments are welcome. Thanks.
Upvotes: 0
Views: 433
Reputation: 12255
To get rows you have to wait for page load using WebDriverWait
, you can find details here:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
the_url = 'https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get(the_url)
elem_deep = wait.until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "table.schedule tbody > tr")))
for e in elem_deep:
print(e.text)
# Link in last column
href = e.find_element_by_css_selector("a[ng-if='row.game.sGID']").get_attribute("href")
print(href)
But better solution is using requests
package to get all information from website. Code below is example how you can scrape much faster and easier:
import requests
url = 'https://spo.handball4all.de/service/if_g_json.php?ca=1&cl=37133&cmd=ps&og=3'
response = requests.get(url).json()
futureGames = response[0]["content"]["futureGames"]["games"]
for game in futureGames:
print(game["gHomeTeam"])
print(game["gGuestTeam"])
# Link in last column
print("http://spo.handball4all.de/misc/sboPublicReports.php?sGID=%s" % game["sGID"])
# You can use example of data below to get all you need
# {
# 'gID': '2799428',
# 'sGID': '671616',
# 'gNo': '61330',
# 'live': False,
# 'gToken': '',
# 'gAppid': '',
# 'gDate': '30.09.18',
# 'gWDay': 'So',
# 'gTime': '14:00',
# 'gGymnasiumID': '303',
# 'gGymnasiumNo': '6037',
# 'gGymnasiumName': 'Sporthalle beim Sportzentrum',
# 'gGymnasiumPostal': '71229',
# 'gGymnasiumTown': 'Leonberg',
# 'gGymnasiumStreet': 'Steinstraße 18',
# 'gHomeTeam': 'SV Leonb/Elt',
# 'gGuestTeam': 'JSG Echaz-Erms 2',
# 'gHomeGoals': '33',
# 'gGuestGoals': '20',
# 'gHomeGoals_1': '19',
# 'gGuestGoals_1': '7',
# 'gHomePoints': '2',
# 'gGuestPoints': '0',
# 'gComment': ' ',
# 'gGroupsortTxt': ' ',
# 'gReferee': ' '
# }
Upvotes: 2
Reputation: 84475
You can use css class selector of
.schedule
That is:
table = driver.find_element_by_css_selector(".schedule")
You may need a wait before.
Then loop content
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
driver = webdriver.Chrome()
url ='https://www.hvw-online.org/spielbetrieb/ergebnissetabellen/#/league?ogId=3&lId=37133&allGames=1'
driver.get(url)
table = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR , '.schedule')))
headers = [elem.text for elem in driver.find_elements_by_css_selector('.schedule th')]
results = []
i = 1
for row in table.find_elements_by_css_selector('tr'):
if i > 1:
results.append([td.text for td in row.find_elements_by_css_selector('td')])
i+=1
df = pd.DataFrame(results, columns = headers)
print(df)
driver.quit()
Upvotes: 0