How to remove specific class in a row from table data using BeautifulSoup

I am trying to scrape data row by row from a table.
However, in some rows, two different classes ["show-for-medium-up", "hide-for-medium-up"] and the data are being linked resulting in a repetitive number. For example, the first number is 10.837 and the second number is 10.84. The resulting number for the cell will be 10.83710.84. I would like to remove the last number.
How to remove only the last class "hide-for-medium-up"?
See my code:

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

url = "https://uk.flightaware.com/live/flight/AZA202/history/20210224/0856Z/LIRF/EGLL/tracklog"
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")

# Number of tables:
flt_tables = soup.find_all("table", attrs={"class": "prettyTable fullWidth"})
print("N. tables: ", len(flt_tables))

# Scraping first table - headers only
table1 = flt_tables[0]
# Data row by row ('tr' -> row)
table_rows = table1.find_all("tr")

header = table_rows[0] # header
table_data = table_rows[1:] # table data (excluding header)

headers = []
for item in header.find_all("th"): # loop in 'th' elements
    item = (item.text).rstrip("\n") # getting text part and removing '\n'
    headers.append(item)
print(headers)

# Scraping table data ('td')
all_rows = []
for row_num in range(len(table_data)): # A row at a time
    row = []
    for row_item in table_data[row_num].find_all("td"): #loop in 'td' elements
        # regex -> removing \xa0 and \n and comma from row_item.text
        # xa0 encodes the flag, \n is the newline and comma separates thousands in numbers
        aa = re.sub("(\xa0)|(\n)|,","", row_item.text)
        row.append(aa)
    all_rows.append(row)

print(all_rows[5])

The output for print(headers) and print(all_rows[5]) are respectively:

['Time (JST)JST', 'LatitudeLat', 'LongitudeLon', 'CourseDir', 'kts', 'mph', 'meters', 'Rate', 'Reporting Facility']

['Wed 17:50:3405:50PM', '42.247342.25', '10.891910.89', '← 297°', '449', '517', '7,9717971', ' ', ' FlightAware ADS-B (LIRG) ']

The text in bold is the part to remove.

The 'tr' html code for the first data row:

<tr class="smallrow1 flight_event">
<td align="left" colspan="6"><div style="display: inline; "><span class="show-for-medium-up">Wed 17:50:34</span><span class="hide-for-medium-up">05:50PM</span> <strong>   Departure (FCO)<span class="show-for-medium-up"> @ Wednesday 09:50:34 <span class="tz">CET</span> </span></strong></div></td>
<td align="left" class="show-for-medium-up-table" colspan="1"><span class="show-for-medium-up"></span></td>
<td class="show-for-medium-up"></td>
<td align="left" class="show-for-large-up-table"><span class="show-for-large-up"><img height="12" src="https://e0.flightcdn.com/images/live/fasource.gif" width="12"/> <a href="/adsb/">FlightAware ADS-B</a>  (LIRG) </span></td>
</tr>

Upvotes: 1

Views: 1025

Answers (1)

braulio
braulio

Reputation: 571

Using the class attributes gets you a step further:

for i in table_data[5].find_all("td"): 
  try: 
    for child in i.childGenerator(): 
      print(child.attrs["class"]) 
  except: 
      print("no attribute \"class\" {:}".format(i))

outputs

['show-for-medium-up']
['hide-for-medium-up']
['show-for-medium-up']
['hide-for-medium-up']
['show-for-medium-up']
['hide-for-medium-up']
no attribute "class" <td align="right"><span>← 297°</span></td>
no attribute "class" <td align="right">449</td>
no attribute "class" <td align="right" class="show-for-medium-up-table">517</td>
['show-for-medium-up']
['hide-for-medium-up']
['show-for-medium-up']
['show-for-large-up']

Like this you can filter out the 'hide-for-medium-up' class. Depending of what how you want to do with these data you still need to do a bit of clean up because, as shown above, not every tag as a class attribute.

Here

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

url = "https://uk.flightaware.com/live/flight/AZA202/history/20210224/0856Z/LIRF/EGLL/tracklog"
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")

# Number of tables:
flt_tables = soup.find_all("table", attrs={"class": "prettyTable fullWidth"})
print("N. tables: ", len(flt_tables))

# Scraping first table - headers only
table1 = flt_tables[0]
# Data row by row ('tr' -> row)
table_rows = table1.find_all("tr")

header = table_rows[0] # header
table_data = table_rows[1:] # table data (excluding header)

headers = []
for item in header.find_all("th"): # loop in 'th' elements
    item = (item.text).rstrip("\n") # getting text part and removing '\n'
    headers.append(item)
print(headers)

# Scraping table data ('td')
all_rows = []
for row_num in range(len(table_data)): # A row at a time
    row = []
    for row_item in table_data[row_num].find_all("td"): #loop in 'td' elements
      _row_text = []
      try: # try to the tags with a class attribute other than "hide-for-medium-up"
        for tag in row_item.childGenerator():
          if tag.attrs["class"][0] == "hide-for-medium-up":
            pass
          else:
            _row_text.append(tag.text)
      except:
          _row_text.append(tag)
      row.append(_row_text) 
    all_rows.append(row)


rows=[] # example clean up '\xa0', you can then clean other things in similar loops
for row in all_rows:
  _row=[]
  for i in row:
    if type(i) == list:
      for e in i:
        aa = re.sub("(\xa0)|(\n)|,","", str(e))
        _row.append(aa)
  rows.append(_row)

print(rows[5])

Output

N. tables:  1
['Time (CET)CET', 'LatitudeLat', 'LongitudeLon', 'CourseDir', 'kts', 'mph', 'meters', 'Rate', 'Reporting Facility']
['Wed 09:50:34', '42.2473', '10.8919', '<span>← 297°</span>', '449', '517', '7971', '', ' FlightAware ADS-B  (LIRG) ']

Upvotes: 1

Related Questions