Reputation: 29064
I am trying to web scrape a HTML table using python. There are many tables in the HTML page, but i want to scrape a certain table only. I am using beautiful soup to do this web scraping.
My code looks like this:
page = get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')
for p in html.select('tr'):
if p.text == "ARGOR CAST BAR":
print (p.text)
I would like only the table that reads "Rate as at Monday, 10 September 2018".
How do I go about doing that?
Upvotes: 3
Views: 1925
Reputation: 19
I believe this code will help you. If you want complete running project visit html to pdf Web scraping
import logging
import math
import json
from flask import jsonify, abort, make_response
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import pydf
from constants import Constants
from response import Response
class SeleniumCrawler(object):
def get_page(self, url):
response = Response()
try:
# Initilized the chrome driver
print("Initilized the chrome driver")
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=1420,1080')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# browser url
browser.get(url)
delay = 10000
# wait till specific classes appears
print("wait till specific classes appears")
WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.CLASS_NAME, 'kbn-table')))
body = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML')
# calculate number of pages exists and loop them
print("calculate number of pages exists and loop them")
pages = (str(browser.find_element_by_class_name("kuiToolBarText").text).split(" ")[2]).replace(",","")
pages = math.ceil(int(pages) / 50) - 1
print("pages found {}".format(pages))
for page in range(1, pages):
browser.execute_script("document.getElementsByClassName('kuiButton')[1].click()")
chunk = browser.find_element_by_class_name("kbn-table").get_attribute('innerHTML').replace("<tbody>", "")
body += chunk`enter code here`
# apply table tags and generate pdf
print("apply table tags and generate pdf")
pdf = pydf.generate_pdf("<table>" + body + "</table>")
with open('out.pdf', 'wb') as f:
f.write(pdf)
return json.loads(json.dumps((response.get_response(Constants.SUCCESS, Constants.SUCCESS))))
except Exception as e:
logging.exception(e)
return abort(make_response(jsonify(response.get_response(Constants.SERVER_ERROR, Constants.SERVER_ERROR)), response.get_code(Constants.SERVER_ERROR)))
Upvotes: 0
Reputation: 3372
from collections import defaultdict
import requests
from bs4 import BeautifulSoup
def get_page_html(url):
r = requests.get(url)
r.raise_for_status()
return r.text
def parse_last_table(html):
prev_key = None
result = defaultdict(list)
soup = BeautifulSoup(html, 'lxml')
last_table = soup.find_all('table')[-1]
for row in last_table.find_all('tr')[2:]:
try:
description, currency, unit, bank_sells, bank_buys = (
col.text.strip() for col in row.find_all('td')
)
except ValueError:
continue # blank/empty row
description = description or prev_key
result[description].append({
'currency': currency,
'unit': unit,
'bank_sells': bank_sells,
'bank_buys': bank_buys
})
prev_key = description
return result
Output:
>>> url = 'http://uobgoldprice.com/history/2018/September/10/'
>>> page_html = get_page_html(url)
>>> result = parse_last_table(page_html)
>>> import json; print(json.dumps(result, indent=2))
{
"ARGOR CAST BAR": [
{
"currency": "SGD",
"unit": "100 GM",
"bank_sells": "5,369.00 (+4.00)",
"bank_buys": "5,291.00 (+3.00)"
}
],
"CAST BARS": [
{
"currency": "SGD",
"unit": "1 KILOBAR",
"bank_sells": "53,201.00 (+36.00)",
"bank_buys": "52,933.00 (+36.00)"
}
],
"GOLD CERTIFICATE": [
{
"currency": "SGD",
"unit": "1 KILOCERT",
"bank_sells": "53,201.00 (+36.00)",
"bank_buys": "52,933.00 (+36.00)"
}
],
"GOLD SAVINGS A/C": [
{
"currency": "SGD",
"unit": "1 GM",
"bank_sells": "53.20 (+0.04)",
"bank_buys": "52.94 (+0.04)"
}
],
"GOLD BULLION COINS": [
{
"currency": "SGD",
"unit": "1/20 OZ(GNC,SLC &GML)",
"bank_sells": "131.00",
"bank_buys": "81.00"
},
{
"currency": "SGD",
"unit": "1/10 OZ",
"bank_sells": "211.00 (+1.00)",
"bank_buys": "163.00"
},
{
"currency": "SGD",
"unit": "1/4 OZ",
"bank_sells": "465.00",
"bank_buys": "410.00"
},
{
"currency": "SGD",
"unit": "1/2 OZ",
"bank_sells": "904.00 (+1.00)",
"bank_buys": "822.00 (+1.00)"
},
{
"currency": "SGD",
"unit": "1 OZ",
"bank_sells": "1,726.00 (+1.00)",
"bank_buys": "1,645.00 (+1.00)"
}
],
"PAMP GOLD BARS": [
{
"currency": "SGD",
"unit": "1/2 OZ",
"bank_sells": "876.00",
"bank_buys": "821.00 (+1.00)"
},
{
"currency": "SGD",
"unit": "1 GM",
"bank_sells": "82.00",
"bank_buys": "50.00"
},
{
"currency": "SGD",
"unit": "1 OZ",
"bank_sells": "1,711.00 (+1.00)",
"bank_buys": "1,644.00 (+1.00)"
},
{
"currency": "SGD",
"unit": "2.5 GM",
"bank_sells": "182.00",
"bank_buys": "130.00"
},
{
"currency": "SGD",
"unit": "5 GM",
"bank_sells": "322.00",
"bank_buys": "262.00"
},
{
"currency": "SGD",
"unit": "10 GM",
"bank_sells": "597.00 (+1.00)",
"bank_buys": "527.00 (+1.00)"
},
{
"currency": "SGD",
"unit": "20 GM",
"bank_sells": "1,132.00 (+1.00)",
"bank_buys": "1,056.00 (+1.00)"
},
{
"currency": "SGD",
"unit": "50 GM",
"bank_sells": "2,746.00 (+2.00)",
"bank_buys": "2,644.00 (+2.00)"
},
{
"currency": "SGD",
"unit": "100 GM",
"bank_sells": "5,414.00 (+3.00)",
"bank_buys": "5,291.00 (+3.00)"
}
],
"SILVER PASSBOOK ACCOUNT": [
{
"currency": "SGD",
"unit": "1 OZ",
"bank_sells": "19.86 (+0.09)",
"bank_buys": "19.30 (+0.09)"
}
]
}
Upvotes: 1
Reputation: 61910
You need to find the elements that contains the text and the parent that is a table:
import re
import requests
from bs4 import BeautifulSoup
page = requests.get("http://uobgoldprice.com/history/2018/September/10/")
html = BeautifulSoup(page.content, 'html.parser')
element = html.find(text=re.compile('Rate as at Monday, 10 September 2018'))
print(element.findParent('table'))
Upvotes: 2