Reputation:
I launched a code to scrape the Santander website.
Scraping seems to work, except that I get false results. And when I run the code twice in a row, the results change.
How could I make the scraping more robust, the problem is that when I run the code and check the results one by one, it seems to work well.
def hw_santander_scrape(Amount, Duration):
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('window-size=10000x5000')
webdriver = webdriver.Chrome('chromedriver', chrome_options = chrome_options)
#
import time
maintenant = DT.now()
period = str(maintenant.day) + '_' + str(maintenant.month) + '_' + str(maintenant.year)
print('Start Scraping')
################################################ Santander###############################################
Santander = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
project = pd.DataFrame({
'Project': "reforma vivienda",
'Period': period,
'Monthly repayment': [0],
'TIN': [0],
'TAE': [0],
'Total repayment': [0],
'Initial amount': [0],
'Duration': [0]
})
url = 'https://simuladores.bancosantander.es/SantanderES/loansimulatorweb.aspx?por=webpublica&prv=publico&m=300&cta=1&ls=0#/t0'
webdriver.get(url)
Max_amount = 90.000
Min_amount = 3.000
for i in range(len(Amount)):
Simulated_amount = Amount[i]
if Simulated_amount > Max_amount:
pass
elif Simulated_amount < Min_amount:
pass
else :
amount = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#amount")))
amount.clear()
amount.send_keys("{:.3f}".format(Simulated_amount))
WebDriverWait(webdriver, 30).until(lambda webdriver: webdriver.execute_script('return jQuery.active') == 0)
for j in range(len(Duration)):
Simulated_duration = Duration[j]
Simulated_duration = round(int(Simulated_duration))
Max_duration = 96
Min_duration = 12
if Simulated_duration > Max_duration:
pass
elif Simulated_duration < Min_duration:
pass
else :
term = WebDriverWait(webdriver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "#term")))
term.clear()
term.send_keys("{}".format(Simulated_duration))
term.send_keys(Keys.TAB)
webdriver.save_screenshot('screenshot_santander.png')
project.loc[j, 'Project'] = "reforma vivienda"
project.loc[j, 'Initial amount'] = float("{:.3f}".format(Amount[i]).replace('.', ''))
project.loc[j, 'Duration'] = Simulated_duration
project.loc[j, 'Period'] = str(maintenant.day) + '/' + str(maintenant.month) + '/' + str(maintenant.year)
project.loc[j, 'Monthly repayment'] = webdriver.find_element_by_css_selector('.r1 span').text.replace(' €', '').replace(',', '.')
project.loc[j, 'TIN'] = float(webdriver.find_element_by_css_selector('.r3 span').text[6: 10].replace(',', '.'))
project.loc[j, 'TAE'] = float(webdriver.find_element_by_css_selector('.r3 span').text[13: 17].replace(',', '.'))
project.loc[j, 'Total repayment'] = float(webdriver.find_element_by_css_selector('.r7 span').text.replace(' €', '').replace('.', '').replace(',', '.'))
Santander = Santander.append(project)
Santander = Santander.loc[Santander.TIN != 0,: ]
Santander.to_csv('Santander_{}.csv'.format(period), index = False)
print('End Scraping')
For run the code:
Amount = [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
Duration = [12, 15, 24, 36, 48, 60, 72, 84, 96]
hw_santander_scrape(Amount, Duration)
Upvotes: 1
Views: 685
Reputation: 54984
That data come from a XHR. So just use requests to post your values and parse the response with json.loads
Use your browser network tab to see what the request looks like.
Upvotes: 2
Reputation: 5011
This is my time to shine!
I'm currently working on a financial data aggregator that was facing this exact same problem.
It collects data from about a dozen websites and organizes it into a JSON object that is then used by a Flask site to display the data.
This data is scraped from websites that have several sub-directories with similar content which have different selectors.
As you can imagine, with a framework like selenium
this becomes very complex so the only solution is to dumb-down it down.
Simplicity is key, so I removed every dependency except for the BeautifulSoup
and requests
library.
Then I created three classes and a function for each filter
[1]
from bs4 import BeautifulSoup
class GET:
def text(soup, selector, index = 0):
selected = soup.select(selector)
if len(selected) > index:
return selected[index].text.strip()
class Parse:
def common(soup, selector):
return GET.text(soup, selector, index = 5)
class Routes:
def main(self):
data = {}
if self.is_dir_1:
data["name"] = GET.text(self.soup, "div")
data["title-data"] = Parse.common(self.soup, "p > div:nth-child(1)")
elif self.is_dir_2:
data["name"] = GET.text(self.soup, "p", index = 2)
data["title-data"] = Parse.common(self.soup, "p > div:nth-child(5)")
return data
def filter_name(url: str, response: str, filter_type: str):
if hasattr(Routes, filter_type):
return getattr(Routes, filter_type)(to_object({
"is_dir_1": bool("/sub_dir_1/" in url),
"is_dir_2": bool("/sub_dir_1/" in url),
"soup": BeautifulSoup(html, "lxml")
}))
return {}
Using the requests
library I made the request that got the data, then I passed the URL, response text and filter_type to the filter_name
function.
Then in the filter_name
function I used the filter_type
argument to pass the "soup" to the target route function and select each element and get it's data there.
Then in the target route function, I used an if
condition to determine the sub directory and assigned the text to a data object.
After all this is complete I returned the data
object.
This method is very simple and has kept my code DRY, it even allows for optional key: value
pairs.
Here is the code for the to_object
helper class:
class to_object(object):
def __init__(self, dictionary):
self.__dict__ = dictionary
This converts dictionaries to objects so instead of having to always write:
self["soup"]
You would write:
self.soup
You really need to standardize the type of indentation you use because your script raises the following error:
Traceback (most recent call last):
File "", line 84
Amount = [13.000, 14.000, 15.000, 30.000, 45.000, 60.000]
^
IndentationError: unindent does not match any outer indentation level
I hope this helps, good luck.
Upvotes: 1