Reputation: 1
I am extracting data from booking.com, my script uses selenium to gather the data, creates a provisional csv with the appropriate timestamp and then appends it to the final database which is also a csv. I would like to get new data every hour even when I'm offline, and append it to the final database, yet I don't know how to do it. I am new to web scraping. Currently my script runs in Jupyter. Any help would be greatly appreciated.
I'm using macOS Big Sur
This is my code:
def prepare_driver(url):
'''Returns a Firefox Webdriver.'''
options = Options()
# options.add_argument('-headless')
driver = Firefox(executable_path='/Users/andreazavala/Downloads/geckodriver', options=options)
driver.get(url)
wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located(
(By.ID, 'ss')))
return driver
def fill_form(driver, search_argument):
'''Finds all the input tags in form and makes a POST requests.'''
search_field = driver.find_element_by_id('ss')
search_field.send_keys(search_argument)
#Look for today's date
driver.find_element_by_class_name('xp__dates-inner').click()
slcpath = "td[data-date='"+str(date.today())+"']"
driver.find_element_by_css_selector(slcpath).click()
# We look for the search button and click it
driver.find_element_by_class_name('sb-searchbox__button')\
.click()
wait = WebDriverWait(driver, timeout=10).until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'sr-hotel__title')))
driver = prepare_driver(domain)
fill_form(driver, 'City Name')
url_iter = driver.current_url
accommodation_urls = list()
accommodation_urls.append(url_iter)
with open('urls.txt', 'w') as f:
for item in accommodation_urls:
f.write("%s\n" % item)
from selectorlib import Extractor
import requests
from time import sleep
import csv
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('booking.yml')
def scrape(url):
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
# You may want to change the user agent if you get blocked
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'https://www.booking.com/index.en-gb.html',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
# Download the page using requests
print("Downloading %s"%url)
r = requests.get(url, headers=headers)
# Pass the HTML of the page and create
return e.extract(r.text,base_url=url)
with open("urls.txt",'r') as urllist, open('data.csv','w') as outfile:
fieldnames = [
"name",
"location",
"price",
"price_for",
"room_type",
"beds",
"rating",
"rating_title",
"number_of_ratings",
"url"
]
writer = csv.DictWriter(outfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for url in urllist.readlines():
data = scrape(url)
if data:
for h in data['hotels']:
writer.writerow(h)
import pandas as pd
data = pd.read_csv("data.csv")
data.insert(0, 'TimeStamp', pd.to_datetime('today').replace(microsecond=0))
df2 = data
df2.to_csv('Tarifa.csv', mode = 'a', header = False)
df_results = pd.read_csv('Tarifa.csv', index_col=0).reset_index(drop = True, inplace = True)
Upvotes: 0
Views: 3089
Reputation: 754
A system approach would be to rely on crontab.
Type in the console: crontab -e
.
Inside there, put 0 0-23 * * * /path/to/script/app.py
That'd run every hour every day.
Save it pressing escape (esc
) then type :wq
. That'd save the new cron job and quit the editor.
Upvotes: 2
Reputation: 79
Here is an approach you could use!
Import schedule & time, then wrap your script in a main function to call once per hour.
import time
import schedule
def runs_my_script():
function1()
function2()
and_so_on()
Then at the bottom add this:
if __name__ == "__main__":
schedule.every().hour.do(runs_my_script) # sets the function to run once per hour
while True: # loops and runs the scheduled job indefinitely
schedule.run_pending()
time.sleep(1)
Its not elegant, but it gets the base job done and can be expanded on to fit your needs :)
Upvotes: 2