How to reduce amount of scraping time when using Requests-HTML?

Question

I currently use Requests-HTML version 0.10.0 and Selenium 3.141.0. My project is to scrape the ratings of all articles on this website https://openreview.net/group?id=ICLR.cc/2021/Conference. To open each page of the website (the website has 53 pages and each page has 50 articles), I use Selenium. Next, to open articles on each page, I use Requests-HTML. My question is about how to reduce the time uses to open each article and get the rating. In this case, I use await r_inside.html.arender(sleep = 5, timeout=100), which means the sleeping time is 5 seconds and the timeout is 100 seconds. When I try to reduce sleep time to 0.5 seconds, it will cause an error, which is because it does not have enough time to scrape the website. However, if I keep the sleep time as 5 seconds, it will take 6 to 13 hours to scrape all 2600 articles. Also, after waiting for 13 hours, I can scrape all 2600 articles, but the codes use 88 GB of RAM, which I do not prefer because I need to send this code to other people who will not have enough RAM to run. My purpose is to reduce the scraping time and RAM memory. Below is the code I use.

import csv

link = 'https://openreview.net/group?id=ICLR.cc/2021/Conference'

from requests_html import HTMLSession, AsyncHTMLSession

import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
id_list = []
keyword_list = []
abstract_list = []
title_list = []
driver = webdriver.Chrome('./requests_html/chromedriver.exe')
driver.get('https://openreview.net/group?id=ICLR.cc/2021/Conference')

cond = EC.presence_of_element_located((By.XPATH, '//*[@id="all-submissions"]/nav/ul/li[13]/a'))
WebDriverWait(driver, 10).until(cond)


for page in tqdm(range(1, 54)):
    text = ''
    elems = driver.find_elements_by_xpath('//*[@id="all-submissions"]/ul/li')
    for i, elem in enumerate(elems):
        try:
            # parse title
            title = elem.find_element_by_xpath('./h4/a[1]')
            link = title.get_attribute('href')
            paper_id = link.split('=')[-1]
            title = title.text.strip().replace('	', ' ').replace('
', ' ')
            # show details
            elem.find_element_by_xpath('./a').click()
            time.sleep(0.2)

            # parse keywords & abstract
            items = elem.find_elements_by_xpath('.//li')
            keyword = ''.join([x.text for x in items if 'Keywords' in x.text])
            abstract = ''.join([x.text for x in items if 'Abstract' in x.text])
            keyword = keyword.strip().replace('	', ' ').replace('
', ' ').replace('Keywords: ', '')
            abstract = abstract.strip().replace('	', ' ').replace('
', ' ').replace('Abstract: ', '')
            text += paper_id+'	'+title+'	'+link+'	'+keyword+'	'+abstract+'
'
            title_list.append(title)
            id_list.append(paper_id)
            keyword_list.append(keyword)
            abstract_list.append(abstract)
        except Exception as e:
            print(f'page {page}, # {i}:', e)
            continue



    # next page
    try:
        driver.find_element_by_xpath('//*[@id="all-submissions"]/nav/ul/li[13]/a').click()
        time.sleep(2) # NOTE: increase sleep time if needed
    except:
        print('no next page, exit.')
        break

csv_file = open('./requests_html/bb_website_scrap.csv','w', encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Title','Keyword','Abstract','Link','Total Number of Reviews','Average Rating','Average Confidence'])
n = 0
for item in range(len(id_list)):
     title = title_list[item]
     keyword = keyword_list[item]
     abstract = abstract_list[item]
     id = id_list[item]

     link_pdf = f'https://openreview.net/forum?id={id}'
     print(id)

     asession_inside = AsyncHTMLSession()
     r_inside = await asession_inside.get(link_pdf)
     print(type(r_inside))
     await r_inside.html.arender(sleep = 5, timeout=100)

     test_rating = r_inside.html.find('div.comment-level-odd div.note_contents span.note_content_value')
     print(len(test_rating))
     check_list = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9','10'}
     total_rating_confidence = []
     total_rating = []
     total_confidence = []
     for t in range(len(test_rating)):
          if any(test_rating[t].text.split(':')[0] in s for s in check_list):
               total_rating_confidence.append(test_rating[t].text.split(':')[0])

     for r in range(len(total_rating_confidence)):
          if (r % 2 == 0):
               total_rating.append(int(total_rating_confidence[r]))
          else:
               total_confidence.append(int(total_rating_confidence[r]))

     average_rating = sum(total_rating) / len(total_rating)
     average_confidence = sum(total_confidence) / len(total_confidence)
     csv_writer.writerow([title, keyword, abstract, link_pdf,len(total_rating),average_rating,average_confidence])
     n = n + 1
     print('Order {}',n)
csv_file.close()

How to reduce amount of scraping time when using Requests-HTML?

Answers (1)

Related Questions