Chris Moore
Chris Moore

Reputation: 49

How do you loop through a web page for content using Python Selenium

I am quite new to using Selenium (Python). I have just scraped some data off a website in exactly the way I wanted to but the code only pulls off the first 10 records. It doesn't proceed to pick up the entire content by looping through the other pages. Would you happen to know why the script fails to open the proceeding pages? Any help would be greatly appreciated. If you find the """This is for navigation to next page""" I think this is the incorrect area.

Code:

from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchWindowException

import time from openpyxl import Workbook

"""This is to collect links that associate with all the profiles present in Allen Overy website""" def get_links(driver, target, upper_datetime_str, lower_datetime_str):

"""This part allows me to filter on date so I'm not having to pull back all the data each time it's run"""
if upper_datetime_str == '' and lower_datetime_str == '':
    time_constrain = 0
else:
    time_constrain = 1
    upper_datetime = time.strptime(upper_datetime_str,'%d/%m/%Y')
    lower_datetime = time.strptime(lower_datetime_str,'%d/%m/%Y')

"""This is to search for news that present in Freshfields website"""
"""Go to page that contains news list"""
driver.get(target) 

isbreak = 0
list_links = []
while True: 
    try:
        """Get links that associate to news in each page"""
        list_ppl_link =  driver.find_elements_by_xpath('//div[@class = "srch-Title3"]')                                                 
        for item in list_ppl_link:                
            rel_date = item.find_elements_by_xpath('//div[@class = "srch-Metadata2"]')
            if time_constrain == 0:
                rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                list_links.append(rel_link)
            else:
                input_datetime = time.strptime(rel_date,'%d %B %Y')
                if input_datetime < lower_datetime:
                    isbreak = 1
                    break
                elif input_datetime >=lower_datetime and input_datetime <= upper_datetime:
                    rel_link = item.find_element_by_tag_name('a').get_attribute('href')
                    list_links.append(rel_link)

        """This is for navigation to next page"""
        next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]')
        if next_b.get_attribute('class') == 'srch-Page-img':
            next_b.click() 
        else:
            break

        if isbreak == 1:
            break

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break           
    except:
        raise

return list_links

def get_news_content(driver, link): driver.get(link)

try:
    rels_date = driver.find_element_by_class_name('ao-rteElement-H4').text
except NoSuchElementException:
    rels_date = ''

try:
    headline = driver.find_element_by_class_name('ao-rteElement-H1').text
except:
    headline = ''

try:
    content1 = driver.find_element_by_class_name('ao-rteElement-introText').text
except NoSuchElementException: 
    content1 = ''
try:
    content2  = driver.find_element_by_id('ctl00_PlaceHolderMain_main__ControlWrapper_ExtendedRichHtmlField').text
except NoSuchElementException: 
    content2 = ''
content = '\n'.join([content1, content2]).strip()

return {'news_date':rels_date ,\
        'news_content':content, 'news_headline':headline}

def extract_data(adict):

return [adict.get('news_date', ''),
        adict.get('news_headline', ''),
        adict.get('news_content', '')]

===============================================================================================================

if name == "main": """Highlight the file variables such as file name and headers for columns with a date stamp of===""" printout = time.strftime('%y%m%d_%H%M%S', time.localtime()) + '_allenovery_news.xlsx'
header = ['Firm Name','Date', 'Headline Title', 'News Content']

wb = Workbook()
ws = wb.active
ws.append(header)
log = open('test.txt', 'w')

"""Identify target link where the data is stored"""
target = 'http://www.allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22'

"""Engage Chrome Driver"""
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()

"""Time format : dd/mm/yyyy"""
"""Select your timeframe below should you wish. Otherwise leave fields blank '' """
upper_datetime_str = ''
lower_datetime_str = ''

print('Collecting news links')
list_ppls = get_links(driver, target, upper_datetime_str, lower_datetime_str)    
driver.quit()


"""Engage Chrome Driver"""
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()


total_link = len(list_ppls)
idx = 0
while idx < total_link:
    try:
        print(idx + 1, 'in', total_link, list_ppls[idx])

        """Append client name to data"""
        ws.append(['Allen and Overy'] + extract_data(get_news_content(driver, list_ppls[idx])))          
        idx += 1  
        if not(idx%100):
            wb.save(printout) 
            driver.quit()
            time.sleep(10)
            driver = webdriver.Chrome(chrome_options=chromeOptions)
            driver.set_page_load_timeout = 120
            driver.maximize_window()

    except KeyboardInterrupt:
        break
    except NoSuchWindowException:
        break
    except:
        driver.quit()
        time.sleep(10)
        driver = webdriver.Chrome(chrome_options=chromeOptions)
        driver.set_page_load_timeout = 120
        driver.maximize_window()
        continue

wb.save(printout)        
log.close()
driver.quit()

Upvotes: 1

Views: 3318

Answers (1)

Andersson
Andersson

Reputation: 52665

You're trying to handle wrong element (driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]') is not what you actually need). Try to implement below code:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

try:
    time.sleep(1)
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "SRP_NextImg"))).click()
except TimeOutException:
    break

This should allow you to get next page until "Next" button (>) is available

Upvotes: 0

Related Questions