Reputation: 49
I am quite new to using Selenium (Python). I have just scraped some data off a website in exactly the way I wanted to but the code only pulls off the first 10 records. It doesn't proceed to pick up the entire content by looping through the other pages. Would you happen to know why the script fails to open the proceeding pages? Any help would be greatly appreciated. If you find the """This is for navigation to next page""" I think this is the incorrect area.
Code:
from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.common.exceptions import NoSuchWindowException
import time from openpyxl import Workbook
"""This is to collect links that associate with all the profiles present in Allen Overy website""" def get_links(driver, target, upper_datetime_str, lower_datetime_str):
"""This part allows me to filter on date so I'm not having to pull back all the data each time it's run"""
if upper_datetime_str == '' and lower_datetime_str == '':
time_constrain = 0
else:
time_constrain = 1
upper_datetime = time.strptime(upper_datetime_str,'%d/%m/%Y')
lower_datetime = time.strptime(lower_datetime_str,'%d/%m/%Y')
"""This is to search for news that present in Freshfields website"""
"""Go to page that contains news list"""
driver.get(target)
isbreak = 0
list_links = []
while True:
try:
"""Get links that associate to news in each page"""
list_ppl_link = driver.find_elements_by_xpath('//div[@class = "srch-Title3"]')
for item in list_ppl_link:
rel_date = item.find_elements_by_xpath('//div[@class = "srch-Metadata2"]')
if time_constrain == 0:
rel_link = item.find_element_by_tag_name('a').get_attribute('href')
list_links.append(rel_link)
else:
input_datetime = time.strptime(rel_date,'%d %B %Y')
if input_datetime < lower_datetime:
isbreak = 1
break
elif input_datetime >=lower_datetime and input_datetime <= upper_datetime:
rel_link = item.find_element_by_tag_name('a').get_attribute('href')
list_links.append(rel_link)
"""This is for navigation to next page"""
next_b = driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]')
if next_b.get_attribute('class') == 'srch-Page-img':
next_b.click()
else:
break
if isbreak == 1:
break
except KeyboardInterrupt:
break
except NoSuchWindowException:
break
except:
raise
return list_links
def get_news_content(driver, link): driver.get(link)
try:
rels_date = driver.find_element_by_class_name('ao-rteElement-H4').text
except NoSuchElementException:
rels_date = ''
try:
headline = driver.find_element_by_class_name('ao-rteElement-H1').text
except:
headline = ''
try:
content1 = driver.find_element_by_class_name('ao-rteElement-introText').text
except NoSuchElementException:
content1 = ''
try:
content2 = driver.find_element_by_id('ctl00_PlaceHolderMain_main__ControlWrapper_ExtendedRichHtmlField').text
except NoSuchElementException:
content2 = ''
content = '\n'.join([content1, content2]).strip()
return {'news_date':rels_date ,\
'news_content':content, 'news_headline':headline}
def extract_data(adict):
return [adict.get('news_date', ''),
adict.get('news_headline', ''),
adict.get('news_content', '')]
if name == "main":
"""Highlight the file variables such as file name and headers for columns with a date stamp of==="""
printout = time.strftime('%y%m%d_%H%M%S', time.localtime()) + '_allenovery_news.xlsx'
header = ['Firm Name','Date', 'Headline Title', 'News Content']
wb = Workbook()
ws = wb.active
ws.append(header)
log = open('test.txt', 'w')
"""Identify target link where the data is stored"""
target = 'http://www.allenovery.com/search/Pages/results.aspx?k=*&v1=-write&s=NewsAndDeals&r=aolanguage%3d%22AQdFbmdsaXNoCmFvbGFuZ3VhZ2UBAV4BJA%3d%3d%22'
"""Engage Chrome Driver"""
chromeOptions = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()
"""Time format : dd/mm/yyyy"""
"""Select your timeframe below should you wish. Otherwise leave fields blank '' """
upper_datetime_str = ''
lower_datetime_str = ''
print('Collecting news links')
list_ppls = get_links(driver, target, upper_datetime_str, lower_datetime_str)
driver.quit()
"""Engage Chrome Driver"""
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()
total_link = len(list_ppls)
idx = 0
while idx < total_link:
try:
print(idx + 1, 'in', total_link, list_ppls[idx])
"""Append client name to data"""
ws.append(['Allen and Overy'] + extract_data(get_news_content(driver, list_ppls[idx])))
idx += 1
if not(idx%100):
wb.save(printout)
driver.quit()
time.sleep(10)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()
except KeyboardInterrupt:
break
except NoSuchWindowException:
break
except:
driver.quit()
time.sleep(10)
driver = webdriver.Chrome(chrome_options=chromeOptions)
driver.set_page_load_timeout = 120
driver.maximize_window()
continue
wb.save(printout)
log.close()
driver.quit()
Upvotes: 1
Views: 3318
Reputation: 52665
You're trying to handle wrong element (driver.find_element_by_xpath('//div[@class="srch-Page srch-Page-bg"]')
is not what you actually need). Try to implement below code:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
try:
time.sleep(1)
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.ID, "SRP_NextImg"))).click()
except TimeOutException:
break
This should allow you to get next page until "Next" button (>
) is available
Upvotes: 0