Reputation: 67
I have a list of google news links from google RSS feed and I would like to get full text of those articles. I use BeautifulSoup library to scrape the data, however, it seems that google redirects to the cookie consent page first. What is the best way to handle the consent step? Is there a way to bypass the consent page or programmatically accept/reject?
The below example returns the text from the consent page rather than the article text.
# Import libraries
import requests
from bs4 import BeautifulSoup
google_rss_url = 'https://news.google.com/rss/articles/CBMimwFBVV95cUxNVmJMNUdiamVCNkJSb1E4NVU0SlBFQUNneXpEaHFuRUJpN3lwRXFNNGdRalpITmFUQUh4Z3lsOVZ4ekFSdWVwVEljVUJOT241S1g2dmRmd3NnRmJjamU4TVFFdUVXd0N2MGVPTUdxb0RVZ2xQbUlkS1Y3eEhKbmdBN2hSUHNzS2ZucjlKQl84SW13ZVpXYlZXRnRSZw?oc=5'
# Send a GET request to the URL
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(google_rss_url, headers=headers)
# Parse the response content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the main content of the article
paragraphs = soup.find_all('p')
# Join the text from all paragraphs
article_text = '\n'.join([para.get_text() for para in paragraphs])
Upvotes: 2
Views: 558
Reputation: 1
Had the same issue with the consent page when accessing google maps. Managed to get around the consent page by sending the appropriate cookies when submitting the request.
In Chrome, if you open the console and go to the section application, then Storage, then Cookies, in there you will be able to find the google related cookies under https://www.google.com/
After passing the cookies called NID and SOCS in the request headers, I managed to get the actual page.
cookies = {'NID':'my_cookie_value'
,'SOCS':'my_cookie_value'}
response = requests.get('my_url', cookies=cookies)
Upvotes: 0
Reputation: 1906
I still can't replicate the consent page issue, even with incognito/vpn/different browser. But even if you bypassed the consent page, you'll get a redirect page that requires javascript execution.
Here's how to bypass all of that and get the final article url:
import requests
from urllib.parse import urlencode, urlparse
import json
google_rss_url = 'https://news.google.com/rss/articles/CBMimwFBVV95cUxNVmJMNUdiamVCNkJSb1E4NVU0SlBFQUNneXpEaHFuRUJpN3lwRXFNNGdRalpITmFUQUh4Z3lsOVZ4ekFSdWVwVEljVUJOT241S1g2dmRmd3NnRmJjamU4TVFFdUVXd0N2MGVPTUdxb0RVZ2xQbUlkS1Y3eEhKbmdBN2hSUHNzS2ZucjlKQl84SW13ZVpXYlZXRnRSZw?oc=5'
# parse guid from url, or directly get it from rss <guid>
guid = urlparse(google_rss_url).path.replace('/rss/articles/', '')
param = '["garturlreq",[["en-US","US",["FINANCE_TOP_INDICES","WEB_TEST_1_0_0"],null,null,1,1,"US:en",null,null,null,null,null,null,null,0,5],"en-US","US",true,[2,4,8],1,true,"661099999",0,0,null,0],{guid}]'
payload = urlencode({
'f.req': [[['Fbv4je', param.format(guid=guid), 'null', 'generic']]]
})
headers = {
'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
url = "https://news.google.com/_/DotsSplashUi/data/batchexecute"
response = requests.post(url, headers=headers, data=payload)
array_string = json.loads(response.text.replace(")]}'", ""))[0][2]
article_url = json.loads(array_string)[1]
print(article_url)
Upvotes: 2
Reputation: 67
The best solution I have managed to find so far was to use Selenium library to accept cookies, wait to get redirected to the source website and extract sour e url. Then use newspaper library to extract article content. My code below.
# Import libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from newspaper import Article, Config
# Set up driver
# Path to your ChromeDriver
driver_path = 'C:\\WebDrivers\\chromedriver-win64\\chromedriver.exe'
# Set up Chrome options
options = Options()
options.add_argument("--headless") # Run in headless mode
# Initialize the WebDriver
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=options)
# Get source url
google_rss_url = 'https://news.google.com/rss/articles/CBMimwFBVV95cUxNVmJMNUdiamVCNkJSb1E4NVU0SlBFQUNneXpEaHFuRUJpN3lwRXFNNGdRalpITmFUQUh4Z3lsOVZ4ekFSdWVwVEljVUJOT241S1g2dmRmd3NnRmJjamU4TVFFdUVXd0N2MGVPTUdxb0RVZ2xQbUlkS1Y3eEhKbmdBN2hSUHNzS2ZucjlKQl84SW13ZVpXYlZXRnRSZw?oc=5'
max_wait_time = 5
page_data = dict()
try:
# Open the URL
driver.get(google_rss_url)
wait = WebDriverWait(driver, max_wait_time)
# Check if the current URL is the consent page
current_url = driver.current_url
# print(f'Current url: {current_url}')
if current_url.startswith("https://consent.google.com/"):
print('Accepting cookie consent')
try:
# Wait for the "Accept All" button to be clickable and click it
accept_all_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[.//span[text()="Accept all"]]')))
if accept_all_button:
accept_all_button.click()
print('Accept All button clicked')
except:
print('No Accept All button found')
# Wait until the URL changes to something that does not start with "https://news.google.com/"
print("Redirecting to source website")
wait.until(lambda driver: not driver.current_url.startswith("https://news.google.com/")
and not driver.current_url.startswith("https://consent.google.com/"))
# Wait for the article page to be fully loaded
wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
# Get page URL
page_data['source_url'] = driver.current_url
except Exception as e:
print(f"An error occurred getting source url for {google_rss_url}: {e}")
driver.quit()
# Print source url
print(page_data['source_url'])
# Get article content using newpaper3k library
# Url
url = page_data['source_url']
# Cofig user agent
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
article_data = dict()
try:
# Use newspaper3k to download and parse the article
article = Article(url, config=config)
article.download()
article.parse()
# Save data
article_data['article_title'] = article.title
article_data['article_text'] = article.text
except Exception as e:
print(f"Error extracting article from {url}: {e}")
# Check output
article_data
Let me know if anyone finds a more straightforward solution.
Upvotes: 0