Reputation: 35
Below is the code, when I run the script, it starts over from the 1st page while running. The aim is to get the post title, date, and body from each page and then click next at the bottom of each page to start the process again.
Here are the includes: import requests import csv import urllib.parse as urlparse from urllib.parse import parse_qs from bs4 import BeautifulSoup from selenium import webdriver import time
browser = webdriver.Chrome('/Users/Xander/desktop/scraper/chromedriver')
URL = "https://www.jancox.com/jans-daily-news"
browser.get(URL)
URL_PAG = None
PAG = None
# Function Definition
def scrapeP(r):
count = 0
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
quotes = []
table = soup.find('div', attrs = {'class':'main-content'})
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
print(quote['date_published'])
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
time.sleep(2)
doc.click()
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(10)
print(count, ' - ', URL_PAG)
if(count % 10 == 0):
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
while True:
if(PAG == True):
browser.get(URL_PAG)
r = requests.get(URL_PAG)
print(URL_PAG)
scrapeP(r)
else:
browser.get(URL)
r = requests.get(URL)
scrapeP(r)
Upvotes: 0
Views: 76
Reputation: 35
The issue was that I had the function to click next and print in the for loop. I needed to allow the for loop to read the entire page before clicking next.
for post in table.findAll('div', attrs = {'class':'post'}):
quote = {}
quote['title'] = post.h1.text
quote['date_published'] = post.time.text
quote['post'] = post.div.text.strip()
print(quote['date_published'])
quotes.append(quote)
time.sleep(2)
doc = browser.find_elements_by_xpath('/html/body/div/div/div[2]/div/div[1]/div/div[2]/nav/div/ul/li[2]/a')[0]
doc.click()
time.sleep(2)
URL_PAG = browser.current_url
count += 1
PAG = True
time.sleep(2)
filename = 'post.csv'
with open(filename, 'a+', newline='') as f:
w = csv.DictWriter(f,['title', 'post', 'date_published'])
w.writeheader()
for quote in quotes:
w.writerow(quote)
quote.clear()
f.close()
Upvotes: 0
Reputation: 1699
The loop never starts. The get request that you see is the one fired at line 3
PAG is None and you change its value inside your scrapeP function, so you never change its value outside scrapeP function's scope.
define
global pag
inside your scrapeP function to change its value from within a scrapeP
>>> PAG = None
>>> def scrapeP():
... global PAG
... PAG = True
...
>>> scrapeP()
>>> PAG
True
Upvotes: 1