Reputation: 2227
import requests
from bs4 import BeautifulSoup
def getdata(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"}
r = s.get(url, headers=headers).text
soup = BeautifulSoup(r, 'html.parser')
return soup
def getnextpage(soup):
try:
div = soup.find('div',{'class': 'a-section a-spacing-large a-spacing-top-large a-text-center s-pagination-container'})
url_a = div.find('a', {'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'https://www.amazon.in' + url_a
return url
except:
print("All pages scraped")
s = requests.Session()
url = f'https://www.amazon.in/s?k=apple&i=electronics&ref=nb_sb_noss_2'
while True:
soup = getdata(url)
#getdeals(soup)
url = getnextpage(soup)
print(url)
I am scraping Amazon. However, the a
element for the next page is found sometimes and sometimes it is not found. I cannot understand the reason for this. If it is found, it keeps going for all pages, else it fails on the very first page itself. How can I correct this?
Upvotes: 1
Views: 250
Reputation: 195428
Seems that Amazon is returning 2 versions of HTML page. I've modified the getnextpage()
function to try different elements for getting next page URL:
import requests
from bs4 import BeautifulSoup
def getdata(url):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_5_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
r = s.get(url, headers=headers).text
soup = BeautifulSoup(r, "html.parser")
return soup
def getnextpage(soup):
next_ = soup.select_one("li.a-last a")
if not next_:
next_ = soup.select_one(".s-pagination-next")
if not next_:
return
return "https://www.amazon.in" + next_["href"]
s = requests.Session()
url = f"https://www.amazon.in/s?k=apple&i=electronics&ref=nb_sb_noss_2"
while True:
soup = getdata(url)
# getdeals(soup)
url = getnextpage(soup)
if not url:
break
print(url)
Prints:
https://www.amazon.in/s?k=apple&i=electronics&page=2&qid=1630604251&ref=sr_pg_1
https://www.amazon.in/s?k=apple&i=electronics&page=3&qid=1630604252&ref=sr_pg_2
https://www.amazon.in/s?k=apple&i=electronics&page=4&qid=1630604253&ref=sr_pg_3
...
Upvotes: 1