Reputation: 694
I am totally new to web scraping. how can i scrape a website, whose url doesn't change with the page number? suppose take this website- https://www.bseindia.com/corporates/Forth_Results.aspx the url doesn't change with page number, this is same as what i am asking, how can we do it using beautiful soup in python??
Upvotes: 1
Views: 1513
Reputation: 195553
This script wi
import requests
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
print(tr.get_text(strip=True, separator=' '))
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
Prints:
1
500002 ABB 23 Jul 2020
531082 ALANKIT 23 Jul 2020
535916 ALSL 23 Jul 2020
526662 ARENTERP 23 Jul 2020
500215 ATFL 23 Jul 2020
540611 AUBANK 23 Jul 2020
532523 BIOCON 23 Jul 2020
533167 COROENGG 23 Jul 2020
532839 DISHTV 23 Jul 2020
500150 FOSECOIND 23 Jul 2020
507488 GMBREW 23 Jul 2020
532855 HARYNACAP 23 Jul 2020
541729 HDFCAMC 23 Jul 2020
524342 INDOBORAX 23 Jul 2020
522183 ITL 23 Jul 2020
534623 JUPITERIN 23 Jul 2020
533192 KCPSUGIND 23 Jul 2020
542753 MAHAANIMP 23 Jul 2020
532525 MAHABANK 23 Jul 2020
523754 MAHEPC 23 Jul 2020
531680 MAYUR 23 Jul 2020
526299 MPHASIS 23 Jul 2020
532416 NEXTMEDIA 23 Jul 2020
502294 NILACHAL 23 Jul 2020
538772 NIYOGIN 23 Jul 2020
2
530805 OIVL 23 Jul 2020
538742 PANACHE 23 Jul 2020
531879 PIONDIST 23 Jul 2020
540173 PNBHOUSING 23 Jul 2020
533178 PRADIP 23 Jul 2020
...and so on.
EDIT: To save it as CSV, you can use this:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.bseindia.com/corporates/Forth_Results.aspx'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
page = 1
all_data = []
while True:
print(page)
rows = soup.select('.TTRow')
if not rows:
break
# print some data to screen:
for tr in rows:
row = tr.get_text(strip=True, separator='|').split('|')
all_data.append(row)
# to get correct page, you have to do POST request with correct data
# the data is located in <input name="..." value=".."> tags
d = {}
for i in soup.select('input'):
d[i['name']] = i.get('value', '')
# some data parameters needs to be deleted:
if 'ctl00$ContentPlaceHolder1$btnSubmit' in d:
del d['ctl00$ContentPlaceHolder1$btnSubmit']
# set correct page:
page += 1
d['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$gvData'
d['__EVENTARGUMENT'] = 'Page${}'.format(page)
soup = BeautifulSoup(requests.post(url, headers=headers, data=d).content, 'html.parser')
df = pd.DataFrame(all_data)
print(df)
df.to_csv('data.csv')
Produces data.csv
(screenshot from LibreOffice):
Upvotes: 1