Reputation: 1
need to scrape all the table data from rajya sabha website. however, instead of scraping from the url link the code scrapes the original table page by page
from selenium import webdriver
import chromedriver_binary
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import time
import lxml
url = 'https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start=0'
#url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={i}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table1 = soup.find('table', id='sam_table')
headers = []
for a in table1.find_all('th'):
title = a.text
headers.append(title)
rsdata = pd.DataFrame(columns = headers)
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False)
# Create a for loop to fill rajya sabha data
for k in range(0,96):
url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={k}"
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(rsdata)
rsdata.loc[length] = row
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False, header=False)
print(k)
# Export to csv
# Try to read csv
#rs_data = pd.read_csv('rs_debate_data.csv')
i was trying to scrape only rows related to keyword climate change in the debate title column of the table.
Upvotes: 0
Views: 23
Reputation: 33335
for k in range(0,96):
url_call = "..."
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
This loop does a find_all()
on the original table1
results, not on the page it just fetched...
Upvotes: 1