Reputation: 69
I want to scrape the table data from http://5000best.com/websites/
The content of the table is paginated upto several pages and are dynamic. I want to scrape the table data for each category. I can scrape the table manually for each category but this is not what I want.
Please look at it and give me the approach to do it. I am able to make links for each category i.e. http://5000best.com/websites/Movies/, http://5000best.com/websites/Games/ etc. But I am not sure how to make it further to navigate through paginated table for each category. And after making all the links I need to extract table data by using that links.
Edit : I am using requests, BeautifulSoup4,
Upvotes: 1
Views: 2507
Reputation: 69
I came with this approach to scrape tables from each category.
# ------------Hemant Sah--------------------
# <- --------Importing Libraries-------- ->
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
import itertools
import requests
import sqlalchemy
import re
final_list = []
dataframe = pd.DataFrame([])
def make_soup(url):
try:
html = requests.get(url)
except requests.exceptions.HTTPError as e:
print(e)
else:
soup = BeautifulSoup(html.text,'lxml')
# print(html.status_code)
return soup
def get_categories_from_soup(soup):
total_list = []
for item in soup.find_all('div',{"class":"sca2"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_a"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_b"}):
total_list.append(item.text)
for item in soup.find_all('div',{"class":"sca2_c"}):
total_list.append(item.text)
total_list.remove("All (5000)")
total_list.remove("Porn (201)")
return total_list
def make_url(total_list,url):
path, page_num, test_page_num, modified_links, new_links = [],[],[],[],[]
for category in total_list:
reg_exp_path = re.compile(r'^\w+')
path.extend(reg_exp_path.findall(category))
test_page_num.extend(re.findall('[0-9]+',category))
# print(path)
for c in test_page_num:
temp = math.ceil(int(c)/100)
page_num.append(temp)
# print(page_num)
# print(page_num)
for p in path:
links= (url+p+"/")
modified_links.append(links)
# print(modified_links)
for w,p in zip(modified_links,page_num):
for n in range(1,p+1):
temp = w+str(n)
new_links.append(temp)
print(new_links)
return new_links
def fetch_table_data(links):
for l in links:
soup = make_soup(l)
my_table = soup.find('table',{'id':'ttable'})
rows = my_table.find_all('tr')
for tr in rows:
td = tr.find_all('td')
row = [tr.text for tr in td]
final_list.append(row)
df = pd.DataFrame(final_list,columns=["Rank","Score","Category","Audience","URL","Links","blank","Desc"])
print(df)
df = df.drop("blank", axis=1)
# print(df)
return df
# df.to_csv('final_data.csv')
def main():
url = "http://5000best.com/websites/"
soup = make_soup(url)
total_list = get_categories_from_soup(soup)
links = make_url(total_list, url)
dataframe = fetch_table_data(links)
if __name__ == "__main__":
main()
Upvotes: 0
Reputation: 10666
Simple Scrapy spider:
import scrapy
class Best500Spider(scrapy.Spider):
name = "best5000"
start_urls = ['http://5000best.com/websites/1']
def parse(self, response):
for row in response.xpath('//table[@id="ttable"]//tr'):
record = {}
record["Rank"] = row.xpath('./td[1]/text()').get()
record["Score"] = row.xpath('./td[2]/text()').get()
record["Category"] = row.xpath('string(./td[3])').get()
record["URL"] = row.xpath('string(./td[5])').get()
yield record
next_page_url = response.xpath('//div[@id="dpages"]/span[@class="pagen0"]/following-sibling::span[1]/a/@href').get()
if next_page_url:
yield scrapy.Request(
url=response.urljoin(next_page_url),
callback=self.parse
)
Upvotes: 2
Reputation: 125
i saw the site and to move to another page just add /pageNumber at the end of the link ,
For example http://5000best.com/websites/50 : will get you page 50
You can use this tool to get python requests code for one page and add a loop : https://curl.trillworks.com/
just put "curl http://5000best.com/websites/50" and adapt your code after
Upvotes: 1