Reno
Reno

Reputation: 1139

Export Web Scraped Table to Excel

I am having trouble getting pandas to export some web scraped data in the format I want.

I want to visit each URL in URLs and get the various elements from that page and put them into an Excel spreadsheet with the column names specified. I then want to visit the next URL in URLs and put this data on the next row of the Excel sheet so that I have an Excel sheet with 6 columns and three rows of data, one for each plant (each plant in on a separate URL).

Currently I have an error saying ValueError: Length mismatch: Expected axis has 18 elements, new values have 6 elements as the new records are being placed horizontally next to each other rather than on a new row in Excel and Pandas isn't expecting that.

Can someone help pls? Thanks

import csv
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
from urllib2 import urlopen
import bs4
from bs4 import BeautifulSoup


URLs = ["http://adbioresources.org/map/ajax-single/27881",
"http://adbioresources.org/map/ajax-single/27967",
"http://adbioresources.org/map/ajax-single/27880"]

mylist = []

for plant in URLs:
    soup = BeautifulSoup(urlopen(plant),'lxml')

    table = soup.find_all('td')
    for td in table:
        mylist.append(td.text)

    heading2 = soup.find_all('h2')
    for h2 in heading2:
        mylist.append(h2.text)

    para = soup.find_all('p')   
    for p in para:
        mylist.append(p.text)

df = pd.DataFrame(mylist)
transposed_df = df.T
transposed_df.columns = 
['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
transposed_df.to_excel(writer,'Sheet1',index=False)
writer.save()

Upvotes: 1

Views: 1680

Answers (2)

Farhan Hai Khan
Farhan Hai Khan

Reputation: 808

I really think that what you are trying to achieve is using multiple URLs to extract tables from the webpages and then converting each table to individual excel sheets.

The below code might be the one that pulls the trick off! I ran it on Google Colaboratory! Hope it helps!!

# for writing to excel(xlsx) we will be needing XlsxWriter, please install it first if you don't have it!
try:
    import xlsxwriter
except ModuleNotFoundError:
    print("xlsxwriter is not installed!!")
    get_ipython().system("pip install xlsxwriter")


# to scrape a table from a webpage
from urllib.parse import urlparse, urlsplit
import requests
import pandas as pd
import os


urls=["https://www.macrotrends.net/countries/IND/india/gdp-growth-rate",
          "http://www.inwea.org/wind-energy-in-india/wind-power-potential",
          "https://en.wikipedia.org/wiki/List_of_districts_in_India",
          "https://en.wikipedia.org/wiki/List_of_Indian_people_by_net_worth",
          "https://en.wikipedia.org/wiki/States_and_union_territories_of_India",
          "https://en.wikipedia.org/wiki/List_of_governors-general_of_India",
          "https://en.wikipedia.org/wiki/List_of_Indian_independence_activists",
          "https://en.wikipedia.org/wiki/List_of_Indian_Grammy_Award_winners_and_nominees",
          "https://en.wikipedia.org/wiki/List_of_Indian_Academy_Award_winners_and_nominees",
          "https://en.wikipedia.org/wiki/List_of_highest-grossing_Indian_films"
          ]

EXCEL_SAVE_PATH  = "Scraped.xlsx"

URL_SHEET_MAPPING = {}
print(len(urls), "Urls Found")


# convert the sheetname- remove _ and - , put title case and remove spaces
def modify_name(my_str):
    replaced = my_str.replace("_", " ").replace("-", " ")
    return replaced.title().replace(" ", "")


def remove_prefix(word, prefix):
    return word[word.startswith(prefix) and len(prefix):]  

# get all tables from a url
def get_dataframes(url):
    print(f"url = {url}")
    os.makedirs("html_saved_pages", exist_ok = True)
    html = requests.get(url).content
    # print(f"html = {html}")
    save_path = remove_prefix(url, "https://www.").replace(".", "_").replace("/", "__")
    with open(f"html_saved_pages/{save_path}.html", "wb") as fh:
        fh.write(html)
    # NOTE : The hapag llyod & bic pages were saved manually, scraping failed on it.
    df_list = pd.read_html(html)
    # print(len(df_list)," Dataframes Returned")
    return df_list


# if df is too small then don't add it
def filter_dfs(dfs_list, min_rows=10):
    new_dfs_list = []
    for each_df in dfs_list:
        if len(each_df) > min_rows:
            new_dfs_list.append(each_df)
    return new_dfs_list


# to avoid InvalidWorksheetName: Excel worksheet name 'StatesAndUnionTerritoriesOfIndia1' must be <= 31 chars.
def crop_name(name, thres=29):
    if len(name) < thres:
        return name
    else:
        return name[:thres]


# to get first n elements from list only
def crop_list(lst, thres=29):
    if len(lst) < thres:
        return lst
    else:
        return lst[:thres]


# converts urls to dataframes to excel sheets
# get_max= get the maximum number of tables from each url
# min_rows= the minimum number of rows in each table to save it to the excel sheet
# crop_name_thres= some excel sheets can get quite huge sheet names which blows up the code
# so crop the sheet name for the better purpose


def urls_to_excel(urls, excel_path=None, get_max=10, min_rows=0, crop_name_thres=29):
    excel_path = (
        os.path.join(os.getcwd(), "Scraped_Iso_Codes.xlsx")
        if excel_path == None
        else excel_path
    )
    writer = pd.ExcelWriter(excel_path, engine="xlsxwriter")
    i = 0
    for url in urls:
        URL_SHEET_MAPPING[url] = {}
        parsed = urlsplit(url)
        sheet_name = parsed.path.split("/")[-1]
        mod_sheet_name = crop_name(modify_name(sheet_name), thres=crop_name_thres)

        dfs_list = get_dataframes(url)
        filtered_dfs_list = filter_dfs(dfs_list, min_rows=min_rows)
        filtered_dfs_list = crop_list(filtered_dfs_list, thres=get_max)
        for idx, each_df in enumerate(filtered_dfs_list):
            sheet_name=str(i) + "_" + mod_sheet_name
            print("Parsing Excel Sheet ", " : ", sheet_name)
            URL_SHEET_MAPPING[url][idx] = sheet_name
            i += 1
            each_df.to_excel(writer, sheet_name=sheet_name, index=True)
    # writer.save()
    writer.close()


urls_to_excel(urls,get_max=1,min_rows=10)

Output I got :


    XlsxWriter is not installed!!
    Collecting XlsxWriter
      Downloading https://files.pythonhosted.org/packages/2b/98/17875723b6814fc4d0fc03f0997ee00de2dbd78cf195e2ec3f2c9c789d40/XlsxWriter-1.3.3-py2.py3-none-any.whl (144kB)
         |████████████████████████████████| 153kB 9.4MB/s
    Installing collected packages: XlsxWriter
    Successfully installed XlsxWriter-1.3.3
    10 Urls Found
    Parsing Excel Sheet   :  0GdpGrowthRate
    Parsing Excel Sheet   :  1WindPowerPotential
    Parsing Excel Sheet   :  2ListOfDistrictsInIndia
    Parsing Excel Sheet   :  3ListOfIndianPeopleByNetWorth
    Parsing Excel Sheet   :  4StatesAndUnionTerritoriesOfIn
    Parsing Excel Sheet   :  5ListOfGovernorsGeneralOfIndia
    Parsing Excel Sheet   :  6ListOfIndianIndependenceActiv
    Parsing Excel Sheet   :  7ListOfIndianGrammyAwardWinner
    Parsing Excel Sheet   :  8ListOfIndianAcademyAwardWinne
    Parsing Excel Sheet   :  9ListOfHighestGrossingIndianFi

Upvotes: 0

Reno
Reno

Reputation: 1139

masterlist = []
i = 0
for plant in URLs:
    sublist = []

    soup = BeautifulSoup(urlopen(plant),'lxml')

    table = soup.find_all('td')
    for td in table:
        sublist.append(td.text)

    heading2 = soup.find_all('h2')
    for h2 in heading2:
        sublist.append(h2.text)

    para = soup.find_all('p')   
    for p in para:
        sublist.append(p.text)
    masterlist.append(sublist)

    i = i + 1
    print i 

df = pd.DataFrame(masterlist)
df.columns = ['Status','Type','Capacity','Feedstock','Address1','Address2']
writer = ExcelWriter('Pandas-Example.xlsx')
df.to_excel(writer,'Sheet1',index=False)
writer.save()

Upvotes: 1

Related Questions