user7157075
user7157075

Reputation:

Writing to an Excel File With Python

I am doing some webscraping with BeautifulSoup and Selenium and I want to write my data to an excel file

# coding: utf-8

import requests
import bs4
from datetime import datetime
import re
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import time

initialpage = 'https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res = requests.get(initialpage, timeout=None)
soup = bs4.BeautifulSoup(res.text, 'html.parser')



pages = []
pagelinks=soup.select('a[href^="/yearly/chart/?page"]')

for i in range(int(len(pagelinks)/2)):
    pages.append(str(pagelinks[i])[9:-14])
    pages[i]=pages[i].replace("amp;","")
    pages[i]= "https://www.boxofficemojo.com" + pages[i]
    pages[i]=pages[i][:-1]



pages.insert(0, initialpage)
date_dic = {}
movie_links = []
titles = []
Domestic_Gross_Arr=[]
Genre_Arr=[]
Release_Date_Arr = []
Theaters_Arr=[]
Budget_Arr = []
Views_Arr = []
Edits_Arr = []
Editors_Arr = []


for i in range(int(len(pagelinks)/2 + 1)): 
    movie_count=0;
    res1 = requests.get(pages[i])
    souppage=bs4.BeautifulSoup(res1.text, 'html.parser')
    for j in souppage.select('tr > td > b > font > a'):
        link = j.get("href")[7:].split("&")
        str1 = "".join(link)
        final = "https://www.boxofficemojo.com/movies" + str1
        if "/?id" in final: 
            movie_links.append(final)
            movie_count += 1

    number_of_theaters=souppage.find("tr", bgcolor="#dcdcdc")
    for k in range(movie_count):
        #print(number_of_theaters.next_sibling.contents[4].text)
        Theaters_Arr.append(number_of_theaters.next_sibling.contents[4].text)
        number_of_theaters=number_of_theaters.next_sibling



k=0
path = os.getcwd()  
path = path + '/movie_pictures'
os.makedirs(path)
os.chdir(path)
while(k < 2):
    j = movie_links[k]
    try:
        res1 = requests.get(j)
        soup1 = bs4.BeautifulSoup(res1.text, 'html.parser')

        c = soup1.select('td[width="35%"]')
        d=soup1.select('div[class="mp_box_content"]')

        genre = soup1.select('td[valign="top"]')[5].select('b')
        image = soup1.select('img')[6].get('src')
        budget = soup1.select('tr > td > b')
        domestic = str(c[0].select('b'))[4:-5]


        release = soup1.nobr.a
        title = soup1.select('title')[0].getText()[:-25]
        print ("-----------------------------------------")
        print ("Title: " +title)
        titles.append(title)
        print ("Domestic Gross: " +domestic)
        Domestic_Gross_Arr.append(domestic)
        print ("Genre: "+genre[0].getText())
        Genre_Arr.append(genre[0].getText())
        print ("Release Date: " +release.contents[0])
        Release_Date_Arr.append(release.contents[0])
        print ("Production Budget: " +budget[5].getText())
        Budget_Arr.append(budget[5].getText())

        year1=str(release.contents[0])[-4:]
        a,b=str(release.contents[0]).split(",")
        month1, day1=a.split(" ")
        datez= year1 + month1 + day1
        new_date= datetime.strptime(datez , "%Y%B%d")
        date_dic[title]=new_date       


        with open('pic' + str(k) + '.png', 'wb') as handle:
            response = requests.get(image, stream=True)

            if not response.ok:
                print response

            for block in response.iter_content(1024):
                if not block:
                    break

            handle.write(block)
    except:
        print("Error Occured, Page Or Data Not Available")
    k+=1



def subtract_one_month(t):

    import datetime
    one_day = datetime.timedelta(days=1)
    one_month_earlier = t - one_day
    while one_month_earlier.month == t.month or one_month_earlier.day > t.day:
        one_month_earlier -= one_day
    year=str(one_month_earlier)[:4]

    day=str(one_month_earlier)[8:10]

    month=str(one_month_earlier)[5:7]

    newdate= year + "-" + month +"-" + day

    return newdate

number_of_errors=0
browser = webdriver.Chrome("/Users/Gokce/Downloads/chromedriver")
browser.maximize_window() 
browser.implicitly_wait(20)
for i in titles:
    try:
        release_date = date_dic[i]
        i = i.replace(' ', '_')
        i = i.replace("2017", "2017_film")
    #end = datetime.strptime(release_date, '%B %d, %Y')


        end_date = release_date.strftime('%Y-%m-%d')
        start_date = subtract_one_month(release_date)
        url = "https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start="+ start_date +"&end="+ end_date + "&pages=" + i


        browser.get(url)
        page_views_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--pageviews .linear-legend--counts:first-child span.pull-right ")
        page_edits_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .linear-legend--counts:first-child span.pull-right ")
        page_editors_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .legend-block--body .linear-legend--counts:nth-child(2) span.pull-right ")
        print (i)

        print ("Number of Page Views: " +page_views_count.text)
        Views_Arr.append(page_views_count.text)
        print ("Number of Edits: " +page_edits_count.text)
        Edits_Arr.append(page_edits_count.text)
        print ("Number of Editors: " +page_editors_count.text)
        Editors_Arr.append(page_editors_count.text)
    except:
        print("Error Occured for this page: " + str(i))
        number_of_errors += 1
        Views_Arr.append(-1)
        Edits_Arr.append(-1)
        Editors_Arr.append(-1)


time.sleep(5)
browser.quit()



import xlsxwriter
os.chdir("/home")
workbook = xlsxwriter.Workbook('WebScraping.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(0,0, "Hello")
worksheet.write(0,1, 'Genre')
worksheet.write(0,2, 'Production Budget')
worksheet.write(0,3, 'Domestic Gross')
worksheet.write(0,4, 'Release Date')
worksheet.write(0,5, 'Number of Wikipedia Page Views')
worksheet.write(0,6, 'Number of Wikipedia Edits')
worksheet.write(0,7, 'Number of Wikipedia Editors')

row=1
for i in range(len(titles)):
    worksheet.write(row, 0, titles[i])
    worksheet.write(row, 1, Genre_Arr[i])
    worksheet.write(row, 2, Budget_Arr[i])
    worksheet.write(row, 3, Domestic_Gross_Arr[i])
    worksheet.write(row, 4, Release_Date_Arr[i])
    worksheet.write(row, 5, Theaters_Arr[i])
    worksheet.write(row, 6, Views_Arr[i])
    worksheet.write(row, 7, Edits_Arr[i])
    worksheet.write(row, 8, Editors_Arr[i])
    row += 1


workbook.close()

The code works until import xlsxwriter, then I get this error:

---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-9-c99eea52d475> in <module>()
     27 
     28 
---> 29 workbook.close()

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in close(self)
    309         if not self.fileclosed:
    310             self.fileclosed = 1
--> 311             self._store_workbook()
    312 
    313     def set_size(self, width, height):

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in _store_workbook(self)
    638 
    639         xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
--> 640                             allowZip64=self.allow_zip64)
    641 
    642         # Add XML sub-files to the Zip file with their Excel filename.

/Users/Gokce/anaconda2/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
    754             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
    755             try:
--> 756                 self.fp = open(file, modeDict[mode])
    757             except IOError:
    758                 if mode == 'a':

IOError: [Errno 45] Operation not supported: 'WebScraping.xlsx' 

What might be the problem? If if cut off the last part and run in a new IDLE with fake data, it works. But it does not work in the main IDLE. So the problem must be in the previous part I believe

Upvotes: 0

Views: 1112

Answers (2)

Sean
Sean

Reputation: 670

I used openpyxl a few years ago to automate some excel spreadsheets. You can use code like below:

import openpyxl
# you could create a template spreadsheet that does all of the
# worksheet.write() commands on lines 188-195
wb = openpyxl.load_workbook('Template.xlsx')
wb.create_sheet('my_sheet')
ws = wb.get_sheet_by_name(title='my_sheet')
row=1
for i in range(len(titles)):
    ws.cell(row=row, column=0, value=titles[i])
    ws.cell(row=row, column=1, value=Genre_Arr[i])
    ws.cell(row=row, column=2, value=Budget_Arr[i])
    ws.cell(row=row, column=3, value=Domestic_Gross_Arr[i])
    ws.cell(row=row, column=4, value=Release_Date_Arr[i])
    ws.cell(row=row, column=5, value=Theaters_Arr[i])
    ws.cell(row=row, column=6, value=Views_Arr[i])
    ws.cell(row=row, column=7, value=Edits_Arr[i])
    ws.cell(row=row, column=8, value=Editors_Arr[i])
    row += 1
wb.save('my_file.xlsx')

Upvotes: 0

miken32
miken32

Reputation: 42760

The error is triggered when the code tries to write the file. Confirm that you have write permissions to that directory, and that the file doesn't already exist. It's unlikely that you have access to /home.

Upvotes: 1

Related Questions