Reputation:
I am doing some webscraping with BeautifulSoup and Selenium and I want to write my data to an excel file
# coding: utf-8
import requests
import bs4
from datetime import datetime
import re
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import time
initialpage = 'https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res = requests.get(initialpage, timeout=None)
soup = bs4.BeautifulSoup(res.text, 'html.parser')
pages = []
pagelinks=soup.select('a[href^="/yearly/chart/?page"]')
for i in range(int(len(pagelinks)/2)):
pages.append(str(pagelinks[i])[9:-14])
pages[i]=pages[i].replace("amp;","")
pages[i]= "https://www.boxofficemojo.com" + pages[i]
pages[i]=pages[i][:-1]
pages.insert(0, initialpage)
date_dic = {}
movie_links = []
titles = []
Domestic_Gross_Arr=[]
Genre_Arr=[]
Release_Date_Arr = []
Theaters_Arr=[]
Budget_Arr = []
Views_Arr = []
Edits_Arr = []
Editors_Arr = []
for i in range(int(len(pagelinks)/2 + 1)):
movie_count=0;
res1 = requests.get(pages[i])
souppage=bs4.BeautifulSoup(res1.text, 'html.parser')
for j in souppage.select('tr > td > b > font > a'):
link = j.get("href")[7:].split("&")
str1 = "".join(link)
final = "https://www.boxofficemojo.com/movies" + str1
if "/?id" in final:
movie_links.append(final)
movie_count += 1
number_of_theaters=souppage.find("tr", bgcolor="#dcdcdc")
for k in range(movie_count):
#print(number_of_theaters.next_sibling.contents[4].text)
Theaters_Arr.append(number_of_theaters.next_sibling.contents[4].text)
number_of_theaters=number_of_theaters.next_sibling
k=0
path = os.getcwd()
path = path + '/movie_pictures'
os.makedirs(path)
os.chdir(path)
while(k < 2):
j = movie_links[k]
try:
res1 = requests.get(j)
soup1 = bs4.BeautifulSoup(res1.text, 'html.parser')
c = soup1.select('td[width="35%"]')
d=soup1.select('div[class="mp_box_content"]')
genre = soup1.select('td[valign="top"]')[5].select('b')
image = soup1.select('img')[6].get('src')
budget = soup1.select('tr > td > b')
domestic = str(c[0].select('b'))[4:-5]
release = soup1.nobr.a
title = soup1.select('title')[0].getText()[:-25]
print ("-----------------------------------------")
print ("Title: " +title)
titles.append(title)
print ("Domestic Gross: " +domestic)
Domestic_Gross_Arr.append(domestic)
print ("Genre: "+genre[0].getText())
Genre_Arr.append(genre[0].getText())
print ("Release Date: " +release.contents[0])
Release_Date_Arr.append(release.contents[0])
print ("Production Budget: " +budget[5].getText())
Budget_Arr.append(budget[5].getText())
year1=str(release.contents[0])[-4:]
a,b=str(release.contents[0]).split(",")
month1, day1=a.split(" ")
datez= year1 + month1 + day1
new_date= datetime.strptime(datez , "%Y%B%d")
date_dic[title]=new_date
with open('pic' + str(k) + '.png', 'wb') as handle:
response = requests.get(image, stream=True)
if not response.ok:
print response
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
except:
print("Error Occured, Page Or Data Not Available")
k+=1
def subtract_one_month(t):
import datetime
one_day = datetime.timedelta(days=1)
one_month_earlier = t - one_day
while one_month_earlier.month == t.month or one_month_earlier.day > t.day:
one_month_earlier -= one_day
year=str(one_month_earlier)[:4]
day=str(one_month_earlier)[8:10]
month=str(one_month_earlier)[5:7]
newdate= year + "-" + month +"-" + day
return newdate
number_of_errors=0
browser = webdriver.Chrome("/Users/Gokce/Downloads/chromedriver")
browser.maximize_window()
browser.implicitly_wait(20)
for i in titles:
try:
release_date = date_dic[i]
i = i.replace(' ', '_')
i = i.replace("2017", "2017_film")
#end = datetime.strptime(release_date, '%B %d, %Y')
end_date = release_date.strftime('%Y-%m-%d')
start_date = subtract_one_month(release_date)
url = "https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start="+ start_date +"&end="+ end_date + "&pages=" + i
browser.get(url)
page_views_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--pageviews .linear-legend--counts:first-child span.pull-right ")
page_edits_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .linear-legend--counts:first-child span.pull-right ")
page_editors_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .legend-block--body .linear-legend--counts:nth-child(2) span.pull-right ")
print (i)
print ("Number of Page Views: " +page_views_count.text)
Views_Arr.append(page_views_count.text)
print ("Number of Edits: " +page_edits_count.text)
Edits_Arr.append(page_edits_count.text)
print ("Number of Editors: " +page_editors_count.text)
Editors_Arr.append(page_editors_count.text)
except:
print("Error Occured for this page: " + str(i))
number_of_errors += 1
Views_Arr.append(-1)
Edits_Arr.append(-1)
Editors_Arr.append(-1)
time.sleep(5)
browser.quit()
import xlsxwriter
os.chdir("/home")
workbook = xlsxwriter.Workbook('WebScraping.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0,0, "Hello")
worksheet.write(0,1, 'Genre')
worksheet.write(0,2, 'Production Budget')
worksheet.write(0,3, 'Domestic Gross')
worksheet.write(0,4, 'Release Date')
worksheet.write(0,5, 'Number of Wikipedia Page Views')
worksheet.write(0,6, 'Number of Wikipedia Edits')
worksheet.write(0,7, 'Number of Wikipedia Editors')
row=1
for i in range(len(titles)):
worksheet.write(row, 0, titles[i])
worksheet.write(row, 1, Genre_Arr[i])
worksheet.write(row, 2, Budget_Arr[i])
worksheet.write(row, 3, Domestic_Gross_Arr[i])
worksheet.write(row, 4, Release_Date_Arr[i])
worksheet.write(row, 5, Theaters_Arr[i])
worksheet.write(row, 6, Views_Arr[i])
worksheet.write(row, 7, Edits_Arr[i])
worksheet.write(row, 8, Editors_Arr[i])
row += 1
workbook.close()
The code works until import xlsxwriter, then I get this error:
---------------------------------------------------------------------------
IOError Traceback (most recent call last)
<ipython-input-9-c99eea52d475> in <module>()
27
28
---> 29 workbook.close()
/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in close(self)
309 if not self.fileclosed:
310 self.fileclosed = 1
--> 311 self._store_workbook()
312
313 def set_size(self, width, height):
/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in _store_workbook(self)
638
639 xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
--> 640 allowZip64=self.allow_zip64)
641
642 # Add XML sub-files to the Zip file with their Excel filename.
/Users/Gokce/anaconda2/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
754 modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
755 try:
--> 756 self.fp = open(file, modeDict[mode])
757 except IOError:
758 if mode == 'a':
IOError: [Errno 45] Operation not supported: 'WebScraping.xlsx'
What might be the problem? If if cut off the last part and run in a new IDLE with fake data, it works. But it does not work in the main IDLE. So the problem must be in the previous part I believe
Upvotes: 0
Views: 1112
Reputation: 670
I used openpyxl a few years ago to automate some excel spreadsheets. You can use code like below:
import openpyxl
# you could create a template spreadsheet that does all of the
# worksheet.write() commands on lines 188-195
wb = openpyxl.load_workbook('Template.xlsx')
wb.create_sheet('my_sheet')
ws = wb.get_sheet_by_name(title='my_sheet')
row=1
for i in range(len(titles)):
ws.cell(row=row, column=0, value=titles[i])
ws.cell(row=row, column=1, value=Genre_Arr[i])
ws.cell(row=row, column=2, value=Budget_Arr[i])
ws.cell(row=row, column=3, value=Domestic_Gross_Arr[i])
ws.cell(row=row, column=4, value=Release_Date_Arr[i])
ws.cell(row=row, column=5, value=Theaters_Arr[i])
ws.cell(row=row, column=6, value=Views_Arr[i])
ws.cell(row=row, column=7, value=Edits_Arr[i])
ws.cell(row=row, column=8, value=Editors_Arr[i])
row += 1
wb.save('my_file.xlsx')
Upvotes: 0
Reputation: 42760
The error is triggered when the code tries to write the file. Confirm that you have write permissions to that directory, and that the file doesn't already exist. It's unlikely that you have access to /home
.
Upvotes: 1