Reputation: 855
I am trying to download 10Ks (annual report of public companies) from EDGAR. I am running the code below (used it from the textbook, don't understand much of it), but keep getting the following error: (I downloaded 'master.idx' files that are described in the code before running the code below).
HTTPError: HTTP Error 403: Forbidden
Can you please help me to resolve it?
import urllib.request
import shutil
import os
import re
from pathlib import Path
def get_files(start_year:int, end_year:int,
reform:str,
inddirect:str, odirect:str):
"""
Downloads SEC filings for specific companies
start_year -> First Year to download
end_year -> Last Year to download
reform -> Regex to specify forms to be downloaded
inddirect -> Directory containing index files
odirect -> Directory the filings will be downloaded to
"""
print('Downloading Filings')
# Regex to identify the form to download.
re_formtype = re.compile(reform, re.IGNORECASE)
# Regex to extract file name information
# from a line
re_fullfilename = re.compile(r"\|(edgar/data.*\/([\d-]+\.txt))", re.IGNORECASE)
#loop through the index files based on year
for year in range(start_year, end_year+1):
#check whether the directory exists and create one
# if it does not.
download_path = os.path.join(odirect, str(year))
if not os.path.exists(download_path):
os.makedirs(download_path)
for qtr in range(1,5):
#name of index file to be read.
dl_file = os.path.join(inddirect, 'master' + str(year) + str(qtr) + '.idx')
# check to see if the index file exists.
if not os.access(dl_file, os.R_OK):
# Download the index file if it does not
# already exist
url='https://www.sec.gov/Archives/edgar/full-index/' + str(year) + '/' + 'QTR' + str(qtr) + '/master.idx'
# download the file defined as url and
# download to the file defined a dl_fle.
urllib.request.urlretrieve(url, dl_file)
# open the index file
with open(dl_file, 'r') as f:
# set a counter called count to 1. Note
# that the counter will only be incremented
# after it downloads a file.
count=1
# loop through each line in the index file,
# assigning to a variable called line
for line in f:
# Only download a file if the counter
# is less than 5.
# Remove this if statement if you want
# to download all the files for the
# time period
if count<5:
# Check to see if the the line
# matches the form type
rematch=re.search(re_formtype,line)
#If there is a match then download
# the filing
if rematch:
# The following line searches
# for filename information.
# The first grouping will
# contain the location and
# filename of the file to be
# downloaded. The second
# grouping will contain just
# the filename o
matches = re.search(re_fullfilename, line)
if matches:
# Construct the url to for
# retrieving the filing
url = str('https://www.sec.gov/Archives/') + str(matches.group(1))
# Create the filename to
# download the file to.
outfile = os.path.join(download_path,
str(matches.group(2)))
# Check to make sure the
# file hasn't already
# been downloaded
if not (os.path.isfile(outfile) and os.access(outfile, os.R_OK)):
# Print the name of the
# file to be downloaded.
print("Downloading:"+str(outfile),end='\n')
#downlaod the file
urllib.request.urlretrieve(url, outfile)
count += 1
print('Downloading of Filings Complete',end='\n')
return
# Specify, in regular expression format, the filing
# you are looking for. Following is the for 10-k.
reform='(\|10-?k(sb|sb40|405)?\s*\|)'
# Specify location of the index files.
inddirect = os.path.join(Path.home(), 'edgar', 'indexfiles')
# Specify where to download filings to
odirect = os.path.join(Path.home(), 'edgar', '10K')
# Execute the get filings function
get_files(2018, 2019, reform, inddirect, odirect)
Upvotes: 0
Views: 882
Reputation: 183
Replace your line:
urllib.request.urlretrieve(url, dl_file)
with these line, as you are missing the user-agent
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent', 'MyApp/1.0')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(url, dl_file)
Upvotes: 2