Reputation: 19
I need to download all 8-K filings in SEC Edgar Database (approximately 3500 companies), anyone knows how to do it with a software or a code?
I tried with the sec-edgar-downloader (https://pypi.org/project/sec-edgar-downloader) and it is a very good software, but it only allows me to download a single company 8-K filings.
I also have this code, but I don't do programming so I don't understand it very much, does this code do as I requested, and how to use it?
Thank you in advance.
import pandas as pd
import gc
import glob
import datetime
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import os, csv, time
from bs4 import BeautifulSoup as bs
import re
import sys
#import edgar # you only need this and the next in the first time you download the index #edgar.download_index(path_sec, 2000) # ... where '2000' is the first year of the period from which you want the data
# This function provides a connection object that is more efficient def requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(500, 502, 503, 504),
session=None,):
if __name__ == '__main__':
pass
import requests
from requests.adapters import HTTPAdapter
#from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry
session = session or requests.Session()
retry = Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def creates_df(tsv_folder,file_type,st_year=2009,lst_year=datetime.datetime.today().year):
''' This function creates a file with the SEC urls necessary for your work.
Start date must be in the YYYY format. Default is 2009. Default end_year is today\'s year.
tsv_folder is the place where your TSV files are, the full path.
file_type is the SEC file type you want to get, e.g., 8-K or DEFM14A, always between quotes.
Destination folder for the output CSV file is your current directory.'''
if __name__ == '__main__':
pass
last_year = lst_year
path_edgar = tsv_folder
typefile = file_type
start_year = st_year
destination = os.getcwd()
print(f'Saving files to {destination}.')
list_files = []
write_cols = True
for file in glob.glob(path_edgar + '*.tsv'):
if int(file[-13:-9]) >= int(start_year) and int(file[-13:-9]) <= int(last_year):
list_files.append(file)
for file_sec in list_files:
try:
print(f'Trying to read {file_sec}.')
x = pd.read_csv(file_sec, sep='|',dtype=str,names=['cik', 'firm_name','file_type','report_date','file_url_txt','file_url_html'])
print('Done. Processing...')
x = x[x['file_type'] == typefile]
for i,j in x.iterrows():
if len(j[0]) < 10:
x.loc[i,'cik'] = '0' * (10 - len(j[0])) + str(j[0])
print('Writing...')
x.to_csv(destination+'/sec_dataframe.csv',header = write_cols, mode='a',index=False)
write_cols = False
except Exception as ex:
print('Can\'t read this file: ' + str(file_sec))
print('Python returned this message: '+str(type(ex).__name__),str(ex.args)+'.')
def id_8k(path_to_file,item8k):
'''This function identifies the 8-K filing that have the respective wanted item.
It assumes you have a csv file extracted from the function creates_df. You need to
provide the path to this file as first parameter and the 8-K item as second parameter.
The function then reads 100,000 rows at a time from the file and processes the results.'''
if __name__ == '__main__':
pass
for chunk in pd.read_csv(path_to_file,chunksize=100000,dtype=str,parse_dates=['report_date']):
for row,col in chunk.assign(
keep=[1 if dt.date().year >= 2019 else 0 for dt in chunk.report_date]).query("keep == 1").iterrows():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_html'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got soup object from: ',str(col['file_url_html']),str(col['cik']))
if soup.text and str(item8k) in soup.text.lower():
try:
r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_txt'])
except:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
soup = bs(r.content,'lxml')
print('Got your filing item from: ',str(col['file_url_txt']),str(col['cik']))
try:
with open(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',str(col['firm_name'])))+'_'+
str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html','a') as file:
file.write(soup.prettify())
print('html file is done. Name: ',str(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\\/]+','',
str(col['firm_name'])))+'_'+str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html'))
except Exception as ex:
print(str(type(ex).__name__),str(ex.args))
with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
continue
Upvotes: 1
Views: 3655
Reputation: 11
Create a list of your company names (or symbols, or CIKS). If you have a list in excel, convert it to a csv and do:
companies = []
with open('/Path', newline='', encoding='utf-
8-sig') as f:
for row in csv.reader(f):
companies.append(row[0])
Then, rifle through that list to grab the files:
dl = Downloader(Path)
for company in companies:
dl.get("File Type"), company
Upvotes: 1