SEC EDGAR Database Downloader

Question

I need to download all 8-K filings in SEC Edgar Database (approximately 3500 companies), anyone knows how to do it with a software or a code?

I tried with the sec-edgar-downloader (https://pypi.org/project/sec-edgar-downloader) and it is a very good software, but it only allows me to download a single company 8-K filings.

I also have this code, but I don't do programming so I don't understand it very much, does this code do as I requested, and how to use it?

Thank you in advance.

import pandas as pd
 import gc
 import glob
 import datetime
 import requests
 from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 import os, csv, time
 from bs4 import BeautifulSoup as bs
 import re
 import sys
 #import edgar # you only need this and the next in the first time you download the index #edgar.download_index(path_sec, 2000) # ... where '2000' is the first year of the period from which you want the data
 
 # This function provides a connection object that is more efficient def requests_retry_session(
    retries=3,
    backoff_factor=0.3,
    status_forcelist=(500, 502, 503, 504),
    session=None,):
    if __name__ == '__main__':
        pass
    import requests
    from requests.adapters import HTTPAdapter
    #from requests.packages.urllib3.util.retry import Retry
    from urllib3.util.retry import Retry
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session
 
 def creates_df(tsv_folder,file_type,st_year=2009,lst_year=datetime.datetime.today().year):
    ''' This function creates a file with the SEC urls necessary for your work.
    Start date must be in the YYYY format. Default is 2009. Default end_year is today\'s year.
    tsv_folder is the place where your TSV files are, the full path.
    file_type is the SEC file type you want to get, e.g., 8-K or DEFM14A, always between quotes.
    Destination folder for the output CSV file is your current directory.'''
    if __name__ == '__main__':
        pass
    last_year = lst_year
    path_edgar = tsv_folder
    typefile = file_type
    start_year = st_year
    destination = os.getcwd()
    print(f'Saving files to {destination}.')
    list_files = []
    write_cols = True
    for file in glob.glob(path_edgar + '*.tsv'):
        if int(file[-13:-9]) >= int(start_year) and int(file[-13:-9]) <= int(last_year):
            list_files.append(file)
    for file_sec in list_files:
        try:
            print(f'Trying to read {file_sec}.')
            x = pd.read_csv(file_sec, sep='|',dtype=str,names=['cik', 'firm_name','file_type','report_date','file_url_txt','file_url_html'])
            print('Done. Processing...')
            x = x[x['file_type'] == typefile]
            for i,j in x.iterrows():
                if len(j[0]) < 10:
                    x.loc[i,'cik'] = '0' * (10 - len(j[0])) + str(j[0])
            print('Writing...')
            x.to_csv(destination+'/sec_dataframe.csv',header = write_cols, mode='a',index=False)
            write_cols = False
        except Exception as ex:
            print('Can\'t read this file: ' + str(file_sec))
            print('Python returned this message: '+str(type(ex).__name__),str(ex.args)+'.')
 
 def id_8k(path_to_file,item8k):
    '''This function identifies the 8-K filing that have the respective wanted item.
    It assumes you have a csv file extracted from the function creates_df. You need to
    provide the path to this file as first parameter and the 8-K item as second parameter.
    The function then reads 100,000 rows at a time from the file and processes the results.'''
    if __name__ == '__main__':
        pass
    for chunk in pd.read_csv(path_to_file,chunksize=100000,dtype=str,parse_dates=['report_date']):
        for row,col in chunk.assign(
            keep=[1 if dt.date().year >= 2019 else 0 for dt in chunk.report_date]).query("keep == 1").iterrows():
            try:
                r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_html'])
            except:
                print(str(type(ex).__name__),str(ex.args))
                with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
                    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                    writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
                continue
            soup = bs(r.content,'lxml')
            print('Got soup object from: ',str(col['file_url_html']),str(col['cik']))
            if soup.text and str(item8k) in soup.text.lower():
                try:
                    r = requests_retry_session().get('https://www.sec.gov/Archives/' + col['file_url_txt'])
                except:
                    print(str(type(ex).__name__),str(ex.args))
                    with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
                        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
                    continue
                soup = bs(r.content,'lxml')
                print('Got your filing item from: ',str(col['file_url_txt']),str(col['cik']))
                try:
                    with open(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\/]+','',str(col['firm_name'])))+'_'+
                    str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html','a') as file:
                        file.write(soup.prettify())
                    print('html file is done. Name: ',str(os.getcwd()+'/'+str(col['cik'])+'_'+str(re.sub(r'[\/]+','',
                        str(col['firm_name'])))+'_'+str(col['report_date'].date())+'_8K_item_'+str(item8k)+'.html'))
                except Exception as ex:
                    print(str(type(ex).__name__),str(ex.args))
                    with open(os.getcwd()+'/'+'errors.csv','a') as csvfile:
                        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                        writer.writerow([str(col['file_url_html']),str(type(ex).__name__),str(ex.args)])
                    continue

SEC EDGAR Database Downloader

Answers (1)

Related Questions