whu-textual-analysis/lectures/programming/solutions/Problem_2_SEC_Filings_Part2_Download.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015

@author: Alexander Hillert, Goethe Uni Frankfurt
"""

directory="C:/Lehre/Textual Analysis/Programming/Files/"

# We need the urllib package
import urllib.request
# To automatically create folders we need the os-module (OS: Operating System)
import os


# Define a user agent
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
# "Some websites dislike being browsed by programs, or send different versions
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
# (where x and y are the major and minor version numbers of the Python release,
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
# The way a browser identifies itself is through the User-Agent header.
opener = urllib.request.build_opener()

# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
# To still automatically download files, you have different options.
# I have listed three examples below but there are many more:
# For a comprehensive list see, e.g.:
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
#opener.addheaders = [('User-agent', 'Mozilla')]
#opener.addheaders = [('User-agent', 'Chrome')]
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
urllib.request.install_opener(opener)


# Open the csv file from part 1 of the problem
input_file=open(directory+'SEC_Filings_Output.csv','r')
input_text=input_file.read()

# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# sometimes you have empty lines after a split command.
# You can remove them using the following command
while input_text_line.count("")>0:
    input_text_line.remove("")

# Create a subfolder in which the 10-K filings are saved.
# When you download a large number of filings I recommend using subfolders for
# each year or even for each year-month combination.
# The option "exist_ok=True" makes sure that you do not get an error if the
# folder already exists.
os.makedirs(directory+"10-Ks/", exist_ok=True)
    
# Loop over all lines of the csv file 
#for i in range(1,len(input_text_line)):
# To avoid having to download hundreds of files when we discuss the solution
# the loop stops at 20. (Remember the upper bound is not included.)
for i in range(1,21):
    
    # split the line into the five variables
    variables=input_text_line[i].split(";")
    # We only need the cik and the link.
    # The cik is the 3rd variable. However, the numbering of lists starts
    # at zero -> 2nd item of the list "variables"
    # The link is the 5th variable -> 4th item of the list "variables"
    cik=variables[2]
    #cik=cik.replace(" ","")
    cik=cik.strip()
    link=variables[4]
    #link=link.replace(" ","")
    link=link.strip()
    
    # Find the filename
    # The link consistes of differnt parts:
    # For example: edgar/data/1000753/0000950129-98-001035.txt
    link_parts=link.split("/")
    # 1st part: edgar
    # 2nd part: data
    # 3rd part: cik
    # 4th part: file name -> 3rd item of the set
    filename=link_parts[3]
    ###########################################################################
    ############################ WARNING ######################################
    # The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
    # may use the same filename. Thus, when you only use the filename files
    # might be overwritten. To avoid this problem you need to have a unique name. 
    # Combining CIK and filename results in a unique identifier, as the 
    # filename appears only once per firm (CIK).
    # -> use the combination of CIK and filename: cik_filename
    ###########################################################################        
    urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
    directory+"10-Ks/"+cik+"_"+filename)
    
input_file.close()
print("DONE")
Add programming files - add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder 2022-08-05 00:05:05 +02:00			`# -- coding: utf-8 --`
			`"""`
			`Created on Wed Jul 29 11:07:10 2015`

			`@author: Alexander Hillert, Goethe Uni Frankfurt`
			`"""`

			`directory="C:/Lehre/Textual Analysis/Programming/Files/"`

			`# We need the urllib package`
			`import urllib.request`
			`# To automatically create folders we need the os-module (OS: Operating System)`
			`import os`


			`# Define a user agent`
			`# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:`
			`# "Some websites dislike being browsed by programs, or send different versions`
			`# to different browsers. By default urllib identifies itself as Python-urllib/x.y`
			`# (where x and y are the major and minor version numbers of the Python release,`
			`# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.`
			`# The way a browser identifies itself is through the User-Agent header.`
			`opener = urllib.request.build_opener()`

			`# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)`
			`# To still automatically download files, you have different options.`
			`# I have listed three examples below but there are many more:`
			`# For a comprehensive list see, e.g.:`
			`# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/`
			`#opener.addheaders = [('User-agent', 'Mozilla')]`
			`#opener.addheaders = [('User-agent', 'Chrome')]`
			`opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]`
			`urllib.request.install_opener(opener)`


			`# Open the csv file from part 1 of the problem`
			`input_file=open(directory+'SEC_Filings_Output.csv','r')`
			`input_text=input_file.read()`

			`# Split the Input File in separate lines`
			`input_text_line=input_text.split("\n")`
			`# sometimes you have empty lines after a split command.`
			`# You can remove them using the following command`
			`while input_text_line.count("")>0:`
			`input_text_line.remove("")`

			`# Create a subfolder in which the 10-K filings are saved.`
			`# When you download a large number of filings I recommend using subfolders for`
			`# each year or even for each year-month combination.`
			`# The option "exist_ok=True" makes sure that you do not get an error if the`
			`# folder already exists.`
			`os.makedirs(directory+"10-Ks/", exist_ok=True)`

			`# Loop over all lines of the csv file`
			`#for i in range(1,len(input_text_line)):`
			`# To avoid having to download hundreds of files when we discuss the solution`
			`# the loop stops at 20. (Remember the upper bound is not included.)`
			`for i in range(1,21):`

			`# split the line into the five variables`
			`variables=input_text_line[i].split(";")`
			`# We only need the cik and the link.`
			`# The cik is the 3rd variable. However, the numbering of lists starts`
			`# at zero -> 2nd item of the list "variables"`
			`# The link is the 5th variable -> 4th item of the list "variables"`
			`cik=variables[2]`
			`#cik=cik.replace(" ","")`
			`cik=cik.strip()`
			`link=variables[4]`
			`#link=link.replace(" ","")`
			`link=link.strip()`

			`# Find the filename`
			`# The link consistes of differnt parts:`
			`# For example: edgar/data/1000753/0000950129-98-001035.txt`
			`link_parts=link.split("/")`
			`# 1st part: edgar`
			`# 2nd part: data`
			`# 3rd part: cik`
			`# 4th part: file name -> 3rd item of the set`
			`filename=link_parts[3]`
			`###########################################################################`
			`############################ WARNING ######################################`
			`# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)`
			`# may use the same filename. Thus, when you only use the filename files`
			`# might be overwritten. To avoid this problem you need to have a unique name.`
			`# Combining CIK and filename results in a unique identifier, as the`
			`# filename appears only once per firm (CIK).`
			`# -> use the combination of CIK and filename: cik_filename`
			`###########################################################################`
			`urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\`
			`directory+"10-Ks/"+cik+"_"+filename)`

			`input_file.close()`
			`print("DONE")`