Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/solutions/Problem_2_SEC_Filings_Part2_Download.py
+++ b/lectures/programming/solutions/Problem_2_SEC_Filings_Part2_Download.py
@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe Uni Frankfurt
+"""
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# We need the urllib package
+import urllib.request
+# To automatically create folders we need the os-module (OS: Operating System)
+import os
+
+
+# Define a user agent
+# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
+# "Some websites dislike being browsed by programs, or send different versions
+# to different browsers. By default urllib identifies itself as Python-urllib/x.y
+# (where x and y are the major and minor version numbers of the Python release,
+# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
+# The way a browser identifies itself is through the User-Agent header.
+opener = urllib.request.build_opener()
+
+# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
+# To still automatically download files, you have different options.
+# I have listed three examples below but there are many more:
+# For a comprehensive list see, e.g.:
+# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
+#opener.addheaders = [('User-agent', 'Mozilla')]
+#opener.addheaders = [('User-agent', 'Chrome')]
+opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
+urllib.request.install_opener(opener)
+
+
+# Open the csv file from part 1 of the problem
+input_file=open(directory+'SEC_Filings_Output.csv','r')
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+# sometimes you have empty lines after a split command.
+# You can remove them using the following command
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create a subfolder in which the 10-K filings are saved.
+# When you download a large number of filings I recommend using subfolders for
+# each year or even for each year-month combination.
+# The option "exist_ok=True" makes sure that you do not get an error if the
+# folder already exists.
+os.makedirs(directory+"10-Ks/", exist_ok=True)
+    
+# Loop over all lines of the csv file 
+#for i in range(1,len(input_text_line)):
+# To avoid having to download hundreds of files when we discuss the solution
+# the loop stops at 20. (Remember the upper bound is not included.)
+for i in range(1,21):
+    
+    # split the line into the five variables
+    variables=input_text_line[i].split(";")
+    # We only need the cik and the link.
+    # The cik is the 3rd variable. However, the numbering of lists starts
+    # at zero -> 2nd item of the list "variables"
+    # The link is the 5th variable -> 4th item of the list "variables"
+    cik=variables[2]
+    #cik=cik.replace(" ","")
+    cik=cik.strip()
+    link=variables[4]
+    #link=link.replace(" ","")
+    link=link.strip()
+    
+    # Find the filename
+    # The link consistes of differnt parts:
+    # For example: edgar/data/1000753/0000950129-98-001035.txt
+    link_parts=link.split("/")
+    # 1st part: edgar
+    # 2nd part: data
+    # 3rd part: cik
+    # 4th part: file name -> 3rd item of the set
+    filename=link_parts[3]
+    ###########################################################################
+    ############################ WARNING ######################################
+    # The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
+    # may use the same filename. Thus, when you only use the filename files
+    # might be overwritten. To avoid this problem you need to have a unique name. 
+    # Combining CIK and filename results in a unique identifier, as the 
+    # filename appears only once per firm (CIK).
+    # -> use the combination of CIK and filename: cik_filename
+    ###########################################################################        
+    urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
+    directory+"10-Ks/"+cik+"_"+filename)
+    
+input_file.close()
+print("DONE")