Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
|
|
@ -0,0 +1,95 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||
"""
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# We need the urllib package
|
||||
import urllib.request
|
||||
# To automatically create folders we need the os-module (OS: Operating System)
|
||||
import os
|
||||
|
||||
|
||||
# Define a user agent
|
||||
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
|
||||
# "Some websites dislike being browsed by programs, or send different versions
|
||||
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
|
||||
# (where x and y are the major and minor version numbers of the Python release,
|
||||
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
|
||||
# The way a browser identifies itself is through the User-Agent header.
|
||||
opener = urllib.request.build_opener()
|
||||
|
||||
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
|
||||
# To still automatically download files, you have different options.
|
||||
# I have listed three examples below but there are many more:
|
||||
# For a comprehensive list see, e.g.:
|
||||
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
|
||||
#opener.addheaders = [('User-agent', 'Mozilla')]
|
||||
#opener.addheaders = [('User-agent', 'Chrome')]
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
|
||||
urllib.request.install_opener(opener)
|
||||
|
||||
|
||||
# Open the csv file from part 1 of the problem
|
||||
input_file=open(directory+'SEC_Filings_Output.csv','r')
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
# sometimes you have empty lines after a split command.
|
||||
# You can remove them using the following command
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create a subfolder in which the 10-K filings are saved.
|
||||
# When you download a large number of filings I recommend using subfolders for
|
||||
# each year or even for each year-month combination.
|
||||
# The option "exist_ok=True" makes sure that you do not get an error if the
|
||||
# folder already exists.
|
||||
os.makedirs(directory+"10-Ks/", exist_ok=True)
|
||||
|
||||
# Loop over all lines of the csv file
|
||||
#for i in range(1,len(input_text_line)):
|
||||
# To avoid having to download hundreds of files when we discuss the solution
|
||||
# the loop stops at 20. (Remember the upper bound is not included.)
|
||||
for i in range(1,21):
|
||||
|
||||
# split the line into the five variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We only need the cik and the link.
|
||||
# The cik is the 3rd variable. However, the numbering of lists starts
|
||||
# at zero -> 2nd item of the list "variables"
|
||||
# The link is the 5th variable -> 4th item of the list "variables"
|
||||
cik=variables[2]
|
||||
#cik=cik.replace(" ","")
|
||||
cik=cik.strip()
|
||||
link=variables[4]
|
||||
#link=link.replace(" ","")
|
||||
link=link.strip()
|
||||
|
||||
# Find the filename
|
||||
# The link consistes of differnt parts:
|
||||
# For example: edgar/data/1000753/0000950129-98-001035.txt
|
||||
link_parts=link.split("/")
|
||||
# 1st part: edgar
|
||||
# 2nd part: data
|
||||
# 3rd part: cik
|
||||
# 4th part: file name -> 3rd item of the set
|
||||
filename=link_parts[3]
|
||||
###########################################################################
|
||||
############################ WARNING ######################################
|
||||
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
|
||||
# may use the same filename. Thus, when you only use the filename files
|
||||
# might be overwritten. To avoid this problem you need to have a unique name.
|
||||
# Combining CIK and filename results in a unique identifier, as the
|
||||
# filename appears only once per firm (CIK).
|
||||
# -> use the combination of CIK and filename: cik_filename
|
||||
###########################################################################
|
||||
urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
|
||||
directory+"10-Ks/"+cik+"_"+filename)
|
||||
|
||||
input_file.close()
|
||||
print("DONE")
|
||||
Loading…
Add table
Add a link
Reference in a new issue