Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
72 lines
No EOL
3 KiB
Python
72 lines
No EOL
3 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Jul 29 09:21:46 2015
|
|
|
|
@author: Alexander Hillert, Goethe Uni Frankfurt
|
|
"""
|
|
import re
|
|
# Please adjust the directory to your machine.
|
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
|
|
|
# Open the txt file with the SEC filings
|
|
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
|
|
sec_filings_text=sec_filings_file.read()
|
|
|
|
# Create output file
|
|
output_file=open(directory+'SEC_Filings_Output.csv','w')
|
|
|
|
# Create first line with variable names
|
|
# I use semicolons as separator in csv files. You can also use any other symbol.
|
|
# However, you should make sure that the separator is not part of the data/text
|
|
# you write to the file.
|
|
# For example, it would be problematic if you use comma as separator and have
|
|
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
|
|
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
|
|
|
|
# Split the Input File in separate line
|
|
sec_filings_line=sec_filings_text.split("\n")
|
|
|
|
# Loop over all lines
|
|
for i in range(len(sec_filings_line)):
|
|
# Does the line refer to a form 10-K file?
|
|
# As pointed out by Loughran and McDonald (2011), many firms mislabelled
|
|
# their 10-K filings as 10-K405 filings. Thus, I included these filings
|
|
# as well.
|
|
# The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
|
|
# Depending on the research question at hand one could include amendments as well.
|
|
# Also, 10KSB (small businesses) could also be included.
|
|
|
|
match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
|
|
if match_10k:
|
|
|
|
#if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
|
|
# Split the line such that the information can be saved in separate
|
|
# variables
|
|
# Each information item has a fixed length in the overview files of the
|
|
# SEC.
|
|
# Filing type: position 1 to 12
|
|
# Remember Python starts counting at 0 and does not include the upper bound
|
|
filing_type=sec_filings_line[i][:12]
|
|
# Company name: position 13 to 74
|
|
company_name=sec_filings_line[i][12:74]
|
|
# CIK: position 75 to 86
|
|
cik=sec_filings_line[i][74:86]
|
|
# Filing date: position 87 to 98
|
|
filing_date=sec_filings_line[i][86:98]
|
|
# Link: position 99 to end of line
|
|
link=sec_filings_line[i][98:]
|
|
|
|
# Is the 10-K filed between March 10 and March 20?
|
|
# The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
|
|
filing_day=filing_date[8:10]
|
|
filing_month=filing_date[5:7]
|
|
# Is the Filing Month March?
|
|
if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
|
|
# The filing meets the conditions -->
|
|
# Write output to the csv file
|
|
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
|
|
|
|
sec_filings_file.close()
|
|
output_file.close()
|
|
|
|
print("DONE") |