1
0
Fork 0

Add programming files

- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
This commit is contained in:
Alexander Hess 2022-08-05 00:05:05 +02:00
commit a37c87d9c8
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
38 changed files with 6416 additions and 0 deletions

View file

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
file_word_list=open(directory+'Complex_Words.txt','r',encoding="utf-8")
word_list=file_word_list.read()
word_list=word_list.lower()
complex_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Complex_Tone.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;Number_Words;Number_Complex_Words;Percent_Complex_Words\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters
text=input_text_10_k.lower()
# Split the text in words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Reset the number of complex words to zero
complex_count=0
# For each complex word, count the number of occurrences
for i in range(len(complex_words)):
complex_count=complex_count+list_of_words.count(complex_words[i])
# Write cik, file name, total number of words, and number of complex words to output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(complex_count)+';'+str(complex_count/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# To determine file size we need the OS package
import os
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# File size of the complete submission file (gross file size)
# You have to divide the result by 1024 to get the size in kilobyte
# The file size will be affected by html code and exhibits.
size_gross=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'.txt')/1024
# File size of the main text file (net file size)
# You have to divide the result by 1024 to get the size in kilobyte
size_net=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt')/1024
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular expressions and counters (->collections)
import re
import collections
# for the bigram part, the sentence tokenizer is helpful
from nltk.tokenize import sent_tokenize
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create an empty counter variable
words_counter=collections.Counter()
# variable is needed only for an alternative solution
words_counter1=collections.Counter()
# counter for the extra task
bigram_counter=collections.Counter()
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# read the content from the file
input_text_10_k=input_file_10_k.read()
# use lower case only so that it does not matter whether a word is at
# the beginning of a sentence ("The") or within a sentence ("the").
# Please note that this can be problematic, e.g. "US" -> United States vs.
# us (personal pronoun)
input_text_10_k_lower=input_text_10_k.lower()
# Split text into words
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
# There can be empty ("") list elements -> remove them
while list_of_words.count("")>0:
list_of_words.remove("")
# optional commands to remove words that only contain "_"
'''
for word in list_of_words:
if re.sub("[a-zA-Z]","",word)!="":
#if word.count("_")>0:
list_of_words.remove(word)
'''
# Add the words to our counter
words_counter=words_counter+collections.Counter(list_of_words)
# alternative solution
words_counter1.update(list_of_words)
#############################################
# optional part for the extra task on bigrams
#############################################
# create an empty list for the bigrams
bigram_list=[]
# split the text into sentences
list_of_sentences=sent_tokenize(input_text_10_k)
# create the BIGRAM IN EACH SENTENCE
for sentence in list_of_sentences:
# make the sentence lower case
sentence_lower=sentence.lower()
# split the sentence into words
list_of_words=re.split("\W{1,}",sentence_lower)
# remove empty elements
while list_of_words.count("")>0:
list_of_words.remove("")
#print("these are the words of the sentence:\n"+str(list_of_words))
# go over all potential two word combinations in the sentence.
for word_number in range(0,len(list_of_words)-1):
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
bigram_counter=bigram_counter+collections.Counter(bigram_list)
# end of extra task
# Close the 10-K filing
input_file_10_k.close()
input_file.close()
######################
# Top 100 single words
######################
# Open the csv file containing the 100 most frequently used words
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
output_file.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_words=words_counter.most_common(100)
# for the alternative solution
#top_100_words=words_counter1.most_common(100)
# Write the 100 most frequent words to the csv file.
# Remember Python starts counting at 0, while humans start at 1.
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
# Consequently, to get a consistent table, we must use the value i for the rank
# but call the element i-1.
for i in range(1,101):
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
str(top_100_words[i-1][1])+"\n")
# Close the csv file
output_file.close()
######################
# Extra task
# Top 100 bigrams
######################
# Open the csv file containing the 100 most frequently used BIGRAMS
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
output_file_bigram.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_bigrams=bigram_counter.most_common(100)
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
for i in range(1,101):
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
str(top_100_bigrams[i-1][1])+"\n")
# Close the csv file
output_file_bigram.close()
print("Task done!")

View file

@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular epressions, tokenize (to identify words), and stemming.
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
#for i in range(1,len(input_text_line)):
# for illustration filings 1 to 3 only
for i in range(1,4):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# Get the text of the 10-K
input_text_10_k=input_file_10_k.read()
# We need to tokenize the text because stem only works on a word by word basis.
# Stemming an entire document without splitting into words does not work!
# The problem is that \n gets lost in this process --> we cannot easily
# recreate the document.
# idea: replace \n by \n and some indicator that there was a line break.
# Here, I choose "LINEBREAKMARK"
input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
# Split text into words
# There are two alternatives.
# Alternative 1 (our standard approach):
#word_list=re.split("\W{1,}",input_text_10_k.lower())
# Alternative 2 (keeps symbols like ,;.):
word_list=word_tokenize(input_text_10_k.lower())
# Stem the text
text_stemmed=''
for word in word_list:
# The following two cases are designed to improve the formatting of the
# output file. It is not needed for the subsequent analyses.
# Case 1: 'word' is not an actual word but a symbol. -> there should
# be no whitespace between the previous words and this symbol.
# \A and \Z indicate the beginning and end of string -> the 'word' is just
# the symbol but not a combination of letters and symbols.
if re.search("\A[\.\?!,:;']{1,}\Z",word):
text_stemmed=text_stemmed+word
# Case 2: the word is an actual word -> have a whitespace included.
else:
text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
# The simple solution (without restoring the formatting of the text) is:
#text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
# To recreate the text, we need to replace the line break indicators by \n
# Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
text_stemmed=text_stemmed.replace("linebreakmark","\n")
# Open the output file for the stemmed text
output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
output_file_10_k.write(text_stemmed)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()
print("Task done!")

View file

@ -0,0 +1,287 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
ps=PorterStemmer()
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
Jaccard_stemmed_NLTK_stop_words\n')
# Read own stop word list
# This list has been created by manually selecting words from the csv-file
# 100_most_frequent_words.csv, which is created by the Python program
# "Problem_12_Most_Frequent_Words.py".
# Simply delete words you consider to be meaningless and that are frequently
# used.
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
stop_word_text=stop_word_file.read()
stop_word_line=stop_word_text.split("\n")
stop_word_line.remove("")
own_stop_words=[""]
for i in range(1,len(stop_word_line)):
stop_word=stop_word_line[i].split(";")[1]
own_stop_words.append(stop_word)
own_stop_words.remove("")
print("This is the list of my stop words:")
print(own_stop_words)
# Read NLTK stop word list
NLTK_stop_words=set(stopwords.words("english"))
print("This is the list of NLTK stop words:")
print(NLTK_stop_words)
# set default values for variables
# It is not required. However, if you don't do it Spyder will suggest that line
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
# is incorrect as word_list_old_edited is not yet defined at point in the program
# code. In this specific example, this will not cause an error, as we do not enter
# the if condition when i=1 -> it
word_list_old_edited=[]
word_list_edited=[]
word_list_old_NLTK_filtered=""
word_list_old_own_filtered=""
word_list_old_edited_stemmed=""
word_list_old_own_filtered_stemmed=""
word_list_old_NLTK_filtered_stemmed=""
#######################################################
# Define a function that computes Jaccard similarity
# As we need these operations several times, it is
# helpful to use a function.
######################################################
# beginning of the function
def jaccard(text1,text2):
counter1=Counter(text1)
counter2=Counter(text2)
intersection=counter1 & counter2
union=counter1 | counter2
return len(intersection)/len(union)
# end of the function
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Write the information from the input file to the output file
# we do not add a line break at the end, as we must append the similarity
# score first.
output_file.write(input_text_line[i])
# Open the ith 10-K; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
'_edited.txt', 'r', encoding='ascii', errors='ignore')
input_text_10_k=input_file_10_k.read()
# check whether the previous entry of the list is from the same firm
permco=input_text_line[i].split(";")[1]
permco_old=input_text_line[i-1].split(";")[1]
# Split text into words
word_list_edited=word_tokenize(input_text_10_k.lower())
############################################
# Sub Task 1: Jaccard for the _edited.txt
############################################
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
# the command calls the jaccard function that we have defined above.
# in the function, text1=word_list_edited and text2=word_list_old_edited.
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited=word_list_edited
############################################
# Sub Task 2: Jaccard for the _edited.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered,\
word_list_old_own_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered=word_list_own_filtered
############################################
# Sub Task 3: Jaccard for the _edited_v1.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
word_list_old_NLTK_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered=word_list_NLTK_filtered
############################################
# Sub Task 4: Jaccard for the _stemmed.txt
############################################
# Create stemmed text
word_list_edited_stemmed=[]
for word in word_list_edited:
word_list_edited_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited_stemmed=word_list_edited_stemmed
############################################
# Sub Task 5: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# Caution; in general, it is not clear whether you should first stem or
# first remove stop words.
# However, in this specific case, you should remove the stop words first
# and then stem, as your stop word list is based on the inflected text.
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# Create stemmed text
word_list_own_filtered_stemmed=[]
for word in word_list_own_filtered:
word_list_own_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
word_list_old_own_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
############################################
# Sub Task 6: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# Caution; it is not clear whether you should first stem or first remove
# stop words. However, the NLTK stop word list seems to be based on inflected
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
# Thus, the stop list seems to be not stemmed.
# Thus, you should remove the stop words first and then stem.
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# Create stemmed text
word_list_NLTK_filtered_stemmed=[]
for word in word_list_NLTK_filtered:
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
word_list_old_NLTK_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
# Write line break to output file
output_file.write("\n")
# Close 10-K filing
input_file_10_k.close()
input_file.close()
output_file.close()
stop_word_file.close()
print("Task done!")

View file

@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
# adjust the directory to your folder
directory="C:/Lehre/Machine Learning/Data/"
# import the data for this problem
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
# The rows of the data are the Form 10-K filings. Each line is one filing.
# The columns are the variables. After some identifying information,
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
# in a 10-K (e.g., 100 times)
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
# and Console will crash.
# However, you can pick a small subset of the data and look at it.
# It list all columns=variables and the first three observations.
data_frame_example=data_frame.head(3)
# you can click on this variable in the variable explorer without Spyder crashing.
# To see the variables included in the data use the following command
data_frame_column_names=data_frame.columns
# you can click on this variable in the variable explorer without Spyder crashing.
# This variables shows all column/variable names in a vector.
# split the data set into the training and testing data
# we use the filings from year 2007 as training data
data_frame_train=data_frame[data_frame.year==2007]
# and the filing from year 2008 as testing data
data_frame_test=data_frame[data_frame.year==2008]
# put the cumulative abnormal return around the filing date into a new variable.
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
# training data
filing_car_train=data_frame_train["excess_ret_t0_t4"]
# testing data
filing_car_test=data_frame_test["excess_ret_t0_t4"]
# so far, you have absolute word counts. For example, "loss" is found 5 times.
# As the length of the 10-Ks can be different, we scale by the number of words
# in the 10-K.
document_length_train=data_frame_train["number_of_words"]
document_length_test=data_frame_test["number_of_words"]
# the word frequencies are our independent variables -> restrict the data frame
# to those variables and drop all variables that are not needed
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
# compute relative frequencies, i.e., divide the absolute word count by document length
data_frame_train=data_frame_train.div(document_length_train, axis=0)
data_frame_test=data_frame_test.div(document_length_test, axis=0)
# standardize the data frames
# training data
data_frame_train_mean=np.mean(data_frame_train,axis=0)
data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
# testing data
data_frame_test_mean=np.mean(data_frame_test,axis=0)
data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
# There can be missing values in the standardized variables.
# They arise if the word count for a specific word is always zero in the training
# or in the testing data. In this case, the standard deviation is zero ->
# division by zero -> NaN.
# We replace these missing values by zero.
# training data
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
# testing data
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
##########################
# Ridge regression
##########################
print("\nRidge regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
# The optimal alpha is at around 140000.
regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
# get the optimal lambda
alpha_optimal_cv=regression_Ridge_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# what is the R2 in the training and testing data?
print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
# predict y in the testing sample
filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
######################
# LASSO regression
######################
print("\nLASSO regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
# The optimal alpha is at around 0.86.
regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
# get the optimal lambda
alpha_optimal_cv=regression_Lasso_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# get the R2 in the training data
print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
# ... and testing data
print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
# predict y in the testing sample
filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
############################################################
# Compare the betas from the Ridge and the LASSO regressions
############################################################
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,len(data_frame_train.columns)):
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 13 21:40:57 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Task 1: Open and print
# Open the Txt-file
print("\nTask 1 starts here!\n")
input_file=open(directory+'Fun_with_Python.txt','r')
input_text=input_file.read()
# Alternative with one command
input_text=open(directory+'Fun_with_Python.txt','r').read()
print(input_text)
# Task 2: Write text to output file
# Create file 'More_fun_with_Python.txt'
print("\nTask 2 starts here!\n")
output_file=open(directory+'More_fun_with_Python.txt','w')
output_file.write("Hallo\n")
output_file.write(input_text)
output_file.close()
# Task 3: loop
print("\nTask 3 starts here!\n")
# Alternative 1: While loop
i = 1
while i<=10:
print('Iteration Number: '+str(i))
i=i+1
# Example of a nested loop
j=1
while j<3:
print('Hallo')
j=j+1
# Alternative 2: For loop
for i in range(0,10):
print('Iteration Number: '+str(i))
# there is also a shorter notation: if there is no lower bound it is assumed to be zero
for i in range(10):
print('Iteration Number: '+str(i))
# Task 4: Print text line by line
# Print text line by line
print("\nTask 4 starts here!\n")
line_of_text=input_text.split('\n')
i=0
while i<len(line_of_text):
print("Line "+str(i+1)+": "+line_of_text[i])
i=i+1
# First alternative using a for loop
for i in range(0,len(line_of_text)):
print("Line "+str(i+1)+": "+line_of_text[i])
# Second alternative
# for ... in -> for each element of the list do ...
# line can be any name; it refers to the elements of the list
i=1
for line in line_of_text:
print("Line "+str(i)+": "+line)
i=i+1
# Task 5: count 'good'
# Count how often the word 'good' appears in the text
print("\nTask 5 starts here!\n")
number_good=input_text.count('good')
print(number_good)
# you can write the command in a shorter format
print(input_text.count('good'))
# Task 6a
# Print lines with the word 'good'
print("\nTask 6a starts here!\n")
for i in range(len(line_of_text)):
if line_of_text[i].count('good')>=1:
print(line_of_text[i])
# Task 7
# Print lines that start with the word 'This'
print("\nTask 7 starts here!\n")
print("\n'This' with a capital T.\n")
for i in range(len(line_of_text)):
if line_of_text[i].startswith('This')>=1:
print(line_of_text[i])
print("\n'this' with a lower case t.\n")
for i in range(len(line_of_text)):
if line_of_text[i].startswith('this')>=1:
print(line_of_text[i])
print("Yes, the command is case sensitive (2 vs. 0 matches)!")
# Task 8
# Replace the word 'good' by 'excellent'
print("\nTask 8 starts here!\n")
new_text=input_text.replace("good","excellent")
print(new_text)
# For illustation only
print("\nFor illustation only\n")
for i in range(len(line_of_text)):
new_line_of_text=line_of_text[i].replace('good','excellent')
# print the new line IF there are a change.
if not new_line_of_text==line_of_text[i]:
print(new_line_of_text)
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 09:21:46 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the txt file with the SEC filings
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
sec_filings_text=sec_filings_file.read()
# Create output file
output_file=open(directory+'SEC_Filings_Output.csv','w')
# Create first line with variable names
# I use semicolons as separator in csv files. You can also use any other symbol.
# However, you should make sure that the separator is not part of the data/text
# you write to the file.
# For example, it would be problematic if you use comma as separator and have
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
# Split the Input File in separate line
sec_filings_line=sec_filings_text.split("\n")
# Loop over all lines
for i in range(len(sec_filings_line)):
# Does the line refer to a form 10-K file?
# As pointed out by Loughran and McDonald (2011), many firms mislabelled
# their 10-K filings as 10-K405 filings. Thus, I included these filings
# as well.
# The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
# Depending on the research question at hand one could include amendments as well.
# Also, 10KSB (small businesses) could also be included.
match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
if match_10k:
#if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
# Split the line such that the information can be saved in separate
# variables
# Each information item has a fixed length in the overview files of the
# SEC.
# Filing type: position 1 to 12
# Remember Python starts counting at 0 and does not include the upper bound
filing_type=sec_filings_line[i][:12]
# Company name: position 13 to 74
company_name=sec_filings_line[i][12:74]
# CIK: position 75 to 86
cik=sec_filings_line[i][74:86]
# Filing date: position 87 to 98
filing_date=sec_filings_line[i][86:98]
# Link: position 99 to end of line
link=sec_filings_line[i][98:]
# Is the 10-K filed between March 10 and March 20?
# The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
filing_day=filing_date[8:10]
filing_month=filing_date[5:7]
# Is the Filing Month March?
if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
# The filing meets the conditions -->
# Write output to the csv file
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
sec_filings_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# We need the urllib package
import urllib.request
# To automatically create folders we need the os-module (OS: Operating System)
import os
# Define a user agent
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
# "Some websites dislike being browsed by programs, or send different versions
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
# (where x and y are the major and minor version numbers of the Python release,
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
# The way a browser identifies itself is through the User-Agent header.
opener = urllib.request.build_opener()
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
# To still automatically download files, you have different options.
# I have listed three examples below but there are many more:
# For a comprehensive list see, e.g.:
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
#opener.addheaders = [('User-agent', 'Mozilla')]
#opener.addheaders = [('User-agent', 'Chrome')]
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
urllib.request.install_opener(opener)
# Open the csv file from part 1 of the problem
input_file=open(directory+'SEC_Filings_Output.csv','r')
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# sometimes you have empty lines after a split command.
# You can remove them using the following command
while input_text_line.count("")>0:
input_text_line.remove("")
# Create a subfolder in which the 10-K filings are saved.
# When you download a large number of filings I recommend using subfolders for
# each year or even for each year-month combination.
# The option "exist_ok=True" makes sure that you do not get an error if the
# folder already exists.
os.makedirs(directory+"10-Ks/", exist_ok=True)
# Loop over all lines of the csv file
#for i in range(1,len(input_text_line)):
# To avoid having to download hundreds of files when we discuss the solution
# the loop stops at 20. (Remember the upper bound is not included.)
for i in range(1,21):
# split the line into the five variables
variables=input_text_line[i].split(";")
# We only need the cik and the link.
# The cik is the 3rd variable. However, the numbering of lists starts
# at zero -> 2nd item of the list "variables"
# The link is the 5th variable -> 4th item of the list "variables"
cik=variables[2]
#cik=cik.replace(" ","")
cik=cik.strip()
link=variables[4]
#link=link.replace(" ","")
link=link.strip()
# Find the filename
# The link consistes of differnt parts:
# For example: edgar/data/1000753/0000950129-98-001035.txt
link_parts=link.split("/")
# 1st part: edgar
# 2nd part: data
# 3rd part: cik
# 4th part: file name -> 3rd item of the set
filename=link_parts[3]
###########################################################################
############################ WARNING ######################################
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
# may use the same filename. Thus, when you only use the filename files
# might be overwritten. To avoid this problem you need to have a unique name.
# Combining CIK and filename results in a unique identifier, as the
# filename appears only once per firm (CIK).
# -> use the combination of CIK and filename: cik_filename
###########################################################################
urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
directory+"10-Ks/"+cik+"_"+filename)
input_file.close()
print("DONE")

View file

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the document
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
input_text=input_file.read()
#######################
# Task 1: remove tables
#######################
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search('<TABLE>', input_text)
print("This is the result of the re.search command:")
print(table_match)
while table_match:
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search('<TABLE>', input_text)
start_table=table_start_match.start()
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
table_end_match=re.search('</TABLE>', input_text)
end_table=table_end_match.end()
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
input_text=input_text[:start_table]+input_text[end_table:]
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search('<TABLE>', input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
#########################
# Task 2: remove Exhibits
#########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
exhibit_start_match=re.search('<TYPE>EX', input_text)
start_exhibit=exhibit_start_match.start()
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# It is important that we search for the </DOCUMENT> only after the exhibit
# started. Otherwise, we could get the end of the main document.
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
end_exhibit=start_exhibit+exhibit_end_match.end()
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
# Check whether there are further exhibits
exhibit_match=re.search('<TYPE>EX', input_text)
##########################
# Task 3: remove html code
##########################
# Alternative 1: remove html code without Beautiful Soup
text=re.sub('<[^>]{1,}>', '', input_text)
# This regex searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# You might have thought about using the following command
#text=re.sub('<.{1,}>', '', input_text)
# However, this command has a problem, as it would delete the following line
# entirely: <page> This is some text that should remain <page>
# The .{1,} would match 'page> This is some text that should remain <page', as
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
# Consequently, in the example only the two "<page>" would be deleted.
# You can verify this by using regex101.com (remember to check "Python" in the
# left menu of the webpage)
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
########################
# Task 4: delete numbers
########################
# Alternative 1 - removing numbers step by step
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
# remove the remaining numbers without commas and dots
text=re.sub('[0-9]','',text)
# Alternative 2 - removing numbers using a single regex
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# Alternative 3 - removing numbers step by step but start with commas and dots
# 1. remove comma incl. the surrounding numbers
text=re.sub("[0-9],[0-9]","",text)
# 2. remove dots incl. the surrounding numbers
text=re.sub("[0-9]\.[0-9]","",text)
# 3. remove any remaining number
text=re.sub("[0-9]","",text)
########################
# Task 5: delete symbols
########################
# When analyzing tone, symbols do not matter, as they are not considered to be
# words and thus do not biased the total word count.
# However, for training purposes this task is included in the problem.
# There is no well defined list of which symbols should be deleted. So, you
# can add further symbols.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
# Open the output file for the pure text
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
output_file.write(text)
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the 10-K
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
input_text=input_file.read()
################################
# Remove tables
# Same approach as in Problem 4
################################
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first two tables that we delete.
i=1
table_match=re.search('<TABLE>', input_text)
while table_match:
# Search for the beginning of the table
table_start_match=re.search('<TABLE>', input_text)
start_table=table_start_match.start()
# search for the end of the table
table_end_match=re.search('</TABLE>', input_text)
end_table=table_end_match.end()
# The if condition and the printing are just for illustrative purposes.
# The commands display the first two tables that are removed from the text.
if i<=2:
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
i=i+1
# remove the table
input_text=input_text[:start_table]+input_text[end_table:]
# check whether there are further tables
table_match=re.search('<TABLE>', input_text)
################################
# Remove exhibits
# Same approach as in Problem 4
################################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first exhibit that we delete.
i=1
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
# Search for the beginning of the exhibit
exhibit_start_match=re.search('<TYPE>EX', input_text)
start_exhibit=exhibit_start_match.start()
# Search for the end of the exhibit
# CAUTION: search only in the text after the beginning of the exhibt, as
# </DOCUMENT> also appears earlier (e.g. end of main document)
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
end_exhibit=start_exhibit+exhibit_end_match.end()
if i<=1:
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
i=i+1
# remove exhibit
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
exhibit_match=re.search('<TYPE>EX', input_text)
##################
# Remove html code
##################
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
############################
# Remove the Document Header
############################
# There are different possibilities how one can define the start of the main part of the text
# In general, you should delete all text that is uninformative for your analysis.
# Alternative 1:
# Search for Table of Contents. To not mistakenly match a reference to the
# table of contents somewhere in the text, we require a linebreak before and after.
# When the "Table of Contents" is centered, there will be whitespaces or tabs
# before and potentially also after
header_match=re.search('(?i)\n[\t ]{0,}table[\t ]of[\t ]contents[\t ]{0,}\n', text)
# Alternative 2:
# Search for Documents incorporated by reference.
header_match=re.search('\n[\t ]{0,}DOCUMENTS[\t ]INCORPORATED[\t ]BY[\t ]REFERENCE[\t ]{0,}\n', text)
if header_match:
# Drop the document header and keep only the rest of the text after the header.
text=text[header_match.end():]
#################################################
# Delete the text in "PART IV"
# This procedure is optional. Look at "Part IV" and decide whether you favor
# the approach. I think that the part should be dropped, as it is just a list
# of exhibits, some mandatory text required by the SEC [indicated by the
# capital letters in the "SIGNATURES" section].
#################################################
'''
# Alternative 1: go over all matches but keep only the last one
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
print("Hallo")
# match now contains the last match
# Delete the text after the last match
text=text[:match.start()]
# Alternative 2: save the positions of all matches (more general approach)
# to use alternative 2, you have to comment out Alternative 1!
# Otherwise line 104 will create a problem when you execute Alternative 2.
list_start_matches=[]
list_end_matches=[]
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
print(match)
list_start_matches.append(match.start())
list_end_matches.append(match.end())
# Position of last match
print(list_start_matches[len(list_start_matches)-1])
print(list_end_matches[len(list_start_matches)-1])
# Alternative 3: manual coding using a loop of re.searches
# create a copy of the text that we can edit
text_check_part_IV=text
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
# create two lists that we can use to save the start and end positions
# of the Part IV matches
list_start_matches_v2=[]
list_end_matches_v2=[]
# variable to save the position of the last match in the overall text
end_position_previous_match=0
while part_IV_match:
start_position_match=end_position_previous_match+part_IV_match.start()
end_position_match=end_position_previous_match+part_IV_match.end()
list_start_matches_v2.append(start_position_match)
list_end_matches_v2.append(end_position_match)
# update the information on the end of the last match
end_position_previous_match=end_position_previous_match+part_IV_match.end()
text_check_part_IV=text_check_part_IV[part_IV_match.end():]
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
# when you compare list_end_matches to list_end_matches_v2, you see that the two
# approaches yield the same result.
# To double check that the approaches have the same results, you could
# replace the Regex in lines 112, 124, and 142 by "\s{2,}PART [A-Z]{1,3}\s{0,}\n".
# In these case you have more matches and so you can better check that the
# two approaches have identical outcomes.
'''
'''
# Delete the text after the last match
text=text[:list_start_matches[len(list_start_matches)-1]]
'''
# Delete item numbers
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
text=re.sub('(?i)Item [0-9]{1,}A{0,1}(\s|\.|:|\n)','',text)
# Delete numbers
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# Alternative stepwise procedure to delete numbers
# remove commas in numbers, e.g., 1,000 or 12,345,678
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
# remove the remaining numbers without commas and dots
text=re.sub('[0-9]','',text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Delete hyphens that are followed by a line feed.
text=re.sub('-\n','',text)
# Replace symbols by a whitespace.
# Extra whitespaces are not a problem.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by a line break (potentially also whitespaces and tabs)
# and that are followed by are followed by a line break (again, there may
# also be whitespaces and tabs).
text=re.sub('\n[\t ]{0,}(\.|,){1,}[\t ]{0,}\n','\n',text)
# Drop single-character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub('(?<=\W)[A-Za-z](?=\W)',' ',text)
# Open the output file for the pure text
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
output_file.write(text)
input_file.close()
output_file.close()
print("COMPLETED.")

View file

@ -0,0 +1,356 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
# We subtract 1 from the lenght, as the first line contains the variable names but not data.
# Loop over all lines
for i in range(1,len(input_text_line)):
# To see the progress of your program you can print the number of iteration.
print(str(i))
# split the lines of the CSV-file into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename to open the file
cik=variables[0]
filename=variables[1]
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# the new file name should be "old_name_clean" -> we have to replace ".txt"
# by "_clean.txt"
filename=filename.replace('.txt','_clean.txt')
# Remove tables
variable=re.search('<TABLE>', input_text_10_k)
while variable:
variable=re.search('<TABLE>', input_text_10_k)
start_table=variable.start()
variable=re.search('</TABLE>', input_text_10_k)
end_table=variable.end()
input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
variable=re.search('<TABLE>', input_text_10_k)
####################### Begin of exhibits removal #########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# In the recent years, there are also exhibits with <TYPE>EXCEL
# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
variable=re.search('<TYPE>EX', input_text_10_k)
while variable:
variable=re.search('<TYPE>EX', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
variable=re.search('<TYPE>EX', input_text_10_k)
# In recent years, there are also XML-Exibits.
# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
# Remove XML-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>XML
# ...
# </DOCUMENT>
variable=re.search('<TYPE>XML', input_text_10_k)
while variable:
variable=re.search('<TYPE>XML', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>XML', input_text_10_k)
# Furthermore, also in recent years, there are also ZIP-Exibits.
# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
# Remove ZIP-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>ZIP
# ...
# </DOCUMENT>
variable=re.search('<TYPE>ZIP', input_text_10_k)
while variable:
variable=re.search('<TYPE>ZIP', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>ZIP', input_text_10_k)
# In addition, there are many Graphic-Exibits.
# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
# Remove GRAPHIC-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>GRAPHIC
# ...
# </DOCUMENT>
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
while variable:
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
# Furthermore, there can be also Cover-Exibits.
# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
# Remove COVER-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>COVER
# ...
# </DOCUMENT>
variable=re.search('<TYPE>COVER', input_text_10_k)
while variable:
variable=re.search('<TYPE>COVER', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>COVER', input_text_10_k)
# Furthermore, there can be also PDF files attached.
# These attachments caused BeautifulSoup to crash on some computers.
# Remove PDFs
variable=re.search('<PDF>', input_text_10_k)
while variable:
variable=re.search('<PDF>', input_text_10_k)
start_pdf=variable.start()
variable=re.search('</PDF>', input_text_10_k[start_pdf:])
end_pdf=start_pdf+variable.end()
input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
variable=re.search('<PDF>', input_text_10_k)
######################## End of exhibits removal ##########################
# Remove Document Header - PART 1
# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
# and "</SEC-HEADER>" are mandatory for all filings.
variable=re.search('</SEC-HEADER>', input_text_10_k)
if variable:
input_text_10_k=input_text_10_k[variable.end():]
# In some filings, firms do not use line feeds \n but <div> and </div>
# instead to indicate the start and the end of sentences.
# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer
# neuen Zeile des Fließtextes beginnt."
# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
# and
# "The <div> tag defines a division or a section in an HTML document.
# By default, browsers always place a line break before and after the <div> element."
# See: https://www.w3schools.com/tags/tag_div.asp
# It is important to replace <div> and </div> by linefeeds because otherwise
# the entire text will be in a single line and the subsequent commands do
# not work properly.
input_text_10_k=input_text_10_k.replace("<div>", "\n")
input_text_10_k=input_text_10_k.replace("</div>", "\n")
# Remove html code
html_text=BeautifulSoup(input_text_10_k, 'html.parser')
text=html_text.get_text()
# To get an idea of what the commands below are doing, it is helpful to
# write the current version of the text to a file and then compare it to the
# final file.
filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
# Open the output file for the text without html code and without tables+exhibits
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text)
output_file_10_k.close()
# Remove the Document Header - PART II
# The above command to remove the header ("</SEC-HEADER>") does not capture
# the entire header -> we need to delete further parts at the top the filing.
# WARNING: The filters below may be specific to this sample of 10-Ks.
# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('\nPART\.{1,}', text)
if variable:
text=text[variable.end():]
# Delete Item numbers
text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
# Delete Part numbers
text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
# Delete numbers:
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# File names, e.g. exhibit.pdf or picture.jpeg should be removed
text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
# URLs --> Remove internet addresse
text=re.sub("http:/{0,2}", "", text)
text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
# In Part 4 of the programming chapter, we will determine the number of
# words per sentence. To be able to use the same underlying sample,
# we need to implement further corrections. These changes do not affect
# the percentage of negative/positive/etc. words.
# --> Only relevant for determining the number of sentences
# The text contains dots that do not indicate the end of a sentence.
# E.g., "Inc." and "St."
# The preceding - is found in non-U.S. for example.
# Replace or remove specific abreviations
# This list is incomplete. In a research project you should spend more time
# on editing the data.
text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
# three repitions of capital letter and dot are also common in filings
# we need to check for three instances first.
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
# now check for two instances
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
# Dots after a single letter can indicate a middle Name Paul J. Smith
# or an abbreviation --> also delete these.
text=re.sub("( |\n|,)[A-Z]\.", "", text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Replace hyphens followed by a line feed by a hyphen without line feed
text=re.sub('-\n','-',text)
# Delete the minus/hyphens
# "Short-term" -> "shortterm"
text=re.sub('-','',text)
# --> Only relevant for determining the number of sentences
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by whitespace or line break and that are followed by
# whitespace or line break.
text=re.sub('\n(\.|,)\n','\n',text)
text=re.sub(' (\.|,) ',' ',text)
# Delete single character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
# There are sentences that are in upper case letters. However, these are not
# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
# We save the edited text in a new variable
text_edited=text
# Split text in sentences
list_sentences=re.split('\.|!|\?', text)
# iterate the list of all sentences
for j in range(0,len(list_sentences)):
# Determine the number of upper case letters
upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
# Determine the number of all letters
total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
# If there is at least one letter calculate the fraction of upper case letters
if total_letters>0:
ratio=upper_letters/total_letters
# If the fraction of upper case letters is larger than 0.9 delete
# the sentence from the text.
if ratio>0.9:
text_edited=text_edited.replace(list_sentences[j]+'.','')
text_edited=text_edited.replace(list_sentences[j]+'!','')
text_edited=text_edited.replace(list_sentences[j]+'?','')
# --> Only relevant for determining the number of sentences
# There are a few cases where a dot follows a dot or where a linefeed
# separates two dots. --> delete the second dot.
text_edited=text_edited.replace('..','.')
text_edited=text_edited.replace('.\n.','.')
# The following commands do not influence the subsequent textual analysis.
# The only purpose is to display the output in a nicer format.
# Replace lines that contain only whitespaces by a line feed.
text_edited=re.sub('\n {1,}\n','\n',text_edited)
# Replace multiple line feeds by one line feed.
text_edited=re.sub('\n{2,}','\n',text_edited)
# Open the output file for the pure text
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text_edited)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()

View file

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary has been obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select negative words and copy them to a txt file
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# The LMD words are all in upper case
word_list=word_list.lower()
negative_words=word_list.split('\n')
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
Percentage_Negative_Words\n')
# Loop over all lines of the csv file
for i in range(1,len(input_text_line)):
#for i in range(1,10):
# If the execution of your scripts takes some time, printing the loop iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-Ks in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters, too
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document is identical. Remember that you have typically lower and upper case
# letters in documents -> modify text.
text=input_text_10_k.lower()
# Split the text in single words to determine the total number of words
# \W is a non-word character: "Matches any character which is not a Unicode
# word character." (Python documentation)
# this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
# case letters, no numbers, and no underscore.
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# It is important that you treat multiple "\W" as one. Otherwise you are left
# with elements in the list that are not acutal words.
# Determine the total number of words
word_count=len(list_of_words)
# Reset the number of negative words to zero
negative_count=0
# For each negative word, count the number of occurrences
for j in range(len(negative_words)):
# the command "list_of_words.count(negative_words[i])" only matches if there
# is exact overlap between the ith negative word and the words in the list.
# For example the following two commands:
# list_of_words=["abandon","abandoned","abandonment"]
# list_of_words.count("abandon")
# yields 1 match
# In contrast,
# text_of_words="abandon abandoned abandonment"
# text_of_words.count("abandon")
# yields 3. Thus, you have to split the text to individual words!!!
negative_count=negative_count+list_of_words.count(negative_words[j])
# Get the percentage of negative words
percentage_negative=negative_count/word_count
# Write cik, file name, total number of words, number of negative words,
# and the percentage of negative words to output file.
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(negative_count)+';'+str(percentage_negative)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary is obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select positive words and copy them to a txt file
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
word_list=file_word_list.read()
word_list=word_list.lower()
positive_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
# Iterate the list of the 200 10-K filings
# the last line is empty --> loop only up to len()-1
#for i in range(1,len(input_text_line)):
for i in range(1,20): # For illustration only
# If the execution of your scripts takes some time, printing the iterator
# gives you an impression of the overall progress
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'/10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters, too
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document are identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=input_text_10_k.lower()
# Split the text in single words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Reset the number of positive words and positive words adj. for negations to zero
positive_count=0
positive_count_adj=0
# For each positive word, count the number of occurrences
for j in range(len(positive_words)):
# standard count operation without controlling for negations
positive_words_found=list_of_words.count(positive_words[j])
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
# only for Fin-Pos words. Simple negation is taken to be observations
# of one of six words (no, not, none, neither, never, nobody) occurring
# within three words preceding a positive word.
# When we have identified positive words we need to search for negations
while positive_words_found>0:
# identify the position of the matched positive word in the list of all words
position_of_word=list_of_words.index(positive_words[j])
# identify the three words before the positive word and add them to a list
# the \ is a line break
list_negation=[list_of_words[max(0,position_of_word-3)],\
list_of_words[max(0,position_of_word-2)],list_of_words[max(0,position_of_word-1)]]
# check whether one of the three words in list_negation is a negation
negation_found=list_negation.count('no')+list_negation.count('not')+\
list_negation.count('none')+list_negation.count('neither')+\
list_negation.count('never')+list_negation.count('nobody')
if negation_found==0:
# no negation
positive_count_adj=positive_count_adj+1
positive_count=positive_count+1
else:
# negation
positive_count=positive_count+1
# delete the matched positive words in the original document
list_of_words[position_of_word]=''
# check whether there are further matches of the jth positive word
positive_words_found=list_of_words.count(positive_words[j])
# Write cik, file name, total number of words, and number of positive
# and adjusted positive words to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
';'+str(positive_count_adj/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We split the text into words and sentences using regular expression
import re
# For comparison, we also include the NLTK tokenizer
from nltk.tokenize import sent_tokenize
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
text=input_file_10_k.read()
# Determine number of sentences and number of words
# Split the text in words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Split the text by symbols that indicate the end of a sentence
# to determine the total number of sentences
list_of_sentences=re.split('[\.!\?]{1,}', text)
while list_of_sentences.count("")>0:
list_of_sentences.remove("")
# Alternative 1:
list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
while list_of_sentences_1.count("")>0:
list_of_sentences_1.remove("")
# Alternative 2:
list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
while list_of_sentences_2.count("")>0:
list_of_sentences_2.remove("")
# Incorrect approach:
# re.split splits the string by the occurrences of the pattern.
# If capturing parentheses, i.e. (), are used in pattern, then the text
# of all groups in the pattern are also returned as part of the resulting list.
# See https://docs.python.org/3/library/re.html#re.split for details
list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
while list_of_sentences_false.count("")>0:
list_of_sentences_false.remove("")
# For comparison, we also include the NLTK tokenizer
list_of_sentences_nltk=sent_tokenize(text)
# Determine total number of sentences
sentence_count=len(list_of_sentences)
sentence_count_1=len(list_of_sentences_1)
sentence_count_2=len(list_of_sentences_2)
sentence_count_false=len(list_of_sentences_false)
sentence_count_nltk=len(list_of_sentences_nltk)
# Ratio of # of words over # of sentences
wps=word_count/sentence_count
wps_1=word_count/sentence_count_1
wps_2=word_count/sentence_count_2
wps_false=word_count/sentence_count_false
wps_nltk=word_count/sentence_count_nltk
# Write cik, file name, total number of words, total number of sentences,
# and WPS to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
# Close filing
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()