Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
76
lectures/programming/solutions/Problem_10_Complex_Words.py
Normal file
76
lectures/programming/solutions/Problem_10_Complex_Words.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the dictionary
|
||||
file_word_list=open(directory+'Complex_Words.txt','r',encoding="utf-8")
|
||||
word_list=file_word_list.read()
|
||||
word_list=word_list.lower()
|
||||
complex_words=word_list.split()
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_Complex_Tone.csv','w',encoding="utf-8")
|
||||
output_file.write('CIK;Filename;Number_Words;Number_Complex_Words;Percent_Complex_Words\n')
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# Use lower case letters
|
||||
text=input_text_10_k.lower()
|
||||
|
||||
# Split the text in words to determine the total number of words
|
||||
list_of_words=re.split('\W{1,}', text)
|
||||
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# Determine total number of words
|
||||
word_count=len(list_of_words)
|
||||
|
||||
# Reset the number of complex words to zero
|
||||
complex_count=0
|
||||
# For each complex word, count the number of occurrences
|
||||
for i in range(len(complex_words)):
|
||||
complex_count=complex_count+list_of_words.count(complex_words[i])
|
||||
|
||||
# Write cik, file name, total number of words, and number of complex words to output file
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||
+str(complex_count)+';'+str(complex_count/word_count)+'\n')
|
||||
|
||||
# Close filings
|
||||
input_file_10_k.close()
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# To determine file size we need the OS package
|
||||
import os
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
|
||||
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# File size of the complete submission file (gross file size)
|
||||
# You have to divide the result by 1024 to get the size in kilobyte
|
||||
# The file size will be affected by html code and exhibits.
|
||||
size_gross=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'.txt')/1024
|
||||
|
||||
# File size of the main text file (net file size)
|
||||
# You have to divide the result by 1024 to get the size in kilobyte
|
||||
size_net=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt')/1024
|
||||
|
||||
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 09:19:54 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We need regular expressions and counters (->collections)
|
||||
import re
|
||||
import collections
|
||||
# for the bigram part, the sentence tokenizer is helpful
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create an empty counter variable
|
||||
words_counter=collections.Counter()
|
||||
# variable is needed only for an alternative solution
|
||||
words_counter1=collections.Counter()
|
||||
|
||||
# counter for the extra task
|
||||
bigram_counter=collections.Counter()
|
||||
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list; remember to specify the encoding
|
||||
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
|
||||
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||
|
||||
# read the content from the file
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# use lower case only so that it does not matter whether a word is at
|
||||
# the beginning of a sentence ("The") or within a sentence ("the").
|
||||
# Please note that this can be problematic, e.g. "US" -> United States vs.
|
||||
# us (personal pronoun)
|
||||
input_text_10_k_lower=input_text_10_k.lower()
|
||||
|
||||
# Split text into words
|
||||
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
|
||||
# There can be empty ("") list elements -> remove them
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# optional commands to remove words that only contain "_"
|
||||
'''
|
||||
for word in list_of_words:
|
||||
if re.sub("[a-zA-Z]","",word)!="":
|
||||
#if word.count("_")>0:
|
||||
list_of_words.remove(word)
|
||||
'''
|
||||
|
||||
# Add the words to our counter
|
||||
words_counter=words_counter+collections.Counter(list_of_words)
|
||||
# alternative solution
|
||||
words_counter1.update(list_of_words)
|
||||
|
||||
|
||||
#############################################
|
||||
# optional part for the extra task on bigrams
|
||||
#############################################
|
||||
|
||||
# create an empty list for the bigrams
|
||||
bigram_list=[]
|
||||
|
||||
# split the text into sentences
|
||||
list_of_sentences=sent_tokenize(input_text_10_k)
|
||||
|
||||
# create the BIGRAM IN EACH SENTENCE
|
||||
for sentence in list_of_sentences:
|
||||
|
||||
# make the sentence lower case
|
||||
sentence_lower=sentence.lower()
|
||||
|
||||
# split the sentence into words
|
||||
list_of_words=re.split("\W{1,}",sentence_lower)
|
||||
|
||||
# remove empty elements
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
#print("these are the words of the sentence:\n"+str(list_of_words))
|
||||
|
||||
# go over all potential two word combinations in the sentence.
|
||||
for word_number in range(0,len(list_of_words)-1):
|
||||
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
|
||||
|
||||
bigram_counter=bigram_counter+collections.Counter(bigram_list)
|
||||
# end of extra task
|
||||
|
||||
|
||||
# Close the 10-K filing
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
|
||||
######################
|
||||
# Top 100 single words
|
||||
######################
|
||||
# Open the csv file containing the 100 most frequently used words
|
||||
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
|
||||
output_file.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent words
|
||||
top_100_words=words_counter.most_common(100)
|
||||
# for the alternative solution
|
||||
#top_100_words=words_counter1.most_common(100)
|
||||
|
||||
# Write the 100 most frequent words to the csv file.
|
||||
# Remember Python starts counting at 0, while humans start at 1.
|
||||
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
|
||||
# Consequently, to get a consistent table, we must use the value i for the rank
|
||||
# but call the element i-1.
|
||||
for i in range(1,101):
|
||||
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
|
||||
str(top_100_words[i-1][1])+"\n")
|
||||
|
||||
# Close the csv file
|
||||
output_file.close()
|
||||
|
||||
|
||||
######################
|
||||
# Extra task
|
||||
# Top 100 bigrams
|
||||
######################
|
||||
# Open the csv file containing the 100 most frequently used BIGRAMS
|
||||
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
|
||||
output_file_bigram.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent words
|
||||
top_100_bigrams=bigram_counter.most_common(100)
|
||||
|
||||
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
|
||||
for i in range(1,101):
|
||||
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
|
||||
str(top_100_bigrams[i-1][1])+"\n")
|
||||
|
||||
# Close the csv file
|
||||
output_file_bigram.close()
|
||||
|
||||
|
||||
print("Task done!")
|
||||
96
lectures/programming/solutions/Problem_13_Stemming.py
Normal file
96
lectures/programming/solutions/Problem_13_Stemming.py
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We need regular epressions, tokenize (to identify words), and stemming.
|
||||
import re
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.stem import PorterStemmer
|
||||
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
#for i in range(1,len(input_text_line)):
|
||||
# for illustration filings 1 to 3 only
|
||||
for i in range(1,4):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list; remember to specify the encoding
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
||||
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# Get the text of the 10-K
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# We need to tokenize the text because stem only works on a word by word basis.
|
||||
# Stemming an entire document without splitting into words does not work!
|
||||
# The problem is that \n gets lost in this process --> we cannot easily
|
||||
# recreate the document.
|
||||
# idea: replace \n by \n and some indicator that there was a line break.
|
||||
# Here, I choose "LINEBREAKMARK"
|
||||
input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
|
||||
|
||||
# Split text into words
|
||||
# There are two alternatives.
|
||||
# Alternative 1 (our standard approach):
|
||||
#word_list=re.split("\W{1,}",input_text_10_k.lower())
|
||||
# Alternative 2 (keeps symbols like ,;.):
|
||||
word_list=word_tokenize(input_text_10_k.lower())
|
||||
|
||||
|
||||
# Stem the text
|
||||
text_stemmed=''
|
||||
for word in word_list:
|
||||
# The following two cases are designed to improve the formatting of the
|
||||
# output file. It is not needed for the subsequent analyses.
|
||||
|
||||
# Case 1: 'word' is not an actual word but a symbol. -> there should
|
||||
# be no whitespace between the previous words and this symbol.
|
||||
# \A and \Z indicate the beginning and end of string -> the 'word' is just
|
||||
# the symbol but not a combination of letters and symbols.
|
||||
|
||||
if re.search("\A[\.\?!,:;']{1,}\Z",word):
|
||||
text_stemmed=text_stemmed+word
|
||||
# Case 2: the word is an actual word -> have a whitespace included.
|
||||
else:
|
||||
text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
||||
|
||||
# The simple solution (without restoring the formatting of the text) is:
|
||||
#text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
||||
|
||||
|
||||
# To recreate the text, we need to replace the line break indicators by \n
|
||||
# Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
|
||||
text_stemmed=text_stemmed.replace("linebreakmark","\n")
|
||||
|
||||
|
||||
# Open the output file for the stemmed text
|
||||
output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
||||
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
|
||||
output_file_10_k.write(text_stemmed)
|
||||
output_file_10_k.close()
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
print("Task done!")
|
||||
287
lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
Normal file
287
lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
Normal file
|
|
@ -0,0 +1,287 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import re
|
||||
from nltk.tokenize import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
from collections import Counter
|
||||
|
||||
|
||||
ps=PorterStemmer()
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the iput file. The following command
|
||||
# deletes these lines
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Open the output csv file in which we write the similarities
|
||||
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
|
||||
# Write variable names to first line
|
||||
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
|
||||
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
|
||||
Jaccard_stemmed_NLTK_stop_words\n')
|
||||
|
||||
# Read own stop word list
|
||||
# This list has been created by manually selecting words from the csv-file
|
||||
# 100_most_frequent_words.csv, which is created by the Python program
|
||||
# "Problem_12_Most_Frequent_Words.py".
|
||||
# Simply delete words you consider to be meaningless and that are frequently
|
||||
# used.
|
||||
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
|
||||
stop_word_text=stop_word_file.read()
|
||||
stop_word_line=stop_word_text.split("\n")
|
||||
stop_word_line.remove("")
|
||||
own_stop_words=[""]
|
||||
for i in range(1,len(stop_word_line)):
|
||||
stop_word=stop_word_line[i].split(";")[1]
|
||||
own_stop_words.append(stop_word)
|
||||
|
||||
own_stop_words.remove("")
|
||||
print("This is the list of my stop words:")
|
||||
print(own_stop_words)
|
||||
|
||||
# Read NLTK stop word list
|
||||
NLTK_stop_words=set(stopwords.words("english"))
|
||||
print("This is the list of NLTK stop words:")
|
||||
print(NLTK_stop_words)
|
||||
|
||||
# set default values for variables
|
||||
# It is not required. However, if you don't do it Spyder will suggest that line
|
||||
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
||||
# is incorrect as word_list_old_edited is not yet defined at point in the program
|
||||
# code. In this specific example, this will not cause an error, as we do not enter
|
||||
# the if condition when i=1 -> it
|
||||
word_list_old_edited=[]
|
||||
word_list_edited=[]
|
||||
word_list_old_NLTK_filtered=""
|
||||
word_list_old_own_filtered=""
|
||||
word_list_old_edited_stemmed=""
|
||||
word_list_old_own_filtered_stemmed=""
|
||||
word_list_old_NLTK_filtered_stemmed=""
|
||||
|
||||
#######################################################
|
||||
# Define a function that computes Jaccard similarity
|
||||
# As we need these operations several times, it is
|
||||
# helpful to use a function.
|
||||
######################################################
|
||||
# beginning of the function
|
||||
def jaccard(text1,text2):
|
||||
counter1=Counter(text1)
|
||||
counter2=Counter(text2)
|
||||
|
||||
intersection=counter1 & counter2
|
||||
union=counter1 | counter2
|
||||
|
||||
return len(intersection)/len(union)
|
||||
# end of the function
|
||||
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Write the information from the input file to the output file
|
||||
# we do not add a line break at the end, as we must append the similarity
|
||||
# score first.
|
||||
output_file.write(input_text_line[i])
|
||||
|
||||
# Open the ith 10-K; remember to specify the encoding
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
|
||||
'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# check whether the previous entry of the list is from the same firm
|
||||
permco=input_text_line[i].split(";")[1]
|
||||
permco_old=input_text_line[i-1].split(";")[1]
|
||||
|
||||
# Split text into words
|
||||
word_list_edited=word_tokenize(input_text_10_k.lower())
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 1: Jaccard for the _edited.txt
|
||||
############################################
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
# the command calls the jaccard function that we have defined above.
|
||||
# in the function, text1=word_list_edited and text2=word_list_old_edited.
|
||||
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_edited=word_list_edited
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 2: Jaccard for the _edited.txt
|
||||
# AND REMOVE STOP WORDS - OWN LIST
|
||||
############################################
|
||||
# remove stop words using personal stop word list
|
||||
word_list_own_filtered=[]
|
||||
for word in word_list_edited:
|
||||
if word not in own_stop_words:
|
||||
word_list_own_filtered.append(word)
|
||||
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
jaccard_similarity=jaccard(word_list_own_filtered,\
|
||||
word_list_old_own_filtered)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_own_filtered=word_list_own_filtered
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 3: Jaccard for the _edited_v1.txt
|
||||
# AND REMOVE STOP WORDS - NLTK LIST
|
||||
############################################
|
||||
# remove stop words using NLTK stop word list
|
||||
word_list_NLTK_filtered=[]
|
||||
for word in word_list_edited:
|
||||
if word not in NLTK_stop_words:
|
||||
word_list_NLTK_filtered.append(word)
|
||||
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
|
||||
word_list_old_NLTK_filtered)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_NLTK_filtered=word_list_NLTK_filtered
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 4: Jaccard for the _stemmed.txt
|
||||
############################################
|
||||
# Create stemmed text
|
||||
word_list_edited_stemmed=[]
|
||||
for word in word_list_edited:
|
||||
word_list_edited_stemmed.append(ps.stem(word))
|
||||
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_edited_stemmed=word_list_edited_stemmed
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 5: Jaccard for the _stemmed.txt
|
||||
# AND REMOVE STOP WORDS - OWN LIST
|
||||
############################################
|
||||
# Caution; in general, it is not clear whether you should first stem or
|
||||
# first remove stop words.
|
||||
# However, in this specific case, you should remove the stop words first
|
||||
# and then stem, as your stop word list is based on the inflected text.
|
||||
|
||||
# remove stop words using personal stop word list
|
||||
word_list_own_filtered=[]
|
||||
for word in word_list_edited:
|
||||
if word not in own_stop_words:
|
||||
word_list_own_filtered.append(word)
|
||||
|
||||
# Create stemmed text
|
||||
word_list_own_filtered_stemmed=[]
|
||||
for word in word_list_own_filtered:
|
||||
word_list_own_filtered_stemmed.append(ps.stem(word))
|
||||
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
|
||||
word_list_old_own_filtered_stemmed)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 6: Jaccard for the _stemmed.txt
|
||||
# AND REMOVE STOP WORDS - NLTK LIST
|
||||
############################################
|
||||
# Caution; it is not clear whether you should first stem or first remove
|
||||
# stop words. However, the NLTK stop word list seems to be based on inflected
|
||||
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
|
||||
# Thus, the stop list seems to be not stemmed.
|
||||
# Thus, you should remove the stop words first and then stem.
|
||||
|
||||
# remove stop words using NLTK stop word list
|
||||
word_list_NLTK_filtered=[]
|
||||
for word in word_list_edited:
|
||||
if word not in NLTK_stop_words:
|
||||
word_list_NLTK_filtered.append(word)
|
||||
|
||||
# Create stemmed text
|
||||
word_list_NLTK_filtered_stemmed=[]
|
||||
for word in word_list_NLTK_filtered:
|
||||
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
|
||||
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
|
||||
word_list_old_NLTK_filtered_stemmed)
|
||||
|
||||
output_file.write(";"+str(jaccard_similarity))
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(";")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
|
||||
|
||||
|
||||
# Write line break to output file
|
||||
output_file.write("\n")
|
||||
|
||||
# Close 10-K filing
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
stop_word_file.close()
|
||||
print("Task done!")
|
||||
|
||||
|
|
@ -0,0 +1,161 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Mar 21 09:38:32 2022
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.linear_model import RidgeCV
|
||||
from sklearn.linear_model import LassoCV
|
||||
|
||||
|
||||
# adjust the directory to your folder
|
||||
directory="C:/Lehre/Machine Learning/Data/"
|
||||
|
||||
|
||||
# import the data for this problem
|
||||
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
|
||||
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
|
||||
# The rows of the data are the Form 10-K filings. Each line is one filing.
|
||||
# The columns are the variables. After some identifying information,
|
||||
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
|
||||
# in a 10-K (e.g., 100 times)
|
||||
|
||||
|
||||
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
|
||||
# and Console will crash.
|
||||
# However, you can pick a small subset of the data and look at it.
|
||||
# It list all columns=variables and the first three observations.
|
||||
data_frame_example=data_frame.head(3)
|
||||
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||
|
||||
# To see the variables included in the data use the following command
|
||||
data_frame_column_names=data_frame.columns
|
||||
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||
# This variables shows all column/variable names in a vector.
|
||||
|
||||
# split the data set into the training and testing data
|
||||
# we use the filings from year 2007 as training data
|
||||
data_frame_train=data_frame[data_frame.year==2007]
|
||||
# and the filing from year 2008 as testing data
|
||||
data_frame_test=data_frame[data_frame.year==2008]
|
||||
|
||||
# put the cumulative abnormal return around the filing date into a new variable.
|
||||
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
|
||||
# training data
|
||||
filing_car_train=data_frame_train["excess_ret_t0_t4"]
|
||||
# testing data
|
||||
filing_car_test=data_frame_test["excess_ret_t0_t4"]
|
||||
|
||||
# so far, you have absolute word counts. For example, "loss" is found 5 times.
|
||||
# As the length of the 10-Ks can be different, we scale by the number of words
|
||||
# in the 10-K.
|
||||
document_length_train=data_frame_train["number_of_words"]
|
||||
document_length_test=data_frame_test["number_of_words"]
|
||||
|
||||
|
||||
# the word frequencies are our independent variables -> restrict the data frame
|
||||
# to those variables and drop all variables that are not needed
|
||||
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||
|
||||
# compute relative frequencies, i.e., divide the absolute word count by document length
|
||||
data_frame_train=data_frame_train.div(document_length_train, axis=0)
|
||||
data_frame_test=data_frame_test.div(document_length_test, axis=0)
|
||||
|
||||
# standardize the data frames
|
||||
# training data
|
||||
data_frame_train_mean=np.mean(data_frame_train,axis=0)
|
||||
data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
|
||||
data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
|
||||
# testing data
|
||||
data_frame_test_mean=np.mean(data_frame_test,axis=0)
|
||||
data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
|
||||
data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
|
||||
|
||||
|
||||
# There can be missing values in the standardized variables.
|
||||
# They arise if the word count for a specific word is always zero in the training
|
||||
# or in the testing data. In this case, the standard deviation is zero ->
|
||||
# division by zero -> NaN.
|
||||
# We replace these missing values by zero.
|
||||
# training data
|
||||
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
|
||||
# testing data
|
||||
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
|
||||
|
||||
##########################
|
||||
# Ridge regression
|
||||
##########################
|
||||
print("\nRidge regression - Using cross-validation\n")
|
||||
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
|
||||
# In this regression, we use the training data.
|
||||
# We use five-fold cross-validation.
|
||||
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
|
||||
# The optimal alpha is at around 140000.
|
||||
regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# what is the R2 in the training and testing data?
|
||||
print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
|
||||
print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
|
||||
# predict y in the testing sample
|
||||
filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
|
||||
# Determine the MSE
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
|
||||
|
||||
|
||||
######################
|
||||
# LASSO regression
|
||||
######################
|
||||
print("\nLASSO regression - Using cross-validation\n")
|
||||
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
|
||||
# In this regression, we use the training data.
|
||||
# We use five-fold cross-validation.
|
||||
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
|
||||
# The optimal alpha is at around 0.86.
|
||||
regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# get the R2 in the training data
|
||||
print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
|
||||
# ... and testing data
|
||||
print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
|
||||
# predict y in the testing sample
|
||||
filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
|
||||
# Determine the MSE
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
|
||||
|
||||
|
||||
############################################################
|
||||
# Compare the betas from the Ridge and the LASSO regressions
|
||||
############################################################
|
||||
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
|
||||
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
|
||||
|
||||
# get the list of coefficients
|
||||
for i in range (0,len(data_frame_train.columns)):
|
||||
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||
|
||||
output_file.close()
|
||||
|
||||
print("Completed!")
|
||||
121
lectures/programming/solutions/Problem_1_Fun_with_Python.py
Normal file
121
lectures/programming/solutions/Problem_1_Fun_with_Python.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Nov 13 21:40:57 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Task 1: Open and print
|
||||
# Open the Txt-file
|
||||
print("\nTask 1 starts here!\n")
|
||||
input_file=open(directory+'Fun_with_Python.txt','r')
|
||||
input_text=input_file.read()
|
||||
# Alternative with one command
|
||||
input_text=open(directory+'Fun_with_Python.txt','r').read()
|
||||
|
||||
print(input_text)
|
||||
|
||||
# Task 2: Write text to output file
|
||||
# Create file 'More_fun_with_Python.txt'
|
||||
print("\nTask 2 starts here!\n")
|
||||
output_file=open(directory+'More_fun_with_Python.txt','w')
|
||||
output_file.write("Hallo\n")
|
||||
output_file.write(input_text)
|
||||
output_file.close()
|
||||
|
||||
# Task 3: loop
|
||||
print("\nTask 3 starts here!\n")
|
||||
# Alternative 1: While loop
|
||||
i = 1
|
||||
while i<=10:
|
||||
print('Iteration Number: '+str(i))
|
||||
i=i+1
|
||||
# Example of a nested loop
|
||||
j=1
|
||||
while j<3:
|
||||
print('Hallo')
|
||||
j=j+1
|
||||
|
||||
# Alternative 2: For loop
|
||||
for i in range(0,10):
|
||||
print('Iteration Number: '+str(i))
|
||||
# there is also a shorter notation: if there is no lower bound it is assumed to be zero
|
||||
for i in range(10):
|
||||
print('Iteration Number: '+str(i))
|
||||
|
||||
|
||||
# Task 4: Print text line by line
|
||||
# Print text line by line
|
||||
print("\nTask 4 starts here!\n")
|
||||
line_of_text=input_text.split('\n')
|
||||
i=0
|
||||
while i<len(line_of_text):
|
||||
print("Line "+str(i+1)+": "+line_of_text[i])
|
||||
i=i+1
|
||||
|
||||
# First alternative using a for loop
|
||||
for i in range(0,len(line_of_text)):
|
||||
print("Line "+str(i+1)+": "+line_of_text[i])
|
||||
|
||||
|
||||
# Second alternative
|
||||
# for ... in -> for each element of the list do ...
|
||||
# line can be any name; it refers to the elements of the list
|
||||
i=1
|
||||
for line in line_of_text:
|
||||
print("Line "+str(i)+": "+line)
|
||||
i=i+1
|
||||
|
||||
|
||||
# Task 5: count 'good'
|
||||
# Count how often the word 'good' appears in the text
|
||||
print("\nTask 5 starts here!\n")
|
||||
number_good=input_text.count('good')
|
||||
print(number_good)
|
||||
# you can write the command in a shorter format
|
||||
print(input_text.count('good'))
|
||||
|
||||
# Task 6a
|
||||
# Print lines with the word 'good'
|
||||
print("\nTask 6a starts here!\n")
|
||||
for i in range(len(line_of_text)):
|
||||
if line_of_text[i].count('good')>=1:
|
||||
print(line_of_text[i])
|
||||
|
||||
|
||||
# Task 7
|
||||
# Print lines that start with the word 'This'
|
||||
print("\nTask 7 starts here!\n")
|
||||
print("\n'This' with a capital T.\n")
|
||||
for i in range(len(line_of_text)):
|
||||
if line_of_text[i].startswith('This')>=1:
|
||||
print(line_of_text[i])
|
||||
|
||||
print("\n'this' with a lower case t.\n")
|
||||
for i in range(len(line_of_text)):
|
||||
if line_of_text[i].startswith('this')>=1:
|
||||
print(line_of_text[i])
|
||||
|
||||
print("Yes, the command is case sensitive (2 vs. 0 matches)!")
|
||||
|
||||
|
||||
# Task 8
|
||||
# Replace the word 'good' by 'excellent'
|
||||
print("\nTask 8 starts here!\n")
|
||||
new_text=input_text.replace("good","excellent")
|
||||
print(new_text)
|
||||
|
||||
# For illustation only
|
||||
print("\nFor illustation only\n")
|
||||
for i in range(len(line_of_text)):
|
||||
new_line_of_text=line_of_text[i].replace('good','excellent')
|
||||
# print the new line IF there are a change.
|
||||
if not new_line_of_text==line_of_text[i]:
|
||||
print(new_line_of_text)
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 09:21:46 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||
"""
|
||||
import re
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the txt file with the SEC filings
|
||||
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
|
||||
sec_filings_text=sec_filings_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'SEC_Filings_Output.csv','w')
|
||||
|
||||
# Create first line with variable names
|
||||
# I use semicolons as separator in csv files. You can also use any other symbol.
|
||||
# However, you should make sure that the separator is not part of the data/text
|
||||
# you write to the file.
|
||||
# For example, it would be problematic if you use comma as separator and have
|
||||
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
|
||||
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
|
||||
|
||||
# Split the Input File in separate line
|
||||
sec_filings_line=sec_filings_text.split("\n")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(len(sec_filings_line)):
|
||||
# Does the line refer to a form 10-K file?
|
||||
# As pointed out by Loughran and McDonald (2011), many firms mislabelled
|
||||
# their 10-K filings as 10-K405 filings. Thus, I included these filings
|
||||
# as well.
|
||||
# The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
|
||||
# Depending on the research question at hand one could include amendments as well.
|
||||
# Also, 10KSB (small businesses) could also be included.
|
||||
|
||||
match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
|
||||
if match_10k:
|
||||
|
||||
#if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
|
||||
# Split the line such that the information can be saved in separate
|
||||
# variables
|
||||
# Each information item has a fixed length in the overview files of the
|
||||
# SEC.
|
||||
# Filing type: position 1 to 12
|
||||
# Remember Python starts counting at 0 and does not include the upper bound
|
||||
filing_type=sec_filings_line[i][:12]
|
||||
# Company name: position 13 to 74
|
||||
company_name=sec_filings_line[i][12:74]
|
||||
# CIK: position 75 to 86
|
||||
cik=sec_filings_line[i][74:86]
|
||||
# Filing date: position 87 to 98
|
||||
filing_date=sec_filings_line[i][86:98]
|
||||
# Link: position 99 to end of line
|
||||
link=sec_filings_line[i][98:]
|
||||
|
||||
# Is the 10-K filed between March 10 and March 20?
|
||||
# The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
|
||||
filing_day=filing_date[8:10]
|
||||
filing_month=filing_date[5:7]
|
||||
# Is the Filing Month March?
|
||||
if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
|
||||
# The filing meets the conditions -->
|
||||
# Write output to the csv file
|
||||
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
|
||||
|
||||
sec_filings_file.close()
|
||||
output_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||
"""
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# We need the urllib package
|
||||
import urllib.request
|
||||
# To automatically create folders we need the os-module (OS: Operating System)
|
||||
import os
|
||||
|
||||
|
||||
# Define a user agent
|
||||
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
|
||||
# "Some websites dislike being browsed by programs, or send different versions
|
||||
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
|
||||
# (where x and y are the major and minor version numbers of the Python release,
|
||||
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
|
||||
# The way a browser identifies itself is through the User-Agent header.
|
||||
opener = urllib.request.build_opener()
|
||||
|
||||
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
|
||||
# To still automatically download files, you have different options.
|
||||
# I have listed three examples below but there are many more:
|
||||
# For a comprehensive list see, e.g.:
|
||||
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
|
||||
#opener.addheaders = [('User-agent', 'Mozilla')]
|
||||
#opener.addheaders = [('User-agent', 'Chrome')]
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
|
||||
urllib.request.install_opener(opener)
|
||||
|
||||
|
||||
# Open the csv file from part 1 of the problem
|
||||
input_file=open(directory+'SEC_Filings_Output.csv','r')
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
# sometimes you have empty lines after a split command.
|
||||
# You can remove them using the following command
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create a subfolder in which the 10-K filings are saved.
|
||||
# When you download a large number of filings I recommend using subfolders for
|
||||
# each year or even for each year-month combination.
|
||||
# The option "exist_ok=True" makes sure that you do not get an error if the
|
||||
# folder already exists.
|
||||
os.makedirs(directory+"10-Ks/", exist_ok=True)
|
||||
|
||||
# Loop over all lines of the csv file
|
||||
#for i in range(1,len(input_text_line)):
|
||||
# To avoid having to download hundreds of files when we discuss the solution
|
||||
# the loop stops at 20. (Remember the upper bound is not included.)
|
||||
for i in range(1,21):
|
||||
|
||||
# split the line into the five variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We only need the cik and the link.
|
||||
# The cik is the 3rd variable. However, the numbering of lists starts
|
||||
# at zero -> 2nd item of the list "variables"
|
||||
# The link is the 5th variable -> 4th item of the list "variables"
|
||||
cik=variables[2]
|
||||
#cik=cik.replace(" ","")
|
||||
cik=cik.strip()
|
||||
link=variables[4]
|
||||
#link=link.replace(" ","")
|
||||
link=link.strip()
|
||||
|
||||
# Find the filename
|
||||
# The link consistes of differnt parts:
|
||||
# For example: edgar/data/1000753/0000950129-98-001035.txt
|
||||
link_parts=link.split("/")
|
||||
# 1st part: edgar
|
||||
# 2nd part: data
|
||||
# 3rd part: cik
|
||||
# 4th part: file name -> 3rd item of the set
|
||||
filename=link_parts[3]
|
||||
###########################################################################
|
||||
############################ WARNING ######################################
|
||||
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
|
||||
# may use the same filename. Thus, when you only use the filename files
|
||||
# might be overwritten. To avoid this problem you need to have a unique name.
|
||||
# Combining CIK and filename results in a unique identifier, as the
|
||||
# filename appears only once per firm (CIK).
|
||||
# -> use the combination of CIK and filename: cik_filename
|
||||
###########################################################################
|
||||
urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
|
||||
directory+"10-Ks/"+cik+"_"+filename)
|
||||
|
||||
input_file.close()
|
||||
print("DONE")
|
||||
|
|
@ -0,0 +1,144 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Apr 12 15:50:22 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# Import regular expressions and BeautifulSoup
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the document
|
||||
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
#######################
|
||||
# Task 1: remove tables
|
||||
#######################
|
||||
# Approach
|
||||
# We search for tables until we find no more html tags that indicate the
|
||||
# beginning of a table.
|
||||
# Search for the start html-tag <TABLE>
|
||||
table_match=re.search('<TABLE>', input_text)
|
||||
print("This is the result of the re.search command:")
|
||||
print(table_match)
|
||||
while table_match:
|
||||
# When we have identified a match, i.e. the start of a table, we save
|
||||
# the position of the beginning of the table in the variable "start_table"
|
||||
table_start_match=re.search('<TABLE>', input_text)
|
||||
start_table=table_start_match.start()
|
||||
# Next, we search for the corresponding html tag that indicates the end of
|
||||
# the table and save the end position to the variable "end_table"
|
||||
table_end_match=re.search('</TABLE>', input_text)
|
||||
end_table=table_end_match.end()
|
||||
|
||||
# We can print the text between the start and end html tag to check whether
|
||||
# the table has been identified correctly.
|
||||
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
|
||||
|
||||
# the text between the beginning and end of the html tags is the part which
|
||||
# we would like to delete.
|
||||
# Consequently, we keep the text before the beginning of the table as well
|
||||
# as the text after the ending of the table.
|
||||
input_text=input_text[:start_table]+input_text[end_table:]
|
||||
# Next, we need to check whether there is another table in the rest of the
|
||||
# text.
|
||||
table_match=re.search('<TABLE>', input_text)
|
||||
# As long as "table_match" exists, i.e. we regex result in a match, the loop
|
||||
# will continue.
|
||||
|
||||
#########################
|
||||
# Task 2: remove Exhibits
|
||||
#########################
|
||||
# Exhibits have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>EX...
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||
while exhibit_match:
|
||||
exhibit_start_match=re.search('<TYPE>EX', input_text)
|
||||
start_exhibit=exhibit_start_match.start()
|
||||
# As the exhibits are at the end of the 10-K filing it would not be
|
||||
# necessary to include an end position. We could also drop the entire text
|
||||
# after "<TYPE>EX"
|
||||
# It is important that we search for the </DOCUMENT> only after the exhibit
|
||||
# started. Otherwise, we could get the end of the main document.
|
||||
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
|
||||
end_exhibit=start_exhibit+exhibit_end_match.end()
|
||||
# Print the identified text to check whether the exhibit has be identified
|
||||
# correctly
|
||||
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||
|
||||
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
|
||||
# Check whether there are further exhibits
|
||||
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||
|
||||
##########################
|
||||
# Task 3: remove html code
|
||||
##########################
|
||||
# Alternative 1: remove html code without Beautiful Soup
|
||||
text=re.sub('<[^>]{1,}>', '', input_text)
|
||||
# This regex searches for a "<" followed by at least one character that must not
|
||||
# equal > and is completed by >.
|
||||
# You might have thought about using the following command
|
||||
#text=re.sub('<.{1,}>', '', input_text)
|
||||
# However, this command has a problem, as it would delete the following line
|
||||
# entirely: <page> This is some text that should remain <page>
|
||||
# The .{1,} would match 'page> This is some text that should remain <page', as
|
||||
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
|
||||
# Consequently, in the example only the two "<page>" would be deleted.
|
||||
# You can verify this by using regex101.com (remember to check "Python" in the
|
||||
# left menu of the webpage)
|
||||
|
||||
# Alternative 2: remove html code using Beautiful Soup
|
||||
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||
text=html_text.get_text()
|
||||
|
||||
########################
|
||||
# Task 4: delete numbers
|
||||
########################
|
||||
# Alternative 1 - removing numbers step by step
|
||||
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
|
||||
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
|
||||
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
|
||||
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
|
||||
# remove the remaining numbers without commas and dots
|
||||
text=re.sub('[0-9]','',text)
|
||||
|
||||
# Alternative 2 - removing numbers using a single regex
|
||||
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||
|
||||
# Alternative 3 - removing numbers step by step but start with commas and dots
|
||||
# 1. remove comma incl. the surrounding numbers
|
||||
text=re.sub("[0-9],[0-9]","",text)
|
||||
# 2. remove dots incl. the surrounding numbers
|
||||
text=re.sub("[0-9]\.[0-9]","",text)
|
||||
# 3. remove any remaining number
|
||||
text=re.sub("[0-9]","",text)
|
||||
|
||||
|
||||
########################
|
||||
# Task 5: delete symbols
|
||||
########################
|
||||
# When analyzing tone, symbols do not matter, as they are not considered to be
|
||||
# words and thus do not biased the total word count.
|
||||
# However, for training purposes this task is included in the problem.
|
||||
# There is no well defined list of which symbols should be deleted. So, you
|
||||
# can add further symbols.
|
||||
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
|
||||
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
|
||||
|
||||
# Open the output file for the pure text
|
||||
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
|
||||
output_file.write(text)
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
||||
209
lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
Normal file
209
lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
Normal file
|
|
@ -0,0 +1,209 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Apr 12 15:50:22 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the 10-K
|
||||
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
|
||||
input_text=input_file.read()
|
||||
|
||||
################################
|
||||
# Remove tables
|
||||
# Same approach as in Problem 4
|
||||
################################
|
||||
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||
# example, we will print the first two tables that we delete.
|
||||
i=1
|
||||
table_match=re.search('<TABLE>', input_text)
|
||||
while table_match:
|
||||
# Search for the beginning of the table
|
||||
table_start_match=re.search('<TABLE>', input_text)
|
||||
start_table=table_start_match.start()
|
||||
# search for the end of the table
|
||||
table_end_match=re.search('</TABLE>', input_text)
|
||||
end_table=table_end_match.end()
|
||||
# The if condition and the printing are just for illustrative purposes.
|
||||
# The commands display the first two tables that are removed from the text.
|
||||
if i<=2:
|
||||
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
|
||||
i=i+1
|
||||
# remove the table
|
||||
input_text=input_text[:start_table]+input_text[end_table:]
|
||||
# check whether there are further tables
|
||||
table_match=re.search('<TABLE>', input_text)
|
||||
|
||||
################################
|
||||
# Remove exhibits
|
||||
# Same approach as in Problem 4
|
||||
################################
|
||||
# Exhibits have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>EX...
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||
# example, we will print the first exhibit that we delete.
|
||||
i=1
|
||||
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||
while exhibit_match:
|
||||
# Search for the beginning of the exhibit
|
||||
exhibit_start_match=re.search('<TYPE>EX', input_text)
|
||||
start_exhibit=exhibit_start_match.start()
|
||||
# Search for the end of the exhibit
|
||||
# CAUTION: search only in the text after the beginning of the exhibt, as
|
||||
# </DOCUMENT> also appears earlier (e.g. end of main document)
|
||||
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
|
||||
end_exhibit=start_exhibit+exhibit_end_match.end()
|
||||
if i<=1:
|
||||
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||
i=i+1
|
||||
# remove exhibit
|
||||
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
|
||||
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||
|
||||
##################
|
||||
# Remove html code
|
||||
##################
|
||||
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||
text=html_text.get_text()
|
||||
|
||||
############################
|
||||
# Remove the Document Header
|
||||
############################
|
||||
# There are different possibilities how one can define the start of the main part of the text
|
||||
# In general, you should delete all text that is uninformative for your analysis.
|
||||
# Alternative 1:
|
||||
# Search for Table of Contents. To not mistakenly match a reference to the
|
||||
# table of contents somewhere in the text, we require a linebreak before and after.
|
||||
# When the "Table of Contents" is centered, there will be whitespaces or tabs
|
||||
# before and potentially also after
|
||||
header_match=re.search('(?i)\n[\t ]{0,}table[\t ]of[\t ]contents[\t ]{0,}\n', text)
|
||||
# Alternative 2:
|
||||
# Search for Documents incorporated by reference.
|
||||
header_match=re.search('\n[\t ]{0,}DOCUMENTS[\t ]INCORPORATED[\t ]BY[\t ]REFERENCE[\t ]{0,}\n', text)
|
||||
if header_match:
|
||||
# Drop the document header and keep only the rest of the text after the header.
|
||||
text=text[header_match.end():]
|
||||
|
||||
#################################################
|
||||
# Delete the text in "PART IV"
|
||||
# This procedure is optional. Look at "Part IV" and decide whether you favor
|
||||
# the approach. I think that the part should be dropped, as it is just a list
|
||||
# of exhibits, some mandatory text required by the SEC [indicated by the
|
||||
# capital letters in the "SIGNATURES" section].
|
||||
#################################################
|
||||
|
||||
'''
|
||||
# Alternative 1: go over all matches but keep only the last one
|
||||
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||
print("Hallo")
|
||||
# match now contains the last match
|
||||
# Delete the text after the last match
|
||||
text=text[:match.start()]
|
||||
|
||||
|
||||
# Alternative 2: save the positions of all matches (more general approach)
|
||||
# to use alternative 2, you have to comment out Alternative 1!
|
||||
# Otherwise line 104 will create a problem when you execute Alternative 2.
|
||||
list_start_matches=[]
|
||||
list_end_matches=[]
|
||||
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||
print(match)
|
||||
list_start_matches.append(match.start())
|
||||
list_end_matches.append(match.end())
|
||||
# Position of last match
|
||||
print(list_start_matches[len(list_start_matches)-1])
|
||||
print(list_end_matches[len(list_start_matches)-1])
|
||||
|
||||
|
||||
# Alternative 3: manual coding using a loop of re.searches
|
||||
# create a copy of the text that we can edit
|
||||
text_check_part_IV=text
|
||||
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
|
||||
# create two lists that we can use to save the start and end positions
|
||||
# of the Part IV matches
|
||||
list_start_matches_v2=[]
|
||||
list_end_matches_v2=[]
|
||||
# variable to save the position of the last match in the overall text
|
||||
end_position_previous_match=0
|
||||
while part_IV_match:
|
||||
start_position_match=end_position_previous_match+part_IV_match.start()
|
||||
end_position_match=end_position_previous_match+part_IV_match.end()
|
||||
|
||||
list_start_matches_v2.append(start_position_match)
|
||||
list_end_matches_v2.append(end_position_match)
|
||||
|
||||
# update the information on the end of the last match
|
||||
end_position_previous_match=end_position_previous_match+part_IV_match.end()
|
||||
|
||||
text_check_part_IV=text_check_part_IV[part_IV_match.end():]
|
||||
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
|
||||
|
||||
# when you compare list_end_matches to list_end_matches_v2, you see that the two
|
||||
# approaches yield the same result.
|
||||
# To double check that the approaches have the same results, you could
|
||||
# replace the Regex in lines 112, 124, and 142 by "\s{2,}PART [A-Z]{1,3}\s{0,}\n".
|
||||
# In these case you have more matches and so you can better check that the
|
||||
# two approaches have identical outcomes.
|
||||
'''
|
||||
|
||||
'''
|
||||
# Delete the text after the last match
|
||||
text=text[:list_start_matches[len(list_start_matches)-1]]
|
||||
'''
|
||||
|
||||
# Delete item numbers
|
||||
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
|
||||
text=re.sub('(?i)Item [0-9]{1,}A{0,1}(\s|\.|:|\n)','',text)
|
||||
|
||||
# Delete numbers
|
||||
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||
|
||||
# Alternative stepwise procedure to delete numbers
|
||||
# remove commas in numbers, e.g., 1,000 or 12,345,678
|
||||
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
|
||||
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
|
||||
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
|
||||
# remove the remaining numbers without commas and dots
|
||||
text=re.sub('[0-9]','',text)
|
||||
|
||||
|
||||
# Hyphens can be used to indicate that the word is continued in the next
|
||||
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||
# Delete hyphens that are followed by a line feed.
|
||||
text=re.sub('-\n','',text)
|
||||
|
||||
# Replace symbols by a whitespace.
|
||||
# Extra whitespaces are not a problem.
|
||||
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
|
||||
|
||||
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||
# that are preceded by a line break (potentially also whitespaces and tabs)
|
||||
# and that are followed by are followed by a line break (again, there may
|
||||
# also be whitespaces and tabs).
|
||||
text=re.sub('\n[\t ]{0,}(\.|,){1,}[\t ]{0,}\n','\n',text)
|
||||
|
||||
# Drop single-character words
|
||||
# One can argue whether one should implement this procedure. Loughran and
|
||||
# McDonald argue in one of their papers in favor of it.
|
||||
# To make sure that there is just one letter, we require that there is a word
|
||||
# boundary (\W) before and after. We use a positive backward looking and a
|
||||
# positive forward looking condition for this to assure that the word boundary
|
||||
# get not deleted as well.
|
||||
text=re.sub('(?<=\W)[A-Za-z](?=\W)',' ',text)
|
||||
|
||||
|
||||
# Open the output file for the pure text
|
||||
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
|
||||
output_file.write(text)
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
print("COMPLETED.")
|
||||
356
lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
Normal file
356
lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
Normal file
|
|
@ -0,0 +1,356 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the iput file. The following command
|
||||
# deletes these lines
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
|
||||
# We subtract 1 from the lenght, as the first line contains the variable names but not data.
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
# To see the progress of your program you can print the number of iteration.
|
||||
print(str(i))
|
||||
|
||||
# split the lines of the CSV-file into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename to open the file
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# the new file name should be "old_name_clean" -> we have to replace ".txt"
|
||||
# by "_clean.txt"
|
||||
filename=filename.replace('.txt','_clean.txt')
|
||||
|
||||
# Remove tables
|
||||
variable=re.search('<TABLE>', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TABLE>', input_text_10_k)
|
||||
start_table=variable.start()
|
||||
variable=re.search('</TABLE>', input_text_10_k)
|
||||
end_table=variable.end()
|
||||
input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
|
||||
variable=re.search('<TABLE>', input_text_10_k)
|
||||
|
||||
|
||||
####################### Begin of exhibits removal #########################
|
||||
# Exhibits have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>EX...
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
# In the recent years, there are also exhibits with <TYPE>EXCEL
|
||||
# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
|
||||
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||
start_exhibit=variable.start()
|
||||
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||
end_exhibit=start_exhibit+variable.end()
|
||||
input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
|
||||
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||
|
||||
# In recent years, there are also XML-Exibits.
|
||||
# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
|
||||
# Remove XML-Exhibits, which have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>XML
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||
start_exhibit=variable.start()
|
||||
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||
end_exhibit=start_exhibit+variable.end()
|
||||
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||
|
||||
# Furthermore, also in recent years, there are also ZIP-Exibits.
|
||||
# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
|
||||
# Remove ZIP-Exhibits, which have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>ZIP
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||
start_exhibit=variable.start()
|
||||
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||
end_exhibit=start_exhibit+variable.end()
|
||||
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||
|
||||
# In addition, there are many Graphic-Exibits.
|
||||
# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
|
||||
# Remove GRAPHIC-Exhibits, which have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>GRAPHIC
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||
start_exhibit=variable.start()
|
||||
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||
end_exhibit=start_exhibit+variable.end()
|
||||
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||
|
||||
# Furthermore, there can be also Cover-Exibits.
|
||||
# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
|
||||
# Remove COVER-Exhibits, which have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>COVER
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||
start_exhibit=variable.start()
|
||||
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||
end_exhibit=start_exhibit+variable.end()
|
||||
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||
|
||||
# Furthermore, there can be also PDF files attached.
|
||||
# These attachments caused BeautifulSoup to crash on some computers.
|
||||
# Remove PDFs
|
||||
variable=re.search('<PDF>', input_text_10_k)
|
||||
while variable:
|
||||
variable=re.search('<PDF>', input_text_10_k)
|
||||
start_pdf=variable.start()
|
||||
variable=re.search('</PDF>', input_text_10_k[start_pdf:])
|
||||
end_pdf=start_pdf+variable.end()
|
||||
input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
|
||||
variable=re.search('<PDF>', input_text_10_k)
|
||||
|
||||
######################## End of exhibits removal ##########################
|
||||
|
||||
# Remove Document Header - PART 1
|
||||
# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
|
||||
# and "</SEC-HEADER>" are mandatory for all filings.
|
||||
variable=re.search('</SEC-HEADER>', input_text_10_k)
|
||||
if variable:
|
||||
input_text_10_k=input_text_10_k[variable.end():]
|
||||
|
||||
|
||||
# In some filings, firms do not use line feeds \n but <div> and </div>
|
||||
# instead to indicate the start and the end of sentences.
|
||||
# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer
|
||||
# neuen Zeile des Fließtextes beginnt."
|
||||
# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
|
||||
# and
|
||||
# "The <div> tag defines a division or a section in an HTML document.
|
||||
# By default, browsers always place a line break before and after the <div> element."
|
||||
# See: https://www.w3schools.com/tags/tag_div.asp
|
||||
# It is important to replace <div> and </div> by linefeeds because otherwise
|
||||
# the entire text will be in a single line and the subsequent commands do
|
||||
# not work properly.
|
||||
input_text_10_k=input_text_10_k.replace("<div>", "\n")
|
||||
input_text_10_k=input_text_10_k.replace("</div>", "\n")
|
||||
|
||||
|
||||
# Remove html code
|
||||
html_text=BeautifulSoup(input_text_10_k, 'html.parser')
|
||||
text=html_text.get_text()
|
||||
|
||||
|
||||
# To get an idea of what the commands below are doing, it is helpful to
|
||||
# write the current version of the text to a file and then compare it to the
|
||||
# final file.
|
||||
filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
|
||||
# Open the output file for the text without html code and without tables+exhibits
|
||||
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
|
||||
output_file_10_k.write(text)
|
||||
output_file_10_k.close()
|
||||
|
||||
|
||||
# Remove the Document Header - PART II
|
||||
# The above command to remove the header ("</SEC-HEADER>") does not capture
|
||||
# the entire header -> we need to delete further parts at the top the filing.
|
||||
# WARNING: The filters below may be specific to this sample of 10-Ks.
|
||||
# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
|
||||
variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
|
||||
if variable:
|
||||
text=text[variable.end():]
|
||||
else:
|
||||
variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
|
||||
if variable:
|
||||
text=text[variable.end():]
|
||||
else:
|
||||
variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
|
||||
if variable:
|
||||
text=text[variable.end():]
|
||||
else:
|
||||
variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
|
||||
if variable:
|
||||
text=text[variable.end():]
|
||||
else:
|
||||
variable=re.search('\nPART\.{1,}', text)
|
||||
if variable:
|
||||
text=text[variable.end():]
|
||||
|
||||
|
||||
# Delete Item numbers
|
||||
text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
|
||||
# Delete Part numbers
|
||||
text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
|
||||
|
||||
# Delete numbers:
|
||||
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||
|
||||
# File names, e.g. exhibit.pdf or picture.jpeg should be removed
|
||||
text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
|
||||
|
||||
# URLs --> Remove internet addresse
|
||||
text=re.sub("http:/{0,2}", "", text)
|
||||
text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
|
||||
|
||||
|
||||
# In Part 4 of the programming chapter, we will determine the number of
|
||||
# words per sentence. To be able to use the same underlying sample,
|
||||
# we need to implement further corrections. These changes do not affect
|
||||
# the percentage of negative/positive/etc. words.
|
||||
# --> Only relevant for determining the number of sentences
|
||||
# The text contains dots that do not indicate the end of a sentence.
|
||||
# E.g., "Inc." and "St."
|
||||
# The preceding - is found in non-U.S. for example.
|
||||
# Replace or remove specific abreviations
|
||||
# This list is incomplete. In a research project you should spend more time
|
||||
# on editing the data.
|
||||
text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
|
||||
text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
|
||||
|
||||
text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
|
||||
text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
|
||||
text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
|
||||
|
||||
# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
|
||||
# three repitions of capital letter and dot are also common in filings
|
||||
# we need to check for three instances first.
|
||||
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
|
||||
# now check for two instances
|
||||
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
|
||||
|
||||
# Dots after a single letter can indicate a middle Name Paul J. Smith
|
||||
# or an abbreviation --> also delete these.
|
||||
text=re.sub("( |\n|,)[A-Z]\.", "", text)
|
||||
|
||||
|
||||
# Hyphens can be used to indicate that the word is continued in the next
|
||||
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||
# Replace hyphens followed by a line feed by a hyphen without line feed
|
||||
text=re.sub('-\n','-',text)
|
||||
|
||||
# Delete the minus/hyphens
|
||||
# "Short-term" -> "shortterm"
|
||||
text=re.sub('-','',text)
|
||||
|
||||
|
||||
# --> Only relevant for determining the number of sentences
|
||||
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||
# that are preceded by whitespace or line break and that are followed by
|
||||
# whitespace or line break.
|
||||
text=re.sub('\n(\.|,)\n','\n',text)
|
||||
text=re.sub(' (\.|,) ',' ',text)
|
||||
|
||||
# Delete single character words
|
||||
# One can argue whether one should implement this procedure. Loughran and
|
||||
# McDonald argue in one of their papers in favor of it.
|
||||
# To make sure that there is just one letter, we require that there is a word
|
||||
# boundary (\W) before and after. We use a positive backward looking and a
|
||||
# positive forward looking condition for this to assure that the word boundary
|
||||
# get not deleted as well.
|
||||
text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
|
||||
|
||||
|
||||
# There are sentences that are in upper case letters. However, these are not
|
||||
# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
|
||||
# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
|
||||
# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
|
||||
# We save the edited text in a new variable
|
||||
text_edited=text
|
||||
# Split text in sentences
|
||||
list_sentences=re.split('\.|!|\?', text)
|
||||
# iterate the list of all sentences
|
||||
for j in range(0,len(list_sentences)):
|
||||
# Determine the number of upper case letters
|
||||
upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
|
||||
# Determine the number of all letters
|
||||
total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
|
||||
# If there is at least one letter calculate the fraction of upper case letters
|
||||
if total_letters>0:
|
||||
ratio=upper_letters/total_letters
|
||||
# If the fraction of upper case letters is larger than 0.9 delete
|
||||
# the sentence from the text.
|
||||
if ratio>0.9:
|
||||
text_edited=text_edited.replace(list_sentences[j]+'.','')
|
||||
text_edited=text_edited.replace(list_sentences[j]+'!','')
|
||||
text_edited=text_edited.replace(list_sentences[j]+'?','')
|
||||
|
||||
|
||||
# --> Only relevant for determining the number of sentences
|
||||
# There are a few cases where a dot follows a dot or where a linefeed
|
||||
# separates two dots. --> delete the second dot.
|
||||
text_edited=text_edited.replace('..','.')
|
||||
text_edited=text_edited.replace('.\n.','.')
|
||||
|
||||
# The following commands do not influence the subsequent textual analysis.
|
||||
# The only purpose is to display the output in a nicer format.
|
||||
# Replace lines that contain only whitespaces by a line feed.
|
||||
text_edited=re.sub('\n {1,}\n','\n',text_edited)
|
||||
|
||||
# Replace multiple line feeds by one line feed.
|
||||
text_edited=re.sub('\n{2,}','\n',text_edited)
|
||||
|
||||
|
||||
# Open the output file for the pure text
|
||||
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
|
||||
output_file_10_k.write(text_edited)
|
||||
output_file_10_k.close()
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
114
lectures/programming/solutions/Problem_7_Tone_Analysis.py
Normal file
114
lectures/programming/solutions/Problem_7_Tone_Analysis.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the dictionary
|
||||
# The dictionary has been obtained from Bill McDonald's webpage
|
||||
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||
# --> select negative words and copy them to a txt file
|
||||
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
|
||||
word_list=file_word_list.read()
|
||||
# The LMD words are all in upper case
|
||||
word_list=word_list.lower()
|
||||
negative_words=word_list.split('\n')
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
|
||||
Percentage_Negative_Words\n')
|
||||
|
||||
# Loop over all lines of the csv file
|
||||
for i in range(1,len(input_text_line)):
|
||||
#for i in range(1,10):
|
||||
# If the execution of your scripts takes some time, printing the loop iterator
|
||||
# gives you an impression of the overall progress made.
|
||||
print(str(i))
|
||||
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (2nd column)
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
|
||||
# modify file name to open the edited files
|
||||
filename=filename.replace('.txt','')
|
||||
# Open the ith 10-Ks in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# Use lower case letters, too
|
||||
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||
# and the document is identical. Remember that you have typically lower and upper case
|
||||
# letters in documents -> modify text.
|
||||
text=input_text_10_k.lower()
|
||||
|
||||
# Split the text in single words to determine the total number of words
|
||||
# \W is a non-word character: "Matches any character which is not a Unicode
|
||||
# word character." (Python documentation)
|
||||
# this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
|
||||
# case letters, no numbers, and no underscore.
|
||||
list_of_words=re.split('\W{1,}', text)
|
||||
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
# It is important that you treat multiple "\W" as one. Otherwise you are left
|
||||
# with elements in the list that are not acutal words.
|
||||
|
||||
# Determine the total number of words
|
||||
word_count=len(list_of_words)
|
||||
|
||||
# Reset the number of negative words to zero
|
||||
negative_count=0
|
||||
# For each negative word, count the number of occurrences
|
||||
for j in range(len(negative_words)):
|
||||
# the command "list_of_words.count(negative_words[i])" only matches if there
|
||||
# is exact overlap between the ith negative word and the words in the list.
|
||||
# For example the following two commands:
|
||||
# list_of_words=["abandon","abandoned","abandonment"]
|
||||
# list_of_words.count("abandon")
|
||||
# yields 1 match
|
||||
# In contrast,
|
||||
# text_of_words="abandon abandoned abandonment"
|
||||
# text_of_words.count("abandon")
|
||||
# yields 3. Thus, you have to split the text to individual words!!!
|
||||
negative_count=negative_count+list_of_words.count(negative_words[j])
|
||||
|
||||
# Get the percentage of negative words
|
||||
percentage_negative=negative_count/word_count
|
||||
|
||||
# Write cik, file name, total number of words, number of negative words,
|
||||
# and the percentage of negative words to output file.
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||
+str(negative_count)+';'+str(percentage_negative)+'\n')
|
||||
|
||||
# Close filings
|
||||
input_file_10_k.close()
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
|
|
@ -0,0 +1,130 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the dictionary
|
||||
# The dictionary is obtained from Bill McDonald's webpage
|
||||
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||
# --> select positive words and copy them to a txt file
|
||||
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
|
||||
word_list=file_word_list.read()
|
||||
word_list=word_list.lower()
|
||||
positive_words=word_list.split()
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
|
||||
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
|
||||
|
||||
# Iterate the list of the 200 10-K filings
|
||||
# the last line is empty --> loop only up to len()-1
|
||||
#for i in range(1,len(input_text_line)):
|
||||
for i in range(1,20): # For illustration only
|
||||
# If the execution of your scripts takes some time, printing the iterator
|
||||
# gives you an impression of the overall progress
|
||||
print(str(i))
|
||||
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (2nd column)
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
|
||||
# modify file name to open the edited files
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'/10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# Use lower case letters, too
|
||||
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||
# and the document are identical. Remember that you have typically lower and upper case
|
||||
# letters in documents -> modify text
|
||||
text=input_text_10_k.lower()
|
||||
|
||||
# Split the text in single words to determine the total number of words
|
||||
list_of_words=re.split('\W{1,}', text)
|
||||
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# Determine total number of words
|
||||
word_count=len(list_of_words)
|
||||
|
||||
# Reset the number of positive words and positive words adj. for negations to zero
|
||||
positive_count=0
|
||||
positive_count_adj=0
|
||||
# For each positive word, count the number of occurrences
|
||||
for j in range(len(positive_words)):
|
||||
# standard count operation without controlling for negations
|
||||
positive_words_found=list_of_words.count(positive_words[j])
|
||||
|
||||
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
|
||||
# only for Fin-Pos words. Simple negation is taken to be observations
|
||||
# of one of six words (no, not, none, neither, never, nobody) occurring
|
||||
# within three words preceding a positive word.
|
||||
|
||||
# When we have identified positive words we need to search for negations
|
||||
while positive_words_found>0:
|
||||
# identify the position of the matched positive word in the list of all words
|
||||
position_of_word=list_of_words.index(positive_words[j])
|
||||
# identify the three words before the positive word and add them to a list
|
||||
# the \ is a line break
|
||||
list_negation=[list_of_words[max(0,position_of_word-3)],\
|
||||
list_of_words[max(0,position_of_word-2)],list_of_words[max(0,position_of_word-1)]]
|
||||
# check whether one of the three words in list_negation is a negation
|
||||
negation_found=list_negation.count('no')+list_negation.count('not')+\
|
||||
list_negation.count('none')+list_negation.count('neither')+\
|
||||
list_negation.count('never')+list_negation.count('nobody')
|
||||
|
||||
if negation_found==0:
|
||||
# no negation
|
||||
positive_count_adj=positive_count_adj+1
|
||||
positive_count=positive_count+1
|
||||
else:
|
||||
# negation
|
||||
positive_count=positive_count+1
|
||||
|
||||
# delete the matched positive words in the original document
|
||||
list_of_words[position_of_word]=''
|
||||
# check whether there are further matches of the jth positive word
|
||||
positive_words_found=list_of_words.count(positive_words[j])
|
||||
|
||||
# Write cik, file name, total number of words, and number of positive
|
||||
# and adjusted positive words to the output file
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
|
||||
';'+str(positive_count_adj/word_count)+'\n')
|
||||
|
||||
# Close filings
|
||||
input_file_10_k.close()
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
111
lectures/programming/solutions/Problem_9_Words_per_Sentence.py
Normal file
111
lectures/programming/solutions/Problem_9_Words_per_Sentence.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We split the text into words and sentences using regular expression
|
||||
import re
|
||||
# For comparison, we also include the NLTK tokenizer
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
|
||||
'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
|
||||
'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
text=input_file_10_k.read()
|
||||
|
||||
# Determine number of sentences and number of words
|
||||
# Split the text in words to determine the total number of words
|
||||
list_of_words=re.split('\W{1,}', text)
|
||||
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
# Determine total number of words
|
||||
word_count=len(list_of_words)
|
||||
|
||||
|
||||
# Split the text by symbols that indicate the end of a sentence
|
||||
# to determine the total number of sentences
|
||||
list_of_sentences=re.split('[\.!\?]{1,}', text)
|
||||
while list_of_sentences.count("")>0:
|
||||
list_of_sentences.remove("")
|
||||
# Alternative 1:
|
||||
list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
|
||||
while list_of_sentences_1.count("")>0:
|
||||
list_of_sentences_1.remove("")
|
||||
# Alternative 2:
|
||||
list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
|
||||
while list_of_sentences_2.count("")>0:
|
||||
list_of_sentences_2.remove("")
|
||||
# Incorrect approach:
|
||||
# re.split splits the string by the occurrences of the pattern.
|
||||
# If capturing parentheses, i.e. (), are used in pattern, then the text
|
||||
# of all groups in the pattern are also returned as part of the resulting list.
|
||||
# See https://docs.python.org/3/library/re.html#re.split for details
|
||||
list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
|
||||
while list_of_sentences_false.count("")>0:
|
||||
list_of_sentences_false.remove("")
|
||||
|
||||
# For comparison, we also include the NLTK tokenizer
|
||||
list_of_sentences_nltk=sent_tokenize(text)
|
||||
|
||||
# Determine total number of sentences
|
||||
sentence_count=len(list_of_sentences)
|
||||
sentence_count_1=len(list_of_sentences_1)
|
||||
sentence_count_2=len(list_of_sentences_2)
|
||||
sentence_count_false=len(list_of_sentences_false)
|
||||
sentence_count_nltk=len(list_of_sentences_nltk)
|
||||
|
||||
# Ratio of # of words over # of sentences
|
||||
wps=word_count/sentence_count
|
||||
wps_1=word_count/sentence_count_1
|
||||
wps_2=word_count/sentence_count_2
|
||||
wps_false=word_count/sentence_count_false
|
||||
wps_nltk=word_count/sentence_count_nltk
|
||||
|
||||
# Write cik, file name, total number of words, total number of sentences,
|
||||
# and WPS to the output file
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||
str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
|
||||
str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
|
||||
str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
|
||||
|
||||
# Close filing
|
||||
input_file_10_k.close()
|
||||
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue