1
0
Fork 0
whu-textual-analysis/lectures/programming/solutions/Problem_7_Tone_Analysis.py

115 lines
4.7 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary has been obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select negative words and copy them to a txt file
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# The LMD words are all in upper case
word_list=word_list.lower()
negative_words=word_list.split('\n')
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
Percentage_Negative_Words\n')
# Loop over all lines of the csv file
for i in range(1,len(input_text_line)):
#for i in range(1,10):
# If the execution of your scripts takes some time, printing the loop iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-Ks in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters, too
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document is identical. Remember that you have typically lower and upper case
# letters in documents -> modify text.
text=input_text_10_k.lower()
# Split the text in single words to determine the total number of words
# \W is a non-word character: "Matches any character which is not a Unicode
# word character." (Python documentation)
# this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
# case letters, no numbers, and no underscore.
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# It is important that you treat multiple "\W" as one. Otherwise you are left
# with elements in the list that are not acutal words.
# Determine the total number of words
word_count=len(list_of_words)
# Reset the number of negative words to zero
negative_count=0
# For each negative word, count the number of occurrences
for j in range(len(negative_words)):
# the command "list_of_words.count(negative_words[i])" only matches if there
# is exact overlap between the ith negative word and the words in the list.
# For example the following two commands:
# list_of_words=["abandon","abandoned","abandonment"]
# list_of_words.count("abandon")
# yields 1 match
# In contrast,
# text_of_words="abandon abandoned abandonment"
# text_of_words.count("abandon")
# yields 3. Thus, you have to split the text to individual words!!!
negative_count=negative_count+list_of_words.count(negative_words[j])
# Get the percentage of negative words
percentage_negative=negative_count/word_count
# Write cik, file name, total number of words, number of negative words,
# and the percentage of negative words to output file.
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(negative_count)+';'+str(percentage_negative)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()