Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
287 lines
11 KiB
Python
287 lines
11 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Jul 29 11:07:10 2015
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
import re
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import PorterStemmer
|
|
from collections import Counter
|
|
|
|
|
|
ps=PorterStemmer()
|
|
|
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
|
|
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
|
input_text=input_file.read()
|
|
|
|
# Split the input file in separate lines
|
|
input_text_line=input_text.split("\n")
|
|
|
|
# In general, there can be empty lines in the iput file. The following command
|
|
# deletes these lines
|
|
while input_text_line.count("")>0:
|
|
input_text_line.remove("")
|
|
|
|
# Open the output csv file in which we write the similarities
|
|
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
|
|
# Write variable names to first line
|
|
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
|
|
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
|
|
Jaccard_stemmed_NLTK_stop_words\n')
|
|
|
|
# Read own stop word list
|
|
# This list has been created by manually selecting words from the csv-file
|
|
# 100_most_frequent_words.csv, which is created by the Python program
|
|
# "Problem_12_Most_Frequent_Words.py".
|
|
# Simply delete words you consider to be meaningless and that are frequently
|
|
# used.
|
|
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
|
|
stop_word_text=stop_word_file.read()
|
|
stop_word_line=stop_word_text.split("\n")
|
|
stop_word_line.remove("")
|
|
own_stop_words=[""]
|
|
for i in range(1,len(stop_word_line)):
|
|
stop_word=stop_word_line[i].split(";")[1]
|
|
own_stop_words.append(stop_word)
|
|
|
|
own_stop_words.remove("")
|
|
print("This is the list of my stop words:")
|
|
print(own_stop_words)
|
|
|
|
# Read NLTK stop word list
|
|
NLTK_stop_words=set(stopwords.words("english"))
|
|
print("This is the list of NLTK stop words:")
|
|
print(NLTK_stop_words)
|
|
|
|
# set default values for variables
|
|
# It is not required. However, if you don't do it Spyder will suggest that line
|
|
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
|
# is incorrect as word_list_old_edited is not yet defined at point in the program
|
|
# code. In this specific example, this will not cause an error, as we do not enter
|
|
# the if condition when i=1 -> it
|
|
word_list_old_edited=[]
|
|
word_list_edited=[]
|
|
word_list_old_NLTK_filtered=""
|
|
word_list_old_own_filtered=""
|
|
word_list_old_edited_stemmed=""
|
|
word_list_old_own_filtered_stemmed=""
|
|
word_list_old_NLTK_filtered_stemmed=""
|
|
|
|
#######################################################
|
|
# Define a function that computes Jaccard similarity
|
|
# As we need these operations several times, it is
|
|
# helpful to use a function.
|
|
######################################################
|
|
# beginning of the function
|
|
def jaccard(text1,text2):
|
|
counter1=Counter(text1)
|
|
counter2=Counter(text2)
|
|
|
|
intersection=counter1 & counter2
|
|
union=counter1 | counter2
|
|
|
|
return len(intersection)/len(union)
|
|
# end of the function
|
|
|
|
|
|
# Loop over all lines
|
|
for i in range(1,len(input_text_line)):
|
|
print(str(i))
|
|
# split the line into the eight variables
|
|
variables=input_text_line[i].split(";")
|
|
# We need the CIK (1st column) and the filename (8th column)
|
|
cik=variables[0]
|
|
filename_parts=re.split('/',variables[7])
|
|
filename=filename_parts[3].replace('.txt','')
|
|
|
|
# Write the information from the input file to the output file
|
|
# we do not add a line break at the end, as we must append the similarity
|
|
# score first.
|
|
output_file.write(input_text_line[i])
|
|
|
|
# Open the ith 10-K; remember to specify the encoding
|
|
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
|
|
'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
|
input_text_10_k=input_file_10_k.read()
|
|
|
|
# check whether the previous entry of the list is from the same firm
|
|
permco=input_text_line[i].split(";")[1]
|
|
permco_old=input_text_line[i-1].split(";")[1]
|
|
|
|
# Split text into words
|
|
word_list_edited=word_tokenize(input_text_10_k.lower())
|
|
|
|
|
|
############################################
|
|
# Sub Task 1: Jaccard for the _edited.txt
|
|
############################################
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
# the command calls the jaccard function that we have defined above.
|
|
# in the function, text1=word_list_edited and text2=word_list_old_edited.
|
|
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_edited=word_list_edited
|
|
|
|
|
|
############################################
|
|
# Sub Task 2: Jaccard for the _edited.txt
|
|
# AND REMOVE STOP WORDS - OWN LIST
|
|
############################################
|
|
# remove stop words using personal stop word list
|
|
word_list_own_filtered=[]
|
|
for word in word_list_edited:
|
|
if word not in own_stop_words:
|
|
word_list_own_filtered.append(word)
|
|
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
jaccard_similarity=jaccard(word_list_own_filtered,\
|
|
word_list_old_own_filtered)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_own_filtered=word_list_own_filtered
|
|
|
|
|
|
############################################
|
|
# Sub Task 3: Jaccard for the _edited_v1.txt
|
|
# AND REMOVE STOP WORDS - NLTK LIST
|
|
############################################
|
|
# remove stop words using NLTK stop word list
|
|
word_list_NLTK_filtered=[]
|
|
for word in word_list_edited:
|
|
if word not in NLTK_stop_words:
|
|
word_list_NLTK_filtered.append(word)
|
|
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
|
|
word_list_old_NLTK_filtered)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_NLTK_filtered=word_list_NLTK_filtered
|
|
|
|
|
|
############################################
|
|
# Sub Task 4: Jaccard for the _stemmed.txt
|
|
############################################
|
|
# Create stemmed text
|
|
word_list_edited_stemmed=[]
|
|
for word in word_list_edited:
|
|
word_list_edited_stemmed.append(ps.stem(word))
|
|
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_edited_stemmed=word_list_edited_stemmed
|
|
|
|
|
|
############################################
|
|
# Sub Task 5: Jaccard for the _stemmed.txt
|
|
# AND REMOVE STOP WORDS - OWN LIST
|
|
############################################
|
|
# Caution; in general, it is not clear whether you should first stem or
|
|
# first remove stop words.
|
|
# However, in this specific case, you should remove the stop words first
|
|
# and then stem, as your stop word list is based on the inflected text.
|
|
|
|
# remove stop words using personal stop word list
|
|
word_list_own_filtered=[]
|
|
for word in word_list_edited:
|
|
if word not in own_stop_words:
|
|
word_list_own_filtered.append(word)
|
|
|
|
# Create stemmed text
|
|
word_list_own_filtered_stemmed=[]
|
|
for word in word_list_own_filtered:
|
|
word_list_own_filtered_stemmed.append(ps.stem(word))
|
|
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
|
|
word_list_old_own_filtered_stemmed)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
|
|
|
|
|
|
############################################
|
|
# Sub Task 6: Jaccard for the _stemmed.txt
|
|
# AND REMOVE STOP WORDS - NLTK LIST
|
|
############################################
|
|
# Caution; it is not clear whether you should first stem or first remove
|
|
# stop words. However, the NLTK stop word list seems to be based on inflected
|
|
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
|
|
# Thus, the stop list seems to be not stemmed.
|
|
# Thus, you should remove the stop words first and then stem.
|
|
|
|
# remove stop words using NLTK stop word list
|
|
word_list_NLTK_filtered=[]
|
|
for word in word_list_edited:
|
|
if word not in NLTK_stop_words:
|
|
word_list_NLTK_filtered.append(word)
|
|
|
|
# Create stemmed text
|
|
word_list_NLTK_filtered_stemmed=[]
|
|
for word in word_list_NLTK_filtered:
|
|
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
|
|
|
|
# compute Jaccard similarity if the previous filing is from the same firm
|
|
if permco==permco_old:
|
|
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
|
|
word_list_old_NLTK_filtered_stemmed)
|
|
|
|
output_file.write(";"+str(jaccard_similarity))
|
|
else:
|
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
|
output_file.write(";")
|
|
|
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
|
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
|
|
|
|
|
|
# Write line break to output file
|
|
output_file.write("\n")
|
|
|
|
# Close 10-K filing
|
|
input_file_10_k.close()
|
|
|
|
input_file.close()
|
|
output_file.close()
|
|
stop_word_file.close()
|
|
print("Task done!")
|
|
|