1
0
Fork 0
whu-textual-analysis/lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
Alexander Hess a37c87d9c8
Add programming files
- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
2022-08-05 00:06:58 +02:00

287 lines
11 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
ps=PorterStemmer()
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
Jaccard_stemmed_NLTK_stop_words\n')
# Read own stop word list
# This list has been created by manually selecting words from the csv-file
# 100_most_frequent_words.csv, which is created by the Python program
# "Problem_12_Most_Frequent_Words.py".
# Simply delete words you consider to be meaningless and that are frequently
# used.
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
stop_word_text=stop_word_file.read()
stop_word_line=stop_word_text.split("\n")
stop_word_line.remove("")
own_stop_words=[""]
for i in range(1,len(stop_word_line)):
stop_word=stop_word_line[i].split(";")[1]
own_stop_words.append(stop_word)
own_stop_words.remove("")
print("This is the list of my stop words:")
print(own_stop_words)
# Read NLTK stop word list
NLTK_stop_words=set(stopwords.words("english"))
print("This is the list of NLTK stop words:")
print(NLTK_stop_words)
# set default values for variables
# It is not required. However, if you don't do it Spyder will suggest that line
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
# is incorrect as word_list_old_edited is not yet defined at point in the program
# code. In this specific example, this will not cause an error, as we do not enter
# the if condition when i=1 -> it
word_list_old_edited=[]
word_list_edited=[]
word_list_old_NLTK_filtered=""
word_list_old_own_filtered=""
word_list_old_edited_stemmed=""
word_list_old_own_filtered_stemmed=""
word_list_old_NLTK_filtered_stemmed=""
#######################################################
# Define a function that computes Jaccard similarity
# As we need these operations several times, it is
# helpful to use a function.
######################################################
# beginning of the function
def jaccard(text1,text2):
counter1=Counter(text1)
counter2=Counter(text2)
intersection=counter1 & counter2
union=counter1 | counter2
return len(intersection)/len(union)
# end of the function
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Write the information from the input file to the output file
# we do not add a line break at the end, as we must append the similarity
# score first.
output_file.write(input_text_line[i])
# Open the ith 10-K; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
'_edited.txt', 'r', encoding='ascii', errors='ignore')
input_text_10_k=input_file_10_k.read()
# check whether the previous entry of the list is from the same firm
permco=input_text_line[i].split(";")[1]
permco_old=input_text_line[i-1].split(";")[1]
# Split text into words
word_list_edited=word_tokenize(input_text_10_k.lower())
############################################
# Sub Task 1: Jaccard for the _edited.txt
############################################
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
# the command calls the jaccard function that we have defined above.
# in the function, text1=word_list_edited and text2=word_list_old_edited.
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited=word_list_edited
############################################
# Sub Task 2: Jaccard for the _edited.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered,\
word_list_old_own_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered=word_list_own_filtered
############################################
# Sub Task 3: Jaccard for the _edited_v1.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
word_list_old_NLTK_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered=word_list_NLTK_filtered
############################################
# Sub Task 4: Jaccard for the _stemmed.txt
############################################
# Create stemmed text
word_list_edited_stemmed=[]
for word in word_list_edited:
word_list_edited_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited_stemmed=word_list_edited_stemmed
############################################
# Sub Task 5: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# Caution; in general, it is not clear whether you should first stem or
# first remove stop words.
# However, in this specific case, you should remove the stop words first
# and then stem, as your stop word list is based on the inflected text.
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# Create stemmed text
word_list_own_filtered_stemmed=[]
for word in word_list_own_filtered:
word_list_own_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
word_list_old_own_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
############################################
# Sub Task 6: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# Caution; it is not clear whether you should first stem or first remove
# stop words. However, the NLTK stop word list seems to be based on inflected
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
# Thus, the stop list seems to be not stemmed.
# Thus, you should remove the stop words first and then stem.
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# Create stemmed text
word_list_NLTK_filtered_stemmed=[]
for word in word_list_NLTK_filtered:
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
word_list_old_NLTK_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
# Write line break to output file
output_file.write("\n")
# Close 10-K filing
input_file_10_k.close()
input_file.close()
output_file.close()
stop_word_file.close()
print("Task done!")