whu-textual-analysis/lectures/programming/solutions/Problem_14_Jaccard_Similarity.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015

@author: Alexander Hillert, Goethe University Frankfurt
"""

import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter


ps=PorterStemmer()

directory="C:/Lehre/Textual Analysis/Programming/Files/"

# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()

# Split the input file in separate lines
input_text_line=input_text.split("\n")

# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
    input_text_line.remove("")

# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
Jaccard_stemmed_NLTK_stop_words\n')

# Read own stop word list
# This list has been created by manually selecting words from the csv-file
# 100_most_frequent_words.csv, which is created by the Python program
# "Problem_12_Most_Frequent_Words.py".
# Simply delete words you consider to be meaningless and that are frequently
# used.
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
stop_word_text=stop_word_file.read()
stop_word_line=stop_word_text.split("\n")
stop_word_line.remove("")
own_stop_words=[""]
for i in range(1,len(stop_word_line)):
    stop_word=stop_word_line[i].split(";")[1]
    own_stop_words.append(stop_word)

own_stop_words.remove("")
print("This is the list of my stop words:")
print(own_stop_words)

# Read NLTK stop word list
NLTK_stop_words=set(stopwords.words("english"))
print("This is the list of NLTK stop words:")
print(NLTK_stop_words)

# set default values for variables
# It is not required. However, if you don't do it Spyder will suggest that line
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
# is incorrect as word_list_old_edited is not yet defined at point in the program
# code. In this specific example, this will not cause an error, as we do not enter
# the if condition when i=1 -> it
word_list_old_edited=[]
word_list_edited=[]
word_list_old_NLTK_filtered=""
word_list_old_own_filtered=""
word_list_old_edited_stemmed=""
word_list_old_own_filtered_stemmed=""
word_list_old_NLTK_filtered_stemmed=""

#######################################################
# Define a function that computes Jaccard similarity
# As we need these operations several times, it is
# helpful to use a function.
######################################################
# beginning of the function
def jaccard(text1,text2):
    counter1=Counter(text1)
    counter2=Counter(text2)

    intersection=counter1 & counter2
    union=counter1 | counter2

    return len(intersection)/len(union)
# end of the function


# Loop over all lines
for i in range(1,len(input_text_line)):
    print(str(i))
    # split the line into the eight variables
    variables=input_text_line[i].split(";")
    # We need the CIK (1st column) and the filename (8th column)
    cik=variables[0]
    filename_parts=re.split('/',variables[7])
    filename=filename_parts[3].replace('.txt','')

    # Write the information from the input file to the output file
    # we do not add a line break at the end, as we must append the similarity
    # score first.
    output_file.write(input_text_line[i])

    # Open the ith 10-K; remember to specify the encoding
    input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
    '_edited.txt', 'r', encoding='ascii', errors='ignore')
    input_text_10_k=input_file_10_k.read()

    # check whether the previous entry of the list is from the same firm
    permco=input_text_line[i].split(";")[1]
    permco_old=input_text_line[i-1].split(";")[1]

    # Split text into words
    word_list_edited=word_tokenize(input_text_10_k.lower())


    ############################################
    # Sub Task 1: Jaccard for the _edited.txt
    ############################################
    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        # the command calls the jaccard function that we have defined above.
        # in the function, text1=word_list_edited and text2=word_list_old_edited.
        jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_edited=word_list_edited


    ############################################
    # Sub Task 2: Jaccard for the _edited.txt
    # AND REMOVE STOP WORDS - OWN LIST
    ############################################
    # remove stop words using personal stop word list
    word_list_own_filtered=[]
    for word in word_list_edited:
        if word not in own_stop_words:
            word_list_own_filtered.append(word)

    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        jaccard_similarity=jaccard(word_list_own_filtered,\
        word_list_old_own_filtered)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_own_filtered=word_list_own_filtered


    ############################################
    # Sub Task 3: Jaccard for the _edited_v1.txt
    # AND REMOVE STOP WORDS - NLTK LIST
    ############################################
    # remove stop words using NLTK stop word list
    word_list_NLTK_filtered=[]
    for word in word_list_edited:
        if word not in NLTK_stop_words:
            word_list_NLTK_filtered.append(word)

    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        jaccard_similarity=jaccard(word_list_NLTK_filtered,\
        word_list_old_NLTK_filtered)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_NLTK_filtered=word_list_NLTK_filtered


    ############################################
    # Sub Task 4: Jaccard for the _stemmed.txt
    ############################################
    # Create stemmed text
    word_list_edited_stemmed=[]
    for word in word_list_edited:
        word_list_edited_stemmed.append(ps.stem(word))

    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_edited_stemmed=word_list_edited_stemmed


    ############################################
    # Sub Task 5: Jaccard for the _stemmed.txt
    # AND REMOVE STOP WORDS - OWN LIST
    ############################################
    # Caution; in general, it is not clear whether you should first stem or
    # first remove stop words.
    # However, in this specific case, you should remove the stop words first
    # and then stem, as your stop word list is based on the inflected text.

    # remove stop words using personal stop word list
    word_list_own_filtered=[]
    for word in word_list_edited:
        if word not in own_stop_words:
            word_list_own_filtered.append(word)

    # Create stemmed text
    word_list_own_filtered_stemmed=[]
    for word in word_list_own_filtered:
        word_list_own_filtered_stemmed.append(ps.stem(word))

    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
        word_list_old_own_filtered_stemmed)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed


    ############################################
    # Sub Task 6: Jaccard for the _stemmed.txt
    # AND REMOVE STOP WORDS - NLTK LIST
    ############################################
    # Caution; it is not clear whether you should first stem or first remove
    # stop words. However, the NLTK stop word list seems to be based on inflected
    # text, e.g. the word "having" is included. "Having" would be stemmed to "have".
    # Thus, the stop list seems to be not stemmed.
    # Thus, you should remove the stop words first and then stem.

    # remove stop words using NLTK stop word list
    word_list_NLTK_filtered=[]
    for word in word_list_edited:
        if word not in NLTK_stop_words:
            word_list_NLTK_filtered.append(word)

    # Create stemmed text
    word_list_NLTK_filtered_stemmed=[]
    for word in word_list_NLTK_filtered:
        word_list_NLTK_filtered_stemmed.append(ps.stem(word))

    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
        word_list_old_NLTK_filtered_stemmed)

        output_file.write(";"+str(jaccard_similarity))
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(";")

    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed


    # Write line break to output file
    output_file.write("\n")

    # Close 10-K filing
    input_file_10_k.close()

input_file.close()
output_file.close()
stop_word_file.close()
print("Task done!")