whu-textual-analysis/lectures/programming/templates/Problem_14_Jaccard_Similarity_form.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015

@author: Alexander Hillert, Goethe University Frankfurt
"""

# For the full task, we need a large set of packages:
# regular expression, stemming, stop words, tokenization, and counters.
import re
#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
#from nltk.corpus import stopwords # NOT needed for the base comparison
#from nltk.stem import PorterStemmer # NOT needed for the base comparison
from collections import Counter


#ps=PorterStemmer() # NOT needed for the base comparison

# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"

# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()

# Split the input file in separate lines
input_text_line=input_text.split("\n")

# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
    input_text_line.remove("")

# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard\n')


# set default values for variables
word_list_old_edited=""
word_list_edited=""

# Loop over all lines
for i in range(1,len(input_text_line)):
    print(str(i))
    # split the line into the eight variables
    variables=input_text_line[i].split(";")
    # We need the CIK (1st column) and the filename (8th column)
    cik=variables[0]
    filename_parts=re.split('/',variables[7])
    filename=filename_parts[3].replace('.txt','')
    
    # Open the ith 10-K; remember to specify the encoding
    input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
    '_edited.txt', 'r', encoding='ascii', errors='ignore')
    # if the command above does not work (error like "file not found" or "directory not found")
    # please use the following command:
    #input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
    
    
    input_text_10_k=input_file_10_k.read()
    
    # Split text into words
    word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
    # Alternative using tokenize
    #word_list_edited=word_tokenize(input_text_10_k.lower())
    
    # check whether the previous entry of the list is from the same firm
    permco=input_text_line[i].split(";")[1]
    permco_old=input_text_line[i-1].split(";")[1]
    
    
    ############################################
    # Sub Task 1: Jaccard for the _edited.txt
    ############################################
    # compute Jaccard similarity if the previous filing is from the same firm
    if permco==permco_old:
        
        counter_current_10k=Counter(XXX)
        counter_previous_10k=Counter(XXX)

        intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
        union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)

        jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION 
        output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
    else:
        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
        output_file.write(input_text_line[i]+";"+"\n")
    
    # Save the current word vector to a separate variable for the comparison of the next report.
    word_list_old_edited=word_list_edited
    
    # Close 10-K filing
    input_file_10_k.close()
    
input_file.close()
output_file.close()
print("Task done!")
Add programming files - add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder 2022-08-05 00:05:05 +02:00			`# -- coding: utf-8 --`
			`"""`
			`Created on Wed Jul 29 11:07:10 2015`

			`@author: Alexander Hillert, Goethe University Frankfurt`
			`"""`

			`# For the full task, we need a large set of packages:`
			`# regular expression, stemming, stop words, tokenization, and counters.`
			`import re`
			`#from nltk.tokenize import word_tokenize # NOT needed for the base comparison`
			`#from nltk.corpus import stopwords # NOT needed for the base comparison`
			`#from nltk.stem import PorterStemmer # NOT needed for the base comparison`
			`from collections import Counter`


			`#ps=PorterStemmer() # NOT needed for the base comparison`

			`# Please adjust the directory to your machine.`
			`directory="C:/Lehre/Textual Analysis/Programming/Files/"`

			`# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M`
			`input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")`
			`input_text=input_file.read()`

			`# Split the input file in separate lines`
			`input_text_line=input_text.split("\n")`

			`# In general, there can be empty lines in the input file. The following command`
			`# deletes these lines.`
			`while input_text_line.count("")>0:`
			`input_text_line.remove("")`

			`# Open the output csv file in which we write the similarities`
			`output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")`
			`# Write variable names to first line`
			`output_file.write(input_text_line[0]+';Jaccard\n')`


			`# set default values for variables`
			`word_list_old_edited=""`
			`word_list_edited=""`

			`# Loop over all lines`
			`for i in range(1,len(input_text_line)):`
			`print(str(i))`
			`# split the line into the eight variables`
			`variables=input_text_line[i].split(";")`
			`# We need the CIK (1st column) and the filename (8th column)`
			`cik=variables[0]`
			`filename_parts=re.split('/',variables[7])`
			`filename=filename_parts[3].replace('.txt','')`

			`# Open the ith 10-K; remember to specify the encoding`
			`input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\`
			`'_edited.txt', 'r', encoding='ascii', errors='ignore')`
			`# if the command above does not work (error like "file not found" or "directory not found")`
			`# please use the following command:`
			`#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')`


			`input_text_10_k=input_file_10_k.read()`

			`# Split text into words`
			`word_list_edited=re.split("\W{1,}",input_text_10_k.lower())`
			`# Alternative using tokenize`
			`#word_list_edited=word_tokenize(input_text_10_k.lower())`

			`# check whether the previous entry of the list is from the same firm`
			`permco=input_text_line[i].split(";")[1]`
			`permco_old=input_text_line[i-1].split(";")[1]`


			`############################################`
			`# Sub Task 1: Jaccard for the _edited.txt`
			`############################################`
			`# compute Jaccard similarity if the previous filing is from the same firm`
			`if permco==permco_old:`

			`counter_current_10k=Counter(XXX)`
			`counter_previous_10k=Counter(XXX)`

			`intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)`
			`union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)`

			`jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION`
			`output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")`
			`else:`
			`# The previous filing is not from the same firm -> cannot compute Jaccard similarity`
			`output_file.write(input_text_line[i]+";"+"\n")`

			`# Save the current word vector to a separate variable for the comparison of the next report.`
			`word_list_old_edited=word_list_edited`

			`# Close 10-K filing`
			`input_file_10_k.close()`

			`input_file.close()`
			`output_file.close()`
			`print("Task done!")`