Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
96 lines
No EOL
3.7 KiB
Python
96 lines
No EOL
3.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Wed Jul 29 11:07:10 2015
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
# We need regular epressions, tokenize (to identify words), and stemming.
|
|
import re
|
|
from nltk.tokenize import word_tokenize
|
|
from nltk.stem import PorterStemmer
|
|
|
|
|
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
|
|
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
|
input_text=input_file.read()
|
|
|
|
# Split the Input File in separate lines
|
|
input_text_line=input_text.split("\n")
|
|
|
|
# In general, there can be empty lines in the input file. The following command
|
|
# deletes these lines.
|
|
while input_text_line.count("")>0:
|
|
input_text_line.remove("")
|
|
|
|
# Loop over all lines
|
|
#for i in range(1,len(input_text_line)):
|
|
# for illustration filings 1 to 3 only
|
|
for i in range(1,4):
|
|
print(str(i))
|
|
# split the line into the eight variables
|
|
variables=input_text_line[i].split(";")
|
|
# We need the CIK (1st column) and the filename (8th column)
|
|
cik=variables[0]
|
|
filename_parts=re.split('/',variables[7])
|
|
filename=filename_parts[3].replace('.txt','')
|
|
|
|
# Open the ith 10-K in the list; remember to specify the encoding
|
|
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
|
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
|
# Get the text of the 10-K
|
|
input_text_10_k=input_file_10_k.read()
|
|
|
|
# We need to tokenize the text because stem only works on a word by word basis.
|
|
# Stemming an entire document without splitting into words does not work!
|
|
# The problem is that \n gets lost in this process --> we cannot easily
|
|
# recreate the document.
|
|
# idea: replace \n by \n and some indicator that there was a line break.
|
|
# Here, I choose "LINEBREAKMARK"
|
|
input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
|
|
|
|
# Split text into words
|
|
# There are two alternatives.
|
|
# Alternative 1 (our standard approach):
|
|
#word_list=re.split("\W{1,}",input_text_10_k.lower())
|
|
# Alternative 2 (keeps symbols like ,;.):
|
|
word_list=word_tokenize(input_text_10_k.lower())
|
|
|
|
|
|
# Stem the text
|
|
text_stemmed=''
|
|
for word in word_list:
|
|
# The following two cases are designed to improve the formatting of the
|
|
# output file. It is not needed for the subsequent analyses.
|
|
|
|
# Case 1: 'word' is not an actual word but a symbol. -> there should
|
|
# be no whitespace between the previous words and this symbol.
|
|
# \A and \Z indicate the beginning and end of string -> the 'word' is just
|
|
# the symbol but not a combination of letters and symbols.
|
|
|
|
if re.search("\A[\.\?!,:;']{1,}\Z",word):
|
|
text_stemmed=text_stemmed+word
|
|
# Case 2: the word is an actual word -> have a whitespace included.
|
|
else:
|
|
text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
|
|
|
# The simple solution (without restoring the formatting of the text) is:
|
|
#text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
|
|
|
|
|
# To recreate the text, we need to replace the line break indicators by \n
|
|
# Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
|
|
text_stemmed=text_stemmed.replace("linebreakmark","\n")
|
|
|
|
|
|
# Open the output file for the stemmed text
|
|
output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
|
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
|
|
output_file_10_k.write(text_stemmed)
|
|
output_file_10_k.close()
|
|
input_file_10_k.close()
|
|
|
|
input_file.close()
|
|
print("Task done!") |