1
0
Fork 0
whu-textual-analysis/lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Alexander Hess a37c87d9c8
Add programming files
- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
2022-08-05 00:06:58 +02:00

167 lines
5.8 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular expressions and counters (->collections)
import re
import collections
# for the bigram part, the sentence tokenizer is helpful
from nltk.tokenize import sent_tokenize
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create an empty counter variable
words_counter=collections.Counter()
# variable is needed only for an alternative solution
words_counter1=collections.Counter()
# counter for the extra task
bigram_counter=collections.Counter()
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# read the content from the file
input_text_10_k=input_file_10_k.read()
# use lower case only so that it does not matter whether a word is at
# the beginning of a sentence ("The") or within a sentence ("the").
# Please note that this can be problematic, e.g. "US" -> United States vs.
# us (personal pronoun)
input_text_10_k_lower=input_text_10_k.lower()
# Split text into words
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
# There can be empty ("") list elements -> remove them
while list_of_words.count("")>0:
list_of_words.remove("")
# optional commands to remove words that only contain "_"
'''
for word in list_of_words:
if re.sub("[a-zA-Z]","",word)!="":
#if word.count("_")>0:
list_of_words.remove(word)
'''
# Add the words to our counter
words_counter=words_counter+collections.Counter(list_of_words)
# alternative solution
words_counter1.update(list_of_words)
#############################################
# optional part for the extra task on bigrams
#############################################
# create an empty list for the bigrams
bigram_list=[]
# split the text into sentences
list_of_sentences=sent_tokenize(input_text_10_k)
# create the BIGRAM IN EACH SENTENCE
for sentence in list_of_sentences:
# make the sentence lower case
sentence_lower=sentence.lower()
# split the sentence into words
list_of_words=re.split("\W{1,}",sentence_lower)
# remove empty elements
while list_of_words.count("")>0:
list_of_words.remove("")
#print("these are the words of the sentence:\n"+str(list_of_words))
# go over all potential two word combinations in the sentence.
for word_number in range(0,len(list_of_words)-1):
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
bigram_counter=bigram_counter+collections.Counter(bigram_list)
# end of extra task
# Close the 10-K filing
input_file_10_k.close()
input_file.close()
######################
# Top 100 single words
######################
# Open the csv file containing the 100 most frequently used words
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
output_file.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_words=words_counter.most_common(100)
# for the alternative solution
#top_100_words=words_counter1.most_common(100)
# Write the 100 most frequent words to the csv file.
# Remember Python starts counting at 0, while humans start at 1.
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
# Consequently, to get a consistent table, we must use the value i for the rank
# but call the element i-1.
for i in range(1,101):
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
str(top_100_words[i-1][1])+"\n")
# Close the csv file
output_file.close()
######################
# Extra task
# Top 100 bigrams
######################
# Open the csv file containing the 100 most frequently used BIGRAMS
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
output_file_bigram.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_bigrams=bigram_counter.most_common(100)
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
for i in range(1,101):
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
str(top_100_bigrams[i-1][1])+"\n")
# Close the csv file
output_file_bigram.close()
print("Task done!")