Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 09:19:54 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We need regular expressions and counters (->collections)
|
||||
import re
|
||||
import collections
|
||||
# for the bigram part, the sentence tokenizer is helpful
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create an empty counter variable
|
||||
words_counter=collections.Counter()
|
||||
# variable is needed only for an alternative solution
|
||||
words_counter1=collections.Counter()
|
||||
|
||||
# counter for the extra task
|
||||
bigram_counter=collections.Counter()
|
||||
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list; remember to specify the encoding
|
||||
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
|
||||
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||
|
||||
# read the content from the file
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# use lower case only so that it does not matter whether a word is at
|
||||
# the beginning of a sentence ("The") or within a sentence ("the").
|
||||
# Please note that this can be problematic, e.g. "US" -> United States vs.
|
||||
# us (personal pronoun)
|
||||
input_text_10_k_lower=input_text_10_k.lower()
|
||||
|
||||
# Split text into words
|
||||
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
|
||||
# There can be empty ("") list elements -> remove them
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# optional commands to remove words that only contain "_"
|
||||
'''
|
||||
for word in list_of_words:
|
||||
if re.sub("[a-zA-Z]","",word)!="":
|
||||
#if word.count("_")>0:
|
||||
list_of_words.remove(word)
|
||||
'''
|
||||
|
||||
# Add the words to our counter
|
||||
words_counter=words_counter+collections.Counter(list_of_words)
|
||||
# alternative solution
|
||||
words_counter1.update(list_of_words)
|
||||
|
||||
|
||||
#############################################
|
||||
# optional part for the extra task on bigrams
|
||||
#############################################
|
||||
|
||||
# create an empty list for the bigrams
|
||||
bigram_list=[]
|
||||
|
||||
# split the text into sentences
|
||||
list_of_sentences=sent_tokenize(input_text_10_k)
|
||||
|
||||
# create the BIGRAM IN EACH SENTENCE
|
||||
for sentence in list_of_sentences:
|
||||
|
||||
# make the sentence lower case
|
||||
sentence_lower=sentence.lower()
|
||||
|
||||
# split the sentence into words
|
||||
list_of_words=re.split("\W{1,}",sentence_lower)
|
||||
|
||||
# remove empty elements
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
#print("these are the words of the sentence:\n"+str(list_of_words))
|
||||
|
||||
# go over all potential two word combinations in the sentence.
|
||||
for word_number in range(0,len(list_of_words)-1):
|
||||
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
|
||||
|
||||
bigram_counter=bigram_counter+collections.Counter(bigram_list)
|
||||
# end of extra task
|
||||
|
||||
|
||||
# Close the 10-K filing
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
|
||||
######################
|
||||
# Top 100 single words
|
||||
######################
|
||||
# Open the csv file containing the 100 most frequently used words
|
||||
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
|
||||
output_file.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent words
|
||||
top_100_words=words_counter.most_common(100)
|
||||
# for the alternative solution
|
||||
#top_100_words=words_counter1.most_common(100)
|
||||
|
||||
# Write the 100 most frequent words to the csv file.
|
||||
# Remember Python starts counting at 0, while humans start at 1.
|
||||
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
|
||||
# Consequently, to get a consistent table, we must use the value i for the rank
|
||||
# but call the element i-1.
|
||||
for i in range(1,101):
|
||||
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
|
||||
str(top_100_words[i-1][1])+"\n")
|
||||
|
||||
# Close the csv file
|
||||
output_file.close()
|
||||
|
||||
|
||||
######################
|
||||
# Extra task
|
||||
# Top 100 bigrams
|
||||
######################
|
||||
# Open the csv file containing the 100 most frequently used BIGRAMS
|
||||
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
|
||||
output_file_bigram.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent words
|
||||
top_100_bigrams=bigram_counter.most_common(100)
|
||||
|
||||
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
|
||||
for i in range(1,101):
|
||||
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
|
||||
str(top_100_bigrams[i-1][1])+"\n")
|
||||
|
||||
# Close the csv file
|
||||
output_file_bigram.close()
|
||||
|
||||
|
||||
print("Task done!")
|
||||
Loading…
Add table
Add a link
Reference in a new issue