Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
189
lectures/programming/templates/NLTK_Sentiment_Analysis.py
Normal file
189
lectures/programming/templates/NLTK_Sentiment_Analysis.py
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Jul 15 21:56:41 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import nltk
|
||||
import random
|
||||
import collections
|
||||
import re
|
||||
|
||||
# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
|
||||
# are positive and the other 1,000 are negative.
|
||||
# if you do not have the movie review corpus yet, download it:
|
||||
nltk.download("movie_reviews")
|
||||
|
||||
from nltk.corpus import movie_reviews
|
||||
|
||||
|
||||
# Create a list that contains the tuples of document and category.
|
||||
# Category is "positive" or "negative"
|
||||
documents = []
|
||||
# For all categories
|
||||
for category in movie_reviews.categories():
|
||||
print("Category: "+str(category))
|
||||
# for all reviews (identified by file ID) in the respective category
|
||||
for file_ID in movie_reviews.fileids(category):
|
||||
# You have to put two parentheses to indicate that you want to add a tuple.
|
||||
documents.append((list(movie_reviews.words(file_ID)),category))
|
||||
|
||||
# Print the first element (i.e. tuple) of documents.
|
||||
print(documents[0])
|
||||
# print the words of the first movie review
|
||||
print(documents[0][0])
|
||||
# print the first word of the first movie review
|
||||
print(documents[0][0][0])
|
||||
|
||||
# print the classification of the first movie review
|
||||
print(documents[0][1])
|
||||
|
||||
# print the classification of the 1000th review (the last negative one)
|
||||
print(documents[999][1])
|
||||
# print the classification of the 1001st review (the first positive one)
|
||||
print(documents[1000][1])
|
||||
|
||||
# The default order of the reviews is first all negative reviews and then all positive ones.
|
||||
# Later we will build a training and a testing set. As we need to have positive and negative
|
||||
# reports in both sets, we randomly shuffle the documents.
|
||||
random.shuffle(documents)
|
||||
|
||||
# Create a list of all words.
|
||||
all_words = []
|
||||
for word in movie_reviews.words():
|
||||
# We use lower case words
|
||||
#all_words.append(word.lower())
|
||||
if re.search("\A[a-z]",word.lower()):
|
||||
# check whether the word is actually a word, i.e., whether it contains
|
||||
# at least one letter
|
||||
#if re.search("[a-z]",word.lower()):
|
||||
# We use lower case words
|
||||
all_words.append(word.lower())
|
||||
|
||||
|
||||
# What are the most frequently used words in the movie reviews?
|
||||
# Alternative 1:
|
||||
# FreqDist sort words from the most frequently used word to the least frequenty used word.
|
||||
all_words_approach_1 = nltk.FreqDist(all_words)
|
||||
print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
|
||||
|
||||
# Alternative 2:
|
||||
# We can also determine the most frequent words by using Counters as we did
|
||||
# in Problem 12 --> transform list of all words to a Counter
|
||||
all_words_approach_2=collections.Counter(all_words)
|
||||
top_15_words=all_words_approach_2.most_common(15)
|
||||
print("Alternative 2: the top 15 words are: "+str(top_15_words))
|
||||
# -> identical results -> perfect.
|
||||
|
||||
# Search for a word and see how often it appears.
|
||||
print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
|
||||
# alternatively
|
||||
print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
|
||||
|
||||
# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
|
||||
# -> create a list that only contains the top 3000 words
|
||||
# get the top 3000 words
|
||||
# Approach 1 using the NLKT.FreqDist from above
|
||||
i=0
|
||||
top_3000_words=all_words_approach_1.most_common(3000)
|
||||
list_top_3000_words_approach_1=[]
|
||||
while i<3000:
|
||||
list_top_3000_words_approach_1.append(top_3000_words[i][0])
|
||||
i=i+1
|
||||
|
||||
# Approach 2 using Counters from above
|
||||
i=0
|
||||
top_3000_words=all_words_approach_2.most_common(3000)
|
||||
list_top_3000_words_approach_2=[]
|
||||
while i<3000:
|
||||
list_top_3000_words_approach_2.append(top_3000_words[i][0])
|
||||
i=i+1
|
||||
|
||||
# select the list of approach 1 or 2
|
||||
word_features=list_top_3000_words_approach_1
|
||||
|
||||
# We need to identify the words we want to use for classification in the documents.
|
||||
# We define a function for that.
|
||||
def find_features(document):
|
||||
words = set(document)
|
||||
features = {}
|
||||
# loop over all the words we consider for the classification
|
||||
for word in word_features:
|
||||
# The expression returns either true or false
|
||||
features[word] = (word in words)
|
||||
|
||||
return features
|
||||
|
||||
# To get an idea what the function find_features() does let's print the features
|
||||
# for one review.
|
||||
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
|
||||
|
||||
|
||||
feature_set = [(find_features(review), category) for (review, category) in documents]
|
||||
|
||||
# How does feature set looks like?
|
||||
print(feature_set[0])
|
||||
# -> it is still a tuple
|
||||
print(feature_set[0][0])
|
||||
# the first element are the 3000 words we use for classification with "True" or "False"
|
||||
# depending on whether the words appear in the review
|
||||
print(feature_set[0][1])
|
||||
# Is the information on whether the review is positive or negative
|
||||
|
||||
# Define the training and testing set
|
||||
# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
|
||||
training_set=feature_set[:1900]
|
||||
testing_set=feature_set[1900:]
|
||||
|
||||
# First we have to train the Naive Bayes Classifier.
|
||||
# It will determine which of the words from word_features appear mostly in positive
|
||||
# reviews and which appear mostly in negative reviews.
|
||||
classifier=nltk.NaiveBayesClassifier.train(training_set)
|
||||
# The following command prints the 20 words that best discriminate between
|
||||
# positive and negative reviews.
|
||||
classifier.show_most_informative_features(20)
|
||||
|
||||
# Let's classify the first element of feature_set
|
||||
# The input for the classification need to be the list of words with True or False
|
||||
print(classifier.classify(feature_set[0][0]))
|
||||
print("The review is actually: "+str(feature_set[0][1]))
|
||||
|
||||
# classify the 100 reports from the testing set
|
||||
# they have the position 1900 to 2000 in the feature set.
|
||||
i=1900
|
||||
classified_set=[]
|
||||
while i<2000:
|
||||
classified_set.append(classifier.classify(feature_set[i][0]))
|
||||
i=i+1
|
||||
|
||||
# Compare classification result with actual category
|
||||
i=0
|
||||
# In this list we save tuples of [predicted category, actual category]
|
||||
comparison=[]
|
||||
# In this list we simply save "accurate" and "inaccurate"
|
||||
comparison_2=[]
|
||||
while i<100:
|
||||
comparison.append([classified_set[i],feature_set[i+1900][1]])
|
||||
# If the predicted and acutal classification match -> accurate
|
||||
if comparison[i][0]==comparison[i][1]:
|
||||
comparison_2.append("accurate")
|
||||
else:
|
||||
comparison_2.append("inaccurate")
|
||||
i=i+1
|
||||
|
||||
print(comparison)
|
||||
# We need the number of accurate and inaccurate classifications
|
||||
comparison_counter=collections.Counter(comparison_2)
|
||||
print(comparison_counter)
|
||||
|
||||
# NLT can compute the accuracy directly
|
||||
# What is the accuracy for the testing set?
|
||||
print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
|
||||
# Same value as from our own calculations -> perfect!
|
||||
|
||||
# What is the accuracy for the training set?
|
||||
print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
|
||||
# Higher than in the testing dataset -> expected.
|
||||
|
||||
print("completed!")
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# To determine file size we need the OS package
|
||||
import os
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
|
||||
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# File size of the complete submission file (gross file size)
|
||||
# You have to divide the result by 1024 to get the size in kilobyte
|
||||
# The file size will be affected by html code and exhibits.
|
||||
# APPLY THE COMMAND THAT IS SHOWN ON SLIDE 62.
|
||||
size_gross=XXX/1024
|
||||
|
||||
# File size of the main text file (net file size)
|
||||
# You have to divide the result by 1024 to get the size in kilobyte
|
||||
size_net=XXX/1024 # SAME COMMAND AS FOR GROSS FILE SIZE BUT APPLIED TO THE _clean.txt
|
||||
|
||||
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
|
|
@ -0,0 +1,150 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 09:19:54 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We need regular expressions and counters (->collections)
|
||||
import re
|
||||
import collections
|
||||
# for the bigram part, the sentence tokenizer is helpful
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create an empty counter variable
|
||||
words_counter=collections.Counter()
|
||||
|
||||
# counter for the extra task
|
||||
bigram_counter=collections.Counter()
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list; remember to specify the encoding
|
||||
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+\
|
||||
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||
|
||||
# read the content from the file
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# THINK ABOUT WE SHOULD USE LOWER OR UPPER CASE CONSISTENTLY!
|
||||
input_text_10_k=
|
||||
|
||||
# Split text into words
|
||||
list_of_words=re.split('\W{1,}',input_text_10_k)
|
||||
|
||||
# Remember: there can be empty list elements!
|
||||
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||
# You can use an approach similar to the one in lines 24 and 25.
|
||||
COMMANDS TO BE ADDED
|
||||
|
||||
|
||||
# Add the words to our counter
|
||||
words_counter=words_counter+XXXX # COMPLETE THIS COMMAND
|
||||
|
||||
|
||||
#############################################
|
||||
# optional part for the extra task on bigrams
|
||||
#############################################
|
||||
# create an empty list for the bigrams
|
||||
'''
|
||||
bigram_list=[]
|
||||
|
||||
# split the text into sentences
|
||||
list_of_sentences=XXX
|
||||
|
||||
# create the bigrams IN EACH SENTENCE
|
||||
for sentence in list_of_sentences:
|
||||
# split the sentence into words
|
||||
list_of_words=XXX
|
||||
|
||||
# remove empty elements
|
||||
while list_of_words.count("")>0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# go over all potential two word combinations in the sentence.
|
||||
for word_number in range(XXX,YYY):
|
||||
# add the bigram (two words connected by whitespace) to the list
|
||||
bigram_list.append(WORD_1 + " " + WORD_2)
|
||||
|
||||
# same command as in line 70
|
||||
bigram_counter=bigram_counter+XXX
|
||||
# end of extra task
|
||||
'''
|
||||
|
||||
|
||||
# Close the 10-K filing
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
|
||||
|
||||
######################
|
||||
# Top 100 single words
|
||||
######################
|
||||
# Open the csv file containing the 100 most frequently used words
|
||||
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8",errors="ignore")
|
||||
output_file.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent words
|
||||
top_100_words=words_counter.XXXX # COMPLETE THIS COMMAND
|
||||
|
||||
# Write the 100 most frequent words to the csv file
|
||||
# REMEMBER: Python starts counting at 0, while humans start at 1.
|
||||
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
|
||||
for i in range(1,101):
|
||||
output_file.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
|
||||
|
||||
# Close the csv file
|
||||
output_file.close()
|
||||
|
||||
|
||||
######################
|
||||
# Extra task
|
||||
# Top 100 bigrams
|
||||
######################
|
||||
'''
|
||||
# Open the csv file containing the 100 most frequently used BIGRAMS
|
||||
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
|
||||
output_file_bigram.write("rank;word;count\n")
|
||||
|
||||
# Get the 100 most frequent bigrams: same commend as above
|
||||
top_100_bigrams=bigram_counter.XXX
|
||||
|
||||
# Write the 100 most frequent bigrams to the csv file.
|
||||
# same logic as above
|
||||
for i in range(1,101):
|
||||
output_file_bigram.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
|
||||
|
||||
# Close the csv file
|
||||
output_file_bigram.close()
|
||||
'''
|
||||
|
||||
print("Task done!")
|
||||
83
lectures/programming/templates/Problem_13_Stemming_form.py
Normal file
83
lectures/programming/templates/Problem_13_Stemming_form.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We need regular epressions and stemming.
|
||||
import re
|
||||
from nltk.stem import PorterStemmer
|
||||
# Depending on how you would like to split the text in words, you may need tokenize.
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list; remember to specify the encoding
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
|
||||
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||
|
||||
|
||||
# Get the text of the 10-K
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# We need to tokenize the text because stem only works on a word by word basis.
|
||||
# Stemming an entire document without splitting into words does not work!
|
||||
# The problem is that \n gets lost in this process --> we cannot easily
|
||||
# recreate the document.
|
||||
# Solution: replace \n by \n and some indicator that there was a line break.
|
||||
# For example replace("\n","\nHereWasALinebreak")
|
||||
input_text_10_k=input_text_10_k.replace("\n",XXXX)
|
||||
|
||||
# Split text into words
|
||||
word_list=XXXX
|
||||
|
||||
# Stem the text from above
|
||||
text_stemmed=''
|
||||
# LOOP ALL WORDS, STEM THEM AND RECONNECT THEM.
|
||||
# WARNING: WHEN RECONNECTING WORDS YOU NEED TO INCLUDE A WHITESPACE BETWEEN
|
||||
# THE WORDS. OTHERWISE, THE TEXT GETS MESSED UP.
|
||||
for word in word_list:
|
||||
|
||||
text_stemmed=text_stemmed+XXX # TO BE COMPLETED
|
||||
|
||||
# To recreate the text, we need to replace the line break indicators by \n.
|
||||
# WARNING: PAY ATTENTION TO UPPER/LOWER CASE, IT CAN CHANGE.
|
||||
text_stemmed=text_stemmed.replace(XXXX,XXXX) # UNDO THE TRANSFORMATION FROM LINE 56.
|
||||
|
||||
|
||||
# Open the output file for the stemmed text
|
||||
output_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
|
||||
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
|
||||
output_file_10_k.write(text_stemmed)
|
||||
output_file_10_k.close()
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
print("Task done!")
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# For the full task, we need a large set of packages:
|
||||
# regular expression, stemming, stop words, tokenization, and counters.
|
||||
import re
|
||||
#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
|
||||
#from nltk.corpus import stopwords # NOT needed for the base comparison
|
||||
#from nltk.stem import PorterStemmer # NOT needed for the base comparison
|
||||
from collections import Counter
|
||||
|
||||
|
||||
#ps=PorterStemmer() # NOT needed for the base comparison
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Open the output csv file in which we write the similarities
|
||||
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
|
||||
# Write variable names to first line
|
||||
output_file.write(input_text_line[0]+';Jaccard\n')
|
||||
|
||||
|
||||
# set default values for variables
|
||||
word_list_old_edited=""
|
||||
word_list_edited=""
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the eight variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (8th column)
|
||||
cik=variables[0]
|
||||
filename_parts=re.split('/',variables[7])
|
||||
filename=filename_parts[3].replace('.txt','')
|
||||
|
||||
# Open the ith 10-K; remember to specify the encoding
|
||||
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
|
||||
'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||
# if the command above does not work (error like "file not found" or "directory not found")
|
||||
# please use the following command:
|
||||
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||
|
||||
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# Split text into words
|
||||
word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
|
||||
# Alternative using tokenize
|
||||
#word_list_edited=word_tokenize(input_text_10_k.lower())
|
||||
|
||||
# check whether the previous entry of the list is from the same firm
|
||||
permco=input_text_line[i].split(";")[1]
|
||||
permco_old=input_text_line[i-1].split(";")[1]
|
||||
|
||||
|
||||
############################################
|
||||
# Sub Task 1: Jaccard for the _edited.txt
|
||||
############################################
|
||||
# compute Jaccard similarity if the previous filing is from the same firm
|
||||
if permco==permco_old:
|
||||
|
||||
counter_current_10k=Counter(XXX)
|
||||
counter_previous_10k=Counter(XXX)
|
||||
|
||||
intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
|
||||
union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)
|
||||
|
||||
jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION
|
||||
output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
|
||||
else:
|
||||
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||
output_file.write(input_text_line[i]+";"+"\n")
|
||||
|
||||
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||
word_list_old_edited=word_list_edited
|
||||
|
||||
# Close 10-K filing
|
||||
input_file_10_k.close()
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
print("Task done!")
|
||||
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Mar 21 09:38:32 2022
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.linear_model import RidgeCV
|
||||
from sklearn.linear_model import LassoCV
|
||||
|
||||
|
||||
# adjust the directory to your folder
|
||||
directory="C:/Lehre/Machine Learning/Data/"
|
||||
|
||||
|
||||
# import the data for this problem
|
||||
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
|
||||
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
|
||||
# The rows of the data are the Form 10-K filings. Each line is one filing.
|
||||
# The columns are the variables. After some identifying information,
|
||||
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
|
||||
# in a 10-K (e.g., 100 times)
|
||||
|
||||
|
||||
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
|
||||
# and Console will crash.
|
||||
# However, you can pick a small subset of the data and look at it.
|
||||
# It list all columns=variables and the first three observations.
|
||||
data_frame_example=data_frame.head(3)
|
||||
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||
|
||||
# To see the variables included in the data use the following command
|
||||
data_frame_column_names=data_frame.columns
|
||||
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||
# This variables shows all column/variable names in a vector.
|
||||
|
||||
# split the data set into the training and testing data
|
||||
# we use the filings from year 2007 as training data
|
||||
data_frame_train=data_frame[data_frame.year==2007]
|
||||
# and the filing from year 2008 as testing data
|
||||
data_frame_test=data_frame[data_frame.year==2008]
|
||||
|
||||
# put the cumulative abnormal return around the filing date into a new variable.
|
||||
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
|
||||
# training data
|
||||
filing_car_train=data_frame_train["excess_ret_t0_t4"]
|
||||
# testing data
|
||||
filing_car_test=data_frame_test["excess_ret_t0_t4"]
|
||||
|
||||
# so far, you have absolute word counts. For example, "loss" is found 5 times.
|
||||
# As the length of the 10-Ks can be different, we scale by the number of words
|
||||
# in the 10-K.
|
||||
document_length_train=data_frame_train["number_of_words"]
|
||||
document_length_test=data_frame_test["number_of_words"]
|
||||
|
||||
|
||||
# the word frequencies are our independent variables -> restrict the data frame
|
||||
# to those variables and drop all variables that are not needed
|
||||
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||
|
||||
# compute relative frequencies, i.e., divide the absolute word count by document length
|
||||
data_frame_train=data_frame_train.div(document_length_train, axis=0)
|
||||
data_frame_test=data_frame_test.div(document_length_test, axis=0)
|
||||
|
||||
# standardize the data frames
|
||||
# training data
|
||||
data_frame_train_mean=TO BE COMPLETED
|
||||
data_frame_train_sd=TO BE COMPLETED
|
||||
data_frame_train_standardized=TO BE COMPLETED
|
||||
# testing data
|
||||
data_frame_test_mean=TO BE COMPLETED
|
||||
data_frame_test_sd=TO BE COMPLETED
|
||||
data_frame_test_standardized=TO BE COMPLETED
|
||||
|
||||
|
||||
# There can be missing values in the standardized variables.
|
||||
# They arise if the word count for a specific word is always zero in the training
|
||||
# or in the testing data. In this case, the standard deviation is zero ->
|
||||
# division by zero -> NaN.
|
||||
# We replace these missing values by zero.
|
||||
# training data
|
||||
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
|
||||
# testing data
|
||||
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
|
||||
|
||||
##########################
|
||||
# Ridge regression
|
||||
##########################
|
||||
print("\nRidge regression - Using cross-validation\n")
|
||||
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
|
||||
# In this regression, we use the training data.
|
||||
# We use five-fold cross-validation.
|
||||
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
|
||||
regression_Ridge_cv=RidgeCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=TO BE COMPLETED
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# what is the R2 in the training and testing data?
|
||||
print("The R2 in the training data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
|
||||
print("The R2 in the testing data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
filing_car_train_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
|
||||
# predict y in the testing sample
|
||||
filing_car_test_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
|
||||
# Determine the MSE
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||
|
||||
|
||||
######################
|
||||
# LASSO regression
|
||||
######################
|
||||
print("\nLASSO regression - Using cross-validation\n")
|
||||
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
|
||||
# In this regression, we use the training data.
|
||||
# We use five-fold cross-validation.
|
||||
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
|
||||
regression_Lasso_cv=LassoCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=TO BE COMPLETED
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# get the R2 in the training data
|
||||
print("The R2 in the training data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
|
||||
# ... and testing data
|
||||
print("The R2 in the testing data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
filing_car_train_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
|
||||
# predict y in the testing sample
|
||||
filing_car_test_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
|
||||
# Determine the MSE
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||
|
||||
|
||||
############################################################
|
||||
# Compare the betas from the Ridge and the LASSO regressions
|
||||
############################################################
|
||||
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
|
||||
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
|
||||
|
||||
# get the list of coefficients
|
||||
for i in range (0,len(data_frame_train.columns)):
|
||||
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||
|
||||
output_file.close()
|
||||
|
||||
print("Completed!")
|
||||
88
lectures/programming/templates/Problem_1_form.py
Normal file
88
lectures/programming/templates/Problem_1_form.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Feb 15 21:37:53 2019
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
# It is important to use a single forward slash / but not a single backslash \.
|
||||
|
||||
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||
|
||||
# open the Fun_with_Python text file
|
||||
input_file=open(directory+"Fun_with_Python.txt","r")
|
||||
|
||||
###################################
|
||||
# Programming Problem 1
|
||||
###################################
|
||||
|
||||
# Task 1: open the file 'Fun_with_Python.txt' in Spyder and print its content
|
||||
# The file can be found in our data folder
|
||||
|
||||
# get the text from the file
|
||||
input_text= TO BE COMPLETED
|
||||
# print the content, i.e., the text of the file (previous line)
|
||||
print(TO BE COMPLETED)
|
||||
|
||||
# See slide 7
|
||||
|
||||
|
||||
# Task 2: Write the content of 'Fun_with_Python.txt' to a new text file
|
||||
# with the name 'More_fun_with_Python.txt'.
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See slide 8.
|
||||
# REMEMBER to close your file. If you do not close the new txt file, its content
|
||||
# will not be saved to the hard drive. You will find an empty txt in your file manager.
|
||||
|
||||
|
||||
# Task 3: Write a loop that prints some text (whatever you like) ten times.
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See slide 9.
|
||||
# You have several options. While loop, for X in range() loop, etc.
|
||||
|
||||
|
||||
|
||||
# Task 4: Print the text of the "Fun_with_Python" file line by line!
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See slide 10.
|
||||
# You need a loop (Task 3) and in each iteration of the loop have Python print
|
||||
# a line of text.
|
||||
|
||||
|
||||
|
||||
# Task 5: Count how often the word 'good' appears in the document 'Fun_with_Python.txt'!
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See slide 11.
|
||||
|
||||
|
||||
|
||||
# Task 6a: Now, print only the lines that contain the word 'good'!
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See also slide 12.
|
||||
# You can use the line-by-line printing from Task 4 and combine it with the command ".count()" from Task 5
|
||||
# and add the if condition from slide 12.
|
||||
# If condition: for each line check whether the specific line contains the word "good".
|
||||
|
||||
|
||||
|
||||
# Task 7: print only the lines that start with the word 'This'!
|
||||
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
# See slide 15.
|
||||
# This is very similar to task 6. You only need to modify the if condition a bit.
|
||||
|
||||
|
||||
|
||||
|
||||
# Task 8a: Replace the word "good" by "excellent" and display the new text!
|
||||
# See slide 16.
|
||||
# ENTER YOUR COMMANDS HERE
|
||||
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 09:21:46 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||
"""
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
# It is important to use a single forward slash / but not a single backslash \.
|
||||
|
||||
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the txt file with the SEC filings
|
||||
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
|
||||
sec_filings_text=sec_filings_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'SEC_Filings_Output.csv','w')
|
||||
|
||||
# Create first line with variable names
|
||||
# I use semicolons as separator in csv files. You can also use any other symbol.
|
||||
# However, you should make sure that the separator is not part of the data/text
|
||||
# you write to the file.
|
||||
# For example, it would be problematic if you use comma as separator and have
|
||||
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
|
||||
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
|
||||
|
||||
|
||||
# Split the Input File in separate line
|
||||
# DO THE LINE SPIT
|
||||
sec_filings_line=
|
||||
|
||||
# Loop over all lines
|
||||
# you can get the number of lines by computing the length of the list of lines,
|
||||
# i.e. by determining the length of sec_filings_line.
|
||||
for / while : # COMPLETE LOOP
|
||||
|
||||
# Does the line refer to a form 10-K file?
|
||||
if : # USE AN IF CONDITION TO TEST THIS -> see TASKS 7 and 8 of PROBLEM 1
|
||||
|
||||
# Split the line such that the information can be saved in separate
|
||||
# variables
|
||||
# Each information item has a fixed length in the overview files of the
|
||||
# SEC.
|
||||
# SEE SLIDE 18 FOR INFORMATION ON THE LENGTH OF THE SEPARATE COLUMNS.
|
||||
|
||||
# COMPLETE THE COMMANDS BELOW
|
||||
filing_type=
|
||||
company_name=
|
||||
cik=
|
||||
filing_date=
|
||||
link=
|
||||
|
||||
# Is the 10-K filed between March 10 and March 20?
|
||||
filing_day=
|
||||
filing_month=
|
||||
# Is the Filing Month March?
|
||||
if : # COMPLETE THE IF-CONDITION
|
||||
# Is the Filing Day between 10 and 20?
|
||||
if : # COMPLETE THE IF-CONDITION
|
||||
# The filing meets the conditions -->
|
||||
# Write output to the csv file
|
||||
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
|
||||
|
||||
|
||||
# Close your input and output file in the end
|
||||
sec_filings_file.close()
|
||||
output_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
|
@ -0,0 +1,103 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Jul 29 11:07:10 2015
|
||||
|
||||
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||
"""
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
# It is important to use a single forward slash / but not a single backslash \.
|
||||
|
||||
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||
|
||||
|
||||
# We need the urllib package for the download.
|
||||
import urllib.request
|
||||
# To automatically create folders, we need the os-module (OS: Operating System)
|
||||
import os
|
||||
|
||||
###############################################################################
|
||||
# Technical issue
|
||||
# As of March 2021, the SEC no longer accepts requests by the standard urllib settings
|
||||
# you have to make some adjustments
|
||||
###############################################################################
|
||||
# Define a user agent
|
||||
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
|
||||
# "Some websites dislike being browsed by programs, or send different versions
|
||||
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
|
||||
# (where x and y are the major and minor version numbers of the Python release,
|
||||
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
|
||||
# The way a browser identifies itself is through the User-Agent header.
|
||||
opener = urllib.request.build_opener()
|
||||
|
||||
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
|
||||
# To still automatically download files, you have different options.
|
||||
# I have listed three examples below but there are many more:
|
||||
# For a comprehensive list see, e.g.:
|
||||
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
|
||||
#opener.addheaders = [('User-agent', 'Mozilla')]
|
||||
#opener.addheaders = [('User-agent', 'Chrome')]
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
|
||||
urllib.request.install_opener(opener)
|
||||
# END of the technical issues
|
||||
|
||||
|
||||
|
||||
# Open the csv file from part 1 of the problem
|
||||
input_file=open(directory+'SEC_Filings_Output.csv','r')
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# Create a subfolder in which the 10-K filings are saved.
|
||||
# When you download a large number of filings I recommend using subfolders for
|
||||
# each year or even for each year-month-day combination.
|
||||
# In this problem, a single subfolder is fine.
|
||||
os.makedirs( COMPLETE THE COMMAND )
|
||||
# See slide 18 for information on the os.-commands!
|
||||
# IN GENERAL, IF YOU SEE AN UNKNOWN COMMAND, GOOGLE IT TO GET INFORMATION.
|
||||
|
||||
# Loop over all lines of the csv file
|
||||
# Like in part 1 of the problem, you can get the number of lines by computing
|
||||
# the length of the list of lines, i.e. by determining the length of input_text_line.
|
||||
for / while: # COMPLETE THE LOOP
|
||||
# split the line into the five variables
|
||||
# THE ; IS THE SEPARATOR IN THE CSV -> USE THE split() COMMAND
|
||||
variables=
|
||||
|
||||
# We only need the cik and the link to download the file.
|
||||
# The cik is the 3rd variable.
|
||||
# The link is the 5th variable
|
||||
cik=
|
||||
link=
|
||||
|
||||
# identify the filename
|
||||
# The link consistes of differnt parts:
|
||||
# For example: edgar/data/1000753/0000950129-98-001035.txt
|
||||
|
||||
link_parts= # USE A SPLIT
|
||||
# 1st part: edgar
|
||||
# 2nd part: data
|
||||
# 3rd part: cik
|
||||
# 4th part: file name -> see next line
|
||||
filename=link_parts[FILE IN THE NUMBER HERE]
|
||||
###########################################################################
|
||||
############################ WARNING ######################################
|
||||
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
|
||||
# may use the same filename. Thus, when you only use the filename files
|
||||
# might be overwritten. To avoid this problem you need to have a unique name.
|
||||
# Combining CIK and filename results in a unique identifier, as the
|
||||
# filename appears only once per firm (CIK).
|
||||
# -> use the combination of CIK and filename: cik_filename
|
||||
###########################################################################
|
||||
urllib.request.urlretrieve(TO BE COMPLETED)
|
||||
# See slide 19 for information on the urllib.-commands.
|
||||
|
||||
|
||||
# Close your input file
|
||||
input_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Apr 12 15:50:22 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# Import regular expressions and BeautifulSoup
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
# It is important to use a single forward slash / but not a single backslash \.
|
||||
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the document
|
||||
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
#######################
|
||||
# Task 1: remove tables
|
||||
#######################
|
||||
# Approach
|
||||
# We search for tables until we find no more html tags that indicate the
|
||||
# beginning of a table.
|
||||
# Search for the start html-tag <TABLE>
|
||||
table_match=re.search(TO BE COMPLETED, input_text)
|
||||
while : # YOU NEED A LOOP THAT SEARCHES FOR TABLES
|
||||
# When we have identified a match, i.e. the start of a table, we save
|
||||
# the position of the beginning of the table in the variable "start_table"
|
||||
table_start_match=re.search(XXX, input_text)
|
||||
start_table=table_start_match.start()
|
||||
# Next, we search for the corresponding html tag that indicates the end of
|
||||
# the table and save the end position to the variable "end_table"
|
||||
|
||||
# REPEAT THE COMMANDS ABOVE FOR THE END OF TABLE
|
||||
table_end_match=
|
||||
end_table=
|
||||
|
||||
# We can print the text between the start and end html tag to check whether
|
||||
# the table has been identified correctly.
|
||||
print("The text below is a table!\n"+input_text[start_table:end_table])
|
||||
|
||||
# the text between the beginning and end of the html tags is the part which
|
||||
# we would like to delete.
|
||||
# Consequently, we keep the text before the beginning of the table as well
|
||||
# as the text after the ending of the table.
|
||||
input_text=TO BE COMPLETED
|
||||
# Next, we need to check whether there is another table in the rest of the
|
||||
# text.
|
||||
table_match=re.search(SAME COMMAND AS IN LINE 27, input_text)
|
||||
# As long as "table_match" exists, i.e. we regex result in a match, the loop
|
||||
# will continue.
|
||||
|
||||
#########################
|
||||
# Task 2: remove Exhibits
|
||||
#########################
|
||||
# Exhibits have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>EX...
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
|
||||
# THE APPROACH IS THE SAME AS THE SEARCH FOR TABLES ABOVE
|
||||
exhibit_match=re.search(, input_text)
|
||||
while :
|
||||
# get the beginning of the exhibit
|
||||
exhibit_start_match=
|
||||
start_exhibit=
|
||||
# As the exhibits are at the end of the 10-K filing it would not be
|
||||
# necessary to include an end position. We could also drop the entire text
|
||||
# after "<TYPE>EX"
|
||||
# However, for completeness, we will define an end
|
||||
exhibit_end_match=
|
||||
end_exhibit=
|
||||
# Print the identified text to check whether the exhibit has be identified
|
||||
# correctly
|
||||
print("The text below is a exhibit!\n"+input_text[start_exhibit:end_exhibit])
|
||||
|
||||
input_text=TO BE COMPLETED
|
||||
# Check whether there are further exhibits
|
||||
exhibit_match=re.search(SAME COMMAND AS IN LINE 65, input_text)
|
||||
|
||||
##########################
|
||||
# Task 3: remove html code
|
||||
##########################
|
||||
# Alternative 1: remove html code without Beautiful Soup
|
||||
text=re.sub(TO BE COMPLETED, '', input_text)
|
||||
# Use a regex that searches for a "<" followed by at least one character that must not
|
||||
# equal > and is completed by >.
|
||||
|
||||
# Alternative 2: remove html code using Beautiful Soup
|
||||
html_text=BeautifulSoup(TO BE COMPLETED)
|
||||
text=html_text.TO BE COMPLETED
|
||||
|
||||
########################
|
||||
# Task 4: delete numbers
|
||||
########################
|
||||
|
||||
# YOU MAY NEED MULTIPLE COMMANDS TO DELETE ALL NUMBERS
|
||||
# Remember that you can have different formats, e.g., 1,234.56 or 0.12 or 1,234,567
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
########################
|
||||
# Task 5: delete symbols
|
||||
########################
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
|
||||
# Open the output file for the pure text
|
||||
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
|
||||
output_file.write(text)
|
||||
|
||||
# close all files
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
|
||||
print("DONE")
|
||||
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Apr 12 15:50:22 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
# It is important to use a single forward slash / but not a single backslash \.
|
||||
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the 10-K
|
||||
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
|
||||
input_text=input_file.read()
|
||||
|
||||
################################
|
||||
# Remove tables
|
||||
# Same approach as in Problem 4
|
||||
################################
|
||||
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||
# example, we will print the first two tables that we delete.
|
||||
i=1
|
||||
table_match=re.search(ENTER THE REGEX, input_text)
|
||||
while table_match:
|
||||
# Search for the beginning of the table
|
||||
table_start_match=re.search(REGEX FOR BEGINNING OF TABLE, input_text)
|
||||
start_table=
|
||||
# search for the end of the table
|
||||
table_end_match=REGEX FOR END OF TABLE
|
||||
end_table=
|
||||
# The if condition and the printing are just for illustrative purposes.
|
||||
# The commands display the first two tables that are removed from the text.
|
||||
if i<=2:
|
||||
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
|
||||
i=i+1
|
||||
# remove the table from the original text
|
||||
input_text=TO BE COMPLETED
|
||||
# check whether there are further tables
|
||||
# same command as in line 24
|
||||
table_match=re.search(XXXXXXX, input_text)
|
||||
|
||||
################################
|
||||
# Remove exhibits
|
||||
# Same approach as in Problem 4
|
||||
################################
|
||||
# Exhibits have the following structure
|
||||
# <DOCUMENT>
|
||||
# <TYPE>EX...
|
||||
# ...
|
||||
# </DOCUMENT>
|
||||
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||
# example, we will print the first exhibit that we delete.
|
||||
i=1
|
||||
exhibit_match=re.search(ENTER THE REGEX, input_text)
|
||||
while exhibit_match:
|
||||
# Search for the beginning of the exhibit
|
||||
exhibit_start_match=re.search(REGEX FOR BEGINNING OF EXHIBIT, input_text)
|
||||
start_exhibit=
|
||||
# Search for the end of the exhibit
|
||||
# CAUTION: search only in the text after the beginning of the exhibt, as
|
||||
# the end-term also appears earlier (e.g. end of main document)
|
||||
exhibit_end_match=re.search(REGEX FOR END OF EXHIBIT, input_text[START OF EHIBIT UNTIL END OF TEXT])
|
||||
end_exhibit=
|
||||
if i<=1:
|
||||
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||
i=i+1
|
||||
# remove exhibit from the original text
|
||||
input_text=
|
||||
# check whether there are further exhibits
|
||||
# same command as in line 55
|
||||
exhibit_match=re.search(XXXXXXX, input_text)
|
||||
|
||||
##################
|
||||
# Remove html code
|
||||
##################
|
||||
# you can use BeautifulSoup for simplicity
|
||||
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||
text=html_text.get_text()
|
||||
|
||||
############################
|
||||
# Remove the Document Header
|
||||
############################
|
||||
# There are different possibilities how one can define the start of the main part of the text
|
||||
# In general, you should delete all text that is uninformative for your analysis.
|
||||
header_match=re.search(END OF DOCUMENT HEADER, text)
|
||||
if header_match:
|
||||
# Drop the document header and keep only the rest of the text after the header.
|
||||
text=text[XXXXXXXXXXXXXXX]
|
||||
|
||||
|
||||
#################################################
|
||||
# Delete the text in "PART IV"
|
||||
# This procedure is optional. Look at "Part IV" and decide whether you favor
|
||||
# the approach. I think that the part should be dropped, as it is just a list
|
||||
# of exhibits, some mandatory text required by the SEC [indicated by the
|
||||
# capital letters in the "SIGNATURES" section].
|
||||
#################################################
|
||||
'''
|
||||
# Alternative 1: go over all matches but keep only the last one
|
||||
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||
pass
|
||||
# match now contains the last match.
|
||||
# Delete the text after the last match
|
||||
text=text[:match.start()]
|
||||
|
||||
# Alternative 2: save the positions of all matches (more general approach)
|
||||
list_start_matches=[]
|
||||
list_end_matches=[]
|
||||
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||
list_start_matches.append(match.start())
|
||||
list_end_matches.append(match.end())
|
||||
# Position of last match
|
||||
print(list_start_matches[len(list_start_matches)-1])
|
||||
print(list_end_matches[len(list_start_matches)-1])
|
||||
|
||||
# Delete the text after the last match
|
||||
text=text[:list_start_matches[len(list_start_matches)-1]]
|
||||
'''
|
||||
|
||||
# Delete item numbers
|
||||
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
# Delete numbers
|
||||
# You can use the code from Problem 4.
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
|
||||
# Hyphens can be used to indicate that the word is continued in the next
|
||||
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||
# Delete hyphens that are followed by a line feed.
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
# Delete symbols
|
||||
# You can use the code from Problem 4.
|
||||
text=re.sub(TO BE COMPLETED,'',text)
|
||||
|
||||
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||
# that are preceded by whitespace or line break and that are followed by
|
||||
# whitespace or line break.
|
||||
text=re.sub('\n(\.|,)\n','\n',text)
|
||||
|
||||
# Drop single-character words
|
||||
# One can argue whether one should implement this procedure. Loughran and
|
||||
# McDonald argue in one of their papers in favor of it.
|
||||
# To make sure that there is just one letter, we require that there is a word
|
||||
# boundary (\W) before and after. We use a positive backward looking and a
|
||||
# positive forward looking condition for this to assure that the word boundary
|
||||
# get not deleted as well.
|
||||
text=re.sub(TO BE COMPLETED,' ',text)
|
||||
|
||||
|
||||
# Open the output file for the pure text
|
||||
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
|
||||
output_file.write(text)
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
print("COMPLETED.")
|
||||
|
||||
117
lectures/programming/templates/Problem_7_Tone_Analysis_form.py
Normal file
117
lectures/programming/templates/Problem_7_Tone_Analysis_form.py
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the dictionary
|
||||
# The dictionary has been obtained from Bill McDonald's webpage
|
||||
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||
# --> select negative words and copy them to a txt file
|
||||
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
|
||||
word_list=file_word_list.read()
|
||||
# LOOK AT THE FILE. ARE THE WORDS IN UPPER OR IN LOWER CASE?
|
||||
# MAKE SURE THAT YOU USE A CONSISTENT FORMAT FOR THE TEXT AND THE DICTIONARY.
|
||||
# THE COMMANDS ARE .lower() AND .upper().
|
||||
|
||||
# CREATE A LIST OF NEGATIVE WORDS -> SPLIT THE TEXT
|
||||
negative_words=word_list.XXXX
|
||||
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the input file in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
|
||||
Percentage_Negative_Words\n')
|
||||
|
||||
# Loop over all lines of the csv file
|
||||
for i in range(1,len(input_text_line)):
|
||||
# If the execution of your scripts takes some time, printing the loop iterator
|
||||
# gives you an impression of the overall progress made.
|
||||
print(str(i))
|
||||
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (2nd column)
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
|
||||
# modify file name to open the edited files
|
||||
filename=filename.replace('.txt','')
|
||||
# Open the ith 10-Ks in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# CONVERT THE TEXT TO UPPER OR LOWER CASE (see comment above)
|
||||
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||
# and the document is identical. Remember that you have typically lower and upper case
|
||||
# letters in documents -> modify text
|
||||
text=input_text_10_k.XXXXXX
|
||||
|
||||
# Split the text in words to determine the total number of words
|
||||
# LOOK AT THE REGEX INTRODUCTION FOR A SUITABLE SPLIT VARIABLE.
|
||||
list_of_words=re.split(XXXXX, text)
|
||||
|
||||
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
|
||||
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||
# You can use an approach similar to the one in lines 37 and 38.
|
||||
COMMANDS TO BE ADDED
|
||||
|
||||
# Determine the total number of words
|
||||
# COUNT THE NUMBER OF ELEMENTS IN list_of_words
|
||||
word_count=XXXX
|
||||
|
||||
# Reset the number of negative words to zero
|
||||
negative_count=0
|
||||
# For each negative word, count the number of occurrences
|
||||
for j in range(len(negative_words)):
|
||||
|
||||
HERE YOU NEED TO COUNT HOW OFTEN THE jth NEGATIVE WORD IS FOUND IN THE TEXT.
|
||||
COMPARE THE TWO CASES BELOW -> EXECUTE THE COMMANDS (see lines below) IN
|
||||
THE COMMAND LINE AND COMPARE THE RESULTS.
|
||||
WHICH ALTERNATIVE IS THE RIGHT APPROACH?
|
||||
|
||||
ALTERNATIVE 1:
|
||||
list_of_words=["abandon","abandoned","abandonment"]
|
||||
list_of_words.count("abandon")
|
||||
ALTERNATIVE 2:
|
||||
text_of_words="abandon abandoned abandonment"
|
||||
text_of_words.count("abandon")
|
||||
|
||||
ADD THE CORRECT COUNT OF NEGATIVE WORD j TO YOUR OVERALL COUNT.
|
||||
negative_count=negative_count+XXXXX
|
||||
|
||||
# Get the percentage of negative words
|
||||
percentage_negative=negative_count/word_count
|
||||
|
||||
# Write cik, file name, total number of words, number of negative words,
|
||||
# and the percentage of negative words to output file.
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||
+str(negative_count)+';'+str(percentage_negative)+'\n')
|
||||
|
||||
# Close filings
|
||||
input_file_10_k.close()
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Please adjust the directory to your machine.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the dictionary
|
||||
# The dictionary is obtained from Bill McDonald's webpage
|
||||
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||
# --> select positive words and copy them to a txt file
|
||||
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
|
||||
word_list=file_word_list.read()
|
||||
|
||||
# LIKE IN PROBLEM 7, YOU HAVE TO APPLY A CONSISTENT FORMAT TO BOTH THE LMD-WORDS
|
||||
# AND THE TEXT OF THE 10-Ks.
|
||||
positive_words=word_list.split()
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the iput file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
|
||||
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
|
||||
|
||||
|
||||
# Iterate the list of the 200 10-K filings
|
||||
for i in range(1,len(input_text_line)):
|
||||
# If the execution of your scripts takes some time, printing the iterator
|
||||
# gives you an impression of the overall progress made.
|
||||
print(str(i))
|
||||
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK (1st column) and the filename (2nd column)
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
|
||||
# modify file name to open the edited files
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'/10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
input_text_10_k=input_file_10_k.read()
|
||||
|
||||
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||
# and the document are identical. Remember that you have typically lower and upper case
|
||||
# letters in documents -> modify text
|
||||
text=XXXX # CONSISTENT FORMAT
|
||||
|
||||
# Split the text in single words to determine the total number of words
|
||||
list_of_words=re.split(XXXX, text) # USE THE SAME COMMAND AS IN PROBLEM 7
|
||||
|
||||
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
|
||||
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||
# You can use an approach similar to the one in lines 34 and 35.
|
||||
COMMANDS TO BE ADDED
|
||||
|
||||
# Determine total number of words
|
||||
word_count=XXXX # SAME COMMAND AS IN PROBLEM 7
|
||||
|
||||
# Reset the number of positive words and positive words adj. for negations to zero.
|
||||
positive_count=0
|
||||
positive_count_adj=0
|
||||
# For each positive word, count the number of occurrences
|
||||
for j in range(len(positive_words)):
|
||||
# standard count operation without controlling for negations
|
||||
positive_words_found=list_of_words.count(positive_words[j])
|
||||
|
||||
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
|
||||
# only for Fin-Pos words. Simple negation is taken to be observations
|
||||
# of one of six words (no, not, none, neither, never, nobody) occurring
|
||||
# within three words preceding a positive word.
|
||||
|
||||
# When we have identified positive words we need to search for negations
|
||||
while positive_words_found>0:
|
||||
# identify the position of the matched positive word in the list of all words
|
||||
position_of_word=list_of_words.XXXXX # THE COMMAND .index() IS HELPFUL HERE
|
||||
|
||||
# identify the three words before the positive word and add them to a list
|
||||
list_negation=[3_WORDS_BEFORE_MATCH,2_WORDS_BEFORE_MATCH,1_WORD_BEFORE_MATCH]
|
||||
# REPLACE THE THREE PLACEHOLDERS BY THE CORRESPONDING ELEMENTS OF list_of_words
|
||||
|
||||
# check whether one of the three words in list_negation is a negation
|
||||
negation_found=list_negation.count('no')+list_negation.count('not')+XXXX TO BE COMPLETED
|
||||
|
||||
if negation_found==0:
|
||||
# no negation
|
||||
positive_count_adj=positive_count_adj+1
|
||||
positive_count=positive_count+1
|
||||
else:
|
||||
# negation
|
||||
positive_count=positive_count+1
|
||||
|
||||
# delete the matched positive words in the original document
|
||||
list_of_words[position_of_word]=XXX
|
||||
# THIS OPERATION IS IMPORTANT BECAUSE OTHERWISE WE WILL GET AN ENDLESS LOOP
|
||||
|
||||
# check whether there are further matches of the jth positive word
|
||||
positive_words_found=list_of_words.count(positive_words[j])
|
||||
|
||||
# Write cik, file name, total number of words, and number of positive
|
||||
# and adjusted positive words to the output file
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
|
||||
';'+str(positive_count_adj/word_count)+'\n')
|
||||
|
||||
# Close filings
|
||||
input_file_10_k.close()
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Wed Apr 13 22:43:32 2016
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
# We split the text into words and sentences using regular expression
|
||||
import re
|
||||
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
# Open the csv file containing the list of the 200 10-Ks
|
||||
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||
input_text=input_file.read()
|
||||
|
||||
# Create output file
|
||||
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;WPS\n')
|
||||
|
||||
# Split the Input File in separate lines
|
||||
input_text_line=input_text.split("\n")
|
||||
|
||||
# In general, there can be empty lines in the input file. The following command
|
||||
# deletes these lines.
|
||||
while input_text_line.count("")>0:
|
||||
input_text_line.remove("")
|
||||
|
||||
# Loop over all lines
|
||||
for i in range(1,len(input_text_line)):
|
||||
print(str(i))
|
||||
# split the line into the two variables
|
||||
variables=input_text_line[i].split(";")
|
||||
# We need the CIK and the filename
|
||||
cik=variables[0]
|
||||
filename=variables[1]
|
||||
filename=filename.replace('.txt','')
|
||||
|
||||
# Open the ith 10-K in the list
|
||||
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
|
||||
encoding='ascii',errors='ignore')
|
||||
text=input_file_10_k.read()
|
||||
|
||||
# Determine number of sentences and number of words
|
||||
# DETERMINE THE NUMBER OF WORDS; YOU KNOW THE COMMAND FROM PROBLEMS 7 AND 8.
|
||||
list_of_words=re.split(XXX, text)
|
||||
# Determine total number of words
|
||||
word_count=XXX
|
||||
# Split the text by symbols that indicate the end of a sentence
|
||||
# to determine the total number of sentences
|
||||
list_of_sentences=re.split(XXX, text)
|
||||
# Determine total number of sentences
|
||||
sentence_count=XXX
|
||||
|
||||
# Ratio of # of words over # of sentences
|
||||
wps=word_count/sentence_count
|
||||
|
||||
# Write cik, file name, total number of words, total number of sentences,
|
||||
# and WPS to the output file
|
||||
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||
str(sentence_count)+';'+str(wps)+'\n')
|
||||
|
||||
# Close filing
|
||||
input_file_10_k.close()
|
||||
|
||||
|
||||
print("Finished")
|
||||
output_file.close()
|
||||
input_file.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue