1
0
Fork 0

Add programming files

- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
This commit is contained in:
Alexander Hess 2022-08-05 00:05:05 +02:00
commit a37c87d9c8
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
38 changed files with 6416 additions and 0 deletions

View file

@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 15 21:56:41 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
import nltk
import random
import collections
import re
# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
# are positive and the other 1,000 are negative.
# if you do not have the movie review corpus yet, download it:
nltk.download("movie_reviews")
from nltk.corpus import movie_reviews
# Create a list that contains the tuples of document and category.
# Category is "positive" or "negative"
documents = []
# For all categories
for category in movie_reviews.categories():
print("Category: "+str(category))
# for all reviews (identified by file ID) in the respective category
for file_ID in movie_reviews.fileids(category):
# You have to put two parentheses to indicate that you want to add a tuple.
documents.append((list(movie_reviews.words(file_ID)),category))
# Print the first element (i.e. tuple) of documents.
print(documents[0])
# print the words of the first movie review
print(documents[0][0])
# print the first word of the first movie review
print(documents[0][0][0])
# print the classification of the first movie review
print(documents[0][1])
# print the classification of the 1000th review (the last negative one)
print(documents[999][1])
# print the classification of the 1001st review (the first positive one)
print(documents[1000][1])
# The default order of the reviews is first all negative reviews and then all positive ones.
# Later we will build a training and a testing set. As we need to have positive and negative
# reports in both sets, we randomly shuffle the documents.
random.shuffle(documents)
# Create a list of all words.
all_words = []
for word in movie_reviews.words():
# We use lower case words
#all_words.append(word.lower())
if re.search("\A[a-z]",word.lower()):
# check whether the word is actually a word, i.e., whether it contains
# at least one letter
#if re.search("[a-z]",word.lower()):
# We use lower case words
all_words.append(word.lower())
# What are the most frequently used words in the movie reviews?
# Alternative 1:
# FreqDist sort words from the most frequently used word to the least frequenty used word.
all_words_approach_1 = nltk.FreqDist(all_words)
print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
# Alternative 2:
# We can also determine the most frequent words by using Counters as we did
# in Problem 12 --> transform list of all words to a Counter
all_words_approach_2=collections.Counter(all_words)
top_15_words=all_words_approach_2.most_common(15)
print("Alternative 2: the top 15 words are: "+str(top_15_words))
# -> identical results -> perfect.
# Search for a word and see how often it appears.
print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
# alternatively
print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
# -> create a list that only contains the top 3000 words
# get the top 3000 words
# Approach 1 using the NLKT.FreqDist from above
i=0
top_3000_words=all_words_approach_1.most_common(3000)
list_top_3000_words_approach_1=[]
while i<3000:
list_top_3000_words_approach_1.append(top_3000_words[i][0])
i=i+1
# Approach 2 using Counters from above
i=0
top_3000_words=all_words_approach_2.most_common(3000)
list_top_3000_words_approach_2=[]
while i<3000:
list_top_3000_words_approach_2.append(top_3000_words[i][0])
i=i+1
# select the list of approach 1 or 2
word_features=list_top_3000_words_approach_1
# We need to identify the words we want to use for classification in the documents.
# We define a function for that.
def find_features(document):
words = set(document)
features = {}
# loop over all the words we consider for the classification
for word in word_features:
# The expression returns either true or false
features[word] = (word in words)
return features
# To get an idea what the function find_features() does let's print the features
# for one review.
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
feature_set = [(find_features(review), category) for (review, category) in documents]
# How does feature set looks like?
print(feature_set[0])
# -> it is still a tuple
print(feature_set[0][0])
# the first element are the 3000 words we use for classification with "True" or "False"
# depending on whether the words appear in the review
print(feature_set[0][1])
# Is the information on whether the review is positive or negative
# Define the training and testing set
# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
training_set=feature_set[:1900]
testing_set=feature_set[1900:]
# First we have to train the Naive Bayes Classifier.
# It will determine which of the words from word_features appear mostly in positive
# reviews and which appear mostly in negative reviews.
classifier=nltk.NaiveBayesClassifier.train(training_set)
# The following command prints the 20 words that best discriminate between
# positive and negative reviews.
classifier.show_most_informative_features(20)
# Let's classify the first element of feature_set
# The input for the classification need to be the list of words with True or False
print(classifier.classify(feature_set[0][0]))
print("The review is actually: "+str(feature_set[0][1]))
# classify the 100 reports from the testing set
# they have the position 1900 to 2000 in the feature set.
i=1900
classified_set=[]
while i<2000:
classified_set.append(classifier.classify(feature_set[i][0]))
i=i+1
# Compare classification result with actual category
i=0
# In this list we save tuples of [predicted category, actual category]
comparison=[]
# In this list we simply save "accurate" and "inaccurate"
comparison_2=[]
while i<100:
comparison.append([classified_set[i],feature_set[i+1900][1]])
# If the predicted and acutal classification match -> accurate
if comparison[i][0]==comparison[i][1]:
comparison_2.append("accurate")
else:
comparison_2.append("inaccurate")
i=i+1
print(comparison)
# We need the number of accurate and inaccurate classifications
comparison_counter=collections.Counter(comparison_2)
print(comparison_counter)
# NLT can compute the accuracy directly
# What is the accuracy for the testing set?
print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
# Same value as from our own calculations -> perfect!
# What is the accuracy for the training set?
print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
# Higher than in the testing dataset -> expected.
print("completed!")

View file

@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# To determine file size we need the OS package
import os
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# File size of the complete submission file (gross file size)
# You have to divide the result by 1024 to get the size in kilobyte
# The file size will be affected by html code and exhibits.
# APPLY THE COMMAND THAT IS SHOWN ON SLIDE 62.
size_gross=XXX/1024
# File size of the main text file (net file size)
# You have to divide the result by 1024 to get the size in kilobyte
size_net=XXX/1024 # SAME COMMAND AS FOR GROSS FILE SIZE BUT APPLIED TO THE _clean.txt
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular expressions and counters (->collections)
import re
import collections
# for the bigram part, the sentence tokenizer is helpful
from nltk.tokenize import sent_tokenize
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create an empty counter variable
words_counter=collections.Counter()
# counter for the extra task
bigram_counter=collections.Counter()
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+\
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# read the content from the file
input_text_10_k=input_file_10_k.read()
# THINK ABOUT WE SHOULD USE LOWER OR UPPER CASE CONSISTENTLY!
input_text_10_k=
# Split text into words
list_of_words=re.split('\W{1,}',input_text_10_k)
# Remember: there can be empty list elements!
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 24 and 25.
COMMANDS TO BE ADDED
# Add the words to our counter
words_counter=words_counter+XXXX # COMPLETE THIS COMMAND
#############################################
# optional part for the extra task on bigrams
#############################################
# create an empty list for the bigrams
'''
bigram_list=[]
# split the text into sentences
list_of_sentences=XXX
# create the bigrams IN EACH SENTENCE
for sentence in list_of_sentences:
# split the sentence into words
list_of_words=XXX
# remove empty elements
while list_of_words.count("")>0:
list_of_words.remove("")
# go over all potential two word combinations in the sentence.
for word_number in range(XXX,YYY):
# add the bigram (two words connected by whitespace) to the list
bigram_list.append(WORD_1 + " " + WORD_2)
# same command as in line 70
bigram_counter=bigram_counter+XXX
# end of extra task
'''
# Close the 10-K filing
input_file_10_k.close()
input_file.close()
######################
# Top 100 single words
######################
# Open the csv file containing the 100 most frequently used words
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8",errors="ignore")
output_file.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_words=words_counter.XXXX # COMPLETE THIS COMMAND
# Write the 100 most frequent words to the csv file
# REMEMBER: Python starts counting at 0, while humans start at 1.
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
for i in range(1,101):
output_file.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
# Close the csv file
output_file.close()
######################
# Extra task
# Top 100 bigrams
######################
'''
# Open the csv file containing the 100 most frequently used BIGRAMS
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
output_file_bigram.write("rank;word;count\n")
# Get the 100 most frequent bigrams: same commend as above
top_100_bigrams=bigram_counter.XXX
# Write the 100 most frequent bigrams to the csv file.
# same logic as above
for i in range(1,101):
output_file_bigram.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
# Close the csv file
output_file_bigram.close()
'''
print("Task done!")

View file

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular epressions and stemming.
import re
from nltk.stem import PorterStemmer
# Depending on how you would like to split the text in words, you may need tokenize.
from nltk.tokenize import word_tokenize
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# Get the text of the 10-K
input_text_10_k=input_file_10_k.read()
# We need to tokenize the text because stem only works on a word by word basis.
# Stemming an entire document without splitting into words does not work!
# The problem is that \n gets lost in this process --> we cannot easily
# recreate the document.
# Solution: replace \n by \n and some indicator that there was a line break.
# For example replace("\n","\nHereWasALinebreak")
input_text_10_k=input_text_10_k.replace("\n",XXXX)
# Split text into words
word_list=XXXX
# Stem the text from above
text_stemmed=''
# LOOP ALL WORDS, STEM THEM AND RECONNECT THEM.
# WARNING: WHEN RECONNECTING WORDS YOU NEED TO INCLUDE A WHITESPACE BETWEEN
# THE WORDS. OTHERWISE, THE TEXT GETS MESSED UP.
for word in word_list:
text_stemmed=text_stemmed+XXX # TO BE COMPLETED
# To recreate the text, we need to replace the line break indicators by \n.
# WARNING: PAY ATTENTION TO UPPER/LOWER CASE, IT CAN CHANGE.
text_stemmed=text_stemmed.replace(XXXX,XXXX) # UNDO THE TRANSFORMATION FROM LINE 56.
# Open the output file for the stemmed text
output_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
output_file_10_k.write(text_stemmed)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()
print("Task done!")

View file

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# For the full task, we need a large set of packages:
# regular expression, stemming, stop words, tokenization, and counters.
import re
#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
#from nltk.corpus import stopwords # NOT needed for the base comparison
#from nltk.stem import PorterStemmer # NOT needed for the base comparison
from collections import Counter
#ps=PorterStemmer() # NOT needed for the base comparison
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard\n')
# set default values for variables
word_list_old_edited=""
word_list_edited=""
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Split text into words
word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
# Alternative using tokenize
#word_list_edited=word_tokenize(input_text_10_k.lower())
# check whether the previous entry of the list is from the same firm
permco=input_text_line[i].split(";")[1]
permco_old=input_text_line[i-1].split(";")[1]
############################################
# Sub Task 1: Jaccard for the _edited.txt
############################################
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
counter_current_10k=Counter(XXX)
counter_previous_10k=Counter(XXX)
intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)
jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION
output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(input_text_line[i]+";"+"\n")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited=word_list_edited
# Close 10-K filing
input_file_10_k.close()
input_file.close()
output_file.close()
print("Task done!")

View file

@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
# adjust the directory to your folder
directory="C:/Lehre/Machine Learning/Data/"
# import the data for this problem
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
# The rows of the data are the Form 10-K filings. Each line is one filing.
# The columns are the variables. After some identifying information,
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
# in a 10-K (e.g., 100 times)
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
# and Console will crash.
# However, you can pick a small subset of the data and look at it.
# It list all columns=variables and the first three observations.
data_frame_example=data_frame.head(3)
# you can click on this variable in the variable explorer without Spyder crashing.
# To see the variables included in the data use the following command
data_frame_column_names=data_frame.columns
# you can click on this variable in the variable explorer without Spyder crashing.
# This variables shows all column/variable names in a vector.
# split the data set into the training and testing data
# we use the filings from year 2007 as training data
data_frame_train=data_frame[data_frame.year==2007]
# and the filing from year 2008 as testing data
data_frame_test=data_frame[data_frame.year==2008]
# put the cumulative abnormal return around the filing date into a new variable.
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
# training data
filing_car_train=data_frame_train["excess_ret_t0_t4"]
# testing data
filing_car_test=data_frame_test["excess_ret_t0_t4"]
# so far, you have absolute word counts. For example, "loss" is found 5 times.
# As the length of the 10-Ks can be different, we scale by the number of words
# in the 10-K.
document_length_train=data_frame_train["number_of_words"]
document_length_test=data_frame_test["number_of_words"]
# the word frequencies are our independent variables -> restrict the data frame
# to those variables and drop all variables that are not needed
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
# compute relative frequencies, i.e., divide the absolute word count by document length
data_frame_train=data_frame_train.div(document_length_train, axis=0)
data_frame_test=data_frame_test.div(document_length_test, axis=0)
# standardize the data frames
# training data
data_frame_train_mean=TO BE COMPLETED
data_frame_train_sd=TO BE COMPLETED
data_frame_train_standardized=TO BE COMPLETED
# testing data
data_frame_test_mean=TO BE COMPLETED
data_frame_test_sd=TO BE COMPLETED
data_frame_test_standardized=TO BE COMPLETED
# There can be missing values in the standardized variables.
# They arise if the word count for a specific word is always zero in the training
# or in the testing data. In this case, the standard deviation is zero ->
# division by zero -> NaN.
# We replace these missing values by zero.
# training data
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
# testing data
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
##########################
# Ridge regression
##########################
print("\nRidge regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
regression_Ridge_cv=RidgeCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
# get the optimal lambda
alpha_optimal_cv=TO BE COMPLETED
print("The optimal alpha is "+str(alpha_optimal_cv))
# what is the R2 in the training and testing data?
print("The R2 in the training data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
print("The R2 in the testing data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
# predict y in the testing sample
filing_car_test_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
######################
# LASSO regression
######################
print("\nLASSO regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
regression_Lasso_cv=LassoCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
# get the optimal lambda
alpha_optimal_cv=TO BE COMPLETED
print("The optimal alpha is "+str(alpha_optimal_cv))
# get the R2 in the training data
print("The R2 in the training data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
# ... and testing data
print("The R2 in the testing data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
# predict y in the testing sample
filing_car_test_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
############################################################
# Compare the betas from the Ridge and the LASSO regressions
############################################################
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,len(data_frame_train.columns)):
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 15 21:37:53 2019
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# open the Fun_with_Python text file
input_file=open(directory+"Fun_with_Python.txt","r")
###################################
# Programming Problem 1
###################################
# Task 1: open the file 'Fun_with_Python.txt' in Spyder and print its content
# The file can be found in our data folder
# get the text from the file
input_text= TO BE COMPLETED
# print the content, i.e., the text of the file (previous line)
print(TO BE COMPLETED)
# See slide 7
# Task 2: Write the content of 'Fun_with_Python.txt' to a new text file
# with the name 'More_fun_with_Python.txt'.
# ENTER YOUR COMMANDS HERE
# See slide 8.
# REMEMBER to close your file. If you do not close the new txt file, its content
# will not be saved to the hard drive. You will find an empty txt in your file manager.
# Task 3: Write a loop that prints some text (whatever you like) ten times.
# ENTER YOUR COMMANDS HERE
# See slide 9.
# You have several options. While loop, for X in range() loop, etc.
# Task 4: Print the text of the "Fun_with_Python" file line by line!
# ENTER YOUR COMMANDS HERE
# See slide 10.
# You need a loop (Task 3) and in each iteration of the loop have Python print
# a line of text.
# Task 5: Count how often the word 'good' appears in the document 'Fun_with_Python.txt'!
# ENTER YOUR COMMANDS HERE
# See slide 11.
# Task 6a: Now, print only the lines that contain the word 'good'!
# ENTER YOUR COMMANDS HERE
# See also slide 12.
# You can use the line-by-line printing from Task 4 and combine it with the command ".count()" from Task 5
# and add the if condition from slide 12.
# If condition: for each line check whether the specific line contains the word "good".
# Task 7: print only the lines that start with the word 'This'!
# ENTER YOUR COMMANDS HERE
# See slide 15.
# This is very similar to task 6. You only need to modify the if condition a bit.
# Task 8a: Replace the word "good" by "excellent" and display the new text!
# See slide 16.
# ENTER YOUR COMMANDS HERE

View file

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 09:21:46 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the txt file with the SEC filings
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
sec_filings_text=sec_filings_file.read()
# Create output file
output_file=open(directory+'SEC_Filings_Output.csv','w')
# Create first line with variable names
# I use semicolons as separator in csv files. You can also use any other symbol.
# However, you should make sure that the separator is not part of the data/text
# you write to the file.
# For example, it would be problematic if you use comma as separator and have
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
# Split the Input File in separate line
# DO THE LINE SPIT
sec_filings_line=
# Loop over all lines
# you can get the number of lines by computing the length of the list of lines,
# i.e. by determining the length of sec_filings_line.
for / while : # COMPLETE LOOP
# Does the line refer to a form 10-K file?
if : # USE AN IF CONDITION TO TEST THIS -> see TASKS 7 and 8 of PROBLEM 1
# Split the line such that the information can be saved in separate
# variables
# Each information item has a fixed length in the overview files of the
# SEC.
# SEE SLIDE 18 FOR INFORMATION ON THE LENGTH OF THE SEPARATE COLUMNS.
# COMPLETE THE COMMANDS BELOW
filing_type=
company_name=
cik=
filing_date=
link=
# Is the 10-K filed between March 10 and March 20?
filing_day=
filing_month=
# Is the Filing Month March?
if : # COMPLETE THE IF-CONDITION
# Is the Filing Day between 10 and 20?
if : # COMPLETE THE IF-CONDITION
# The filing meets the conditions -->
# Write output to the csv file
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
# Close your input and output file in the end
sec_filings_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# We need the urllib package for the download.
import urllib.request
# To automatically create folders, we need the os-module (OS: Operating System)
import os
###############################################################################
# Technical issue
# As of March 2021, the SEC no longer accepts requests by the standard urllib settings
# you have to make some adjustments
###############################################################################
# Define a user agent
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
# "Some websites dislike being browsed by programs, or send different versions
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
# (where x and y are the major and minor version numbers of the Python release,
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
# The way a browser identifies itself is through the User-Agent header.
opener = urllib.request.build_opener()
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
# To still automatically download files, you have different options.
# I have listed three examples below but there are many more:
# For a comprehensive list see, e.g.:
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
#opener.addheaders = [('User-agent', 'Mozilla')]
#opener.addheaders = [('User-agent', 'Chrome')]
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
urllib.request.install_opener(opener)
# END of the technical issues
# Open the csv file from part 1 of the problem
input_file=open(directory+'SEC_Filings_Output.csv','r')
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# Create a subfolder in which the 10-K filings are saved.
# When you download a large number of filings I recommend using subfolders for
# each year or even for each year-month-day combination.
# In this problem, a single subfolder is fine.
os.makedirs( COMPLETE THE COMMAND )
# See slide 18 for information on the os.-commands!
# IN GENERAL, IF YOU SEE AN UNKNOWN COMMAND, GOOGLE IT TO GET INFORMATION.
# Loop over all lines of the csv file
# Like in part 1 of the problem, you can get the number of lines by computing
# the length of the list of lines, i.e. by determining the length of input_text_line.
for / while: # COMPLETE THE LOOP
# split the line into the five variables
# THE ; IS THE SEPARATOR IN THE CSV -> USE THE split() COMMAND
variables=
# We only need the cik and the link to download the file.
# The cik is the 3rd variable.
# The link is the 5th variable
cik=
link=
# identify the filename
# The link consistes of differnt parts:
# For example: edgar/data/1000753/0000950129-98-001035.txt
link_parts= # USE A SPLIT
# 1st part: edgar
# 2nd part: data
# 3rd part: cik
# 4th part: file name -> see next line
filename=link_parts[FILE IN THE NUMBER HERE]
###########################################################################
############################ WARNING ######################################
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
# may use the same filename. Thus, when you only use the filename files
# might be overwritten. To avoid this problem you need to have a unique name.
# Combining CIK and filename results in a unique identifier, as the
# filename appears only once per firm (CIK).
# -> use the combination of CIK and filename: cik_filename
###########################################################################
urllib.request.urlretrieve(TO BE COMPLETED)
# See slide 19 for information on the urllib.-commands.
# Close your input file
input_file.close()
print("DONE")

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the document
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
input_text=input_file.read()
#######################
# Task 1: remove tables
#######################
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search(TO BE COMPLETED, input_text)
while : # YOU NEED A LOOP THAT SEARCHES FOR TABLES
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search(XXX, input_text)
start_table=table_start_match.start()
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
# REPEAT THE COMMANDS ABOVE FOR THE END OF TABLE
table_end_match=
end_table=
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table])
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
input_text=TO BE COMPLETED
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search(SAME COMMAND AS IN LINE 27, input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
#########################
# Task 2: remove Exhibits
#########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# THE APPROACH IS THE SAME AS THE SEARCH FOR TABLES ABOVE
exhibit_match=re.search(, input_text)
while :
# get the beginning of the exhibit
exhibit_start_match=
start_exhibit=
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# However, for completeness, we will define an end
exhibit_end_match=
end_exhibit=
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is a exhibit!\n"+input_text[start_exhibit:end_exhibit])
input_text=TO BE COMPLETED
# Check whether there are further exhibits
exhibit_match=re.search(SAME COMMAND AS IN LINE 65, input_text)
##########################
# Task 3: remove html code
##########################
# Alternative 1: remove html code without Beautiful Soup
text=re.sub(TO BE COMPLETED, '', input_text)
# Use a regex that searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(TO BE COMPLETED)
text=html_text.TO BE COMPLETED
########################
# Task 4: delete numbers
########################
# YOU MAY NEED MULTIPLE COMMANDS TO DELETE ALL NUMBERS
# Remember that you can have different formats, e.g., 1,234.56 or 0.12 or 1,234,567
text=re.sub(TO BE COMPLETED,'',text)
########################
# Task 5: delete symbols
########################
text=re.sub(TO BE COMPLETED,'',text)
# Open the output file for the pure text
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
output_file.write(text)
# close all files
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the 10-K
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
input_text=input_file.read()
################################
# Remove tables
# Same approach as in Problem 4
################################
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first two tables that we delete.
i=1
table_match=re.search(ENTER THE REGEX, input_text)
while table_match:
# Search for the beginning of the table
table_start_match=re.search(REGEX FOR BEGINNING OF TABLE, input_text)
start_table=
# search for the end of the table
table_end_match=REGEX FOR END OF TABLE
end_table=
# The if condition and the printing are just for illustrative purposes.
# The commands display the first two tables that are removed from the text.
if i<=2:
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
i=i+1
# remove the table from the original text
input_text=TO BE COMPLETED
# check whether there are further tables
# same command as in line 24
table_match=re.search(XXXXXXX, input_text)
################################
# Remove exhibits
# Same approach as in Problem 4
################################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first exhibit that we delete.
i=1
exhibit_match=re.search(ENTER THE REGEX, input_text)
while exhibit_match:
# Search for the beginning of the exhibit
exhibit_start_match=re.search(REGEX FOR BEGINNING OF EXHIBIT, input_text)
start_exhibit=
# Search for the end of the exhibit
# CAUTION: search only in the text after the beginning of the exhibt, as
# the end-term also appears earlier (e.g. end of main document)
exhibit_end_match=re.search(REGEX FOR END OF EXHIBIT, input_text[START OF EHIBIT UNTIL END OF TEXT])
end_exhibit=
if i<=1:
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
i=i+1
# remove exhibit from the original text
input_text=
# check whether there are further exhibits
# same command as in line 55
exhibit_match=re.search(XXXXXXX, input_text)
##################
# Remove html code
##################
# you can use BeautifulSoup for simplicity
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
############################
# Remove the Document Header
############################
# There are different possibilities how one can define the start of the main part of the text
# In general, you should delete all text that is uninformative for your analysis.
header_match=re.search(END OF DOCUMENT HEADER, text)
if header_match:
# Drop the document header and keep only the rest of the text after the header.
text=text[XXXXXXXXXXXXXXX]
#################################################
# Delete the text in "PART IV"
# This procedure is optional. Look at "Part IV" and decide whether you favor
# the approach. I think that the part should be dropped, as it is just a list
# of exhibits, some mandatory text required by the SEC [indicated by the
# capital letters in the "SIGNATURES" section].
#################################################
'''
# Alternative 1: go over all matches but keep only the last one
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
pass
# match now contains the last match.
# Delete the text after the last match
text=text[:match.start()]
# Alternative 2: save the positions of all matches (more general approach)
list_start_matches=[]
list_end_matches=[]
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
list_start_matches.append(match.start())
list_end_matches.append(match.end())
# Position of last match
print(list_start_matches[len(list_start_matches)-1])
print(list_end_matches[len(list_start_matches)-1])
# Delete the text after the last match
text=text[:list_start_matches[len(list_start_matches)-1]]
'''
# Delete item numbers
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
text=re.sub(TO BE COMPLETED,'',text)
# Delete numbers
# You can use the code from Problem 4.
text=re.sub(TO BE COMPLETED,'',text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Delete hyphens that are followed by a line feed.
text=re.sub(TO BE COMPLETED,'',text)
# Delete symbols
# You can use the code from Problem 4.
text=re.sub(TO BE COMPLETED,'',text)
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by whitespace or line break and that are followed by
# whitespace or line break.
text=re.sub('\n(\.|,)\n','\n',text)
# Drop single-character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub(TO BE COMPLETED,' ',text)
# Open the output file for the pure text
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
output_file.write(text)
input_file.close()
output_file.close()
print("COMPLETED.")

View file

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary has been obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select negative words and copy them to a txt file
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# LOOK AT THE FILE. ARE THE WORDS IN UPPER OR IN LOWER CASE?
# MAKE SURE THAT YOU USE A CONSISTENT FORMAT FOR THE TEXT AND THE DICTIONARY.
# THE COMMANDS ARE .lower() AND .upper().
# CREATE A LIST OF NEGATIVE WORDS -> SPLIT THE TEXT
negative_words=word_list.XXXX
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
Percentage_Negative_Words\n')
# Loop over all lines of the csv file
for i in range(1,len(input_text_line)):
# If the execution of your scripts takes some time, printing the loop iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-Ks in the list
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# CONVERT THE TEXT TO UPPER OR LOWER CASE (see comment above)
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document is identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=input_text_10_k.XXXXXX
# Split the text in words to determine the total number of words
# LOOK AT THE REGEX INTRODUCTION FOR A SUITABLE SPLIT VARIABLE.
list_of_words=re.split(XXXXX, text)
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 37 and 38.
COMMANDS TO BE ADDED
# Determine the total number of words
# COUNT THE NUMBER OF ELEMENTS IN list_of_words
word_count=XXXX
# Reset the number of negative words to zero
negative_count=0
# For each negative word, count the number of occurrences
for j in range(len(negative_words)):
HERE YOU NEED TO COUNT HOW OFTEN THE jth NEGATIVE WORD IS FOUND IN THE TEXT.
COMPARE THE TWO CASES BELOW -> EXECUTE THE COMMANDS (see lines below) IN
THE COMMAND LINE AND COMPARE THE RESULTS.
WHICH ALTERNATIVE IS THE RIGHT APPROACH?
ALTERNATIVE 1:
list_of_words=["abandon","abandoned","abandonment"]
list_of_words.count("abandon")
ALTERNATIVE 2:
text_of_words="abandon abandoned abandonment"
text_of_words.count("abandon")
ADD THE CORRECT COUNT OF NEGATIVE WORD j TO YOUR OVERALL COUNT.
negative_count=negative_count+XXXXX
# Get the percentage of negative words
percentage_negative=negative_count/word_count
# Write cik, file name, total number of words, number of negative words,
# and the percentage of negative words to output file.
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(negative_count)+';'+str(percentage_negative)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,131 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary is obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select positive words and copy them to a txt file
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# LIKE IN PROBLEM 7, YOU HAVE TO APPLY A CONSISTENT FORMAT TO BOTH THE LMD-WORDS
# AND THE TEXT OF THE 10-Ks.
positive_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
# Iterate the list of the 200 10-K filings
for i in range(1,len(input_text_line)):
# If the execution of your scripts takes some time, printing the iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'/10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document are identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=XXXX # CONSISTENT FORMAT
# Split the text in single words to determine the total number of words
list_of_words=re.split(XXXX, text) # USE THE SAME COMMAND AS IN PROBLEM 7
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 34 and 35.
COMMANDS TO BE ADDED
# Determine total number of words
word_count=XXXX # SAME COMMAND AS IN PROBLEM 7
# Reset the number of positive words and positive words adj. for negations to zero.
positive_count=0
positive_count_adj=0
# For each positive word, count the number of occurrences
for j in range(len(positive_words)):
# standard count operation without controlling for negations
positive_words_found=list_of_words.count(positive_words[j])
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
# only for Fin-Pos words. Simple negation is taken to be observations
# of one of six words (no, not, none, neither, never, nobody) occurring
# within three words preceding a positive word.
# When we have identified positive words we need to search for negations
while positive_words_found>0:
# identify the position of the matched positive word in the list of all words
position_of_word=list_of_words.XXXXX # THE COMMAND .index() IS HELPFUL HERE
# identify the three words before the positive word and add them to a list
list_negation=[3_WORDS_BEFORE_MATCH,2_WORDS_BEFORE_MATCH,1_WORD_BEFORE_MATCH]
# REPLACE THE THREE PLACEHOLDERS BY THE CORRESPONDING ELEMENTS OF list_of_words
# check whether one of the three words in list_negation is a negation
negation_found=list_negation.count('no')+list_negation.count('not')+XXXX TO BE COMPLETED
if negation_found==0:
# no negation
positive_count_adj=positive_count_adj+1
positive_count=positive_count+1
else:
# negation
positive_count=positive_count+1
# delete the matched positive words in the original document
list_of_words[position_of_word]=XXX
# THIS OPERATION IS IMPORTANT BECAUSE OTHERWISE WE WILL GET AN ENDLESS LOOP
# check whether there are further matches of the jth positive word
positive_words_found=list_of_words.count(positive_words[j])
# Write cik, file name, total number of words, and number of positive
# and adjusted positive words to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
';'+str(positive_count_adj/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We split the text into words and sentences using regular expression
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;WPS\n')
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
text=input_file_10_k.read()
# Determine number of sentences and number of words
# DETERMINE THE NUMBER OF WORDS; YOU KNOW THE COMMAND FROM PROBLEMS 7 AND 8.
list_of_words=re.split(XXX, text)
# Determine total number of words
word_count=XXX
# Split the text by symbols that indicate the end of a sentence
# to determine the total number of sentences
list_of_sentences=re.split(XXX, text)
# Determine total number of sentences
sentence_count=XXX
# Ratio of # of words over # of sentences
wps=word_count/sentence_count
# Write cik, file name, total number of words, total number of sentences,
# and WPS to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(sentence_count)+';'+str(wps)+'\n')
# Close filing
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()