Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/templates/NLTK_Sentiment_Analysis.py
+++ b/lectures/programming/templates/NLTK_Sentiment_Analysis.py
@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jul 15 21:56:41 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import nltk
+import random
+import collections
+import re
+
+# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
+# are positive and the other 1,000 are negative.
+# if you do not have the movie review corpus yet, download it:
+nltk.download("movie_reviews")
+
+from nltk.corpus import movie_reviews
+
+
+# Create a list that contains the tuples of document and category.
+# Category is "positive" or "negative"
+documents = []
+# For all categories
+for category in movie_reviews.categories():
+    print("Category: "+str(category))
+    # for all reviews (identified by file ID) in the respective category
+    for file_ID in movie_reviews.fileids(category):
+        # You have to put two parentheses to indicate that you want to add a tuple.
+        documents.append((list(movie_reviews.words(file_ID)),category))
+
+# Print the first element (i.e. tuple) of documents.
+print(documents[0])
+# print the words of the first movie review
+print(documents[0][0])
+# print the first word of the first movie review
+print(documents[0][0][0])
+
+# print the classification of the first movie review
+print(documents[0][1])
+
+# print the classification of the 1000th review (the last negative one)
+print(documents[999][1])
+# print the classification of the 1001st review (the first positive one)
+print(documents[1000][1])
+
+# The default order of the reviews is first all negative reviews and then all positive ones.
+# Later we will build a training and a testing set. As we need to have positive and negative
+# reports in both sets, we randomly shuffle the documents.
+random.shuffle(documents)
+
+# Create a list of all words.
+all_words = []
+for word in movie_reviews.words():
+    # We use lower case words
+    #all_words.append(word.lower())
+    if re.search("\A[a-z]",word.lower()):
+    # check whether the word is actually a word, i.e., whether it contains
+    # at least one letter
+    #if re.search("[a-z]",word.lower()):
+        # We use lower case words
+        all_words.append(word.lower())
+    
+
+# What are the most frequently used words in the movie reviews?
+# Alternative 1:
+# FreqDist sort words from the most frequently used word to the least frequenty used word.
+all_words_approach_1 = nltk.FreqDist(all_words)
+print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
+
+# Alternative 2:
+# We can also determine the most frequent words by using Counters as we did
+# in Problem 12 --> transform list of all words to a Counter
+all_words_approach_2=collections.Counter(all_words)
+top_15_words=all_words_approach_2.most_common(15)
+print("Alternative 2: the top 15 words are: "+str(top_15_words))
+# -> identical results -> perfect.
+
+# Search for a word and see how often it appears.
+print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
+# alternatively
+print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
+
+# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
+# -> create a list that only contains the top 3000 words
+# get the top 3000 words
+# Approach 1 using the NLKT.FreqDist from above
+i=0
+top_3000_words=all_words_approach_1.most_common(3000)
+list_top_3000_words_approach_1=[]
+while i<3000:
+    list_top_3000_words_approach_1.append(top_3000_words[i][0])
+    i=i+1
+    
+# Approach 2 using Counters from above
+i=0
+top_3000_words=all_words_approach_2.most_common(3000)
+list_top_3000_words_approach_2=[]
+while i<3000:
+    list_top_3000_words_approach_2.append(top_3000_words[i][0])
+    i=i+1
+
+# select the list of approach 1 or 2
+word_features=list_top_3000_words_approach_1
+
+# We need to identify the words we want to use for classification in the documents.
+# We define a function for that.
+def find_features(document):
+    words = set(document)
+    features = {}
+    # loop over all the words we consider for the classification
+    for word in word_features:
+        # The expression returns either true or false
+        features[word] = (word in words)
+    
+    return features
+
+# To get an idea what the function find_features() does let's print the features
+# for one review.
+print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
+
+
+feature_set = [(find_features(review), category) for (review, category) in documents]
+
+# How does feature set looks like?
+print(feature_set[0])
+# -> it is still a tuple
+print(feature_set[0][0])
+# the first element are the 3000 words we use for classification with "True" or "False"
+# depending on whether the words appear in the review
+print(feature_set[0][1])
+# Is the information on whether the review is positive or negative
+
+# Define the training and testing set
+# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
+training_set=feature_set[:1900]
+testing_set=feature_set[1900:]
+
+# First we have to train the Naive Bayes Classifier.
+# It will determine which of the words from word_features appear mostly in positive
+# reviews and which appear mostly in negative reviews.
+classifier=nltk.NaiveBayesClassifier.train(training_set)
+# The following command prints the 20 words that best discriminate between
+# positive and negative reviews.
+classifier.show_most_informative_features(20)
+
+# Let's classify the first element of feature_set
+# The input for the classification need to be the list of words with True or False
+print(classifier.classify(feature_set[0][0]))
+print("The review is actually: "+str(feature_set[0][1]))
+
+# classify the 100 reports from the testing set
+# they have the position 1900 to 2000 in the feature set.
+i=1900
+classified_set=[]
+while i<2000:
+    classified_set.append(classifier.classify(feature_set[i][0]))
+    i=i+1
+
+# Compare classification result with actual category
+i=0
+# In this list we save tuples of [predicted category, actual category]
+comparison=[]
+# In this list we simply save "accurate" and "inaccurate"
+comparison_2=[]
+while i<100:
+    comparison.append([classified_set[i],feature_set[i+1900][1]])
+    # If the predicted and acutal classification match -> accurate
+    if comparison[i][0]==comparison[i][1]:
+        comparison_2.append("accurate")
+    else:
+        comparison_2.append("inaccurate")
+    i=i+1
+
+print(comparison)
+# We need the number of accurate and inaccurate classifications
+comparison_counter=collections.Counter(comparison_2)
+print(comparison_counter)
+
+# NLT can compute the accuracy directly
+# What is the accuracy for the testing set?
+print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
+# Same value as from our own calculations -> perfect!
+
+# What is the accuracy for the training set?
+print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
+# Higher than in the testing dataset -> expected.
+
+print("completed!")
--- a/lectures/programming/templates/Problem_11_determine_file_size_form.py
+++ b/lectures/programming/templates/Problem_11_determine_file_size_form.py
@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# To determine file size we need the OS package
+import os
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
+output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+    
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # File size of the complete submission file (gross file size)
+    # You have to divide the result by 1024 to get the size in kilobyte
+    # The file size will be affected by html code and exhibits.
+    # APPLY THE COMMAND THAT IS SHOWN ON SLIDE 62.
+    size_gross=XXX/1024
+    
+    # File size of the main text file (net file size)
+    # You have to divide the result by 1024 to get the size in kilobyte
+    size_net=XXX/1024 # SAME COMMAND AS FOR GROSS FILE SIZE BUT APPLIED TO THE _clean.txt
+    
+    output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
+    
+print("Finished")
+output_file.close()
+input_file.close()
--- a/lectures/programming/templates/Problem_12_Most_Frequent_Words_form.py
+++ b/lectures/programming/templates/Problem_12_Most_Frequent_Words_form.py
@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 11 09:19:54 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We need regular expressions and counters (->collections)
+import re
+import collections
+# for the bigram part, the sentence tokenizer is helpful
+from nltk.tokenize import sent_tokenize
+
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create an empty counter variable
+words_counter=collections.Counter()
+
+# counter for the extra task
+bigram_counter=collections.Counter()
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Open the ith 10-K in the list; remember to specify the encoding
+    # The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
+    input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+\
+    filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
+    
+    # read the content from the file
+    input_text_10_k=input_file_10_k.read()
+    
+    # THINK ABOUT WE SHOULD USE LOWER OR UPPER CASE CONSISTENTLY!
+    input_text_10_k=
+    
+    # Split text into words
+    list_of_words=re.split('\W{1,}',input_text_10_k)
+    
+    # Remember: there can be empty list elements!
+    # Make sure that empty list elements do not bias the word count -> delete them!
+    # You can use an approach similar to the one in lines 24 and 25.
+    COMMANDS TO BE ADDED   
+    
+    
+    # Add the words to our counter
+    words_counter=words_counter+XXXX # COMPLETE THIS COMMAND
+    
+    
+    #############################################
+    # optional part for the extra task on bigrams
+    #############################################
+    # create an empty list for the bigrams
+    '''
+    bigram_list=[]
+    
+    # split the text into sentences
+    list_of_sentences=XXX
+    
+    # create the bigrams IN EACH SENTENCE
+    for sentence in list_of_sentences:
+        # split the sentence into words
+        list_of_words=XXX
+        
+        # remove empty elements
+        while list_of_words.count("")>0:
+            list_of_words.remove("")
+        
+        # go over all potential two word combinations in the sentence.
+        for word_number in range(XXX,YYY):
+            # add the bigram (two words connected by whitespace) to the list
+            bigram_list.append(WORD_1 + " " + WORD_2)
+                
+    # same command as in line 70
+    bigram_counter=bigram_counter+XXX
+    # end of extra task
+    '''
+    
+    
+    # Close the 10-K filing
+    input_file_10_k.close()
+
+input_file.close()
+
+
+######################
+# Top 100 single words
+######################
+# Open the csv file containing the 100 most frequently used words
+output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8",errors="ignore")
+output_file.write("rank;word;count\n")
+
+# Get the 100 most frequent words
+top_100_words=words_counter.XXXX # COMPLETE THIS COMMAND
+
+# Write the 100 most frequent words to the csv file
+# REMEMBER: Python starts counting at 0, while humans start at 1.
+# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
+for i in range(1,101):
+    output_file.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
+
+# Close the csv file
+output_file.close()
+
+
+######################
+# Extra task
+# Top 100 bigrams
+######################
+'''
+# Open the csv file containing the 100 most frequently used BIGRAMS
+output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
+output_file_bigram.write("rank;word;count\n")
+
+# Get the 100 most frequent bigrams: same commend as above
+top_100_bigrams=bigram_counter.XXX
+
+# Write the 100 most frequent bigrams to the csv file.
+# same logic as above
+for i in range(1,101):
+    output_file_bigram.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
+
+# Close the csv file
+output_file_bigram.close()
+'''
+
+print("Task done!")
--- a/lectures/programming/templates/Problem_13_Stemming_form.py
+++ b/lectures/programming/templates/Problem_13_Stemming_form.py
@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We need regular epressions and stemming.
+import re
+from nltk.stem import PorterStemmer
+# Depending on how you would like to split the text in words, you may need tokenize.
+from nltk.tokenize import word_tokenize
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Open the ith 10-K in the list; remember to specify the encoding
+    input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+    +'_edited.txt', 'r', encoding='ascii', errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
+    
+    
+    # Get the text of the 10-K
+    input_text_10_k=input_file_10_k.read()
+    
+    # We need to tokenize the text because stem only works on a word by word basis.
+    # Stemming an entire document without splitting into words does not work!
+    # The problem is that \n gets lost in this process --> we cannot easily 
+    # recreate the document.
+    # Solution: replace \n by \n and some indicator that there was a line break.
+    # For example replace("\n","\nHereWasALinebreak")
+    input_text_10_k=input_text_10_k.replace("\n",XXXX)
+    
+    # Split text into words
+    word_list=XXXX
+
+    # Stem the text from above
+    text_stemmed=''
+    # LOOP ALL WORDS, STEM THEM AND RECONNECT THEM.
+    # WARNING: WHEN RECONNECTING WORDS YOU NEED TO INCLUDE A WHITESPACE BETWEEN
+    # THE WORDS. OTHERWISE, THE TEXT GETS MESSED UP.
+    for word in word_list:
+        
+        text_stemmed=text_stemmed+XXX # TO BE COMPLETED
+        
+    # To recreate the text, we need to replace the line break indicators by \n.
+    # WARNING: PAY ATTENTION TO UPPER/LOWER CASE, IT CAN CHANGE.
+    text_stemmed=text_stemmed.replace(XXXX,XXXX) # UNDO THE TRANSFORMATION FROM LINE 56.
+    
+    
+    # Open the output file for the stemmed text
+    output_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+    +'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
+    output_file_10_k.write(text_stemmed)
+    output_file_10_k.close()
+    input_file_10_k.close()
+
+input_file.close()
+print("Task done!")
--- a/lectures/programming/templates/Problem_14_Jaccard_Similarity_form.py
+++ b/lectures/programming/templates/Problem_14_Jaccard_Similarity_form.py
@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# For the full task, we need a large set of packages:
+# regular expression, stemming, stop words, tokenization, and counters.
+import re
+#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
+#from nltk.corpus import stopwords # NOT needed for the base comparison
+#from nltk.stem import PorterStemmer # NOT needed for the base comparison
+from collections import Counter
+
+
+#ps=PorterStemmer() # NOT needed for the base comparison
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Open the output csv file in which we write the similarities
+output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
+# Write variable names to first line
+output_file.write(input_text_line[0]+';Jaccard\n')
+
+
+# set default values for variables
+word_list_old_edited=""
+word_list_edited=""
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Open the ith 10-K; remember to specify the encoding
+    input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
+    '_edited.txt', 'r', encoding='ascii', errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
+    
+    
+    input_text_10_k=input_file_10_k.read()
+    
+    # Split text into words
+    word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
+    # Alternative using tokenize
+    #word_list_edited=word_tokenize(input_text_10_k.lower())
+    
+    # check whether the previous entry of the list is from the same firm
+    permco=input_text_line[i].split(";")[1]
+    permco_old=input_text_line[i-1].split(";")[1]
+    
+    
+    ############################################
+    # Sub Task 1: Jaccard for the _edited.txt
+    ############################################
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        
+        counter_current_10k=Counter(XXX)
+        counter_previous_10k=Counter(XXX)
+
+        intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
+        union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)
+
+        jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION 
+        output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(input_text_line[i]+";"+"\n")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_edited=word_list_edited
+    
+    # Close 10-K filing
+    input_file_10_k.close()
+    
+input_file.close()
+output_file.close()
+print("Task done!")
+
--- a/lectures/programming/templates/Problem_17_Ridge_LASSO_text_data_form.py
+++ b/lectures/programming/templates/Problem_17_Ridge_LASSO_text_data_form.py
@ -0,0 +1,159 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 21 09:38:32 2022
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from sklearn.linear_model import RidgeCV
+from sklearn.linear_model import LassoCV
+
+
+# adjust the directory to your folder
+directory="C:/Lehre/Machine Learning/Data/"
+
+
+# import the data for this problem
+# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
+data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
+# The rows of the data are the Form 10-K filings. Each line is one filing.
+# The columns are the variables. After some identifying information,
+# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
+# in a 10-K (e.g., 100 times)
+
+
+# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
+# and Console will crash.
+# However, you can pick a small subset of the data and look at it.
+# It list all columns=variables and the first three observations.
+data_frame_example=data_frame.head(3)
+# you can click on this variable in the variable explorer without Spyder crashing.
+
+# To see the variables included in the data use the following command
+data_frame_column_names=data_frame.columns 
+# you can click on this variable in the variable explorer without Spyder crashing.
+# This variables shows all column/variable names in a vector.
+
+# split the data set into the training and testing data
+# we use the filings from year 2007 as training data
+data_frame_train=data_frame[data_frame.year==2007]
+# and the filing from year 2008 as testing data
+data_frame_test=data_frame[data_frame.year==2008]
+
+# put the cumulative abnormal return around the filing date into a new variable.
+# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
+# training data
+filing_car_train=data_frame_train["excess_ret_t0_t4"]
+# testing data
+filing_car_test=data_frame_test["excess_ret_t0_t4"]
+
+# so far, you have absolute word counts. For example, "loss" is found 5 times.
+# As the length of the 10-Ks can be different, we scale by the number of words
+# in the 10-K.
+document_length_train=data_frame_train["number_of_words"]
+document_length_test=data_frame_test["number_of_words"]
+
+
+# the word frequencies are our independent variables -> restrict the data frame
+# to those variables and drop all variables that are not needed
+data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
+data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
+
+# compute relative frequencies, i.e., divide the absolute word count by document length
+data_frame_train=data_frame_train.div(document_length_train, axis=0)
+data_frame_test=data_frame_test.div(document_length_test, axis=0)
+
+# standardize the data frames
+# training data
+data_frame_train_mean=TO BE COMPLETED
+data_frame_train_sd=TO BE COMPLETED
+data_frame_train_standardized=TO BE COMPLETED
+# testing data
+data_frame_test_mean=TO BE COMPLETED
+data_frame_test_sd=TO BE COMPLETED
+data_frame_test_standardized=TO BE COMPLETED
+
+
+# There can be missing values in the standardized variables.
+# They arise if the word count for a specific word is always zero in the training
+# or in the testing data. In this case, the standard deviation is zero ->
+# division by zero -> NaN.
+# We replace these missing values by zero.
+# training data
+data_frame_train_standardized=data_frame_train_standardized.fillna(0)
+# testing data
+data_frame_test_standardized=data_frame_test_standardized.fillna(0)
+
+##########################
+# Ridge regression
+##########################
+print("\nRidge regression - Using cross-validation\n")
+# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
+# In this regression, we use the training data.
+# We use five-fold cross-validation.
+# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
+regression_Ridge_cv=RidgeCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
+
+# get the optimal lambda
+alpha_optimal_cv=TO BE COMPLETED
+print("The optimal alpha is "+str(alpha_optimal_cv))
+
+# what is the R2 in the training and testing data?
+print("The R2 in the training data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
+print("The R2 in the testing data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
+
+# Mean squared error using the cross-validated model
+# predict y in the full training sample
+filing_car_train_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
+# predict y in the testing sample
+filing_car_test_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
+# Determine the MSE
+print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
+print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
+
+
+######################
+# LASSO regression
+######################
+print("\nLASSO regression - Using cross-validation\n")
+# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
+# In this regression, we use the training data.
+# We use five-fold cross-validation.
+# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
+regression_Lasso_cv=LassoCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
+
+# get the optimal lambda
+alpha_optimal_cv=TO BE COMPLETED
+print("The optimal alpha is "+str(alpha_optimal_cv))
+
+# get the R2 in the training data
+print("The R2 in the training data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
+# ... and testing data
+print("The R2 in the testing data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
+
+# Mean squared error using the cross-validated model
+# predict y in the full training sample
+filing_car_train_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
+# predict y in the testing sample
+filing_car_test_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
+# Determine the MSE
+print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
+print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
+
+
+############################################################
+# Compare the betas from the Ridge and the LASSO regressions
+############################################################
+output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
+output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
+
+# get the list of coefficients
+for i in range (0,len(data_frame_train.columns)):
+    output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
+    
+output_file.close()
+
+print("Completed!")
--- a/lectures/programming/templates/Problem_1_form.py
+++ b/lectures/programming/templates/Problem_1_form.py
@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Feb 15 21:37:53 2019
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+# It is important to use a single forward slash / but not a single backslash \.
+
+# For MAC users: your directory will usually start with "/Users/". For example:
+#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
+
+# open the Fun_with_Python text file
+input_file=open(directory+"Fun_with_Python.txt","r")
+
+###################################
+# Programming Problem 1
+###################################
+
+# Task 1: open the file 'Fun_with_Python.txt' in Spyder and print its content
+# The file can be found in our data folder
+
+# get the text from the file
+input_text= TO BE COMPLETED
+# print the content, i.e., the text of the file (previous line)
+print(TO BE COMPLETED)
+
+# See slide 7
+
+
+# Task 2: Write the content of 'Fun_with_Python.txt' to a new text file
+# with the name 'More_fun_with_Python.txt'.
+
+# ENTER YOUR COMMANDS HERE
+# See slide 8.
+# REMEMBER to close your file. If you do not close the new txt file, its content
+# will not be saved to the hard drive. You will find an empty txt in your file manager.
+
+
+# Task 3: Write a loop that prints some text (whatever you like) ten times.
+
+# ENTER YOUR COMMANDS HERE
+# See slide 9.
+# You have several options. While loop, for X in range() loop, etc.
+
+
+
+# Task 4: Print the text of the "Fun_with_Python" file line by line!
+
+# ENTER YOUR COMMANDS HERE
+# See slide 10.
+# You need a loop (Task 3) and in each iteration of the loop have Python print
+# a line of text.
+
+
+
+# Task 5: Count how often the word 'good' appears in the document 'Fun_with_Python.txt'!
+
+# ENTER YOUR COMMANDS HERE
+# See slide 11.
+
+
+
+# Task 6a: Now, print only the lines that contain the word 'good'!
+
+# ENTER YOUR COMMANDS HERE
+# See also slide 12.
+# You can use the line-by-line printing from Task 4 and combine it with the command ".count()" from Task 5
+# and add the if condition from slide 12.
+# If condition: for each line check whether the specific line contains the word "good".
+
+
+
+# Task 7: print only the lines that start with the word 'This'!
+
+# ENTER YOUR COMMANDS HERE
+# See slide 15.
+# This is very similar to task 6. You only need to modify the if condition a bit.
+
+
+
+
+# Task 8a: Replace the word "good" by "excellent" and display the new text!
+# See slide 16.
+# ENTER YOUR COMMANDS HERE
+
--- a/lectures/programming/templates/Problem_2_SEC_Filings_Part1_Identification_form.py
+++ b/lectures/programming/templates/Problem_2_SEC_Filings_Part1_Identification_form.py
@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 09:21:46 2015
+
+@author: Alexander Hillert, Goethe Uni Frankfurt
+"""
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+# It is important to use a single forward slash / but not a single backslash \.
+
+# For MAC users: your directory will usually start with "/Users/". For example:
+#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
+
+# Open the txt file with the SEC filings
+sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
+sec_filings_text=sec_filings_file.read()
+
+# Create output file
+output_file=open(directory+'SEC_Filings_Output.csv','w')
+
+# Create first line with variable names
+# I use semicolons as separator in csv files. You can also use any other symbol.
+# However, you should make sure that the separator is not part of the data/text
+# you write to the file.
+# For example, it would be problematic if you use comma as separator and have
+# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
+output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
+
+
+# Split the Input File in separate line
+# DO THE LINE SPIT
+sec_filings_line=
+
+# Loop over all lines
+# you can get the number of lines by computing the length of the list of lines,
+# i.e. by determining the length of sec_filings_line.
+for / while : # COMPLETE LOOP
+    
+    # Does the line refer to a form 10-K file?
+    if : # USE AN IF CONDITION TO TEST THIS -> see TASKS 7 and 8 of PROBLEM 1
+        
+        # Split the line such that the information can be saved in separate
+        # variables
+        # Each information item has a fixed length in the overview files of the
+        # SEC.
+        # SEE SLIDE 18 FOR INFORMATION ON THE LENGTH OF THE SEPARATE COLUMNS.
+        
+        # COMPLETE THE COMMANDS BELOW
+        filing_type=
+        company_name=
+        cik=
+        filing_date=
+        link=
+        
+        # Is the 10-K filed between March 10 and March 20?
+        filing_day=
+        filing_month=
+        # Is the Filing Month March?
+        if : # COMPLETE THE IF-CONDITION
+            # Is the Filing Day between 10 and 20?
+            if : # COMPLETE THE IF-CONDITION
+                # The filing meets the conditions -->
+                # Write output to the csv file
+                output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
+
+
+# Close your input and output file in the end
+sec_filings_file.close()
+output_file.close()
+
+print("DONE")
--- a/lectures/programming/templates/Problem_2_SEC_Filings_Part2_Download_form.py
+++ b/lectures/programming/templates/Problem_2_SEC_Filings_Part2_Download_form.py
@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe Uni Frankfurt
+"""
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+# It is important to use a single forward slash / but not a single backslash \.
+
+# For MAC users: your directory will usually start with "/Users/". For example:
+#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
+
+
+# We need the urllib package for the download.
+import urllib.request
+# To automatically create folders, we need the os-module (OS: Operating System)
+import os
+
+###############################################################################
+# Technical issue
+# As of March 2021, the SEC no longer accepts requests by the standard urllib settings
+# you have to make some adjustments
+###############################################################################
+# Define a user agent
+# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
+# "Some websites dislike being browsed by programs, or send different versions
+# to different browsers. By default urllib identifies itself as Python-urllib/x.y
+# (where x and y are the major and minor version numbers of the Python release,
+# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
+# The way a browser identifies itself is through the User-Agent header.
+opener = urllib.request.build_opener()
+
+# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
+# To still automatically download files, you have different options.
+# I have listed three examples below but there are many more:
+# For a comprehensive list see, e.g.:
+# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
+#opener.addheaders = [('User-agent', 'Mozilla')]
+#opener.addheaders = [('User-agent', 'Chrome')]
+opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
+urllib.request.install_opener(opener)
+# END of the technical issues
+
+
+
+# Open the csv file from part 1 of the problem
+input_file=open(directory+'SEC_Filings_Output.csv','r')
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# Create a subfolder in which the 10-K filings are saved.
+# When you download a large number of filings I recommend using subfolders for
+# each year or even for each year-month-day combination.
+# In this problem, a single subfolder is fine.
+os.makedirs( COMPLETE THE COMMAND )
+# See slide 18 for information on the os.-commands!
+# IN GENERAL, IF YOU SEE AN UNKNOWN COMMAND, GOOGLE IT TO GET INFORMATION.
+    
+# Loop over all lines of the csv file
+# Like in part 1 of the problem, you can get the number of lines by computing
+# the length of the list of lines, i.e. by determining the length of input_text_line.
+for / while: # COMPLETE THE LOOP
+    # split the line into the five variables
+    # THE ; IS THE SEPARATOR IN THE CSV -> USE THE split() COMMAND
+    variables=
+    
+    # We only need the cik and the link to download the file.
+    # The cik is the 3rd variable.
+    # The link is the 5th variable
+    cik=
+    link=
+    
+    # identify the filename
+    # The link consistes of differnt parts:
+    # For example: edgar/data/1000753/0000950129-98-001035.txt
+    
+    link_parts= # USE A SPLIT
+    # 1st part: edgar
+    # 2nd part: data
+    # 3rd part: cik
+    # 4th part: file name -> see next line
+    filename=link_parts[FILE IN THE NUMBER HERE]
+    ###########################################################################
+    ############################ WARNING ######################################
+    # The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
+    # may use the same filename. Thus, when you only use the filename files
+    # might be overwritten. To avoid this problem you need to have a unique name. 
+    # Combining CIK and filename results in a unique identifier, as the 
+    # filename appears only once per firm (CIK).
+    # -> use the combination of CIK and filename: cik_filename
+    ###########################################################################        
+    urllib.request.urlretrieve(TO BE COMPLETED)
+    # See slide 19 for information on the urllib.-commands.
+
+
+# Close your input file
+input_file.close()
+
+print("DONE")
--- a/lectures/programming/templates/Problem_4_Application_Regular_Expressions_form.py
+++ b/lectures/programming/templates/Problem_4_Application_Regular_Expressions_form.py
@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 12 15:50:22 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# Import regular expressions and BeautifulSoup
+import re
+from bs4 import BeautifulSoup
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+# It is important to use a single forward slash / but not a single backslash \.
+# For MAC users: your directory will usually start with "/Users/". For example:
+#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
+
+# Open the document
+input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
+input_text=input_file.read()
+
+#######################
+# Task 1: remove tables
+#######################
+# Approach
+# We search for tables until we find no more html tags that indicate the
+# beginning of a table.
+# Search for the start html-tag <TABLE>
+table_match=re.search(TO BE COMPLETED, input_text)
+while : # YOU NEED A LOOP THAT SEARCHES FOR TABLES
+    # When we have identified a match, i.e. the start of a table, we save
+    # the position of the beginning of the table in the variable "start_table"
+    table_start_match=re.search(XXX, input_text)
+    start_table=table_start_match.start()
+    # Next, we search for the corresponding html tag that indicates the end of
+    # the table and save the end position to the variable "end_table"
+    
+    # REPEAT THE COMMANDS ABOVE FOR THE END OF TABLE
+    table_end_match=
+    end_table=
+    
+    # We can print the text between the start and end html tag to check whether
+    # the table has been identified correctly.
+    print("The text below is a table!\n"+input_text[start_table:end_table])
+
+    # the text between the beginning and end of the html tags is the part which
+    # we would like to delete.
+    # Consequently, we keep the text before the beginning of the table as well
+    # as the text after the ending of the table.
+    input_text=TO BE COMPLETED
+    # Next, we need to check whether there is another table in the rest of the
+    # text.
+    table_match=re.search(SAME COMMAND AS IN LINE 27, input_text)
+    # As long as "table_match" exists, i.e. we regex result in a match, the loop 
+    # will continue.
+
+#########################
+# Task 2: remove Exhibits
+#########################
+# Exhibits have the following structure
+# <DOCUMENT>
+# <TYPE>EX...
+# ...
+# </DOCUMENT>
+
+# THE APPROACH IS THE SAME AS THE SEARCH FOR TABLES ABOVE
+exhibit_match=re.search(, input_text)
+while :
+    # get the beginning of the exhibit
+    exhibit_start_match=
+    start_exhibit=
+    # As the exhibits are at the end of the 10-K filing it would not be
+    # necessary to include an end position. We could also drop the entire text
+    # after "<TYPE>EX"
+    # However, for completeness, we will define an end
+    exhibit_end_match=
+    end_exhibit=
+    # Print the identified text to check whether the exhibit has be identified
+    # correctly
+    print("The text below is a exhibit!\n"+input_text[start_exhibit:end_exhibit])
+    
+    input_text=TO BE COMPLETED
+    # Check whether there are further exhibits
+    exhibit_match=re.search(SAME COMMAND AS IN LINE 65, input_text)
+
+##########################
+# Task 3: remove html code
+##########################
+# Alternative 1: remove html code without Beautiful Soup
+text=re.sub(TO BE COMPLETED, '', input_text)
+# Use a regex that searches for a "<" followed by at least one character that must not
+# equal > and is completed by >.
+
+# Alternative 2: remove html code using Beautiful Soup
+html_text=BeautifulSoup(TO BE COMPLETED)
+text=html_text.TO BE COMPLETED
+
+########################
+# Task 4: delete numbers
+########################
+
+# YOU MAY NEED MULTIPLE COMMANDS TO DELETE ALL NUMBERS
+# Remember that you can have different formats, e.g., 1,234.56 or 0.12 or 1,234,567
+text=re.sub(TO BE COMPLETED,'',text)
+
+########################
+# Task 5: delete symbols
+########################
+text=re.sub(TO BE COMPLETED,'',text)
+
+
+# Open the output file for the pure text
+output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
+output_file.write(text)
+
+# close all files
+input_file.close()
+output_file.close()
+
+print("DONE")
+
--- a/lectures/programming/templates/Problem_5_Clean_SEC_Filing_form.py
+++ b/lectures/programming/templates/Problem_5_Clean_SEC_Filing_form.py
@ -0,0 +1,164 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 12 15:50:22 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+import re
+from bs4 import BeautifulSoup
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+# It is important to use a single forward slash / but not a single backslash \.
+# For MAC users: your directory will usually start with "/Users/". For example:
+#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
+
+# Open the 10-K
+input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
+input_text=input_file.read()
+
+################################
+# Remove tables
+# Same approach as in Problem 4
+################################
+# Sometimes it is helpful to print the text parts that are deleted. In this
+# example, we will print the first two tables that we delete.
+i=1
+table_match=re.search(ENTER THE REGEX, input_text)
+while table_match:
+    # Search for the beginning of the table
+    table_start_match=re.search(REGEX FOR BEGINNING OF TABLE, input_text)
+    start_table=
+    # search for the end of the table
+    table_end_match=REGEX FOR END OF TABLE
+    end_table=
+    # The if condition and the printing are just for illustrative purposes.
+    # The commands display the first two tables that are removed from the text.
+    if i<=2:
+        print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
+        i=i+1
+    # remove the table from the original text
+    input_text=TO BE COMPLETED
+    # check whether there are further tables
+    # same command as in line 24
+    table_match=re.search(XXXXXXX, input_text)
+    
+################################
+# Remove exhibits
+# Same approach as in Problem 4
+################################
+# Exhibits have the following structure
+# <DOCUMENT>
+# <TYPE>EX...
+# ...
+# </DOCUMENT>
+# Sometimes it is helpful to print the text parts that are deleted. In this
+# example, we will print the first exhibit that we delete.
+i=1
+exhibit_match=re.search(ENTER THE REGEX, input_text)
+while exhibit_match:
+    # Search for the beginning of the exhibit
+    exhibit_start_match=re.search(REGEX FOR BEGINNING OF EXHIBIT, input_text)
+    start_exhibit=
+    # Search for the end of the exhibit
+    # CAUTION: search only in the text after the beginning of the exhibt, as
+    # the end-term also appears earlier (e.g. end of main document)
+    exhibit_end_match=re.search(REGEX FOR END OF EXHIBIT, input_text[START OF EHIBIT UNTIL END OF TEXT])
+    end_exhibit=
+    if i<=1:
+        print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
+        i=i+1
+    # remove exhibit from the original text
+    input_text=
+    # check whether there are further exhibits
+    # same command as in line 55
+    exhibit_match=re.search(XXXXXXX, input_text)
+    
+##################
+# Remove html code
+##################
+# you can use BeautifulSoup for simplicity
+html_text=BeautifulSoup(input_text, 'html.parser')
+text=html_text.get_text()
+
+############################
+# Remove the Document Header
+############################
+# There are different possibilities how one can define the start of the main part of the text
+# In general, you should delete all text that is uninformative for your analysis.
+header_match=re.search(END OF DOCUMENT HEADER, text)
+if header_match:
+    # Drop the document header and keep only the rest of the text after the header.
+    text=text[XXXXXXXXXXXXXXX]
+
+
+#################################################
+# Delete the text in "PART IV"
+# This procedure is optional. Look at "Part IV" and decide whether you favor
+# the approach. I think that the part should be dropped, as it is just a list
+# of exhibits, some mandatory text required by the SEC [indicated by the
+# capital letters in the "SIGNATURES" section].
+#################################################
+'''
+# Alternative 1: go over all matches but keep only the last one
+for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
+    pass
+# match now contains the last match.
+# Delete the text after the last match
+text=text[:match.start()]
+
+# Alternative 2: save the positions of all matches (more general approach)
+list_start_matches=[]
+list_end_matches=[]
+for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
+    list_start_matches.append(match.start())
+    list_end_matches.append(match.end())
+# Position of last match
+print(list_start_matches[len(list_start_matches)-1])
+print(list_end_matches[len(list_start_matches)-1])
+
+# Delete the text after the last match
+text=text[:list_start_matches[len(list_start_matches)-1]]
+'''
+
+# Delete item numbers
+# This is optional. It removes "Item  1.", "ITEM 1.", "Item 10.", "Item  7A."
+text=re.sub(TO BE COMPLETED,'',text)
+
+# Delete numbers
+# You can use the code from Problem 4.
+text=re.sub(TO BE COMPLETED,'',text)
+
+
+# Hyphens can be used to indicate that the word is continued in the next
+# line. For example, "Micro-\nsoft" (\n is the line feed).
+# Delete hyphens that are followed by a line feed.
+text=re.sub(TO BE COMPLETED,'',text)
+
+# Delete symbols
+# You can use the code from Problem 4.
+text=re.sub(TO BE COMPLETED,'',text)
+
+# Delete dots and commas that are not part of sentences, i.e. commas and dots
+# that are preceded by whitespace or line break and that are followed by
+# whitespace or line break.
+text=re.sub('\n(\.|,)\n','\n',text)
+
+# Drop single-character words
+# One can argue whether one should implement this procedure. Loughran and
+# McDonald argue in one of their papers in favor of it.
+# To make sure that there is just one letter, we require that there is a word
+# boundary (\W) before and after. We use a positive backward looking and a
+# positive forward looking condition for this to assure that the word boundary
+# get not deleted as well.
+text=re.sub(TO BE COMPLETED,' ',text)
+   
+    
+# Open the output file for the pure text
+output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
+output_file.write(text)
+
+input_file.close()
+output_file.close()
+print("COMPLETED.")
+
--- a/lectures/programming/templates/Problem_7_Tone_Analysis_form.py
+++ b/lectures/programming/templates/Problem_7_Tone_Analysis_form.py
@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import re
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the dictionary
+# The dictionary has been obtained from Bill McDonald's webpage
+# http://www3.nd.edu/~mcdonald/Word_Lists.html
+# --> LoughranMcDonald_MasterDictionary_2014.xlsx
+# --> select negative words and copy them to a txt file
+file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
+word_list=file_word_list.read()
+# LOOK AT THE FILE. ARE THE WORDS IN UPPER OR IN LOWER CASE?
+# MAKE SURE THAT YOU USE A CONSISTENT FORMAT FOR THE TEXT AND THE DICTIONARY.
+# THE COMMANDS ARE .lower() AND .upper().
+
+# CREATE A LIST OF NEGATIVE WORDS -> SPLIT THE TEXT
+negative_words=word_list.XXXX
+
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
+Percentage_Negative_Words\n')
+
+# Loop over all lines of the csv file
+for i in range(1,len(input_text_line)):
+    # If the execution of your scripts takes some time, printing the loop iterator
+    # gives you an impression of the overall progress made.
+    print(str(i))
+    
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (2nd column)
+    cik=variables[0]
+    filename=variables[1]
+    
+    # modify file name to open the edited files
+    filename=filename.replace('.txt','')
+    # Open the ith 10-Ks in the list
+    input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # CONVERT THE TEXT TO UPPER OR LOWER CASE (see comment above)
+    # It is important that the formatting (lower case vs. upper case) of the word list
+    # and the document is identical. Remember that you have typically lower and upper case
+    # letters in documents -> modify text
+    text=input_text_10_k.XXXXXX
+    
+    # Split the text in words to determine the total number of words
+    # LOOK AT THE REGEX INTRODUCTION FOR A SUITABLE SPLIT VARIABLE. 
+    list_of_words=re.split(XXXXX, text)
+    
+    # ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
+    # Make sure that empty list elements do not bias the word count -> delete them!
+    # You can use an approach similar to the one in lines 37 and 38.
+    COMMANDS TO BE ADDED
+        
+    # Determine the total number of words
+    # COUNT THE NUMBER OF ELEMENTS IN list_of_words
+    word_count=XXXX
+    
+    # Reset the number of negative words to zero
+    negative_count=0
+    # For each negative word, count the number of occurrences
+    for j in range(len(negative_words)):
+        
+        HERE YOU NEED TO COUNT HOW OFTEN THE jth NEGATIVE WORD IS FOUND IN THE TEXT.
+        COMPARE THE TWO CASES BELOW -> EXECUTE THE COMMANDS (see lines below) IN
+        THE COMMAND LINE AND COMPARE THE RESULTS.
+        WHICH ALTERNATIVE IS THE RIGHT APPROACH?
+        
+        ALTERNATIVE 1:
+        list_of_words=["abandon","abandoned","abandonment"]
+        list_of_words.count("abandon")
+        ALTERNATIVE 2:
+        text_of_words="abandon abandoned abandonment"
+        text_of_words.count("abandon")
+        
+        ADD THE CORRECT COUNT OF NEGATIVE WORD j TO YOUR OVERALL COUNT.
+        negative_count=negative_count+XXXXX
+    
+    # Get the percentage of negative words
+    percentage_negative=negative_count/word_count
+    
+    # Write cik, file name, total number of words, number of negative words,
+    # and the percentage of negative words to output file.
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+    +str(negative_count)+';'+str(percentage_negative)+'\n')
+    
+    # Close filings
+    input_file_10_k.close()
+
+print("Finished")    
+output_file.close()
+input_file.close()
--- a/lectures/programming/templates/Problem_8_Tone_Analysis_Positive_Words_form.py
+++ b/lectures/programming/templates/Problem_8_Tone_Analysis_Positive_Words_form.py
@ -0,0 +1,131 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert
+"""
+
+import re
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the dictionary
+# The dictionary is obtained from Bill McDonald's webpage
+# http://www3.nd.edu/~mcdonald/Word_Lists.html
+# --> LoughranMcDonald_MasterDictionary_2014.xlsx
+# --> select positive words and copy them to a txt file
+file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
+word_list=file_word_list.read()
+
+# LIKE IN PROBLEM 7, YOU HAVE TO APPLY A CONSISTENT FORMAT TO BOTH THE LMD-WORDS
+# AND THE TEXT OF THE 10-Ks.
+positive_words=word_list.split()
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the iput file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
+
+
+# Iterate the list of the 200 10-K filings
+for i in range(1,len(input_text_line)):
+    # If the execution of your scripts takes some time, printing the iterator
+    # gives you an impression of the overall progress made.
+    print(str(i))
+    
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (2nd column)
+    cik=variables[0]
+    filename=variables[1]
+    
+    # modify file name to open the edited files
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'/10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # It is important that the formatting (lower case vs. upper case) of the word list
+    # and the document are identical. Remember that you have typically lower and upper case
+    # letters in documents -> modify text
+    text=XXXX # CONSISTENT FORMAT
+    
+    # Split the text in single words to determine the total number of words
+    list_of_words=re.split(XXXX, text) # USE THE SAME COMMAND AS IN PROBLEM 7
+    
+    # ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
+    # Make sure that empty list elements do not bias the word count -> delete them!
+    # You can use an approach similar to the one in lines 34 and 35.
+    COMMANDS TO BE ADDED   
+    
+    # Determine total number of words
+    word_count=XXXX # SAME COMMAND AS IN PROBLEM 7
+    
+    # Reset the number of positive words and positive words adj. for negations to zero.
+    positive_count=0
+    positive_count_adj=0
+    # For each positive word, count the number of occurrences
+    for j in range(len(positive_words)):
+        # standard count operation without controlling for negations
+        positive_words_found=list_of_words.count(positive_words[j])
+        
+        # Loughran and McDonald (2011, JF, p.44): "We account for simple negation
+        # only for Fin-Pos words. Simple negation is taken to be observations
+        # of one of six words (no, not, none, neither, never, nobody) occurring
+        # within three words preceding a positive word.
+        
+        # When we have identified positive words we need to search for negations
+        while positive_words_found>0:
+            # identify the position of the matched positive word in the list of all words
+            position_of_word=list_of_words.XXXXX # THE COMMAND .index() IS HELPFUL HERE
+            
+            # identify the three words before the positive word and add them to a list
+            list_negation=[3_WORDS_BEFORE_MATCH,2_WORDS_BEFORE_MATCH,1_WORD_BEFORE_MATCH]
+            # REPLACE THE THREE PLACEHOLDERS BY THE CORRESPONDING ELEMENTS OF list_of_words
+            
+            # check whether one of the three words in list_negation is a negation
+            negation_found=list_negation.count('no')+list_negation.count('not')+XXXX TO BE COMPLETED
+            
+            if negation_found==0:
+                # no negation
+                positive_count_adj=positive_count_adj+1
+                positive_count=positive_count+1
+            else:
+                # negation
+                positive_count=positive_count+1
+    
+             # delete the matched positive words in the original document
+            list_of_words[position_of_word]=XXX
+            # THIS OPERATION IS IMPORTANT BECAUSE OTHERWISE WE WILL GET AN ENDLESS LOOP
+            
+            # check whether there are further matches of the jth positive word
+            positive_words_found=list_of_words.count(positive_words[j])
+            
+    # Write cik, file name, total number of words, and number of positive
+    # and adjusted positive words to the output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
+    str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
+    ';'+str(positive_count_adj/word_count)+'\n')
+    
+    # Close filings
+    input_file_10_k.close()
+    
+print("Finished")
+output_file.close()
+input_file.close()
--- a/lectures/programming/templates/Problem_9_Words_per_Sentence_form.py
+++ b/lectures/programming/templates/Problem_9_Words_per_Sentence_form.py
@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We split the text into words and sentences using regular expression
+import re
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;WPS\n')
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    text=input_file_10_k.read()
+    
+    # Determine number of sentences and number of words    
+    # DETERMINE THE NUMBER OF WORDS; YOU KNOW THE COMMAND FROM PROBLEMS 7 AND 8.
+    list_of_words=re.split(XXX, text)
+    # Determine total number of words
+    word_count=XXX
+    # Split the text by symbols that indicate the end of a sentence
+    # to determine the total number of sentences
+    list_of_sentences=re.split(XXX, text)
+    # Determine total number of sentences
+    sentence_count=XXX
+    
+    # Ratio of # of words over # of sentences
+    wps=word_count/sentence_count
+    
+    # Write cik, file name, total number of words, total number of sentences,
+    # and WPS to the output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
+    str(sentence_count)+';'+str(wps)+'\n')
+    
+    # Close filing
+    input_file_10_k.close()
+    
+
+print("Finished") 
+output_file.close()
+input_file.close()