Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/solutions/Problem_10_Complex_Words.py
+++ b/lectures/programming/solutions/Problem_10_Complex_Words.py
@ -0,0 +1,76 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import re
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the dictionary
+file_word_list=open(directory+'Complex_Words.txt','r',encoding="utf-8")
+word_list=file_word_list.read()
+word_list=word_list.lower()
+complex_words=word_list.split()
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_Complex_Tone.csv','w',encoding="utf-8")
+output_file.write('CIK;Filename;Number_Words;Number_Complex_Words;Percent_Complex_Words\n')
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # Use lower case letters
+    text=input_text_10_k.lower()    
+    
+    # Split the text in words to determine the total number of words
+    list_of_words=re.split('\W{1,}', text)
+     # to make sure that empty list elements do not bias the word count, we delete them.
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+        
+    # Determine total number of words
+    word_count=len(list_of_words)
+    
+    # Reset the number of complex words to zero
+    complex_count=0
+    # For each complex word, count the number of occurrences
+    for i in range(len(complex_words)):
+        complex_count=complex_count+list_of_words.count(complex_words[i])
+    
+    # Write cik, file name, total number of words, and number of complex words to output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+    +str(complex_count)+';'+str(complex_count/word_count)+'\n')
+    
+    # Close filings
+    input_file_10_k.close()    
+
+print("Finished")        
+output_file.close()
+input_file.close()
--- a/lectures/programming/solutions/Problem_11_determine_file_size.py
+++ b/lectures/programming/solutions/Problem_11_determine_file_size.py
@ -0,0 +1,53 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# To determine file size we need the OS package
+import os
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
+output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+    
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # File size of the complete submission file (gross file size)
+    # You have to divide the result by 1024 to get the size in kilobyte
+    # The file size will be affected by html code and exhibits.
+    size_gross=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'.txt')/1024
+    
+    # File size of the main text file (net file size)
+    # You have to divide the result by 1024 to get the size in kilobyte
+    size_net=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt')/1024
+    
+    output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
+    
+print("Finished")
+output_file.close()
+input_file.close()
--- a/lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
+++ b/lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 11 09:19:54 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We need regular expressions and counters (->collections)
+import re
+import collections
+# for the bigram part, the sentence tokenizer is helpful
+from nltk.tokenize import sent_tokenize
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create an empty counter variable
+words_counter=collections.Counter()
+# variable is needed only for an alternative solution
+words_counter1=collections.Counter()
+
+# counter for the extra task
+bigram_counter=collections.Counter()
+
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Open the ith 10-K in the list; remember to specify the encoding
+    # The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
+    input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
+    filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
+    
+    # read the content from the file
+    input_text_10_k=input_file_10_k.read()
+    
+    # use lower case only so that it does not matter whether a word is at
+    # the beginning of a sentence ("The") or within a sentence ("the").
+    # Please note that this can be problematic, e.g. "US" -> United States vs.
+    # us (personal pronoun)
+    input_text_10_k_lower=input_text_10_k.lower()
+    
+    # Split text into words
+    list_of_words=re.split('\W{1,}',input_text_10_k_lower)
+    # There can be empty ("") list elements -> remove them
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+    
+    # optional commands to remove words that only contain "_"
+    '''
+    for word in list_of_words:
+        if re.sub("[a-zA-Z]","",word)!="":
+        #if word.count("_")>0:
+            list_of_words.remove(word)
+    '''
+    
+    # Add the words to our counter
+    words_counter=words_counter+collections.Counter(list_of_words)
+    # alternative solution
+    words_counter1.update(list_of_words)
+    
+    
+    #############################################
+    # optional part for the extra task on bigrams
+    #############################################
+    
+    # create an empty list for the bigrams
+    bigram_list=[]
+    
+    # split the text into sentences
+    list_of_sentences=sent_tokenize(input_text_10_k)
+    
+    # create the BIGRAM IN EACH SENTENCE
+    for sentence in list_of_sentences:
+        
+        # make the sentence lower case
+        sentence_lower=sentence.lower()
+        
+        # split the sentence into words
+        list_of_words=re.split("\W{1,}",sentence_lower)
+        
+        # remove empty elements
+        while list_of_words.count("")>0:
+            list_of_words.remove("")
+        
+        #print("these are the words of the sentence:\n"+str(list_of_words))
+        
+        # go over all potential two word combinations in the sentence.
+        for word_number in range(0,len(list_of_words)-1):
+            bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
+                
+    bigram_counter=bigram_counter+collections.Counter(bigram_list)
+    # end of extra task
+    
+    
+    # Close the 10-K filing
+    input_file_10_k.close()
+
+input_file.close()
+
+######################
+# Top 100 single words
+######################
+# Open the csv file containing the 100 most frequently used words
+output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
+output_file.write("rank;word;count\n")
+
+# Get the 100 most frequent words
+top_100_words=words_counter.most_common(100)
+# for the alternative solution
+#top_100_words=words_counter1.most_common(100)
+
+# Write the 100 most frequent words to the csv file.
+# Remember Python starts counting at 0, while humans start at 1.
+# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
+# Consequently, to get a consistent table, we must use the value i for the rank
+# but call the element i-1.
+for i in range(1,101):
+    output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
+    str(top_100_words[i-1][1])+"\n")
+
+# Close the csv file
+output_file.close()
+
+
+######################
+# Extra task
+# Top 100 bigrams
+######################
+# Open the csv file containing the 100 most frequently used BIGRAMS
+output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
+output_file_bigram.write("rank;word;count\n")
+
+# Get the 100 most frequent words
+top_100_bigrams=bigram_counter.most_common(100)
+
+# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
+for i in range(1,101):
+    output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
+    str(top_100_bigrams[i-1][1])+"\n")
+
+# Close the csv file
+output_file_bigram.close()
+
+
+print("Task done!")
--- a/lectures/programming/solutions/Problem_13_Stemming.py
+++ b/lectures/programming/solutions/Problem_13_Stemming.py
@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We need regular epressions, tokenize (to identify words), and stemming.
+import re
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+#for i in range(1,len(input_text_line)):
+# for illustration filings 1 to 3 only
+for i in range(1,4):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Open the ith 10-K in the list; remember to specify the encoding
+    input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+    +'_edited.txt', 'r', encoding='ascii', errors='ignore')
+    # Get the text of the 10-K
+    input_text_10_k=input_file_10_k.read()
+    
+    # We need to tokenize the text because stem only works on a word by word basis.
+    # Stemming an entire document without splitting into words does not work!
+    # The problem is that \n gets lost in this process --> we cannot easily 
+    # recreate the document.
+    # idea: replace \n by \n and some indicator that there was a line break.
+    # Here, I choose "LINEBREAKMARK"
+    input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
+    
+    # Split text into words
+    # There are two alternatives.
+    # Alternative 1 (our standard approach):
+    #word_list=re.split("\W{1,}",input_text_10_k.lower())
+    # Alternative 2 (keeps symbols like ,;.):
+    word_list=word_tokenize(input_text_10_k.lower())
+
+
+    # Stem the text
+    text_stemmed=''
+    for word in word_list:
+        # The following two cases are designed to improve the formatting of the
+        # output file. It is not needed for the subsequent analyses.
+        
+        # Case 1: 'word' is not an actual word but a symbol. -> there should
+        # be no whitespace between the previous words and this symbol.
+        # \A and \Z indicate the beginning and end of string -> the 'word' is just
+        # the symbol but not a combination of letters and symbols.
+        
+        if re.search("\A[\.\?!,:;']{1,}\Z",word):
+            text_stemmed=text_stemmed+word
+        # Case 2: the word is an actual word -> have a whitespace included.
+        else:
+            text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
+        
+        # The simple solution (without restoring the formatting of the text) is:
+        #text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
+        
+        
+    # To recreate the text, we need to replace the line break indicators by \n
+    # Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
+    text_stemmed=text_stemmed.replace("linebreakmark","\n")
+    
+    
+    # Open the output file for the stemmed text
+    output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+    +'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
+    output_file_10_k.write(text_stemmed)
+    output_file_10_k.close()
+    input_file_10_k.close()
+
+input_file.close()
+print("Task done!")
--- a/lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
+++ b/lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
@ -0,0 +1,287 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import re
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+from collections import Counter
+
+
+ps=PorterStemmer()
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
+input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the iput file. The following command
+# deletes these lines
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Open the output csv file in which we write the similarities
+output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
+# Write variable names to first line
+output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
+Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
+Jaccard_stemmed_NLTK_stop_words\n')
+
+# Read own stop word list
+# This list has been created by manually selecting words from the csv-file
+# 100_most_frequent_words.csv, which is created by the Python program
+# "Problem_12_Most_Frequent_Words.py".
+# Simply delete words you consider to be meaningless and that are frequently
+# used.
+stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
+stop_word_text=stop_word_file.read()
+stop_word_line=stop_word_text.split("\n")
+stop_word_line.remove("")
+own_stop_words=[""]
+for i in range(1,len(stop_word_line)):
+    stop_word=stop_word_line[i].split(";")[1]
+    own_stop_words.append(stop_word)
+
+own_stop_words.remove("")
+print("This is the list of my stop words:")
+print(own_stop_words)
+
+# Read NLTK stop word list
+NLTK_stop_words=set(stopwords.words("english"))
+print("This is the list of NLTK stop words:")
+print(NLTK_stop_words)
+
+# set default values for variables
+# It is not required. However, if you don't do it Spyder will suggest that line
+# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
+# is incorrect as word_list_old_edited is not yet defined at point in the program
+# code. In this specific example, this will not cause an error, as we do not enter
+# the if condition when i=1 -> it
+word_list_old_edited=[]
+word_list_edited=[]
+word_list_old_NLTK_filtered=""
+word_list_old_own_filtered=""
+word_list_old_edited_stemmed=""
+word_list_old_own_filtered_stemmed=""
+word_list_old_NLTK_filtered_stemmed=""
+
+#######################################################
+# Define a function that computes Jaccard similarity
+# As we need these operations several times, it is
+# helpful to use a function.
+######################################################
+# beginning of the function
+def jaccard(text1,text2):
+    counter1=Counter(text1)
+    counter2=Counter(text2)
+
+    intersection=counter1 & counter2
+    union=counter1 | counter2
+
+    return len(intersection)/len(union)
+# end of the function
+
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the eight variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (8th column)
+    cik=variables[0]
+    filename_parts=re.split('/',variables[7])
+    filename=filename_parts[3].replace('.txt','')
+    
+    # Write the information from the input file to the output file
+    # we do not add a line break at the end, as we must append the similarity
+    # score first.
+    output_file.write(input_text_line[i])
+    
+    # Open the ith 10-K; remember to specify the encoding
+    input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
+    '_edited.txt', 'r', encoding='ascii', errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # check whether the previous entry of the list is from the same firm
+    permco=input_text_line[i].split(";")[1]
+    permco_old=input_text_line[i-1].split(";")[1]
+    
+    # Split text into words
+    word_list_edited=word_tokenize(input_text_10_k.lower())
+    
+    
+    ############################################
+    # Sub Task 1: Jaccard for the _edited.txt
+    ############################################
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        # the command calls the jaccard function that we have defined above.
+        # in the function, text1=word_list_edited and text2=word_list_old_edited.
+        jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
+         
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_edited=word_list_edited
+    
+    
+    ############################################
+    # Sub Task 2: Jaccard for the _edited.txt
+    # AND REMOVE STOP WORDS - OWN LIST
+    ############################################
+    # remove stop words using personal stop word list
+    word_list_own_filtered=[]
+    for word in word_list_edited:
+        if word not in own_stop_words:
+            word_list_own_filtered.append(word)
+
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        jaccard_similarity=jaccard(word_list_own_filtered,\
+        word_list_old_own_filtered)
+         
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_own_filtered=word_list_own_filtered
+    
+    
+    ############################################
+    # Sub Task 3: Jaccard for the _edited_v1.txt
+    # AND REMOVE STOP WORDS - NLTK LIST
+    ############################################
+    # remove stop words using NLTK stop word list
+    word_list_NLTK_filtered=[]
+    for word in word_list_edited:
+        if word not in NLTK_stop_words:
+            word_list_NLTK_filtered.append(word)
+
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        jaccard_similarity=jaccard(word_list_NLTK_filtered,\
+        word_list_old_NLTK_filtered)
+        
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_NLTK_filtered=word_list_NLTK_filtered
+    
+    
+    ############################################
+    # Sub Task 4: Jaccard for the _stemmed.txt
+    ############################################
+    # Create stemmed text
+    word_list_edited_stemmed=[]
+    for word in word_list_edited:
+        word_list_edited_stemmed.append(ps.stem(word))
+        
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
+        
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_edited_stemmed=word_list_edited_stemmed
+    
+    
+    ############################################
+    # Sub Task 5: Jaccard for the _stemmed.txt
+    # AND REMOVE STOP WORDS - OWN LIST
+    ############################################
+    # Caution; in general, it is not clear whether you should first stem or
+    # first remove stop words.
+    # However, in this specific case, you should remove the stop words first
+    # and then stem, as your stop word list is based on the inflected text.
+    
+    # remove stop words using personal stop word list
+    word_list_own_filtered=[]
+    for word in word_list_edited:
+        if word not in own_stop_words:
+            word_list_own_filtered.append(word)
+    
+    # Create stemmed text
+    word_list_own_filtered_stemmed=[]
+    for word in word_list_own_filtered:
+        word_list_own_filtered_stemmed.append(ps.stem(word))
+        
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
+        word_list_old_own_filtered_stemmed)
+        
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
+    
+    
+    ############################################
+    # Sub Task 6: Jaccard for the _stemmed.txt
+    # AND REMOVE STOP WORDS - NLTK LIST
+    ############################################
+    # Caution; it is not clear whether you should first stem or first remove
+    # stop words. However, the NLTK stop word list seems to be based on inflected
+    # text, e.g. the word "having" is included. "Having" would be stemmed to "have".
+    # Thus, the stop list seems to be not stemmed.
+    # Thus, you should remove the stop words first and then stem.
+    
+    # remove stop words using NLTK stop word list
+    word_list_NLTK_filtered=[]
+    for word in word_list_edited:
+        if word not in NLTK_stop_words:
+            word_list_NLTK_filtered.append(word)
+    
+    # Create stemmed text
+    word_list_NLTK_filtered_stemmed=[]
+    for word in word_list_NLTK_filtered:
+        word_list_NLTK_filtered_stemmed.append(ps.stem(word))
+    
+    # compute Jaccard similarity if the previous filing is from the same firm
+    if permco==permco_old:
+        jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
+        word_list_old_NLTK_filtered_stemmed)
+        
+        output_file.write(";"+str(jaccard_similarity))
+    else:
+        # The previous filing is not from the same firm -> cannot compute Jaccard similarity
+        output_file.write(";")
+    
+    # Save the current word vector to a separate variable for the comparison of the next report.
+    word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
+    
+    
+    # Write line break to output file
+    output_file.write("\n")
+    
+    # Close 10-K filing
+    input_file_10_k.close()
+    
+input_file.close()
+output_file.close()
+stop_word_file.close()
+print("Task done!")
+
--- a/lectures/programming/solutions/Problem_17_Ridge_LASSO_text_data.py
+++ b/lectures/programming/solutions/Problem_17_Ridge_LASSO_text_data.py
@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Mar 21 09:38:32 2022
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from sklearn.linear_model import RidgeCV
+from sklearn.linear_model import LassoCV
+
+
+# adjust the directory to your folder
+directory="C:/Lehre/Machine Learning/Data/"
+
+
+# import the data for this problem
+# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
+data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
+# The rows of the data are the Form 10-K filings. Each line is one filing.
+# The columns are the variables. After some identifying information,
+# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
+# in a 10-K (e.g., 100 times)
+
+
+# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
+# and Console will crash.
+# However, you can pick a small subset of the data and look at it.
+# It list all columns=variables and the first three observations.
+data_frame_example=data_frame.head(3)
+# you can click on this variable in the variable explorer without Spyder crashing.
+
+# To see the variables included in the data use the following command
+data_frame_column_names=data_frame.columns 
+# you can click on this variable in the variable explorer without Spyder crashing.
+# This variables shows all column/variable names in a vector.
+
+# split the data set into the training and testing data
+# we use the filings from year 2007 as training data
+data_frame_train=data_frame[data_frame.year==2007]
+# and the filing from year 2008 as testing data
+data_frame_test=data_frame[data_frame.year==2008]
+
+# put the cumulative abnormal return around the filing date into a new variable.
+# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
+# training data
+filing_car_train=data_frame_train["excess_ret_t0_t4"]
+# testing data
+filing_car_test=data_frame_test["excess_ret_t0_t4"]
+
+# so far, you have absolute word counts. For example, "loss" is found 5 times.
+# As the length of the 10-Ks can be different, we scale by the number of words
+# in the 10-K.
+document_length_train=data_frame_train["number_of_words"]
+document_length_test=data_frame_test["number_of_words"]
+
+
+# the word frequencies are our independent variables -> restrict the data frame
+# to those variables and drop all variables that are not needed
+data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
+data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
+
+# compute relative frequencies, i.e., divide the absolute word count by document length
+data_frame_train=data_frame_train.div(document_length_train, axis=0)
+data_frame_test=data_frame_test.div(document_length_test, axis=0)
+
+# standardize the data frames
+# training data
+data_frame_train_mean=np.mean(data_frame_train,axis=0)
+data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
+data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
+# testing data
+data_frame_test_mean=np.mean(data_frame_test,axis=0)
+data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
+data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
+
+
+# There can be missing values in the standardized variables.
+# They arise if the word count for a specific word is always zero in the training
+# or in the testing data. In this case, the standard deviation is zero ->
+# division by zero -> NaN.
+# We replace these missing values by zero.
+# training data
+data_frame_train_standardized=data_frame_train_standardized.fillna(0)
+# testing data
+data_frame_test_standardized=data_frame_test_standardized.fillna(0)
+
+##########################
+# Ridge regression
+##########################
+print("\nRidge regression - Using cross-validation\n")
+# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
+# In this regression, we use the training data.
+# We use five-fold cross-validation.
+# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
+# The optimal alpha is at around 140000.
+regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
+
+# get the optimal lambda
+alpha_optimal_cv=regression_Ridge_cv.alpha_
+print("The optimal alpha is "+str(alpha_optimal_cv))
+
+# what is the R2 in the training and testing data?
+print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
+print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
+
+# Mean squared error using the cross-validated model
+# predict y in the full training sample
+filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
+# predict y in the testing sample
+filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
+# Determine the MSE
+print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
+print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
+
+
+######################
+# LASSO regression
+######################
+print("\nLASSO regression - Using cross-validation\n")
+# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
+# In this regression, we use the training data.
+# We use five-fold cross-validation.
+# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
+# The optimal alpha is at around 0.86.
+regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
+
+# get the optimal lambda
+alpha_optimal_cv=regression_Lasso_cv.alpha_
+print("The optimal alpha is "+str(alpha_optimal_cv))
+
+# get the R2 in the training data
+print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
+# ... and testing data
+print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
+
+# Mean squared error using the cross-validated model
+# predict y in the full training sample
+filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
+# predict y in the testing sample
+filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
+# Determine the MSE
+print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
+print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
+
+
+############################################################
+# Compare the betas from the Ridge and the LASSO regressions
+############################################################
+output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
+output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
+
+# get the list of coefficients
+for i in range (0,len(data_frame_train.columns)):
+    output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
+    
+output_file.close()
+
+print("Completed!")
--- a/lectures/programming/solutions/Problem_1_Fun_with_Python.py
+++ b/lectures/programming/solutions/Problem_1_Fun_with_Python.py
@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Nov 13 21:40:57 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Task 1: Open and print
+# Open the Txt-file
+print("\nTask 1 starts here!\n")
+input_file=open(directory+'Fun_with_Python.txt','r')
+input_text=input_file.read()
+# Alternative with one command
+input_text=open(directory+'Fun_with_Python.txt','r').read()
+
+print(input_text)
+
+# Task 2: Write text to output file
+# Create file 'More_fun_with_Python.txt'
+print("\nTask 2 starts here!\n")
+output_file=open(directory+'More_fun_with_Python.txt','w')
+output_file.write("Hallo\n")
+output_file.write(input_text)
+output_file.close()
+
+# Task 3: loop
+print("\nTask 3 starts here!\n")
+# Alternative 1: While loop
+i = 1
+while i<=10:
+    print('Iteration Number: '+str(i))
+    i=i+1
+    # Example of a nested loop
+    j=1
+    while j<3:
+        print('Hallo')
+        j=j+1
+
+# Alternative 2: For loop
+for i in range(0,10):
+    print('Iteration Number: '+str(i))
+# there is also a shorter notation: if there is no lower bound it is assumed to be zero
+for i in range(10):
+    print('Iteration Number: '+str(i))
+
+
+# Task 4: Print text line by line
+# Print text line by line
+print("\nTask 4 starts here!\n")
+line_of_text=input_text.split('\n')
+i=0
+while i<len(line_of_text):
+    print("Line "+str(i+1)+": "+line_of_text[i])
+    i=i+1
+
+# First alternative using a for loop
+for i in range(0,len(line_of_text)):
+    print("Line "+str(i+1)+": "+line_of_text[i])
+    
+    
+# Second alternative
+# for ... in -> for each element of the list do ...
+# line can be any name; it refers to the elements of the list
+i=1
+for line in line_of_text:
+    print("Line "+str(i)+": "+line)
+    i=i+1
+
+
+# Task 5: count 'good'
+# Count how often the word 'good' appears in the text
+print("\nTask 5 starts here!\n")
+number_good=input_text.count('good')
+print(number_good)
+# you can write the command in a shorter format
+print(input_text.count('good'))
+
+# Task 6a
+# Print lines with the word 'good'
+print("\nTask 6a starts here!\n")
+for i in range(len(line_of_text)):
+    if line_of_text[i].count('good')>=1:
+        print(line_of_text[i])
+
+
+# Task 7
+# Print lines that start with the word 'This'
+print("\nTask 7 starts here!\n")
+print("\n'This' with a capital T.\n")
+for i in range(len(line_of_text)):
+    if line_of_text[i].startswith('This')>=1:
+        print(line_of_text[i])
+        
+print("\n'this' with a lower case t.\n")
+for i in range(len(line_of_text)):
+    if line_of_text[i].startswith('this')>=1:
+        print(line_of_text[i])
+
+print("Yes, the command is case sensitive (2 vs. 0 matches)!")
+
+
+# Task 8
+# Replace the word 'good' by 'excellent'
+print("\nTask 8 starts here!\n")
+new_text=input_text.replace("good","excellent")
+print(new_text)
+
+# For illustation only
+print("\nFor illustation only\n")
+for i in range(len(line_of_text)):
+    new_line_of_text=line_of_text[i].replace('good','excellent')
+    # print the new line IF there are a change.
+    if not new_line_of_text==line_of_text[i]:
+        print(new_line_of_text)
+
+input_file.close()
+output_file.close()
+
+print("DONE")
--- a/lectures/programming/solutions/Problem_2_SEC_Filings_Part1_Identification.py
+++ b/lectures/programming/solutions/Problem_2_SEC_Filings_Part1_Identification.py
@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 09:21:46 2015
+
+@author: Alexander Hillert, Goethe Uni Frankfurt
+"""
+import re
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the txt file with the SEC filings
+sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
+sec_filings_text=sec_filings_file.read()
+
+# Create output file
+output_file=open(directory+'SEC_Filings_Output.csv','w')
+
+# Create first line with variable names
+# I use semicolons as separator in csv files. You can also use any other symbol.
+# However, you should make sure that the separator is not part of the data/text
+# you write to the file.
+# For example, it would be problematic if you use comma as separator and have
+# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
+output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
+
+# Split the Input File in separate line
+sec_filings_line=sec_filings_text.split("\n")
+
+# Loop over all lines
+for i in range(len(sec_filings_line)):
+    # Does the line refer to a form 10-K file?
+    # As pointed out by Loughran and McDonald (2011), many firms mislabelled
+    # their 10-K filings as 10-K405 filings. Thus, I included these filings
+    # as well.
+    # The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
+    # Depending on the research question at hand one could include amendments as well.
+    # Also, 10KSB (small businesses) could also be included.
+    
+    match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
+    if match_10k:
+        
+    #if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
+        # Split the line such that the information can be saved in separate
+        # variables
+        # Each information item has a fixed length in the overview files of the
+        # SEC.
+        # Filing type: position 1 to 12
+        # Remember Python starts counting at 0 and does not include the upper bound
+        filing_type=sec_filings_line[i][:12]
+        # Company name: position 13 to 74
+        company_name=sec_filings_line[i][12:74]
+        # CIK: position 75 to 86
+        cik=sec_filings_line[i][74:86]
+        # Filing date: position 87 to 98
+        filing_date=sec_filings_line[i][86:98]
+        # Link: position 99 to end of line
+        link=sec_filings_line[i][98:]
+        
+        # Is the 10-K filed between March 10 and March 20?
+        # The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
+        filing_day=filing_date[8:10]
+        filing_month=filing_date[5:7]
+        # Is the Filing Month March?
+        if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
+            # The filing meets the conditions -->
+            # Write output to the csv file
+            output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
+
+sec_filings_file.close()
+output_file.close()
+
+print("DONE")
--- a/lectures/programming/solutions/Problem_2_SEC_Filings_Part2_Download.py
+++ b/lectures/programming/solutions/Problem_2_SEC_Filings_Part2_Download.py
@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe Uni Frankfurt
+"""
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# We need the urllib package
+import urllib.request
+# To automatically create folders we need the os-module (OS: Operating System)
+import os
+
+
+# Define a user agent
+# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
+# "Some websites dislike being browsed by programs, or send different versions
+# to different browsers. By default urllib identifies itself as Python-urllib/x.y
+# (where x and y are the major and minor version numbers of the Python release,
+# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
+# The way a browser identifies itself is through the User-Agent header.
+opener = urllib.request.build_opener()
+
+# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
+# To still automatically download files, you have different options.
+# I have listed three examples below but there are many more:
+# For a comprehensive list see, e.g.:
+# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
+#opener.addheaders = [('User-agent', 'Mozilla')]
+#opener.addheaders = [('User-agent', 'Chrome')]
+opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
+urllib.request.install_opener(opener)
+
+
+# Open the csv file from part 1 of the problem
+input_file=open(directory+'SEC_Filings_Output.csv','r')
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+# sometimes you have empty lines after a split command.
+# You can remove them using the following command
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create a subfolder in which the 10-K filings are saved.
+# When you download a large number of filings I recommend using subfolders for
+# each year or even for each year-month combination.
+# The option "exist_ok=True" makes sure that you do not get an error if the
+# folder already exists.
+os.makedirs(directory+"10-Ks/", exist_ok=True)
+    
+# Loop over all lines of the csv file 
+#for i in range(1,len(input_text_line)):
+# To avoid having to download hundreds of files when we discuss the solution
+# the loop stops at 20. (Remember the upper bound is not included.)
+for i in range(1,21):
+    
+    # split the line into the five variables
+    variables=input_text_line[i].split(";")
+    # We only need the cik and the link.
+    # The cik is the 3rd variable. However, the numbering of lists starts
+    # at zero -> 2nd item of the list "variables"
+    # The link is the 5th variable -> 4th item of the list "variables"
+    cik=variables[2]
+    #cik=cik.replace(" ","")
+    cik=cik.strip()
+    link=variables[4]
+    #link=link.replace(" ","")
+    link=link.strip()
+    
+    # Find the filename
+    # The link consistes of differnt parts:
+    # For example: edgar/data/1000753/0000950129-98-001035.txt
+    link_parts=link.split("/")
+    # 1st part: edgar
+    # 2nd part: data
+    # 3rd part: cik
+    # 4th part: file name -> 3rd item of the set
+    filename=link_parts[3]
+    ###########################################################################
+    ############################ WARNING ######################################
+    # The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
+    # may use the same filename. Thus, when you only use the filename files
+    # might be overwritten. To avoid this problem you need to have a unique name. 
+    # Combining CIK and filename results in a unique identifier, as the 
+    # filename appears only once per firm (CIK).
+    # -> use the combination of CIK and filename: cik_filename
+    ###########################################################################        
+    urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
+    directory+"10-Ks/"+cik+"_"+filename)
+    
+input_file.close()
+print("DONE")
--- a/lectures/programming/solutions/Problem_4_Application_Regular_Expressions.py
+++ b/lectures/programming/solutions/Problem_4_Application_Regular_Expressions.py
@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 12 15:50:22 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# Import regular expressions and BeautifulSoup
+import re
+from bs4 import BeautifulSoup
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the document
+input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
+input_text=input_file.read()
+
+#######################
+# Task 1: remove tables
+#######################
+# Approach
+# We search for tables until we find no more html tags that indicate the
+# beginning of a table.
+# Search for the start html-tag <TABLE>
+table_match=re.search('<TABLE>', input_text)
+print("This is the result of the re.search command:")
+print(table_match)
+while table_match:
+    # When we have identified a match, i.e. the start of a table, we save
+    # the position of the beginning of the table in the variable "start_table"
+    table_start_match=re.search('<TABLE>', input_text)
+    start_table=table_start_match.start()
+    # Next, we search for the corresponding html tag that indicates the end of
+    # the table and save the end position to the variable "end_table"
+    table_end_match=re.search('</TABLE>', input_text)
+    end_table=table_end_match.end()
+    
+    # We can print the text between the start and end html tag to check whether
+    # the table has been identified correctly.
+    print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
+
+    # the text between the beginning and end of the html tags is the part which
+    # we would like to delete.
+    # Consequently, we keep the text before the beginning of the table as well
+    # as the text after the ending of the table.
+    input_text=input_text[:start_table]+input_text[end_table:]
+    # Next, we need to check whether there is another table in the rest of the
+    # text.
+    table_match=re.search('<TABLE>', input_text)
+    # As long as "table_match" exists, i.e. we regex result in a match, the loop 
+    # will continue.
+
+#########################
+# Task 2: remove Exhibits
+#########################
+# Exhibits have the following structure
+# <DOCUMENT>
+# <TYPE>EX...
+# ...
+# </DOCUMENT>
+exhibit_match=re.search('<TYPE>EX', input_text)
+while exhibit_match:
+    exhibit_start_match=re.search('<TYPE>EX', input_text)
+    start_exhibit=exhibit_start_match.start()
+    # As the exhibits are at the end of the 10-K filing it would not be
+    # necessary to include an end position. We could also drop the entire text
+    # after "<TYPE>EX"
+    # It is important that we search for the </DOCUMENT> only after the exhibit
+    # started. Otherwise, we could get the end of the main document. 
+    exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
+    end_exhibit=start_exhibit+exhibit_end_match.end()
+    # Print the identified text to check whether the exhibit has be identified
+    # correctly
+    print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
+    
+    input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
+    # Check whether there are further exhibits
+    exhibit_match=re.search('<TYPE>EX', input_text)
+
+##########################
+# Task 3: remove html code
+##########################
+# Alternative 1: remove html code without Beautiful Soup
+text=re.sub('<[^>]{1,}>', '', input_text)
+# This regex searches for a "<" followed by at least one character that must not
+# equal > and is completed by >.
+# You might have thought about using the following command
+#text=re.sub('<.{1,}>', '', input_text)
+# However, this command has a problem, as it would delete the following line
+# entirely: <page> This is some text that should remain <page>
+# The .{1,} would match 'page> This is some text that should remain <page', as
+# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
+# Consequently, in the example only the two "<page>" would be deleted.
+# You can verify this by using regex101.com (remember to check "Python" in the
+# left menu of the webpage)
+
+# Alternative 2: remove html code using Beautiful Soup
+html_text=BeautifulSoup(input_text, 'html.parser')
+text=html_text.get_text()
+
+########################
+# Task 4: delete numbers
+########################
+# Alternative 1 - removing numbers step by step
+# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
+text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
+# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
+text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
+# remove the remaining numbers without commas and dots
+text=re.sub('[0-9]','',text)
+
+# Alternative 2 - removing numbers using a single regex
+text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
+
+# Alternative 3 - removing numbers step by step but start with commas and dots
+# 1. remove comma incl. the surrounding numbers
+text=re.sub("[0-9],[0-9]","",text)
+# 2. remove dots incl. the surrounding numbers
+text=re.sub("[0-9]\.[0-9]","",text)
+# 3. remove any remaining number
+text=re.sub("[0-9]","",text)
+
+
+########################
+# Task 5: delete symbols
+########################
+# When analyzing tone, symbols do not matter, as they are not considered to be
+# words and thus do not biased the total word count.
+# However, for training purposes this task is included in the problem.
+# There is no well defined list of which symbols should be deleted. So, you
+# can add further symbols.
+text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
+text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
+
+# Open the output file for the pure text
+output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
+output_file.write(text)
+
+input_file.close()
+output_file.close()
+
+print("DONE")
+
--- a/lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
+++ b/lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 12 15:50:22 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+import re
+from bs4 import BeautifulSoup
+
+# Please adjust the directory to your machine.
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the 10-K
+input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
+input_text=input_file.read()
+
+################################
+# Remove tables
+# Same approach as in Problem 4
+################################
+# Sometimes it is helpful to print the text parts that are deleted. In this
+# example, we will print the first two tables that we delete.
+i=1
+table_match=re.search('<TABLE>', input_text)
+while table_match:
+    # Search for the beginning of the table
+    table_start_match=re.search('<TABLE>', input_text)
+    start_table=table_start_match.start()
+    # search for the end of the table
+    table_end_match=re.search('</TABLE>', input_text)
+    end_table=table_end_match.end()
+    # The if condition and the printing are just for illustrative purposes.
+    # The commands display the first two tables that are removed from the text.
+    if i<=2:
+        print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
+        i=i+1
+    # remove the table
+    input_text=input_text[:start_table]+input_text[end_table:]
+    # check whether there are further tables
+    table_match=re.search('<TABLE>', input_text)
+    
+################################
+# Remove exhibits
+# Same approach as in Problem 4
+################################
+# Exhibits have the following structure
+# <DOCUMENT>
+# <TYPE>EX...
+# ...
+# </DOCUMENT>
+# Sometimes it is helpful to print the text parts that are deleted. In this
+# example, we will print the first exhibit that we delete.
+i=1
+exhibit_match=re.search('<TYPE>EX', input_text)
+while exhibit_match:
+    # Search for the beginning of the exhibit
+    exhibit_start_match=re.search('<TYPE>EX', input_text)
+    start_exhibit=exhibit_start_match.start()
+    # Search for the end of the exhibit
+    # CAUTION: search only in the text after the beginning of the exhibt, as
+    # </DOCUMENT> also appears earlier (e.g. end of main document)
+    exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
+    end_exhibit=start_exhibit+exhibit_end_match.end()
+    if i<=1:
+        print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
+        i=i+1
+    # remove exhibit
+    input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
+    exhibit_match=re.search('<TYPE>EX', input_text)
+    
+##################
+# Remove html code
+##################
+html_text=BeautifulSoup(input_text, 'html.parser')
+text=html_text.get_text()
+
+############################
+# Remove the Document Header
+############################
+# There are different possibilities how one can define the start of the main part of the text
+# In general, you should delete all text that is uninformative for your analysis.
+# Alternative 1:
+# Search for Table of Contents. To not mistakenly match a reference to the
+# table of contents somewhere in the text, we require a linebreak before and after.
+# When the "Table of Contents" is centered, there will be whitespaces or tabs
+# before and potentially also after
+header_match=re.search('(?i)\n[\t ]{0,}table[\t ]of[\t ]contents[\t ]{0,}\n', text)
+# Alternative 2:
+# Search for Documents incorporated by reference.
+header_match=re.search('\n[\t ]{0,}DOCUMENTS[\t ]INCORPORATED[\t ]BY[\t ]REFERENCE[\t ]{0,}\n', text)
+if header_match:
+    # Drop the document header and keep only the rest of the text after the header.
+    text=text[header_match.end():]
+
+#################################################
+# Delete the text in "PART IV"
+# This procedure is optional. Look at "Part IV" and decide whether you favor
+# the approach. I think that the part should be dropped, as it is just a list
+# of exhibits, some mandatory text required by the SEC [indicated by the
+# capital letters in the "SIGNATURES" section].
+#################################################
+
+'''
+# Alternative 1: go over all matches but keep only the last one
+for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
+    print("Hallo")
+# match now contains the last match
+# Delete the text after the last match
+text=text[:match.start()]
+
+
+# Alternative 2: save the positions of all matches (more general approach)
+# to use alternative 2, you have to comment out Alternative 1!
+# Otherwise line 104 will create a problem when you execute Alternative 2.
+list_start_matches=[]
+list_end_matches=[]
+for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
+    print(match)
+    list_start_matches.append(match.start())
+    list_end_matches.append(match.end())
+# Position of last match
+print(list_start_matches[len(list_start_matches)-1])
+print(list_end_matches[len(list_start_matches)-1])
+
+
+# Alternative 3: manual coding using a loop of re.searches
+# create a copy of the text that we can edit
+text_check_part_IV=text
+part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
+# create two lists that we can use to save the start and end positions
+# of the Part IV matches
+list_start_matches_v2=[]
+list_end_matches_v2=[]
+# variable to save the position of the last match in the overall text
+end_position_previous_match=0
+while part_IV_match:
+    start_position_match=end_position_previous_match+part_IV_match.start()
+    end_position_match=end_position_previous_match+part_IV_match.end()
+    
+    list_start_matches_v2.append(start_position_match)
+    list_end_matches_v2.append(end_position_match)
+    
+    # update the information on the end of the last match
+    end_position_previous_match=end_position_previous_match+part_IV_match.end()
+    
+    text_check_part_IV=text_check_part_IV[part_IV_match.end():]
+    part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
+
+# when you compare list_end_matches to list_end_matches_v2, you see that the two
+# approaches yield the same result.
+# To double check that the approaches have the same results, you could
+# replace the Regex in lines 112, 124, and 142 by "\s{2,}PART [A-Z]{1,3}\s{0,}\n".
+# In these case you have more matches and so you can better check that the
+# two approaches have identical outcomes.
+'''
+
+'''
+# Delete the text after the last match
+text=text[:list_start_matches[len(list_start_matches)-1]]
+'''
+
+# Delete item numbers
+# This is optional. It removes "Item  1.", "ITEM 1.", "Item 10.", "Item  7A."
+text=re.sub('(?i)Item [0-9]{1,}A{0,1}(\s|\.|:|\n)','',text)
+
+# Delete numbers
+text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
+
+# Alternative stepwise procedure to delete numbers
+# remove commas in numbers, e.g., 1,000 or 12,345,678
+text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
+# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
+text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
+# remove the remaining numbers without commas and dots
+text=re.sub('[0-9]','',text)
+
+
+# Hyphens can be used to indicate that the word is continued in the next
+# line. For example, "Micro-\nsoft" (\n is the line feed).
+# Delete hyphens that are followed by a line feed.
+text=re.sub('-\n','',text)
+
+# Replace symbols by a whitespace.
+# Extra whitespaces are not a problem.
+text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
+
+# Delete dots and commas that are not part of sentences, i.e. commas and dots
+# that are preceded by a line break (potentially also whitespaces and tabs)
+# and that are followed by are followed by a line break (again, there may
+# also be whitespaces and tabs).
+text=re.sub('\n[\t ]{0,}(\.|,){1,}[\t ]{0,}\n','\n',text)
+
+# Drop single-character words
+# One can argue whether one should implement this procedure. Loughran and
+# McDonald argue in one of their papers in favor of it.
+# To make sure that there is just one letter, we require that there is a word
+# boundary (\W) before and after. We use a positive backward looking and a
+# positive forward looking condition for this to assure that the word boundary
+# get not deleted as well.
+text=re.sub('(?<=\W)[A-Za-z](?=\W)',' ',text)
+   
+    
+# Open the output file for the pure text
+output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
+output_file.write(text)
+
+input_file.close()
+output_file.close()
+print("COMPLETED.")
--- a/lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
+++ b/lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
@ -0,0 +1,356 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jul 29 11:07:10 2015
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import re
+from bs4 import BeautifulSoup
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the iput file. The following command
+# deletes these lines
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
+# We subtract 1 from the lenght, as the first line contains the variable names but not data.
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    # To see the progress of your program you can print the number of iteration.
+    print(str(i))
+    
+    # split the lines of the CSV-file into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename to open the file
+    cik=variables[0]
+    filename=variables[1]
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # the new file name should be "old_name_clean" -> we have to replace ".txt"
+    # by "_clean.txt"
+    filename=filename.replace('.txt','_clean.txt')
+    
+    # Remove tables
+    variable=re.search('<TABLE>', input_text_10_k)
+    while variable:
+        variable=re.search('<TABLE>', input_text_10_k)
+        start_table=variable.start()
+        variable=re.search('</TABLE>', input_text_10_k)
+        end_table=variable.end()
+        input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
+        variable=re.search('<TABLE>', input_text_10_k)
+    
+    
+    ####################### Begin of exhibits removal #########################
+    # Exhibits have the following structure
+    # <DOCUMENT>
+    # <TYPE>EX...
+    # ...
+    # </DOCUMENT>
+    # In the recent years, there are also exhibits with <TYPE>EXCEL
+    # -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
+    variable=re.search('<TYPE>EX', input_text_10_k)
+    while variable:
+        variable=re.search('<TYPE>EX', input_text_10_k)
+        start_exhibit=variable.start()
+        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
+        end_exhibit=start_exhibit+variable.end()
+        input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
+        variable=re.search('<TYPE>EX', input_text_10_k)
+        
+    # In recent years, there are also XML-Exibits.
+    # CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
+    # Remove XML-Exhibits, which have the following structure
+    # <DOCUMENT>
+    # <TYPE>XML
+    # ...
+    # </DOCUMENT>
+    variable=re.search('<TYPE>XML', input_text_10_k)
+    while variable:
+        variable=re.search('<TYPE>XML', input_text_10_k)
+        start_exhibit=variable.start()
+        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
+        end_exhibit=start_exhibit+variable.end()
+        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
+        variable=re.search('<TYPE>XML', input_text_10_k)
+    
+    # Furthermore, also in recent years, there are also ZIP-Exibits.
+    # CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
+    # Remove ZIP-Exhibits, which have the following structure
+    # <DOCUMENT>
+    # <TYPE>ZIP
+    # ...
+    # </DOCUMENT>
+    variable=re.search('<TYPE>ZIP', input_text_10_k)
+    while variable:
+        variable=re.search('<TYPE>ZIP', input_text_10_k)
+        start_exhibit=variable.start()
+        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
+        end_exhibit=start_exhibit+variable.end()
+        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
+        variable=re.search('<TYPE>ZIP', input_text_10_k)
+    
+    # In addition, there are many Graphic-Exibits.
+    # CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
+    # Remove GRAPHIC-Exhibits, which have the following structure
+    # <DOCUMENT>
+    # <TYPE>GRAPHIC
+    # ...
+    # </DOCUMENT>
+    variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
+    while variable:
+        variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
+        start_exhibit=variable.start()
+        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
+        end_exhibit=start_exhibit+variable.end()
+        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
+        variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
+        
+    # Furthermore, there can be also Cover-Exibits.
+    # CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
+    # Remove COVER-Exhibits, which have the following structure
+    # <DOCUMENT>
+    # <TYPE>COVER
+    # ...
+    # </DOCUMENT>
+    variable=re.search('<TYPE>COVER', input_text_10_k)
+    while variable:
+        variable=re.search('<TYPE>COVER', input_text_10_k)
+        start_exhibit=variable.start()
+        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
+        end_exhibit=start_exhibit+variable.end()
+        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
+        variable=re.search('<TYPE>COVER', input_text_10_k)
+
+	# Furthermore, there can be also PDF files attached.
+	# These attachments caused BeautifulSoup to crash on some computers.
+    # Remove PDFs
+    variable=re.search('<PDF>', input_text_10_k)
+    while variable:
+        variable=re.search('<PDF>', input_text_10_k)
+        start_pdf=variable.start()
+        variable=re.search('</PDF>', input_text_10_k[start_pdf:])
+        end_pdf=start_pdf+variable.end()
+        input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
+        variable=re.search('<PDF>', input_text_10_k)
+	
+    ######################## End of exhibits removal ##########################
+    
+    # Remove Document Header - PART 1
+    # This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
+    # and "</SEC-HEADER>" are mandatory for all filings.
+    variable=re.search('</SEC-HEADER>', input_text_10_k)
+    if variable:
+        input_text_10_k=input_text_10_k[variable.end():]
+    
+    
+    # In some filings, firms do not use line feeds \n but <div> and </div>
+    # instead to indicate the start and the end of sentences.
+    # "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer 
+    # neuen Zeile des Fließtextes beginnt."
+    # see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
+    # and
+    # "The <div> tag defines a division or a section in an HTML document.
+    # By default, browsers always place a line break before and after the <div> element."
+    # See: https://www.w3schools.com/tags/tag_div.asp
+    # It is important to replace <div> and </div> by linefeeds because otherwise
+    # the entire text will be in a single line and the subsequent commands do
+    # not work properly.
+    input_text_10_k=input_text_10_k.replace("<div>", "\n")
+    input_text_10_k=input_text_10_k.replace("</div>", "\n")
+
+    
+    # Remove html code
+    html_text=BeautifulSoup(input_text_10_k, 'html.parser')
+    text=html_text.get_text()
+    
+    
+    # To get an idea of what the commands below are doing, it is helpful to
+    # write the current version of the text to a file and then compare it to the
+    # final file.
+    filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
+    # Open the output file for the text without html code and without tables+exhibits
+    output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
+    output_file_10_k.write(text)
+    output_file_10_k.close()
+    
+    
+    # Remove the Document Header - PART II
+    # The above command to remove the header ("</SEC-HEADER>") does not capture
+    # the entire header -> we need to delete further parts at the top the filing.
+    # WARNING: The filters below may be specific to this sample of 10-Ks.
+    # Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
+    variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
+    if variable:
+        text=text[variable.end():]
+    else:
+        variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
+        if variable:
+            text=text[variable.end():]
+        else:
+            variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
+            if variable:
+                text=text[variable.end():]
+            else:
+                variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
+                if variable:
+                    text=text[variable.end():]
+                else:
+                    variable=re.search('\nPART\.{1,}', text)
+                    if variable:
+                        text=text[variable.end():]
+    
+    
+    # Delete Item numbers
+    text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
+    # Delete Part numbers
+    text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
+    
+    # Delete numbers:
+    text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
+    
+    # File names, e.g. exhibit.pdf or picture.jpeg should be removed
+    text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
+    
+    # URLs --> Remove internet addresse
+    text=re.sub("http:/{0,2}", "", text)
+    text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
+    
+    
+    # In Part 4 of the programming chapter, we will determine the number of
+    # words per sentence. To be able to use the same underlying sample,
+    # we need to implement further corrections. These changes do not affect
+    # the percentage of negative/positive/etc. words.
+    # --> Only relevant for determining the number of sentences
+    # The text contains dots that do not indicate the end of a sentence.
+    # E.g., "Inc." and "St."
+    # The preceding - is found in non-U.S. for example.
+    # Replace or remove specific abreviations
+    # This list is incomplete. In a research project you should spend more time
+    # on editing the data.
+    text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
+    text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
+    text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
+    text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
+    text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
+    text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
+    text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
+    text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
+    text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
+    text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
+    text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
+    text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
+    
+    text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
+    text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
+    text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
+    text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
+    text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
+    text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
+    text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
+    text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
+    text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
+    text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
+    text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
+    text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
+        
+    # The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
+    # three repitions of capital letter and dot are also common in filings
+    # we need to check for three instances first.
+    text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
+    # now check for two instances
+    text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
+    
+    # Dots after a single letter can indicate a middle Name Paul J. Smith
+    # or an abbreviation --> also delete these. 
+    text=re.sub("( |\n|,)[A-Z]\.", "", text)
+    
+    
+    # Hyphens can be used to indicate that the word is continued in the next
+    # line. For example, "Micro-\nsoft" (\n is the line feed).
+    # Replace hyphens followed by a line feed by a hyphen without line feed
+    text=re.sub('-\n','-',text)
+    
+    # Delete the minus/hyphens
+    # "Short-term" -> "shortterm"
+    text=re.sub('-','',text)
+    
+    
+    # --> Only relevant for determining the number of sentences
+    # Delete dots and commas that are not part of sentences, i.e. commas and dots
+    # that are preceded by whitespace or line break and that are followed by
+    # whitespace or line break.
+    text=re.sub('\n(\.|,)\n','\n',text)
+    text=re.sub(' (\.|,) ',' ',text)
+
+    # Delete single character words
+    # One can argue whether one should implement this procedure. Loughran and
+    # McDonald argue in one of their papers in favor of it.
+    # To make sure that there is just one letter, we require that there is a word
+    # boundary (\W) before and after. We use a positive backward looking and a
+    # positive forward looking condition for this to assure that the word boundary
+    # get not deleted as well.
+    text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
+    
+    
+    # There are sentences that are in upper case letters. However, these are not
+    # "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
+    # or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
+    # SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
+    # We save the edited text in a new variable
+    text_edited=text
+    # Split text in sentences
+    list_sentences=re.split('\.|!|\?', text)
+    # iterate the list of all sentences
+    for j in range(0,len(list_sentences)):
+        # Determine the number of upper case letters
+        upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
+        # Determine the number of all letters
+        total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
+        # If there is at least one letter calculate the fraction of upper case letters
+        if total_letters>0:
+            ratio=upper_letters/total_letters
+            # If the fraction of upper case letters is larger than 0.9 delete
+            # the sentence from the text.
+            if ratio>0.9:
+                text_edited=text_edited.replace(list_sentences[j]+'.','')
+                text_edited=text_edited.replace(list_sentences[j]+'!','')
+                text_edited=text_edited.replace(list_sentences[j]+'?','')
+    
+    
+    # --> Only relevant for determining the number of sentences
+    # There are a few cases where a dot follows a dot or where a linefeed 
+    # separates two dots. --> delete the second dot.
+    text_edited=text_edited.replace('..','.')
+    text_edited=text_edited.replace('.\n.','.')
+    
+    # The following commands do not influence the subsequent textual analysis.
+    # The only purpose is to display the output in a nicer format.
+    # Replace lines that contain only whitespaces by a line feed.
+    text_edited=re.sub('\n {1,}\n','\n',text_edited)
+    
+    # Replace multiple line feeds by one line feed.
+    text_edited=re.sub('\n{2,}','\n',text_edited)
+    
+    
+    # Open the output file for the pure text
+    output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
+    output_file_10_k.write(text_edited)
+    output_file_10_k.close()
+    input_file_10_k.close()
+
+input_file.close()
--- a/lectures/programming/solutions/Problem_7_Tone_Analysis.py
+++ b/lectures/programming/solutions/Problem_7_Tone_Analysis.py
@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import re
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the dictionary
+# The dictionary has been obtained from Bill McDonald's webpage
+# http://www3.nd.edu/~mcdonald/Word_Lists.html
+# --> LoughranMcDonald_MasterDictionary_2014.xlsx
+# --> select negative words and copy them to a txt file
+file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
+word_list=file_word_list.read()
+# The LMD words are all in upper case
+word_list=word_list.lower()
+negative_words=word_list.split('\n')
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the input file in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
+Percentage_Negative_Words\n')
+
+# Loop over all lines of the csv file
+for i in range(1,len(input_text_line)):
+#for i in range(1,10):
+    # If the execution of your scripts takes some time, printing the loop iterator
+    # gives you an impression of the overall progress made.
+    print(str(i))
+    
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (2nd column)
+    cik=variables[0]
+    filename=variables[1]
+    
+    # modify file name to open the edited files
+    filename=filename.replace('.txt','')
+    # Open the ith 10-Ks in the list
+    input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # Use lower case letters, too
+    # It is important that the formatting (lower case vs. upper case) of the word list
+    # and the document is identical. Remember that you have typically lower and upper case
+    # letters in documents -> modify text.
+    text=input_text_10_k.lower()   
+    
+    # Split the text in single words to determine the total number of words
+    # \W is a non-word character: "Matches any character which is not a Unicode
+    # word character." (Python documentation)
+    # this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
+    # case letters, no numbers, and no underscore.
+    list_of_words=re.split('\W{1,}', text)
+    # to make sure that empty list elements do not bias the word count, we delete them.
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+    # It is important that you treat multiple "\W" as one. Otherwise you are left
+    # with elements in the list that are not acutal words.
+    
+    # Determine the total number of words
+    word_count=len(list_of_words)
+    
+    # Reset the number of negative words to zero
+    negative_count=0
+    # For each negative word, count the number of occurrences
+    for j in range(len(negative_words)):
+        # the command "list_of_words.count(negative_words[i])" only matches if there
+        # is exact overlap between the ith negative word and the words in the list.
+        # For example the following two commands:
+        # list_of_words=["abandon","abandoned","abandonment"]
+        # list_of_words.count("abandon")
+        # yields 1 match
+        # In contrast,
+        # text_of_words="abandon abandoned abandonment"
+        # text_of_words.count("abandon")
+        # yields 3. Thus, you have to split the text to individual words!!!
+        negative_count=negative_count+list_of_words.count(negative_words[j])
+    
+    # Get the percentage of negative words
+    percentage_negative=negative_count/word_count
+    
+    # Write cik, file name, total number of words, number of negative words,
+    # and the percentage of negative words to output file.
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+    +str(negative_count)+';'+str(percentage_negative)+'\n')
+    
+    # Close filings
+    input_file_10_k.close()
+
+print("Finished")    
+output_file.close()
+input_file.close()
--- a/lectures/programming/solutions/Problem_8_Tone_Analysis_Positive_Words.py
+++ b/lectures/programming/solutions/Problem_8_Tone_Analysis_Positive_Words.py
@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert
+"""
+
+import re
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the dictionary
+# The dictionary is obtained from Bill McDonald's webpage
+# http://www3.nd.edu/~mcdonald/Word_Lists.html
+# --> LoughranMcDonald_MasterDictionary_2014.xlsx
+# --> select positive words and copy them to a txt file
+file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
+word_list=file_word_list.read()
+word_list=word_list.lower()
+positive_words=word_list.split()
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+    
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
+
+# Iterate the list of the 200 10-K filings
+# the last line is empty --> loop only up to len()-1
+#for i in range(1,len(input_text_line)):
+for i in range(1,20): # For illustration only
+    # If the execution of your scripts takes some time, printing the iterator
+    # gives you an impression of the overall progress
+    print(str(i))
+    
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK (1st column) and the filename (2nd column)
+    cik=variables[0]
+    filename=variables[1]
+    
+    # modify file name to open the edited files
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'/10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    # if the command above does not work (error like "file not found" or "directory not found")
+    # please use the following command:
+    #input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
+    input_text_10_k=input_file_10_k.read()
+    
+    # Use lower case letters, too
+    # It is important that the formatting (lower case vs. upper case) of the word list
+    # and the document are identical. Remember that you have typically lower and upper case
+    # letters in documents -> modify text
+    text=input_text_10_k.lower()
+    
+    # Split the text in single words to determine the total number of words
+    list_of_words=re.split('\W{1,}', text)
+    # to make sure that empty list elements do not bias the word count, we delete them.
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+        
+    # Determine total number of words
+    word_count=len(list_of_words)
+    
+    # Reset the number of positive words and positive words adj. for negations to zero
+    positive_count=0
+    positive_count_adj=0
+    # For each positive word, count the number of occurrences
+    for j in range(len(positive_words)):
+        # standard count operation without controlling for negations
+        positive_words_found=list_of_words.count(positive_words[j])
+        
+        # Loughran and McDonald (2011, JF, p.44): "We account for simple negation
+        # only for Fin-Pos words. Simple negation is taken to be observations
+        # of one of six words (no, not, none, neither, never, nobody) occurring
+        # within three words preceding a positive word.
+        
+        # When we have identified positive words we need to search for negations
+        while positive_words_found>0:
+            # identify the position of the matched positive word in the list of all words
+            position_of_word=list_of_words.index(positive_words[j])
+            # identify the three words before the positive word and add them to a list
+            # the \ is a line break
+            list_negation=[list_of_words[max(0,position_of_word-3)],\
+            list_of_words[max(0,position_of_word-2)],list_of_words[max(0,position_of_word-1)]]
+            # check whether one of the three words in list_negation is a negation
+            negation_found=list_negation.count('no')+list_negation.count('not')+\
+            list_negation.count('none')+list_negation.count('neither')+\
+            list_negation.count('never')+list_negation.count('nobody')
+            
+            if negation_found==0:
+                # no negation
+                positive_count_adj=positive_count_adj+1
+                positive_count=positive_count+1
+            else:
+                # negation
+                positive_count=positive_count+1
+    
+             # delete the matched positive words in the original document 
+            list_of_words[position_of_word]=''
+            # check whether there are further matches of the jth positive word
+            positive_words_found=list_of_words.count(positive_words[j])
+            
+    # Write cik, file name, total number of words, and number of positive
+    # and adjusted positive words to the output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
+    str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
+    ';'+str(positive_count_adj/word_count)+'\n')
+    
+    # Close filings
+    input_file_10_k.close()
+    
+print("Finished")
+output_file.close()
+input_file.close()
--- a/lectures/programming/solutions/Problem_9_Words_per_Sentence.py
+++ b/lectures/programming/solutions/Problem_9_Words_per_Sentence.py
@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Apr 13 22:43:32 2016
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+# We split the text into words and sentences using regular expression
+import re
+# For comparison, we also include the NLTK tokenizer
+from nltk.tokenize import sent_tokenize
+
+directory="C:/Lehre/Textual Analysis/Programming/Files/"
+
+# Open the csv file containing the list of the 200 10-Ks
+input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
+input_text=input_file.read()
+
+# Create output file
+output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
+# Write variable names to the first line of the output file
+output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
+'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
+'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
+
+# Split the Input File in separate lines
+input_text_line=input_text.split("\n")
+
+# In general, there can be empty lines in the input file. The following command
+# deletes these lines.
+while input_text_line.count("")>0:
+    input_text_line.remove("")
+
+# Loop over all lines
+for i in range(1,len(input_text_line)):
+    print(str(i))
+    # split the line into the two variables
+    variables=input_text_line[i].split(";")
+    # We need the CIK and the filename
+    cik=variables[0]
+    filename=variables[1]
+    filename=filename.replace('.txt','')
+    
+    # Open the ith 10-K in the list
+    input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
+    encoding='ascii',errors='ignore')
+    text=input_file_10_k.read()
+    
+    # Determine number of sentences and number of words    
+    # Split the text in words to determine the total number of words
+    list_of_words=re.split('\W{1,}', text)
+     # to make sure that empty list elements do not bias the word count, we delete them.
+    while list_of_words.count("")>0:
+        list_of_words.remove("")
+    # Determine total number of words
+    word_count=len(list_of_words)
+    
+    
+    # Split the text by symbols that indicate the end of a sentence
+    # to determine the total number of sentences
+    list_of_sentences=re.split('[\.!\?]{1,}', text)
+    while list_of_sentences.count("")>0:
+        list_of_sentences.remove("")
+    # Alternative 1:
+    list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
+    while list_of_sentences_1.count("")>0:
+        list_of_sentences_1.remove("")
+    # Alternative 2:
+    list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
+    while list_of_sentences_2.count("")>0:
+        list_of_sentences_2.remove("")
+    # Incorrect approach:
+    # re.split splits the string by the occurrences of the pattern.
+    # If capturing parentheses, i.e. (), are used in pattern, then the text
+    # of all groups in the pattern are also returned as part of the resulting list. 
+    # See https://docs.python.org/3/library/re.html#re.split for details
+    list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
+    while list_of_sentences_false.count("")>0:
+        list_of_sentences_false.remove("")
+    
+    # For comparison, we also include the NLTK tokenizer
+    list_of_sentences_nltk=sent_tokenize(text)
+    
+    # Determine total number of sentences
+    sentence_count=len(list_of_sentences)
+    sentence_count_1=len(list_of_sentences_1)
+    sentence_count_2=len(list_of_sentences_2)
+    sentence_count_false=len(list_of_sentences_false)
+    sentence_count_nltk=len(list_of_sentences_nltk)
+    
+    # Ratio of # of words over # of sentences
+    wps=word_count/sentence_count
+    wps_1=word_count/sentence_count_1
+    wps_2=word_count/sentence_count_2
+    wps_false=word_count/sentence_count_false
+    wps_nltk=word_count/sentence_count_nltk
+    
+    # Write cik, file name, total number of words, total number of sentences,
+    # and WPS to the output file
+    output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
+    str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
+    str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
+    str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
+    
+    # Close filing
+    input_file_10_k.close()
+    
+
+print("Finished") 
+output_file.close()
+input_file.close()