whu-textual-analysis/lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015

@author: Alexander Hillert, Goethe University Frankfurt
"""

import re
from bs4 import BeautifulSoup

directory="C:/Lehre/Textual Analysis/Programming/Files/"

# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
input_text=input_file.read()

# Split the input file in separate lines
input_text_line=input_text.split("\n")

# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
    input_text_line.remove("")

print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
# We subtract 1 from the lenght, as the first line contains the variable names but not data.

# Loop over all lines
for i in range(1,len(input_text_line)):
    # To see the progress of your program you can print the number of iteration.
    print(str(i))
    
    # split the lines of the CSV-file into the two variables
    variables=input_text_line[i].split(";")
    # We need the CIK and the filename to open the file
    cik=variables[0]
    filename=variables[1]
    
    # Open the ith 10-K in the list
    input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
    input_text_10_k=input_file_10_k.read()
    
    # the new file name should be "old_name_clean" -> we have to replace ".txt"
    # by "_clean.txt"
    filename=filename.replace('.txt','_clean.txt')
    
    # Remove tables
    variable=re.search('<TABLE>', input_text_10_k)
    while variable:
        variable=re.search('<TABLE>', input_text_10_k)
        start_table=variable.start()
        variable=re.search('</TABLE>', input_text_10_k)
        end_table=variable.end()
        input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
        variable=re.search('<TABLE>', input_text_10_k)
    
    
    ####################### Begin of exhibits removal #########################
    # Exhibits have the following structure
    # <DOCUMENT>
    # <TYPE>EX...
    # ...
    # </DOCUMENT>
    # In the recent years, there are also exhibits with <TYPE>EXCEL
    # -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
    variable=re.search('<TYPE>EX', input_text_10_k)
    while variable:
        variable=re.search('<TYPE>EX', input_text_10_k)
        start_exhibit=variable.start()
        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
        end_exhibit=start_exhibit+variable.end()
        input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
        variable=re.search('<TYPE>EX', input_text_10_k)
        
    # In recent years, there are also XML-Exibits.
    # CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
    # Remove XML-Exhibits, which have the following structure
    # <DOCUMENT>
    # <TYPE>XML
    # ...
    # </DOCUMENT>
    variable=re.search('<TYPE>XML', input_text_10_k)
    while variable:
        variable=re.search('<TYPE>XML', input_text_10_k)
        start_exhibit=variable.start()
        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
        end_exhibit=start_exhibit+variable.end()
        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
        variable=re.search('<TYPE>XML', input_text_10_k)
    
    # Furthermore, also in recent years, there are also ZIP-Exibits.
    # CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
    # Remove ZIP-Exhibits, which have the following structure
    # <DOCUMENT>
    # <TYPE>ZIP
    # ...
    # </DOCUMENT>
    variable=re.search('<TYPE>ZIP', input_text_10_k)
    while variable:
        variable=re.search('<TYPE>ZIP', input_text_10_k)
        start_exhibit=variable.start()
        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
        end_exhibit=start_exhibit+variable.end()
        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
        variable=re.search('<TYPE>ZIP', input_text_10_k)
    
    # In addition, there are many Graphic-Exibits.
    # CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
    # Remove GRAPHIC-Exhibits, which have the following structure
    # <DOCUMENT>
    # <TYPE>GRAPHIC
    # ...
    # </DOCUMENT>
    variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
    while variable:
        variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
        start_exhibit=variable.start()
        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
        end_exhibit=start_exhibit+variable.end()
        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
        variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
        
    # Furthermore, there can be also Cover-Exibits.
    # CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
    # Remove COVER-Exhibits, which have the following structure
    # <DOCUMENT>
    # <TYPE>COVER
    # ...
    # </DOCUMENT>
    variable=re.search('<TYPE>COVER', input_text_10_k)
    while variable:
        variable=re.search('<TYPE>COVER', input_text_10_k)
        start_exhibit=variable.start()
        variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
        end_exhibit=start_exhibit+variable.end()
        input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
        variable=re.search('<TYPE>COVER', input_text_10_k)

	# Furthermore, there can be also PDF files attached.
	# These attachments caused BeautifulSoup to crash on some computers.
    # Remove PDFs
    variable=re.search('<PDF>', input_text_10_k)
    while variable:
        variable=re.search('<PDF>', input_text_10_k)
        start_pdf=variable.start()
        variable=re.search('</PDF>', input_text_10_k[start_pdf:])
        end_pdf=start_pdf+variable.end()
        input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
        variable=re.search('<PDF>', input_text_10_k)
	
    ######################## End of exhibits removal ##########################
    
    # Remove Document Header - PART 1
    # This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
    # and "</SEC-HEADER>" are mandatory for all filings.
    variable=re.search('</SEC-HEADER>', input_text_10_k)
    if variable:
        input_text_10_k=input_text_10_k[variable.end():]
    
    
    # In some filings, firms do not use line feeds \n but <div> and </div>
    # instead to indicate the start and the end of sentences.
    # "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer 
    # neuen Zeile des Fließtextes beginnt."
    # see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
    # and
    # "The <div> tag defines a division or a section in an HTML document.
    # By default, browsers always place a line break before and after the <div> element."
    # See: https://www.w3schools.com/tags/tag_div.asp
    # It is important to replace <div> and </div> by linefeeds because otherwise
    # the entire text will be in a single line and the subsequent commands do
    # not work properly.
    input_text_10_k=input_text_10_k.replace("<div>", "\n")
    input_text_10_k=input_text_10_k.replace("</div>", "\n")

    
    # Remove html code
    html_text=BeautifulSoup(input_text_10_k, 'html.parser')
    text=html_text.get_text()
    
    
    # To get an idea of what the commands below are doing, it is helpful to
    # write the current version of the text to a file and then compare it to the
    # final file.
    filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
    # Open the output file for the text without html code and without tables+exhibits
    output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
    output_file_10_k.write(text)
    output_file_10_k.close()
    
    
    # Remove the Document Header - PART II
    # The above command to remove the header ("</SEC-HEADER>") does not capture
    # the entire header -> we need to delete further parts at the top the filing.
    # WARNING: The filters below may be specific to this sample of 10-Ks.
    # Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
    variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
    if variable:
        text=text[variable.end():]
    else:
        variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
        if variable:
            text=text[variable.end():]
        else:
            variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
            if variable:
                text=text[variable.end():]
            else:
                variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
                if variable:
                    text=text[variable.end():]
                else:
                    variable=re.search('\nPART\.{1,}', text)
                    if variable:
                        text=text[variable.end():]
    
    
    # Delete Item numbers
    text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
    # Delete Part numbers
    text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
    
    # Delete numbers:
    text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
    
    # File names, e.g. exhibit.pdf or picture.jpeg should be removed
    text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
    
    # URLs --> Remove internet addresse
    text=re.sub("http:/{0,2}", "", text)
    text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
    
    
    # In Part 4 of the programming chapter, we will determine the number of
    # words per sentence. To be able to use the same underlying sample,
    # we need to implement further corrections. These changes do not affect
    # the percentage of negative/positive/etc. words.
    # --> Only relevant for determining the number of sentences
    # The text contains dots that do not indicate the end of a sentence.
    # E.g., "Inc." and "St."
    # The preceding - is found in non-U.S. for example.
    # Replace or remove specific abreviations
    # This list is incomplete. In a research project you should spend more time
    # on editing the data.
    text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
    text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
    text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
    text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
    text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
    text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
    text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
    text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
    text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
    text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
    text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
    text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
    
    text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
    text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
    text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
    text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
    text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
    text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
    text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
    text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
    text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
    text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
    text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
    text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
        
    # The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
    # three repitions of capital letter and dot are also common in filings
    # we need to check for three instances first.
    text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
    # now check for two instances
    text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
    
    # Dots after a single letter can indicate a middle Name Paul J. Smith
    # or an abbreviation --> also delete these. 
    text=re.sub("( |\n|,)[A-Z]\.", "", text)
    
    
    # Hyphens can be used to indicate that the word is continued in the next
    # line. For example, "Micro-\nsoft" (\n is the line feed).
    # Replace hyphens followed by a line feed by a hyphen without line feed
    text=re.sub('-\n','-',text)
    
    # Delete the minus/hyphens
    # "Short-term" -> "shortterm"
    text=re.sub('-','',text)
    
    
    # --> Only relevant for determining the number of sentences
    # Delete dots and commas that are not part of sentences, i.e. commas and dots
    # that are preceded by whitespace or line break and that are followed by
    # whitespace or line break.
    text=re.sub('\n(\.|,)\n','\n',text)
    text=re.sub(' (\.|,) ',' ',text)

    # Delete single character words
    # One can argue whether one should implement this procedure. Loughran and
    # McDonald argue in one of their papers in favor of it.
    # To make sure that there is just one letter, we require that there is a word
    # boundary (\W) before and after. We use a positive backward looking and a
    # positive forward looking condition for this to assure that the word boundary
    # get not deleted as well.
    text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
    
    
    # There are sentences that are in upper case letters. However, these are not
    # "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
    # or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
    # SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
    # We save the edited text in a new variable
    text_edited=text
    # Split text in sentences
    list_sentences=re.split('\.|!|\?', text)
    # iterate the list of all sentences
    for j in range(0,len(list_sentences)):
        # Determine the number of upper case letters
        upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
        # Determine the number of all letters
        total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
        # If there is at least one letter calculate the fraction of upper case letters
        if total_letters>0:
            ratio=upper_letters/total_letters
            # If the fraction of upper case letters is larger than 0.9 delete
            # the sentence from the text.
            if ratio>0.9:
                text_edited=text_edited.replace(list_sentences[j]+'.','')
                text_edited=text_edited.replace(list_sentences[j]+'!','')
                text_edited=text_edited.replace(list_sentences[j]+'?','')
    
    
    # --> Only relevant for determining the number of sentences
    # There are a few cases where a dot follows a dot or where a linefeed 
    # separates two dots. --> delete the second dot.
    text_edited=text_edited.replace('..','.')
    text_edited=text_edited.replace('.\n.','.')
    
    # The following commands do not influence the subsequent textual analysis.
    # The only purpose is to display the output in a nicer format.
    # Replace lines that contain only whitespaces by a line feed.
    text_edited=re.sub('\n {1,}\n','\n',text_edited)
    
    # Replace multiple line feeds by one line feed.
    text_edited=re.sub('\n{2,}','\n',text_edited)
    
    
    # Open the output file for the pure text
    output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
    output_file_10_k.write(text_edited)
    output_file_10_k.close()
    input_file_10_k.close()

input_file.close()
Add programming files - add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder 2022-08-05 00:05:05 +02:00			`# -- coding: utf-8 --`
			`"""`
			`Created on Wed Jul 29 11:07:10 2015`

			`@author: Alexander Hillert, Goethe University Frankfurt`
			`"""`

			`import re`
			`from bs4 import BeautifulSoup`

			`directory="C:/Lehre/Textual Analysis/Programming/Files/"`

			`# Open the csv file containing the list of the 200 10-Ks`
			`input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')`
			`input_text=input_file.read()`

			`# Split the input file in separate lines`
			`input_text_line=input_text.split("\n")`

			`# In general, there can be empty lines in the iput file. The following command`
			`# deletes these lines`
			`while input_text_line.count("")>0:`
			`input_text_line.remove("")`

			`print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")`
			`# We subtract 1 from the lenght, as the first line contains the variable names but not data.`

			`# Loop over all lines`
			`for i in range(1,len(input_text_line)):`
			`# To see the progress of your program you can print the number of iteration.`
			`print(str(i))`

			`# split the lines of the CSV-file into the two variables`
			`variables=input_text_line[i].split(";")`
			`# We need the CIK and the filename to open the file`
			`cik=variables[0]`
			`filename=variables[1]`

			`# Open the ith 10-K in the list`
			`input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')`
			`input_text_10_k=input_file_10_k.read()`

			`# the new file name should be "old_name_clean" -> we have to replace ".txt"`
			`# by "_clean.txt"`
			`filename=filename.replace('.txt','_clean.txt')`

			`# Remove tables`
			`variable=re.search('<TABLE>', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TABLE>', input_text_10_k)`
			`start_table=variable.start()`
			`variable=re.search('</TABLE>', input_text_10_k)`
			`end_table=variable.end()`
			`input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]`
			`variable=re.search('<TABLE>', input_text_10_k)`


			`####################### Begin of exhibits removal #########################`
			`# Exhibits have the following structure`
			`# <DOCUMENT>`
			`# <TYPE>EX...`
			`# ...`
			`# </DOCUMENT>`
			`# In the recent years, there are also exhibits with <TYPE>EXCEL`
			`# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.`
			`variable=re.search('<TYPE>EX', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TYPE>EX', input_text_10_k)`
			`start_exhibit=variable.start()`
			`variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])`
			`end_exhibit=start_exhibit+variable.end()`
			`input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]`
			`variable=re.search('<TYPE>EX', input_text_10_k)`

			`# In recent years, there are also XML-Exibits.`
			`# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning`
			`# Remove XML-Exhibits, which have the following structure`
			`# <DOCUMENT>`
			`# <TYPE>XML`
			`# ...`
			`# </DOCUMENT>`
			`variable=re.search('<TYPE>XML', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TYPE>XML', input_text_10_k)`
			`start_exhibit=variable.start()`
			`variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])`
			`end_exhibit=start_exhibit+variable.end()`
			`input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]`
			`variable=re.search('<TYPE>XML', input_text_10_k)`

			`# Furthermore, also in recent years, there are also ZIP-Exibits.`
			`# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning`
			`# Remove ZIP-Exhibits, which have the following structure`
			`# <DOCUMENT>`
			`# <TYPE>ZIP`
			`# ...`
			`# </DOCUMENT>`
			`variable=re.search('<TYPE>ZIP', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TYPE>ZIP', input_text_10_k)`
			`start_exhibit=variable.start()`
			`variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])`
			`end_exhibit=start_exhibit+variable.end()`
			`input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]`
			`variable=re.search('<TYPE>ZIP', input_text_10_k)`

			`# In addition, there are many Graphic-Exibits.`
			`# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning`
			`# Remove GRAPHIC-Exhibits, which have the following structure`
			`# <DOCUMENT>`
			`# <TYPE>GRAPHIC`
			`# ...`
			`# </DOCUMENT>`
			`variable=re.search('<TYPE>GRAPHIC', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TYPE>GRAPHIC', input_text_10_k)`
			`start_exhibit=variable.start()`
			`variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])`
			`end_exhibit=start_exhibit+variable.end()`
			`input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]`
			`variable=re.search('<TYPE>GRAPHIC', input_text_10_k)`

			`# Furthermore, there can be also Cover-Exibits.`
			`# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning`
			`# Remove COVER-Exhibits, which have the following structure`
			`# <DOCUMENT>`
			`# <TYPE>COVER`
			`# ...`
			`# </DOCUMENT>`
			`variable=re.search('<TYPE>COVER', input_text_10_k)`
			`while variable:`
			`variable=re.search('<TYPE>COVER', input_text_10_k)`
			`start_exhibit=variable.start()`
			`variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])`
			`end_exhibit=start_exhibit+variable.end()`
			`input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]`
			`variable=re.search('<TYPE>COVER', input_text_10_k)`

			`# Furthermore, there can be also PDF files attached.`
			`# These attachments caused BeautifulSoup to crash on some computers.`
			`# Remove PDFs`
			`variable=re.search('<PDF>', input_text_10_k)`
			`while variable:`
			`variable=re.search('<PDF>', input_text_10_k)`
			`start_pdf=variable.start()`
			`variable=re.search('</PDF>', input_text_10_k[start_pdf:])`
			`end_pdf=start_pdf+variable.end()`
			`input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]`
			`variable=re.search('<PDF>', input_text_10_k)`

			`######################## End of exhibits removal ##########################`

			`# Remove Document Header - PART 1`
			`# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"`
			`# and "</SEC-HEADER>" are mandatory for all filings.`
			`variable=re.search('</SEC-HEADER>', input_text_10_k)`
			`if variable:`
			`input_text_10_k=input_text_10_k[variable.end():]`


			`# In some filings, firms do not use line feeds \n but <div> and </div>`
			`# instead to indicate the start and the end of sentences.`
			`# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer`
			`# neuen Zeile des Fließtextes beginnt."`
			`# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div`
			`# and`
			`# "The <div> tag defines a division or a section in an HTML document.`
			`# By default, browsers always place a line break before and after the <div> element."`
			`# See: https://www.w3schools.com/tags/tag_div.asp`
			`# It is important to replace <div> and </div> by linefeeds because otherwise`
			`# the entire text will be in a single line and the subsequent commands do`
			`# not work properly.`
			`input_text_10_k=input_text_10_k.replace("<div>", "\n")`
			`input_text_10_k=input_text_10_k.replace("</div>", "\n")`


			`# Remove html code`
			`html_text=BeautifulSoup(input_text_10_k, 'html.parser')`
			`text=html_text.get_text()`


			`# To get an idea of what the commands below are doing, it is helpful to`
			`# write the current version of the text to a file and then compare it to the`
			`# final file.`
			`filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')`
			`# Open the output file for the text without html code and without tables+exhibits`
			`output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')`
			`output_file_10_k.write(text)`
			`output_file_10_k.close()`


			`# Remove the Document Header - PART II`
			`# The above command to remove the header ("</SEC-HEADER>") does not capture`
			`# the entire header -> we need to delete further parts at the top the filing.`
			`# WARNING: The filters below may be specific to this sample of 10-Ks.`
			`# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".`
			`variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)`
			`if variable:`
			`text=text[variable.end():]`
			`else:`
			`variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)`
			`if variable:`
			`text=text[variable.end():]`
			`else:`
			`variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)`
			`if variable:`
			`text=text[variable.end():]`
			`else:`
			`variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)`
			`if variable:`
			`text=text[variable.end():]`
			`else:`
			`variable=re.search('\nPART\.{1,}', text)`
			`if variable:`
			`text=text[variable.end():]`


			`# Delete Item numbers`
			`text=re.sub('(?i)Item {1,}[0-9]{1,}(A\|B){0,1}(\s\|\.\|:\|\n)','',text)`
			`# Delete Part numbers`
			`text=re.sub('(?i)Part (1\|2\|3\|4\|III\|II\|I\|IV)','',text)`

			`# Delete numbers:`
			`text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)`

			`# File names, e.g. exhibit.pdf or picture.jpeg should be removed`
			`text=re.sub("[ \|\n]\S{1,}\.(pdf\|htm\|html\|doc\|jpg\|txt\|xml)(?=[ \n\.\?!])", "", text)`

			`# URLs --> Remove internet addresse`
			`text=re.sub("http:/{0,2}", "", text)`
			`text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)`


			`# In Part 4 of the programming chapter, we will determine the number of`
			`# words per sentence. To be able to use the same underlying sample,`
			`# we need to implement further corrections. These changes do not affect`
			`# the percentage of negative/positive/etc. words.`
			`# --> Only relevant for determining the number of sentences`
			`# The text contains dots that do not indicate the end of a sentence.`
			`# E.g., "Inc." and "St."`
			`# The preceding - is found in non-U.S. for example.`
			`# Replace or remove specific abreviations`
			`# This list is incomplete. In a research project you should spend more time`
			`# on editing the data.`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Inc\.", " Inc", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Corp\.", " Corp", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Ltd\.", " Ltd", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Co\.", " Co", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)S\.A\.", " SA", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)U\.S\.", " US", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Ms\.", " Ms", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Mr\.", " Mr", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)No\.", " Number", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)v\.s\.", " vs", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)St\.", " ", text)`
			`text=re.sub("(?i)(-\|\s\|\A\|,)Jr\.", " ", text)`

			`text=re.sub("(?i)(\s\|\A\|,)Jan\.", " January", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Feb\.", " February", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Mar\.", " March", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Apr\.", " April", text)`
			`text=re.sub("(?i)(\s\|\A\|,)May\.", " May", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Jun\.", " June", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Jul\.", " July", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Aug\.", " August", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Sep\.", " September", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Oct\.", " October", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Nov\.", " November", text)`
			`text=re.sub("(?i)(\s\|\A\|,)Dec\.", " December", text)`

			`# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation`
			`# three repitions of capital letter and dot are also common in filings`
			`# we need to check for three instances first.`
			`text=re.sub("( \|\n\|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)`
			`# now check for two instances`
			`text=re.sub("( \|\n\|,)[A-Z]\.[A-Z]\.", " ", text)`

			`# Dots after a single letter can indicate a middle Name Paul J. Smith`
			`# or an abbreviation --> also delete these.`
			`text=re.sub("( \|\n\|,)[A-Z]\.", "", text)`


			`# Hyphens can be used to indicate that the word is continued in the next`
			`# line. For example, "Micro-\nsoft" (\n is the line feed).`
			`# Replace hyphens followed by a line feed by a hyphen without line feed`
			`text=re.sub('-\n','-',text)`

			`# Delete the minus/hyphens`
			`# "Short-term" -> "shortterm"`
			`text=re.sub('-','',text)`


			`# --> Only relevant for determining the number of sentences`
			`# Delete dots and commas that are not part of sentences, i.e. commas and dots`
			`# that are preceded by whitespace or line break and that are followed by`
			`# whitespace or line break.`
			`text=re.sub('\n(\.\|,)\n','\n',text)`
			`text=re.sub(' (\.\|,) ',' ',text)`

			`# Delete single character words`
			`# One can argue whether one should implement this procedure. Loughran and`
			`# McDonald argue in one of their papers in favor of it.`
			`# To make sure that there is just one letter, we require that there is a word`
			`# boundary (\W) before and after. We use a positive backward looking and a`
			`# positive forward looking condition for this to assure that the word boundary`
			`# get not deleted as well.`
			`text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)`


			`# There are sentences that are in upper case letters. However, these are not`
			`# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."`
			`# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE`
			`# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"`
			`# We save the edited text in a new variable`
			`text_edited=text`
			`# Split text in sentences`
			`list_sentences=re.split('\.\|!\|\?', text)`
			`# iterate the list of all sentences`
			`for j in range(0,len(list_sentences)):`
			`# Determine the number of upper case letters`
			`upper_letters=len(re.findall('[A-Z]',list_sentences[j]))`
			`# Determine the number of all letters`
			`total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))`
			`# If there is at least one letter calculate the fraction of upper case letters`
			`if total_letters>0:`
			`ratio=upper_letters/total_letters`
			`# If the fraction of upper case letters is larger than 0.9 delete`
			`# the sentence from the text.`
			`if ratio>0.9:`
			`text_edited=text_edited.replace(list_sentences[j]+'.','')`
			`text_edited=text_edited.replace(list_sentences[j]+'!','')`
			`text_edited=text_edited.replace(list_sentences[j]+'?','')`


			`# --> Only relevant for determining the number of sentences`
			`# There are a few cases where a dot follows a dot or where a linefeed`
			`# separates two dots. --> delete the second dot.`
			`text_edited=text_edited.replace('..','.')`
			`text_edited=text_edited.replace('.\n.','.')`

			`# The following commands do not influence the subsequent textual analysis.`
			`# The only purpose is to display the output in a nicer format.`
			`# Replace lines that contain only whitespaces by a line feed.`
			`text_edited=re.sub('\n {1,}\n','\n',text_edited)`

			`# Replace multiple line feeds by one line feed.`
			`text_edited=re.sub('\n{2,}','\n',text_edited)`


			`# Open the output file for the pure text`
			`output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')`
			`output_file_10_k.write(text_edited)`
			`output_file_10_k.close()`
			`input_file_10_k.close()`

			`input_file.close()`