1
0
Fork 0
whu-textual-analysis/lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py

356 lines
16 KiB
Python
Raw Normal View History

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
# We subtract 1 from the lenght, as the first line contains the variable names but not data.
# Loop over all lines
for i in range(1,len(input_text_line)):
# To see the progress of your program you can print the number of iteration.
print(str(i))
# split the lines of the CSV-file into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename to open the file
cik=variables[0]
filename=variables[1]
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# the new file name should be "old_name_clean" -> we have to replace ".txt"
# by "_clean.txt"
filename=filename.replace('.txt','_clean.txt')
# Remove tables
variable=re.search('<TABLE>', input_text_10_k)
while variable:
variable=re.search('<TABLE>', input_text_10_k)
start_table=variable.start()
variable=re.search('</TABLE>', input_text_10_k)
end_table=variable.end()
input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
variable=re.search('<TABLE>', input_text_10_k)
####################### Begin of exhibits removal #########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# In the recent years, there are also exhibits with <TYPE>EXCEL
# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
variable=re.search('<TYPE>EX', input_text_10_k)
while variable:
variable=re.search('<TYPE>EX', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
variable=re.search('<TYPE>EX', input_text_10_k)
# In recent years, there are also XML-Exibits.
# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
# Remove XML-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>XML
# ...
# </DOCUMENT>
variable=re.search('<TYPE>XML', input_text_10_k)
while variable:
variable=re.search('<TYPE>XML', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>XML', input_text_10_k)
# Furthermore, also in recent years, there are also ZIP-Exibits.
# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
# Remove ZIP-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>ZIP
# ...
# </DOCUMENT>
variable=re.search('<TYPE>ZIP', input_text_10_k)
while variable:
variable=re.search('<TYPE>ZIP', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>ZIP', input_text_10_k)
# In addition, there are many Graphic-Exibits.
# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
# Remove GRAPHIC-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>GRAPHIC
# ...
# </DOCUMENT>
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
while variable:
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
# Furthermore, there can be also Cover-Exibits.
# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
# Remove COVER-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>COVER
# ...
# </DOCUMENT>
variable=re.search('<TYPE>COVER', input_text_10_k)
while variable:
variable=re.search('<TYPE>COVER', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>COVER', input_text_10_k)
# Furthermore, there can be also PDF files attached.
# These attachments caused BeautifulSoup to crash on some computers.
# Remove PDFs
variable=re.search('<PDF>', input_text_10_k)
while variable:
variable=re.search('<PDF>', input_text_10_k)
start_pdf=variable.start()
variable=re.search('</PDF>', input_text_10_k[start_pdf:])
end_pdf=start_pdf+variable.end()
input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
variable=re.search('<PDF>', input_text_10_k)
######################## End of exhibits removal ##########################
# Remove Document Header - PART 1
# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
# and "</SEC-HEADER>" are mandatory for all filings.
variable=re.search('</SEC-HEADER>', input_text_10_k)
if variable:
input_text_10_k=input_text_10_k[variable.end():]
# In some filings, firms do not use line feeds \n but <div> and </div>
# instead to indicate the start and the end of sentences.
# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer
# neuen Zeile des Fließtextes beginnt."
# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
# and
# "The <div> tag defines a division or a section in an HTML document.
# By default, browsers always place a line break before and after the <div> element."
# See: https://www.w3schools.com/tags/tag_div.asp
# It is important to replace <div> and </div> by linefeeds because otherwise
# the entire text will be in a single line and the subsequent commands do
# not work properly.
input_text_10_k=input_text_10_k.replace("<div>", "\n")
input_text_10_k=input_text_10_k.replace("</div>", "\n")
# Remove html code
html_text=BeautifulSoup(input_text_10_k, 'html.parser')
text=html_text.get_text()
# To get an idea of what the commands below are doing, it is helpful to
# write the current version of the text to a file and then compare it to the
# final file.
filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
# Open the output file for the text without html code and without tables+exhibits
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text)
output_file_10_k.close()
# Remove the Document Header - PART II
# The above command to remove the header ("</SEC-HEADER>") does not capture
# the entire header -> we need to delete further parts at the top the filing.
# WARNING: The filters below may be specific to this sample of 10-Ks.
# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('\nPART\.{1,}', text)
if variable:
text=text[variable.end():]
# Delete Item numbers
text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
# Delete Part numbers
text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
# Delete numbers:
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# File names, e.g. exhibit.pdf or picture.jpeg should be removed
text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
# URLs --> Remove internet addresse
text=re.sub("http:/{0,2}", "", text)
text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
# In Part 4 of the programming chapter, we will determine the number of
# words per sentence. To be able to use the same underlying sample,
# we need to implement further corrections. These changes do not affect
# the percentage of negative/positive/etc. words.
# --> Only relevant for determining the number of sentences
# The text contains dots that do not indicate the end of a sentence.
# E.g., "Inc." and "St."
# The preceding - is found in non-U.S. for example.
# Replace or remove specific abreviations
# This list is incomplete. In a research project you should spend more time
# on editing the data.
text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
# three repitions of capital letter and dot are also common in filings
# we need to check for three instances first.
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
# now check for two instances
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
# Dots after a single letter can indicate a middle Name Paul J. Smith
# or an abbreviation --> also delete these.
text=re.sub("( |\n|,)[A-Z]\.", "", text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Replace hyphens followed by a line feed by a hyphen without line feed
text=re.sub('-\n','-',text)
# Delete the minus/hyphens
# "Short-term" -> "shortterm"
text=re.sub('-','',text)
# --> Only relevant for determining the number of sentences
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by whitespace or line break and that are followed by
# whitespace or line break.
text=re.sub('\n(\.|,)\n','\n',text)
text=re.sub(' (\.|,) ',' ',text)
# Delete single character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
# There are sentences that are in upper case letters. However, these are not
# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
# We save the edited text in a new variable
text_edited=text
# Split text in sentences
list_sentences=re.split('\.|!|\?', text)
# iterate the list of all sentences
for j in range(0,len(list_sentences)):
# Determine the number of upper case letters
upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
# Determine the number of all letters
total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
# If there is at least one letter calculate the fraction of upper case letters
if total_letters>0:
ratio=upper_letters/total_letters
# If the fraction of upper case letters is larger than 0.9 delete
# the sentence from the text.
if ratio>0.9:
text_edited=text_edited.replace(list_sentences[j]+'.','')
text_edited=text_edited.replace(list_sentences[j]+'!','')
text_edited=text_edited.replace(list_sentences[j]+'?','')
# --> Only relevant for determining the number of sentences
# There are a few cases where a dot follows a dot or where a linefeed
# separates two dots. --> delete the second dot.
text_edited=text_edited.replace('..','.')
text_edited=text_edited.replace('.\n.','.')
# The following commands do not influence the subsequent textual analysis.
# The only purpose is to display the output in a nicer format.
# Replace lines that contain only whitespaces by a line feed.
text_edited=re.sub('\n {1,}\n','\n',text_edited)
# Replace multiple line feeds by one line feed.
text_edited=re.sub('\n{2,}','\n',text_edited)
# Open the output file for the pure text
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text_edited)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()