# -*- coding: utf-8 -*- """ Created on Wed Jul 29 11:07:10 2015 @author: Alexander Hillert, Goethe University Frankfurt """ import re from bs4 import BeautifulSoup directory="C:/Lehre/Textual Analysis/Programming/Files/" # Open the csv file containing the list of the 200 10-Ks input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r') input_text=input_file.read() # Split the input file in separate lines input_text_line=input_text.split("\n") # In general, there can be empty lines in the iput file. The following command # deletes these lines while input_text_line.count("")>0: input_text_line.remove("") print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.") # We subtract 1 from the lenght, as the first line contains the variable names but not data. # Loop over all lines for i in range(1,len(input_text_line)): # To see the progress of your program you can print the number of iteration. print(str(i)) # split the lines of the CSV-file into the two variables variables=input_text_line[i].split(";") # We need the CIK and the filename to open the file cik=variables[0] filename=variables[1] # Open the ith 10-K in the list input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore') input_text_10_k=input_file_10_k.read() # the new file name should be "old_name_clean" -> we have to replace ".txt" # by "_clean.txt" filename=filename.replace('.txt','_clean.txt') # Remove tables variable=re.search('', input_text_10_k) while variable: variable=re.search('
', input_text_10_k) start_table=variable.start() variable=re.search('
', input_text_10_k) end_table=variable.end() input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):] variable=re.search('', input_text_10_k) ####################### Begin of exhibits removal ######################### # Exhibits have the following structure # # EX... # ... # # In the recent years, there are also exhibits with EXCEL # -> as we search for "EX", the loop will delete EXCEL exhibits, too. variable=re.search('EX', input_text_10_k) while variable: variable=re.search('EX', input_text_10_k) start_exhibit=variable.start() variable=re.search('', input_text_10_k[start_exhibit:]) end_exhibit=start_exhibit+variable.end() input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):] variable=re.search('EX', input_text_10_k) # In recent years, there are also XML-Exibits. # CAUTION: These are XML and not EX -> need separate cleaning # Remove XML-Exhibits, which have the following structure # # XML # ... # variable=re.search('XML', input_text_10_k) while variable: variable=re.search('XML', input_text_10_k) start_exhibit=variable.start() variable=re.search('', input_text_10_k[start_exhibit:]) end_exhibit=start_exhibit+variable.end() input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:] variable=re.search('XML', input_text_10_k) # Furthermore, also in recent years, there are also ZIP-Exibits. # CAUTION: These are ZIP and not EX -> need separate cleaning # Remove ZIP-Exhibits, which have the following structure # # ZIP # ... # variable=re.search('ZIP', input_text_10_k) while variable: variable=re.search('ZIP', input_text_10_k) start_exhibit=variable.start() variable=re.search('', input_text_10_k[start_exhibit:]) end_exhibit=start_exhibit+variable.end() input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:] variable=re.search('ZIP', input_text_10_k) # In addition, there are many Graphic-Exibits. # CAUTION: These are GRAPHIC and not EX -> need separate cleaning # Remove GRAPHIC-Exhibits, which have the following structure # # GRAPHIC # ... # variable=re.search('GRAPHIC', input_text_10_k) while variable: variable=re.search('GRAPHIC', input_text_10_k) start_exhibit=variable.start() variable=re.search('', input_text_10_k[start_exhibit:]) end_exhibit=start_exhibit+variable.end() input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:] variable=re.search('GRAPHIC', input_text_10_k) # Furthermore, there can be also Cover-Exibits. # CAUTION: These are COVER and not EX -> need separate cleaning # Remove COVER-Exhibits, which have the following structure # # COVER # ... # variable=re.search('COVER', input_text_10_k) while variable: variable=re.search('COVER', input_text_10_k) start_exhibit=variable.start() variable=re.search('', input_text_10_k[start_exhibit:]) end_exhibit=start_exhibit+variable.end() input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:] variable=re.search('COVER', input_text_10_k) # Furthermore, there can be also PDF files attached. # These attachments caused BeautifulSoup to crash on some computers. # Remove PDFs variable=re.search('', input_text_10_k) while variable: variable=re.search('', input_text_10_k) start_pdf=variable.start() variable=re.search('', input_text_10_k[start_pdf:]) end_pdf=start_pdf+variable.end() input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):] variable=re.search('', input_text_10_k) ######################## End of exhibits removal ########################## # Remove Document Header - PART 1 # This condition should work for all 10-K filings as the hmtl tags "" # and "" are mandatory for all filings. variable=re.search('', input_text_10_k) if variable: input_text_10_k=input_text_10_k[variable.end():] # In some filings, firms do not use line feeds \n but
and
# instead to indicate the start and the end of sentences. # "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer # neuen Zeile des Fließtextes beginnt." # see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div # and # "The
tag defines a division or a section in an HTML document. # By default, browsers always place a line break before and after the
element." # See: https://www.w3schools.com/tags/tag_div.asp # It is important to replace
and
by linefeeds because otherwise # the entire text will be in a single line and the subsequent commands do # not work properly. input_text_10_k=input_text_10_k.replace("
", "\n") input_text_10_k=input_text_10_k.replace("
", "\n") # Remove html code html_text=BeautifulSoup(input_text_10_k, 'html.parser') text=html_text.get_text() # To get an idea of what the commands below are doing, it is helpful to # write the current version of the text to a file and then compare it to the # final file. filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt') # Open the output file for the text without html code and without tables+exhibits output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore') output_file_10_k.write(text) output_file_10_k.close() # Remove the Document Header - PART II # The above command to remove the header ("") does not capture # the entire header -> we need to delete further parts at the top the filing. # WARNING: The filters below may be specific to this sample of 10-Ks. # Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ". variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text) if variable: text=text[variable.end():] else: variable=re.search('(?i)\n {0,}table of contents {0,}\n', text) if variable: text=text[variable.end():] else: variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text) if variable: text=text[variable.end():] else: variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text) if variable: text=text[variable.end():] else: variable=re.search('\nPART\.{1,}', text) if variable: text=text[variable.end():] # Delete Item numbers text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text) # Delete Part numbers text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text) # Delete numbers: text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text) # File names, e.g. exhibit.pdf or picture.jpeg should be removed text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text) # URLs --> Remove internet addresse text=re.sub("http:/{0,2}", "", text) text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text) # In Part 4 of the programming chapter, we will determine the number of # words per sentence. To be able to use the same underlying sample, # we need to implement further corrections. These changes do not affect # the percentage of negative/positive/etc. words. # --> Only relevant for determining the number of sentences # The text contains dots that do not indicate the end of a sentence. # E.g., "Inc." and "St." # The preceding - is found in non-U.S. for example. # Replace or remove specific abreviations # This list is incomplete. In a research project you should spend more time # on editing the data. text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text) text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text) text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text) text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text) text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text) text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text) text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text) text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text) text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text) text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text) text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text) text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text) text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text) text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text) text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text) text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text) text=re.sub("(?i)(\s|\A|,)May\.", " May", text) text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text) text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text) text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text) text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text) text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text) text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text) text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text) # The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation # three repitions of capital letter and dot are also common in filings # we need to check for three instances first. text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text) # now check for two instances text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text) # Dots after a single letter can indicate a middle Name Paul J. Smith # or an abbreviation --> also delete these. text=re.sub("( |\n|,)[A-Z]\.", "", text) # Hyphens can be used to indicate that the word is continued in the next # line. For example, "Micro-\nsoft" (\n is the line feed). # Replace hyphens followed by a line feed by a hyphen without line feed text=re.sub('-\n','-',text) # Delete the minus/hyphens # "Short-term" -> "shortterm" text=re.sub('-','',text) # --> Only relevant for determining the number of sentences # Delete dots and commas that are not part of sentences, i.e. commas and dots # that are preceded by whitespace or line break and that are followed by # whitespace or line break. text=re.sub('\n(\.|,)\n','\n',text) text=re.sub(' (\.|,) ',' ',text) # Delete single character words # One can argue whether one should implement this procedure. Loughran and # McDonald argue in one of their papers in favor of it. # To make sure that there is just one letter, we require that there is a word # boundary (\W) before and after. We use a positive backward looking and a # positive forward looking condition for this to assure that the word boundary # get not deleted as well. text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text) # There are sentences that are in upper case letters. However, these are not # "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE." # or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE # SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]" # We save the edited text in a new variable text_edited=text # Split text in sentences list_sentences=re.split('\.|!|\?', text) # iterate the list of all sentences for j in range(0,len(list_sentences)): # Determine the number of upper case letters upper_letters=len(re.findall('[A-Z]',list_sentences[j])) # Determine the number of all letters total_letters=len(re.findall('[A-Za-z]',list_sentences[j])) # If there is at least one letter calculate the fraction of upper case letters if total_letters>0: ratio=upper_letters/total_letters # If the fraction of upper case letters is larger than 0.9 delete # the sentence from the text. if ratio>0.9: text_edited=text_edited.replace(list_sentences[j]+'.','') text_edited=text_edited.replace(list_sentences[j]+'!','') text_edited=text_edited.replace(list_sentences[j]+'?','') # --> Only relevant for determining the number of sentences # There are a few cases where a dot follows a dot or where a linefeed # separates two dots. --> delete the second dot. text_edited=text_edited.replace('..','.') text_edited=text_edited.replace('.\n.','.') # The following commands do not influence the subsequent textual analysis. # The only purpose is to display the output in a nicer format. # Replace lines that contain only whitespaces by a line feed. text_edited=re.sub('\n {1,}\n','\n',text_edited) # Replace multiple line feeds by one line feed. text_edited=re.sub('\n{2,}','\n',text_edited) # Open the output file for the pure text output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore') output_file_10_k.write(text_edited) output_file_10_k.close() input_file_10_k.close() input_file.close()