- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
144 lines
5.8 KiB
144 lines
5.8 KiB
# -*- coding: utf-8 -*-
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the document
# Task 1: remove tables
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search('<TABLE>', input_text)
print("This is the result of the re.search command:")
while table_match:
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search('<TABLE>', input_text)
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
table_end_match=re.search('</TABLE>', input_text)
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search('<TABLE>', input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
# Task 2: remove Exhibits
# Exhibits have the following structure
# <TYPE>EX...
# ...
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
exhibit_start_match=re.search('<TYPE>EX', input_text)
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# It is important that we search for the </DOCUMENT> only after the exhibit
# started. Otherwise, we could get the end of the main document.
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
# Check whether there are further exhibits
exhibit_match=re.search('<TYPE>EX', input_text)
# Task 3: remove html code
# Alternative 1: remove html code without Beautiful Soup
text=re.sub('<[^>]{1,}>', '', input_text)
# This regex searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# You might have thought about using the following command
#text=re.sub('<.{1,}>', '', input_text)
# However, this command has a problem, as it would delete the following line
# entirely: <page> This is some text that should remain <page>
# The .{1,} would match 'page> This is some text that should remain <page', as
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
# Consequently, in the example only the two "<page>" would be deleted.
# You can verify this by using regex101.com (remember to check "Python" in the
# left menu of the webpage)
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(input_text, 'html.parser')
# Task 4: delete numbers
# Alternative 1 - removing numbers step by step
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
# remove the remaining numbers without commas and dots
# Alternative 2 - removing numbers using a single regex
# Alternative 3 - removing numbers step by step but start with commas and dots
# 1. remove comma incl. the surrounding numbers
# 2. remove dots incl. the surrounding numbers
# 3. remove any remaining number
# Task 5: delete symbols
# When analyzing tone, symbols do not matter, as they are not considered to be
# words and thus do not biased the total word count.
# However, for training purposes this task is included in the problem.
# There is no well defined list of which symbols should be deleted. So, you
# can add further symbols.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
# Open the output file for the pure text