Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
144 lines
5.8 KiB
Python
144 lines
5.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Apr 12 15:50:22 2016
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
# Import regular expressions and BeautifulSoup
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Please adjust the directory to your machine.
|
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
|
|
|
# Open the document
|
|
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
|
|
input_text=input_file.read()
|
|
|
|
#######################
|
|
# Task 1: remove tables
|
|
#######################
|
|
# Approach
|
|
# We search for tables until we find no more html tags that indicate the
|
|
# beginning of a table.
|
|
# Search for the start html-tag <TABLE>
|
|
table_match=re.search('<TABLE>', input_text)
|
|
print("This is the result of the re.search command:")
|
|
print(table_match)
|
|
while table_match:
|
|
# When we have identified a match, i.e. the start of a table, we save
|
|
# the position of the beginning of the table in the variable "start_table"
|
|
table_start_match=re.search('<TABLE>', input_text)
|
|
start_table=table_start_match.start()
|
|
# Next, we search for the corresponding html tag that indicates the end of
|
|
# the table and save the end position to the variable "end_table"
|
|
table_end_match=re.search('</TABLE>', input_text)
|
|
end_table=table_end_match.end()
|
|
|
|
# We can print the text between the start and end html tag to check whether
|
|
# the table has been identified correctly.
|
|
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
|
|
|
|
# the text between the beginning and end of the html tags is the part which
|
|
# we would like to delete.
|
|
# Consequently, we keep the text before the beginning of the table as well
|
|
# as the text after the ending of the table.
|
|
input_text=input_text[:start_table]+input_text[end_table:]
|
|
# Next, we need to check whether there is another table in the rest of the
|
|
# text.
|
|
table_match=re.search('<TABLE>', input_text)
|
|
# As long as "table_match" exists, i.e. we regex result in a match, the loop
|
|
# will continue.
|
|
|
|
#########################
|
|
# Task 2: remove Exhibits
|
|
#########################
|
|
# Exhibits have the following structure
|
|
# <DOCUMENT>
|
|
# <TYPE>EX...
|
|
# ...
|
|
# </DOCUMENT>
|
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
|
while exhibit_match:
|
|
exhibit_start_match=re.search('<TYPE>EX', input_text)
|
|
start_exhibit=exhibit_start_match.start()
|
|
# As the exhibits are at the end of the 10-K filing it would not be
|
|
# necessary to include an end position. We could also drop the entire text
|
|
# after "<TYPE>EX"
|
|
# It is important that we search for the </DOCUMENT> only after the exhibit
|
|
# started. Otherwise, we could get the end of the main document.
|
|
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
|
|
end_exhibit=start_exhibit+exhibit_end_match.end()
|
|
# Print the identified text to check whether the exhibit has be identified
|
|
# correctly
|
|
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
|
|
|
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
|
|
# Check whether there are further exhibits
|
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
|
|
|
##########################
|
|
# Task 3: remove html code
|
|
##########################
|
|
# Alternative 1: remove html code without Beautiful Soup
|
|
text=re.sub('<[^>]{1,}>', '', input_text)
|
|
# This regex searches for a "<" followed by at least one character that must not
|
|
# equal > and is completed by >.
|
|
# You might have thought about using the following command
|
|
#text=re.sub('<.{1,}>', '', input_text)
|
|
# However, this command has a problem, as it would delete the following line
|
|
# entirely: <page> This is some text that should remain <page>
|
|
# The .{1,} would match 'page> This is some text that should remain <page', as
|
|
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
|
|
# Consequently, in the example only the two "<page>" would be deleted.
|
|
# You can verify this by using regex101.com (remember to check "Python" in the
|
|
# left menu of the webpage)
|
|
|
|
# Alternative 2: remove html code using Beautiful Soup
|
|
html_text=BeautifulSoup(input_text, 'html.parser')
|
|
text=html_text.get_text()
|
|
|
|
########################
|
|
# Task 4: delete numbers
|
|
########################
|
|
# Alternative 1 - removing numbers step by step
|
|
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
|
|
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
|
|
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
|
|
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
|
|
# remove the remaining numbers without commas and dots
|
|
text=re.sub('[0-9]','',text)
|
|
|
|
# Alternative 2 - removing numbers using a single regex
|
|
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
|
|
|
# Alternative 3 - removing numbers step by step but start with commas and dots
|
|
# 1. remove comma incl. the surrounding numbers
|
|
text=re.sub("[0-9],[0-9]","",text)
|
|
# 2. remove dots incl. the surrounding numbers
|
|
text=re.sub("[0-9]\.[0-9]","",text)
|
|
# 3. remove any remaining number
|
|
text=re.sub("[0-9]","",text)
|
|
|
|
|
|
########################
|
|
# Task 5: delete symbols
|
|
########################
|
|
# When analyzing tone, symbols do not matter, as they are not considered to be
|
|
# words and thus do not biased the total word count.
|
|
# However, for training purposes this task is included in the problem.
|
|
# There is no well defined list of which symbols should be deleted. So, you
|
|
# can add further symbols.
|
|
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
|
|
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
|
|
|
|
# Open the output file for the pure text
|
|
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
|
|
output_file.write(text)
|
|
|
|
input_file.close()
|
|
output_file.close()
|
|
|
|
print("DONE")
|
|
|