1
0
Fork 0
whu-textual-analysis/lectures/programming/solutions/Problem_4_Application_Regular_Expressions.py
Alexander Hess a37c87d9c8
Add programming files
- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
2022-08-05 00:06:58 +02:00

144 lines
5.8 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the document
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
input_text=input_file.read()
#######################
# Task 1: remove tables
#######################
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search('<TABLE>', input_text)
print("This is the result of the re.search command:")
print(table_match)
while table_match:
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search('<TABLE>', input_text)
start_table=table_start_match.start()
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
table_end_match=re.search('</TABLE>', input_text)
end_table=table_end_match.end()
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
input_text=input_text[:start_table]+input_text[end_table:]
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search('<TABLE>', input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
#########################
# Task 2: remove Exhibits
#########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
exhibit_start_match=re.search('<TYPE>EX', input_text)
start_exhibit=exhibit_start_match.start()
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# It is important that we search for the </DOCUMENT> only after the exhibit
# started. Otherwise, we could get the end of the main document.
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
end_exhibit=start_exhibit+exhibit_end_match.end()
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
# Check whether there are further exhibits
exhibit_match=re.search('<TYPE>EX', input_text)
##########################
# Task 3: remove html code
##########################
# Alternative 1: remove html code without Beautiful Soup
text=re.sub('<[^>]{1,}>', '', input_text)
# This regex searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# You might have thought about using the following command
#text=re.sub('<.{1,}>', '', input_text)
# However, this command has a problem, as it would delete the following line
# entirely: <page> This is some text that should remain <page>
# The .{1,} would match 'page> This is some text that should remain <page', as
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
# Consequently, in the example only the two "<page>" would be deleted.
# You can verify this by using regex101.com (remember to check "Python" in the
# left menu of the webpage)
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
########################
# Task 4: delete numbers
########################
# Alternative 1 - removing numbers step by step
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
# remove the remaining numbers without commas and dots
text=re.sub('[0-9]','',text)
# Alternative 2 - removing numbers using a single regex
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# Alternative 3 - removing numbers step by step but start with commas and dots
# 1. remove comma incl. the surrounding numbers
text=re.sub("[0-9],[0-9]","",text)
# 2. remove dots incl. the surrounding numbers
text=re.sub("[0-9]\.[0-9]","",text)
# 3. remove any remaining number
text=re.sub("[0-9]","",text)
########################
# Task 5: delete symbols
########################
# When analyzing tone, symbols do not matter, as they are not considered to be
# words and thus do not biased the total word count.
# However, for training purposes this task is included in the problem.
# There is no well defined list of which symbols should be deleted. So, you
# can add further symbols.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
# Open the output file for the pure text
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
output_file.write(text)
input_file.close()
output_file.close()
print("DONE")