1
0
Fork 0

Add programming files

- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
This commit is contained in:
Alexander Hess 2022-08-05 00:05:05 +02:00
commit a37c87d9c8
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
38 changed files with 6416 additions and 0 deletions

View file

@ -0,0 +1,270 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
This version: February 22, 2019
This is an introduction to two data containers: lists and counters.
Python has several built-in data containers, e.g., sets, dictionaries, and lists
In addition to these containers, there are further types.
For textual analysis application counters are helpful.
This introduction covers lists in the first part.
The second part introduces the basics of counters.
"""
# for counters, you need to import collections
import collections
import re
###############################################################################
# Introduction on data containers
###############################################################################
#################################
# Part 1: lists
#################################
# Create an empty list
empty_list=[]
# Create non-empty lists
string_list=["a", "b", "c"]
mixed_list=[1, "ab", -4,"hello"]
print(mixed_list)
# Call items of a list
print(string_list[0])
print(string_list[2])
print(string_list[-1])
# Length of a list
length=len(string_list)
print("The length of the list is: "+str(length))
# ADD ITEMS TO A LIST
# ALTERNATIVE 1: insert -> you can specify the position
string_list.insert(1,"d")
# you cannot add multiple elements with the insert command
# You can try, but it will not work
# 1st try
string_list.insert(3,"e" "f") # -> the new element is "ef"
print(string_list)
# 2nd try
try:
string_list.insert(3,"e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error TypeError: insert() takes exactly 2 arguments (3 given)'")
# 3rd try
string_list.insert(3, ["e", "f"])
# check length
print("The length of the list is: "+str(len(string_list))) # -> only 6 and not 7
print(string_list[3])
# So element 3 of the list is another list
# You can call the elements of the sub-list
print("First element of sub list: "+string_list[3][0]+" and second element of \
sub list: "+string_list[3][1])
# Reset string_list to keep things easily tractable
string_list=["a", "b", "c"]
# ALTERNATIVE 2: append -> items are added at the end
string_list.append("d")
# Try to add multiple items
# 1st try
string_list.append("e" "f") # -> the new element is "ef"
print(string_list)
# 2nd try
try:
string_list.append("e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error 'TypeError: append() takes exactly one argument (2 given)'")
# 3rd try
string_list.append(["e", "f"])
# check length
print("length of list is "+str(len(string_list))) # -> only 6 and not 7
print(string_list[len(string_list)-1])
# -> element 3 of the list is another list
# You can call the elements of the sub-list
print("First element of sub list: "+string_list[len(string_list)-1][0]+" and \
second element of sub list: "+string_list[len(string_list)-1][1])
# Reset string_list to keep things easily tractable
string_list=["a", "b", "c"]
# ALTERNATIVE 3: extend -> items are added at the end
string_list.extend("d")
# Try to add multiple items
# 1st try
string_list.extend("e" "f") # -> Two elements are created -> works!!!
print(string_list)
# 2nd try
try:
string_list.extend("e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error 'TypeError: extend() takes exactly one argument (2 given)'")
# 3rd try
string_list.extend(["e", "f"])
print(string_list) # -> also works!!!
# check length
print("length of list is "+str(len(string_list))) # -> it is 8 and should be 8
# DELETE ITEMS FROM A LIST
string_list.remove("a")
print("List after deletion of 'a' "+str(string_list))
# What happens if an element occurs multiple times
string_list.remove("e")
print("List after further deletion of 'e' "+str(string_list))
# --> only first occurence of "e" is deleted
# FURTHER OPERATIONS WITH LISTS
# Accessing parts of a list
# Remember the first element is [0]! And the upper bound of the range is not
# included, i.e. [0:3] means [0], [1] and [2].
print("Sublist from beginning to third element: "+str(string_list[0:3]))
print("Sublist from beginning to third element: "+str(string_list[:3]))
print("Sublist from second(!) to third element: "+str(string_list[1:3]))
print("Sublist from fourth(!) to fifth element: "+str(string_list[3:5]))
print("Sublist from fifth(!) to the end: "+str(string_list[4:]))
# Search in lists
position=string_list.index("b")
print("Position of 'b' is: "+str(position))
# Searching for an element that is not part of the list
try:
string_list.index("a")
except:
print("Error message. If the command were executed without the try-except "\
"you would get the error 'ValueError: 'a' is not in list'")
if "c" in string_list:
print("'c' is at position: "+str(string_list.index("c")))
# Sort list
string_list.sort()
print('Sorted list: '+str(string_list))
string_list.sort(reverse=True)
print('Reversely sorted list: '+str(string_list))
# What happens when sorting mixed (i.e. integers and strings) lists?
try:
mixed_list.sort()
except:
print("Error message. If the command were executed without the try-except "\
"you would get the error 'TypeError: unorderable types: str() < int()'")
#################################
# Part 2: counters
#################################
'''
A Counter is a dictionary subclass for counting hashable objects.
It is an unordered collection where elements are stored as dictionary keys and
their counts are stored as dictionary values.
'''
# Creating a counter
counter_obj=collections.Counter(["a", "b", "c", "d", "a", "b", "a"])
print('The counter object is: '+str(counter_obj))
# The previous command is equivalent to
counter_obj=collections.Counter(a=3, b=2, c=1, d=1)
print('The counter object (2nd command) is: '+str(counter_obj))
# Add objects to a counter
counter_obj.update(["e", "f", "e"])
print('The updated counter object is: '+str(counter_obj))
# Alternative command
counter_obj["g"]=4
print('The updated updated counter object is: '+str(counter_obj))
# Length of the counter
length=len(counter_obj)
print('The length of the counter is: '+str(length))
# Loop over the elements of the counter and their frequency
i=1
for element in counter_obj:
print("Element "+str(i)+" of the counter: "+str(element))
print("Frequency of Element "+str(i)+" of the counter: "+str(counter_obj[element]))
i=i+1
# .elements() provides an iterator of all individual elements of the counter
counter_elements=list(counter_obj.elements())
print('Elements of the counter: '+str(counter_elements))
# APPLY COUNTERS TO TEXTS
sentence1="This is the first sentence."
sentence2="This is the second sentence, which is longer."
# Split sentences in words
sentence1_words=re.split("\W{1,}", sentence1)
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
# The last element is empty -> delete it.
sentence1_words.remove("")
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
# -> now okay
sentence2_words=re.split("\W{1,}", sentence2)
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
# The last element is empty -> delete it.
sentence2_words.remove("")
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
# -> now okay
# Counter words
sentence1_counter=collections.Counter(sentence1_words)
sentence2_counter=collections.Counter(sentence2_words)
print(sentence1_counter)
print(sentence2_counter)
# OPERATIONS WITH COUNTERS
# add counters
add_counters=sentence1_counter+sentence2_counter
print("You can add counters: "+str(add_counters))
# subtract counters
subtract_counters=sentence1_counter-sentence2_counter
print("You can subtract counters: "+str(subtract_counters))
# Each time a new Counter is produced through an operation, any items with zero
# or negative counts are discarded. --> only first appears in subtract_counters
# Intersection of counters
intersection_counters=sentence1_counter & sentence2_counter
print("You can determine the intersection of counters: "+str(intersection_counters))
# -> takes the minimum of occurences; again elements with zero frequency
# are not included.
# Union of counters
union_counters=sentence1_counter | sentence2_counter
print("You can determine the union of counters: "+str(union_counters))
# -> takes the maximum of occurences
# MOST FREQUENT WORDS
# Determine the three most frequent words in the add_counters set.
top_3_words=add_counters.most_common(3)
print("The top 3 words are: "+str(top_3_words))
# Identify the two most frequent words with the top 4 words in the add_counters sample.
top_4_words=add_counters.most_common(4)
# The first [] refers to the line, i.e. is it the second common, second most
# frequent word.
# The second[] refers either to the word itself [0] or to the frequency of the word [1].
# the most frequent word
top_word=top_4_words[0][0]
top_word_count=top_4_words[0][1]
print("The top word is '"+str(top_word)+"', which appears "+str(top_word_count)+" times")
# the second most frequent word
top_2_word=top_4_words[1][0]
top_2_word_count=top_4_words[1][1]
print("The second most frequent word is '"+str(top_2_word)+"', which appears "+str(top_2_word_count)+" times")
print("Completed")

View file

@ -0,0 +1,447 @@
# -*- coding: utf-8 -*-
"""
INTRODUCTION TO REGULAR EXPRESSION
@author: Alexander Hillert, Goethe University Frankfurt
This version: June 3, 2019
What are regular expressions?
Regular expressions allow you to search for general patterns in texts. The
standard string commands like .count("search_term") and .replace("old_word","new_word")
can only count and replace one specific word, respectively. They cannot search
for general patterns like all words that consist of three or more letters.
Assume that you want to identify all numbers in a text or that you search for
the year of birth in bios of corporate executives. In the examples, you need a
search tool that can process broad patterns --> you need regular expressions.
Consider the second example, i.e. you would like to automatically identify
people's year of birth from their bios. You know that the number must have four
digits and that the first two digits must equal 19. Of course, you could
hardcode all possible years (1900, 1901, ..., 1999), but this is unnecessarily
complicated and slows down the program. Therefore, it is better to learn
how to use regex.
Useful online resources:
1. https://regex101.com/
On this webpage, you can enter a text and a regular expression.
The webpage highlights the matches and provides explanations for
every part of the regex pattern.
Caution: click on "Python" in the left menu (the default language is php)!
2. https://docs.python.org/3/library/re.html
The offical documentation of regular expression in Python 3.
"""
# To be able to use regular expressions you need to import the re package first.
import re
# Select the directory where you saved the accompanying txt-file.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# In this introduction, we use the accompanying txt-file "Text_Introduction_Regular_Expressions.txt"
# open the file
text_file=open(directory+'Text_Introduction_Regular_Expressions.txt','r',encoding='cp1252')
# read its content
text=text_file.read()
# Let's start with the example from the beginning and search for people's years of birth.
# The standard search command for regular expressions is re.search. It searches
# for the FIRST match of the expression in the text.
# First try
match=re.search("19[0-9]{2}",text)
# This command searches for four digits of which the first is a 1, the second a 9,
# and then there are two further digits which can be any digits.
# [0-9] refers to any digit. Equivalently, you can write \d which also refers
# to any digits.
# The {2} specifies that there must be exactly to digits.
print(match)
# match contains information on the match:
# span is the position in text where the match starts and ends; here 226 and 230
# furthermore, the matched text is shown. Here, the first match is 1956.
# You can use the positions to print the text before the match, after the match,
# and, of course, of the matched text.
start=match.start()
end=match.end()
print("From beginning of the document to the match: \n"+text[:start]+"\n\n")
print("The match itself: \n"+text[start:end]+"\n\n")
print("From end of match to end of document: \n"+text[end:]+"\n\n")
# To access the match, you can also use the command .group(0):
print("Alternative way to access the matched text: \n"+match.group(0)+"\n\n")
# CAUTION
# If no match is found the variable match does not exist.
# Example: search for a ten digit number that start with 19
match=re.search("19[0-9]{8}",text)
# The command start=match.start() returns the follwoing error:
# "AttributeError: 'NoneType' object has no attribute 'start'"
# SOLUTION
match=re.search("19[0-9]{8}",text)
if match:
# match found, the start .start() is now conditional on the existence of match
start=match.start()
print("Match found. Starting at position "+str(start))
else:
# no match found
print("No match found")
'''
Information on Syntax, Special Characters in Regular Expression
Character Meaning
[] Indicates a set of characters
\[ Matches the actual [
\] Matches the actual ]
^ negation; the symbols listed afterwards are not allowed in the match
E.g., [^0-9] will not match any numbers but all other symbols.
\d Any digit, i.e. 0, 1, 2, ..., 9. Equivalent to [0-9]
\n Linefeed/newline, the start of a new line.
\s Any whitespace, i.e. a tab, a space.
CAUTION: \s matches also the newline (\n). This property of \s
can lead to unintended matches.
RECOMMENDATION: to match whitespaces only use [ \t], i.e. a space
and a tab (\t).
\S Any non-whitespace symbol.
. Any character (digit, letter, symbol [!,?,%,etc.], spaces) but
NOT the newline, \n.
\. Matches the actual dot.
\w Matches word characters, i.e. [0-9a-zA-Z_]
The underscore (_) is defined to be a word character.
\W Matches any non-word characters, i.e. [^0-9a-zA-Z_]
| Or condition (for an example see line 272)
() Like in math: parentheses indicate which characters of an expression
belong togehter. (For an example see line 272.)
\( Matches the actual (
\) Matches the actual )
(?i) Performs the regex case-insensitive. Must be put at the beginning
of the regex. E.g. re.search("(?i)TeSt",text) will match
TEST, test, Test, etc.
re.IGNORECASE Performs the regex case-insensitive. Must be put at the end of
the regex as an option. E.g. re.search("test",text,re.IGNORECASE)
'''
# Examples of character sets
# 1. [0-9]: numbers
match=re.search("[0-9]","ABC abc 123")
print(match)
#2. [a-z]: any lower case letter
match=re.search("[a-z]","ABC abc 123")
print(match)
#3. [A-Z]: any upper case letter
match=re.search("[A-Z]","ABC abc 123")
print(match)
#4. [cde]: lower case letters c, d, and e.
match=re.search("[cde]","ABC abc 123")
print(match)
#5. [^A-Zab]: all symbols except captial letters and a and b.
match=re.search("[^A-Zab]","ABC abc 123")
print(match)
# you don't see any character because the match is the first white space before abc
'''
Quantifiers for regular expression:
n and m refer to non-negative integers (0, 1, 2, ...), where m>n
Quantifier Meaning
{n} The preceding pattern must be found EXACTLY n times.
{n,} The preceding pattern must be found AT LEAST n times.
{,n} The preceding pattern must be found AT MOST n times.
{n,m} The preceding pattern must be found AT LEAST n but AT MOST m times.
{n,}? The ? tells the regex not to be "greedy" (see lines 211 for details)
There are alternative notations for commonly used quantifiers:
* is equivalent to {0,}, i.e. 0 or more repetitions of the preceding pattern.
+ is equivalent to {1,}, i.e. 1 or more repetitions of the preceding pattern.
? is equivalent to {0,1}, i.e. 0 or 1 repetition of the preceding pattern.
'''
# re.search() returns only the first match: How to get all matches?
# Alternative 1: use a loop.
text1=text
i=1
match=re.search("19[0-9]{2}",text1)
# Repeat the following commands until no more matches are found.
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text1=text1[end:]
match=re.search("19[0-9]{2}",text1)
i=i+1
# Alternative 2: use re.findall
# The syntax is identical to re.search
list_of_matches=re.findall("19[0-9]{2}",text)
print(list_of_matches)
# the individual matches can be called by list_of_matches[i], where i ranges
# from zero to the number of matches minus one.
# Remember: the first element of a list has the position 0
for i in range(0,len(list_of_matches)):
print("This is match number "+str(i+1)+" using the re.findall command: "+list_of_matches[i])
# When you read the text you will observe that there are only six years of birth
# in the text and not eight -> there are two mismatches -> adjust filter to
# get only the years of birth and not all years.
text1=text
i=1
# Check whether the word born appears before the year. The distance between
# born and the year must be smaller or equal 15 (plus the two white spaces)
match=re.search("born .{,15} 19[0-9]{2}",text1)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Extract the year
match1=re.search("19[0-9]{2}",match.group(0))
print("The year of match number "+str(i)+" is: "+match1.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text1=text1[end:]
match=re.search("born .{,15} 19[0-9]{2}",text1)
i=i+1
# The quantifiers introduced above are "greedy". For example, if a pattern matches overlapping
# text parts of different length, the regex will return the longest match.
# Example: search for the first sentence in a text. You know that sentences
# end with period in this example.
text2="This is the first senctence. This is the second sentence. And so on"
# Search for a positive number of occurances of characters followed by a period.
# Remeber that the dot is \. in regex. The . will match any character.
match=re.search(".{1,}\.",text2)
print(match.group(0))
# -> the regex returns the first and second sentence.
# To get the first match that fulfils the regex, put a ? after the quantifiers.
# This makes the quantifier "non-greedy", and only the first occurance will be matched.
match=re.search(".{1,}?\.",text2)
print(match.group(0))
# You will often have situations where there are multiple versions of the same
# pattern. How can you include all of them in one regular expression?
# Example 1: search for the word "losses" in the following sentence:
text3="X Corp's soda division returned significant losses in the last quarter. Losses will be reduced this quarter."
# the first letter of "loss" can be upper or lower case
print("Example 1: Loss and loss")
text4=text3
i=1
# A set of characters [] is matched if at least one of the components of the
# set is found in the text. This works only for a single letter/number/symbol
# but not for sequences of multiple letters/numbers/symbols.
match=re.search("[Ll]oss",text3)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("[Ll]oss",text4)
i=i+1
# Alternatively
list_of_matches=re.findall("[Ll]oss",text3)
print("Alternative using re.findall: "+str(list_of_matches))
# In this example, you could also simply perform a case-insensitive match.
print("Case-INsensitive matching using re.IGNORECASE")
text4=text3
i=1
match=re.search("loss",text3,re.IGNORECASE)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("loss",text4,re.IGNORECASE)
i=i+1
# Or equivalently
print("Case-INsensitive matching using (?i)")
text4=text3
i=1
match=re.search("(?i)loss",text3)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("(?i)loss",text4)
i=i+1
# Example 2: search for the expressions "profits declined" and "profits decreased"
# in the following sentence:
text3="X Corp's profits declined in 2010, while Y Inc.'s profits decreased the year before."
# Here, [] no longer works because we need to match terms consisting of several
# characters and [] matches only one character. -> use the OR-operator |
print("Example 2: profits declied and profits decreased - First try")
text4=text3
i=1
match=re.search("profits declined|decreased",text3)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("profits declined|decreased",text4)
i=i+1
# Problem: regex interprets the entire set of characters before the | as one
# alternative.
# Solution: use parantheses to define the boundaries.
print("Example 2: profits declied and profits decreased - Second try")
text4=text3
i=1
match=re.search("profits (declined|decreased)",text3)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("profits (declined|decreased)",text4)
i=i+1
# Alternative: does re.findall work?
list_of_matches=re.findall("profits (declined|decreased)",text3)
print(list_of_matches)
# -> No! Because there is a major difference between re.search and re.findall
# in the way they treat parantheses ().
# re.search follows the general regular expression syntax that is also used in
# other programming languages.
# To use re.findall you have to write down the full text before and after the |.
list_of_matches=re.findall("profits declined|profits decreased",text3)
print(list_of_matches)
# More information on the difference between re.search and re.findall
# Example 3: let's search for the numbers in the second part of the txt file
# and compare what the two commands do.
# Get the second part
match=re.search("Here are some numbers:",text)
text4=text[match.end():]
print(text4)
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
# What are the individual parts of this pattern?
# [0-9]{1,} There has to be at least one digit.
# ([0-9]{3}|,){0,} The first digit can be followed by combinations of three
# digits and commas (as thousand separator).
# \.{0,1} There can be zero or one period as decimal separator.
# [0-9]{0,} There can be multiple decimal places.
i=1
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
i=i+1
# Can we obtain the same result by using re.findall?
match=re.search("Here are some numbers:",text)
text4=text[match.end():]
list_of_matches=re.findall("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
print(list_of_matches)
# Does not work!
# One has to put "?:" in the part that captures the repetition of the thousands.
# This tells re.findall to return the full match and not subpatterns.
list_of_matches=re.findall("[0-9]{1,}(?:[0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
print(list_of_matches)
# TAKE AWAY: The matching of re.findall does not always match that of re.search
# Be careful when using re.findall!!!
# How to delete or substitute parts of texts?
# Alternative 1: identify the beginning and end of the matched text part and
# remove it from the overall text.
# Example delete all numbers in the text
text4=text
print("Original Text:\n"+text4)
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
while match:
# Remove the match
text4=text4[:match.start()]+text4[match.end():]
# Check whether there are further matches in the remaining text
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
print("Text without numbers using re.search:\n"+text4)
# Alternative 2: use re.sub (sub -> substitute)
# syntax: new_text=re.sub(pattern, replacement, old_text)
# replacement is some string. Regular expressions are only allowed in the pattern
# but not in the replacement.
text4=text
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","",text4)
print("Text without numbers using re.sub:\n"+text4)
# re.sub is the more efficient way.
# Furthermore, re.sub can not only delete text but also replace text.
# Example
text4=text
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","NUMBER",text4)
print("Text where numbers are replaced by the word 'NUMBER':\n"+text4)
# Make sure you get the right match --> importance of word boundaries.
# When you search for a word it can happen that the word is part of a different
# longer word. For example, searching for "high" would also match "highlight".
# To avoid such mismatches you can either include word boundaries in the search
# (Alternative 1) or split the text first by word boundaries into single words
# and perform standard string search operations afterwards (Alternative 2).
# Alternative 2 does not return the individual matches but tells you for example
# the number of matches
# Example: search for the word "is"
# Alternative 1:
match=re.search("is",text)
print("Searching without word boundaries yields: '"+match.group(0)+\
"' But the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
match=re.search("\Wis\W",text)
print("Searching with word boundaries yields: '"+match.group(0)+\
"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
# You see that the preceding and subsequent word boundaries are also matched
# and saved as the matched term. However, often you want the match to include only
# the actual word without its boundaries.
# Solution: use so called "look ahead" and "look back" conditions.
'''
Look ahead and look behind/back conditions
Regex requires that the parts of the pattern that are classified as look ahead
or look back/behind are present in the text but does not include them in the match.
Syntax:
positive look ahead: (?=) Example: X(?=\W) requires that there is a word
boundary after X
negative look ahead: (?!) Example: X(?!\W) requires that there must NOT
be a word boundary after X.
positive look back: (?<=) Example: (?<=\W)X requires that there is a word
boundary before X
negative look back: (?<!) Example: (?<!\W)X requires that there must NOT
be a word boundary before X.
'''
match=re.search("(?<=\W)is(?=\W)",text)
print("Searching with word boundaries as look ahead and look back condition yields: '" #
+match.group(0)+"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
# Does it work also with re.finall?
list_of_matches=re.findall("\Wis\W",text)
print("Word boundaries using re.findall: "+str(list_of_matches))
list_of_matches=re.findall("(?<=\W)is(?=\W)",text)
print("Word boundaries as look ahead and look back condition using re.findall: "+str(list_of_matches))
print("In total there are "+str(len(list_of_matches))+" matches.")
# --> Yes, the approach also work with re.findall.
# Alternative 2:
# Use re.split(), which is similar to split() but more powerful.
text_split=re.split("\W",text)
print(text_split)
# Problem: there are elements in the list that are not words, e.g. ''. These
# elements are created because there can be a series of non-word characters (\W),
# e.g. ' (' in 'Balmer (born'.
# Solution: treat a series of wordboundaries \W as a single split character
text_split=re.split("\W{1,}",text)
print(text_split)
# Now, you do not need to include word boundaries and can use standard string
# operations.
number_matches=text_split.count("is")
print("Using standard string operations, we get "+str(number_matches)+" matches.")
# -> same result.

View file

@ -0,0 +1,485 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
'''
This script introduces you to linear models using the sklearn package.
Besides sklearn, we will use pandas to work with data sets as well as
numpy to perform computations.
The introduction consists of 10 parts:
1. linear regressions using a toy data set
2. linear regressions using a "real" data set
3. linear regressions using standardized variables
4. Ridge regression basics
5. Ridge regression with training, tuning, and testing sample
6. Ridge regression with cross-validation
7. LASSO regression basics
8. LASSO regression with training, tuning, and testing sample
9. LASSO regression with cross-validation
10. Compare the results from Ridge and LASSO
'''
import pandas as pd
import numpy as np
# For OLS regressions
from sklearn.linear_model import LinearRegression
# for Ridge regressions
from sklearn.linear_model import Ridge
# for computing mean squared errors
from sklearn.metrics import mean_squared_error
# for plotting the MSEs for different levels of Lambda
import matplotlib.pyplot as plot
# for Ridge regressions with cross-validation
from sklearn.linear_model import RidgeCV
# for LASSO regressions
from sklearn.linear_model import Lasso
# for LASSO regressions with cross-validation
from sklearn.linear_model import LassoCV
# adjust the directory to your folder!!!
directory="C:/Lehre/Machine Learning/Data/"
############################################################
# Part 1. Basics: linear regressions in Python using sklearn
############################################################
print("\nPart 1: Run an OLS regression on a sandbox data set\n")
# create a random number from a normal distribution with mean 0 and standard deviation 1.
random_number=np.random.normal(0, 1)
print("A random number is: "+str(random_number))
# you can also create a vector or matrix of random variables
# the parameter size(# of rows, # of columns) specifies the number rows and columns
# For example, a (10,1) vector
random_number_vector=np.random.normal(0, 1, size=(10,1))
print("The vector of random numbers is:")
print(random_number_vector)
# create the independent variable x as a vector of random numbers
x_vector=np.random.normal(0, 1, size=(10,1))
print("The vector of the independent variable x is:")
print(x_vector)
# create the dependent variable y as
# y = 2x + epsilon, where epsilon is the random error term from above
y_vector=np.dot(x_vector,2) + random_number_vector
print("The vector of the dependent variable y is:")
print(y_vector)
# perform a standard OLS regression with intercept.
# The command takes x (independent variable(s)) first and then y (dependent variable)
# Note that the default is that the intercept is included. So, strictly speaking,
# the (fit_intercept=True) option is not needed.
regression_1=LinearRegression(fit_intercept=True).fit(x_vector, y_vector)
# display the intercept and the beta coefficient on x
print("The intercept is: "+str(regression_1.intercept_))
# to get it as a scalar/number not an array, use
regression_1.intercept_[0]
print("The coefficient on x is: "+str(regression_1.coef_))
# to get it as a scalar/number not an array, use
regression_1.coef_[0][0]
# R2 of the regression
print("The R2 is: "+str(regression_1.score(x_vector, y_vector)))
###############################################################
# Part 2: linear regression using a "real" data set
###############################################################
print("\nPart 2: Run an OLS regression with a real data set\n")
# import the data for this problem
# The data set consists of 200 independent variables (x1 to x200) and
# a dependent variable (y).
# There are 1,200 observations in total. In the later parts, we will
# use the first 1,000 observations for training and the last 200 for testing.
# The data are simulated using the following process:
# y = 0.5*x1 + 0.5*x2 + ... + 0.5*x100 + random error (mean 0, std. dev. 4)
# The x101 to x200 are not directly related to y but are correlated with
# the x1 to x100. More specifically,
# x101 = 0.7*x1 + random error (mean 0, std. dev. 1)
# x102 = 0.7*x2 + random error (mean 0, std. dev. 1)
# x200 = 0.7*x100 + random error (mean 0, std. dev. 1)
data_frame=pd.read_csv(directory+"regression_data_scikit.csv",sep=";")
# to get any idea about the data, display the first five data points
data_frame.head(5)
# split the data frame into the independent and dependent variables
# the independent variables(x1 to x200) are columns 1 to 200
x_variables=data_frame.values[:,:-1]
# the dependent variable (y) is column 201
y_variable=data_frame.values[:,-1:]
# run a standard OLS regression
regression_OLS=LinearRegression(fit_intercept=True).fit(x_variables, y_variable)
# You can double check the results by reruning the regression in Stata or R.
# display the intercept and the beta coefficients on x1 and x51
print("The intercept is: "+str(regression_OLS.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_OLS.score(x_variables, y_variable)))
##################################################################
# Part 3: standardize the data to have mean zero and unit variance
# and rerun the regression
##################################################################
print("\nPart 3a.: Standardize variables\n")
# standardize x and y to have mean zero and unit variance
# axis=0 (axis=1) means that the computation is executed column (row) wise
x_variables_mean=np.mean(x_variables,axis=0)
# ddof=1 means that we use n-1 to compute the standard deviation
x_variables_standard_deviation=np.std(x_variables, axis=0, ddof=1)
x_variables_standardized=(x_variables-x_variables_mean)/x_variables_standard_deviation
# do the same exercise for y
y_variable_mean=np.mean(y_variable,axis=0)
y_variable_standard_deviation=np.std(y_variable, axis=0, ddof=1)
y_variable_standardized=(y_variable-y_variable_mean)/y_variable_standard_deviation
# rerun the regression using standardized data
regression_OLS_standardized=LinearRegression(fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# results are identical to a regression in Stata with beta coefficients.
# display the intercept and the beta coefficients on x_1 and x_51
print("The intercept is: "+str(regression_OLS_standardized.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS_standardized.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS_standardized.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_OLS_standardized.score(x_variables_standardized, y_variable_standardized)))
# The R2 is identical to the one from Part 2 -> good!
#######################################################################################
# CAUTION: be careful using the "normalize=True" option in the LinearRegression module!
#######################################################################################
print("\nPart 3b.: Regression with 'normalization'\n")
# Normalizer works on the rows, not the columns!
# By default, L2 normalization is applied to each observation so that the
# values in a row (!) have a unit norm. Unit norm with L2 means that if each
# element were squared and summed, the total would equal 1.
regression_OLS_normalized=LinearRegression(fit_intercept=True,normalize=True).fit(x_variables, y_variable)
# display the intercept and the beta coefficient on x_1 and x_51
print("The intercept is: "+str(regression_OLS_normalized.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS_normalized.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS_normalized.coef_[0][50]))
# The coefficients are different from the ones above highlighting that the
# "normalize=True" option does not do the same as "normal" standardizing
# R2 of the regression
print("The R2 is: "+str(regression_OLS_normalized.score(x_variables, y_variable)))
#######################################################################
# Part 4: Ridge regression on the full sample (no training and testing)
# This part is to learn the syntax.
# We are using the standardized variables to have the same penalty
# for a given effect of x on y.
# Remember: if the independent variables are measured on very different
# scales, the beta coefficients have different sizes (e.g., market cap in
# thousand USD vs. past stock returns as a decimal number) and, thus,
# the panelty would be applied inconsistently.
#######################################################################
print("\nPart 4: Ridge regression - learning the syntax\n")
# the parameter alpha corresponds to the penalty parameter Lambda from
# the notation that is typically used.
# the default is that the intercept is included, so you do not need the
# "intercept=True" parameter. But it is good to keep in mind what
# specification you are using.
regression_Ridge=Ridge(alpha=10,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# display the intercept and the beta coefficient on x1 and x51
print("The intercept is: "+str(regression_Ridge.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_Ridge.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_Ridge.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_Ridge.score(x_variables_standardized, y_variable_standardized)))
# How to compute the mean squared error (MSE)?
# 1. get the predicted values
y_variable_standardized_predicted=regression_Ridge.predict(x_variables_standardized)
# 2. determine the MSE
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
#######################################################################
# Part 5: Ridge regression using a training, tuning, and testing sample
#######################################################################
print("\nPart 5: Ridge regression - Application with training, tuning, and testing data\n")
# Create a training, tuning, and testing sample
# we split the data into a training, a tuning, and a testing set
# training data are the frist 800 rows
# In the brackets, the first range (before the comma) indicates the rows, the second the columns.
x_variables_std_train=x_variables_standardized[:800,:]
y_variable_std_train=y_variable_standardized[:800,:]
# the tuning data are row 801 to 1000 -> 200 observations
x_variables_std_tune=x_variables_standardized[800:1000,:]
y_variable_std_tune=y_variable_standardized[800:1000,:]
# testing data are the last 200 rows
x_variables_std_test=x_variables_standardized[1000:,:]
y_variable_std_test=y_variable_standardized[1000:,:]
##########################
# find the optimal Lambda
##########################
# we store the MSE of the training/tuning data for each Lambda
mse_train_list=[]
mse_tune_list=[]
# Again, Lambda and Alpha refer to the same thing.
alpha_list=[]
# we iterate from 0.1 to 100 increasing Lambda=Alpha by 0.1 in each step.
alpha=0.1
while alpha<100:
# train the model
regression_Ridge_train=Ridge(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
# add the alpha to the list of alphas
alpha_list.append(alpha)
# predict y in the training sample
y_variable_std_train_predicted=regression_Ridge_train.predict(x_variables_std_train)
# predict y in the tuning sample
y_variable_std_tune_predicted=regression_Ridge_train.predict(x_variables_std_tune)
# compute the MSE in both samples
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
# append the MSEs to the two lists
mse_train_list.append(mse_train)
mse_tune_list.append(mse_tune)
# continue with the next alpha
alpha=alpha+0.1
########################################
# plot the MSEs for the different alphas
########################################
# MSE in the training sample
plot.scatter(alpha_list, mse_train_list)
plot.show()
# higher Lambda associated with higher MSE
# MSE in the tuning sample
plot.scatter(alpha_list, mse_tune_list)
plot.show()
# there is an optimal alpha with the lowest MSE
######################################
# determine the optimal Lambda
######################################
# what is the smallest MSE?
minimum=min(mse_tune_list)
print("The smallest MSE is "+ str(minimum))
# get the position of the minimum MSE in our list
index_min_MSE=mse_tune_list.index(minimum)
# choose the corresponding alpha
alpha_optimal=alpha_list[index_min_MSE]
print("The optimal alpha is "+str(alpha_optimal))
#############################################################
# What is the out-of-sample performance of the optimal model?
#############################################################
# take the full training data set (1000 observations, i.e., training + tuning set)
x_variables_std_train_total=np.concatenate((x_variables_std_train, x_variables_std_tune), axis=0)
y_variable_std_train_total=np.concatenate((y_variable_std_train, y_variable_std_tune), axis=0)
# train the model with the optimal Lambda on the training and tuning data
regression_Ridge_optimal=Ridge(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
# Mean squared error
# predict y in the full training sample
y_variable_std_train_total_predicted=regression_Ridge_optimal.predict(x_variables_std_train_total)
# predict y in the testing sample
# Remeber: we have not used the testing data yet. Firewall principle!!!
y_variable_std_test_predicted=regression_Ridge_optimal.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
#############################################################
# Part 6: Ridge regression with k-fold cross-validation
# Implement the cross validation using a package
#############################################################
print("\nPart 6. Ridge regression - Using cross-validation\n")
# the default for cv is the leave-one-out cross-validation
# here we apply five-fold cross-validation
regression_Ridge_cv=RidgeCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
# get the optimal lambda
alpha_optimal_cv=regression_Ridge_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# Mean squared error using the cross-validated model
# predict y in the full training sample
y_variable_std_train_total_predicted_cv=regression_Ridge_cv.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted_cv=regression_Ridge_cv.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
###########################################
# Part 7: LASSO regression
# on the full sample -> to learn the syntax
###########################################
print("\nPart 7: LASSO regression - learning the syntax\n")
# the parameter alpha corresponds to the penalty parameter Lambda from
# the notation that is typically used.
# the default is that the intercept is included, so you do not need the
# "intercept=True" parameter. But it is good to keep in mind what
# specification you are using.
regression_Lasso=Lasso(alpha=0.1,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# display the intercept and the beta coefficient on x1 and x51
print("The intercept is: "+str(regression_Lasso.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_Lasso.coef_[0]))
print("The coefficient on x_51 is: "+str(regression_Lasso.coef_[50]))
# R2 of the regression
print("The R2 is: "+str(regression_Lasso.score(x_variables_standardized, y_variable_standardized)))
# How to compute the mean squared error (MSE)?
# 1. get the predicted values
y_variable_standardized_predicted=regression_Lasso.predict(x_variables_standardized)
# 2. determine the MSE
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
####################################################
# Part 8: Create a training, tune and testing sample
####################################################
print("\nPart 8: LASSO regression - Application with training, tuning, and testing data\n")
# we use the same training, tuning, and testing data as in part 5.
# -> no need to redefine the data sets.
#################################
# find the optimal Lambda
#################################
# we store the MSE of the training/tuning data for each Lambda
mse_train_list=[]
mse_tune_list=[]
# Again, Lambda and Alpha refer to the same thing.
alpha_list=[]
# we iterate from 0.0001 to 0.25 increasing alpha by 0.0001 in each step.
alpha=0.0001
while alpha<0.25:
# train the model
regression_Lasso_train=Lasso(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
# add the alpha to the list of alphas
alpha_list.append(alpha)
# predict y in the training sample
y_variable_std_train_predicted=regression_Lasso_train.predict(x_variables_std_train)
# predict y in the tuning sample
y_variable_std_tune_predicted=regression_Lasso_train.predict(x_variables_std_tune)
# compute the MSE in both samples
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
# append the MSEs to the two lists
mse_train_list.append(mse_train)
mse_tune_list.append(mse_tune)
# continue with the next alpha
alpha=alpha+0.0001
########################################
# plot the MSEs for the different alphas
########################################
# MSE in the training sample
plot.scatter(alpha_list, mse_train_list)
plot.show()
# higher Lambda associated with higher MSE
# MSE in the tuning sample
plot.scatter(alpha_list, mse_tune_list)
plot.show()
# there is an optimal alpha with the lowest MSE
######################################
# determine the optimal Lambda
######################################
# what is the smallest MSE?
minimum=min(mse_tune_list)
print("The smallest MSE is "+ str(minimum))
# get the position of the minimum MSE
index_min_MSE=mse_tune_list.index(minimum)
alpha_optimal=alpha_list[index_min_MSE]
print("The optimal alpha is "+str(alpha_optimal))
#############################################################
# What is the out-of-sample performance of the optimal model?
#############################################################
# take the full training data set (1000 observations; training + tuning)
# use the same variables as in Part 5.
# train the model with the optimal Lambda on the training and tuning data
regression_Lasso_optimal=Lasso(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
# Mean squared error
# predict y in the full training sample
y_variable_std_train_total_predicted=regression_Lasso_optimal.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted=regression_Lasso_optimal.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
#############################################################
# Part 9: Implement the cross validation using a package
#############################################################
print("\nPart 9: LASSO regression - Using cross-validation\n")
# the default for cv in LassoCV is the 5-fold cross-validation
regression_Lasso_cv=LassoCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
# get the optimal lambda
alpha_optimal_cv=regression_Lasso_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# Mean squared error using the cross-validated model
# predict y in the full training sample
y_variable_std_train_total_predicted_cv=regression_Lasso_cv.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted_cv=regression_Lasso_cv.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
#####################################################################
# Part 10: Compare the betas from the Ridge and the LASSO regressions
#####################################################################
print("\nPart 10: Comparison of Ridge and LASSO coefficients\n")
# To set to what extend the results of Ridge and LASSO are similar, we
# write the coefficients from the cross-validation tasks (Parts 6 and 9)
# to a csv files.
output_file=open(directory+"comparison_coefficients_Ridge_LASSO.csv","w",encoding="utf-8")
output_file.write("index;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,200):
output_file.write(str(i)+';'+str(regression_Ridge_cv.coef_[0][i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 17 17:09:50 2021
@author: ahillert
"""
from nltk.tokenize import sent_tokenize
print("\nExample 1\n")
text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \
+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \
+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"
sentence_list_1=sent_tokenize(text_1)
for i in range(0,len(sentence_list_1)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])
# -> good performance
print("\nExample 2\n")
text_2=text_1.lower()
sentence_list_2=sent_tokenize(text_2)
for i in range(0,len(sentence_list_2)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])
# -> poor performance
# For the NLTK tokenizer it makes a difference whether text is lower or upper case.
print("\nExample 3\n")
text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \
+"While Jane C. Taylor became the president of the S. E. C. " \
+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."
sentence_list_3=sent_tokenize(text_3)
for i in range(0,len(sentence_list_3)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])
# -> good performance
print("\nExample 4\n")
text_4=text_3.lower()
sentence_list_4=sent_tokenize(text_4)
for i in range(0,len(sentence_list_4)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])

View file

@ -0,0 +1,137 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 17:43:45 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# import modules
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
# the following three commands:
#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
################
# 1. Tokenize
################
# Create a test text to see how well nltk.tokenize performs
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
# Tokenize sentences
sentence_list=sent_tokenize(test_text)
print("This is the list of sentences:")
print(sentence_list)
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
# Tokenize words
word_list=word_tokenize(test_text)
print("This is the list of words:")
print(word_list)
print(len(word_list))
# --> word_tokenize also includes symbols and numbers as words.
# How to delete the elements that are not real words?
word_list_1=[]
for word in word_list:
if re.search('[A-Za-z]',word):
word_list_1.append(word)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_1)
print(len(word_list_1))
# Alternative
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
word_list_2=word_tokenize(test_text1)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_2)
print(len(word_list_2))
################
# 2. Stop Words
################
example_sentence = "This is an example showing off stop word filtering."
stop_words=set(stopwords.words("english"))
print("This is the list of stop words from NLTK:")
print(stop_words)
# --> the stop words are all lower case
print(len(stop_words))
# Split example sentence into words
word_list_example=word_tokenize(example_sentence.lower())
# Create list for filtered words
word_list_filtered=[]
# filter out stop words
for word in word_list_example:
if word not in stop_words:
word_list_filtered.append(word)
print("Example sentence after stop words have been deleted:")
print(word_list_filtered)
# How does the example from above look like?
test_text_filtered=[]
# filter out stop words
for word in word_tokenize(test_text.lower()):
if word not in stop_words:
test_text_filtered.append(word)
print("Test text after stop words have been deleted:")
print(test_text_filtered)
################
# 3. Stemming
################
# define an abbreviation
ps=PorterStemmer()
example_words_1=["play", "player", "players", "played", "playing"]
for word in example_words_1:
print(ps.stem(word))
# the full syntax without the abbreviation would be:
print(PorterStemmer().stem(word))
# adjectives and adverbs
example_words_2=["high", "higher", "highest", "highly", "height"]
for word in example_words_2:
print(ps.stem(word))
# --> comparative and superlative are not reduced to the stem/regular adjective
# neither are adverbs
# Let's see how the stemmer deals with irregular words.
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
for word in example_words_3:
print(ps.stem(word))
# --> upper case words are also transformed to lower case.
# Stem the test text from above
# Approach 1: stem word by word
test_text_stemmed=[]
for word in word_tokenize(test_text):
test_text_stemmed.append(ps.stem(word))
print("Stemming word by word: test text after it has been stemmed:")
print(test_text_stemmed)
# Alternative approach: stem entire text
test_text_stemmed=ps.stem(test_text)
print("Stemming entire document: test text after it has been stemmed:")
print(test_text_stemmed)
# -> does not work
print("End of nltk introduction!")

View file

@ -0,0 +1,19 @@
This is the text for the introduction to regular expressions.
In the first example, we search for the year of birth of current and former CEOs.
These are sentences that I made up:
Microsoft's former CEO Steve Balmer (born in 1956) graduated from Harvard in 1977.
Michael Dell was born in 1965 in Houston and founded Dell Inc in 1984.
Walmart is currently run by Doug McMillon, who was born in 1966.
The following three examples are taken from the Wikipedia pages of the three people.
Steven Anthony "Steve" Ballmer (born March 24, 1956) is an American chief executive who is the former chief executive officer of Microsoft from January 2000 to February 2014, and is the current owner of the Los Angeles Clippers. Source: https://en.wikipedia.org/wiki/Steve_Ballmer, June 22, 2017.
Michael Saul Dell (born February 23, 1965) is an American business magnate, investor, philanthropist, and author. He is the founder and CEO of Dell Technologies, one of the worlds leading providers of information technology infrastructure solutions. Source: https://en.wikipedia.org/wiki/Michael_Dell, June 22, 2017.
Carl Douglas "Doug" McMillon (born October 17, 1966) is an American businessman and is the president and chief executive officer (CEO) of Wal-Mart Stores, Inc. Source: https://en.wikipedia.org/wiki/Doug_McMillon, June 22, 2017.
Here are some numbers:
1,234,567
8,901
34
56.82
539,234,353.41

File diff suppressed because it is too large Load diff