Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
|
|
@ -0,0 +1,270 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 09:19:54 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
This version: February 22, 2019
|
||||
|
||||
This is an introduction to two data containers: lists and counters.
|
||||
|
||||
Python has several built-in data containers, e.g., sets, dictionaries, and lists
|
||||
In addition to these containers, there are further types.
|
||||
For textual analysis application counters are helpful.
|
||||
|
||||
This introduction covers lists in the first part.
|
||||
The second part introduces the basics of counters.
|
||||
"""
|
||||
|
||||
# for counters, you need to import collections
|
||||
import collections
|
||||
import re
|
||||
|
||||
###############################################################################
|
||||
# Introduction on data containers
|
||||
###############################################################################
|
||||
|
||||
#################################
|
||||
# Part 1: lists
|
||||
#################################
|
||||
# Create an empty list
|
||||
empty_list=[]
|
||||
|
||||
# Create non-empty lists
|
||||
string_list=["a", "b", "c"]
|
||||
mixed_list=[1, "ab", -4,"hello"]
|
||||
|
||||
print(mixed_list)
|
||||
|
||||
# Call items of a list
|
||||
print(string_list[0])
|
||||
print(string_list[2])
|
||||
print(string_list[-1])
|
||||
|
||||
# Length of a list
|
||||
length=len(string_list)
|
||||
print("The length of the list is: "+str(length))
|
||||
|
||||
|
||||
# ADD ITEMS TO A LIST
|
||||
# ALTERNATIVE 1: insert -> you can specify the position
|
||||
string_list.insert(1,"d")
|
||||
# you cannot add multiple elements with the insert command
|
||||
# You can try, but it will not work
|
||||
# 1st try
|
||||
string_list.insert(3,"e" "f") # -> the new element is "ef"
|
||||
print(string_list)
|
||||
# 2nd try
|
||||
try:
|
||||
string_list.insert(3,"e", "f")
|
||||
except:
|
||||
print("Wrong syntax. If the command were executed without the try-except "\
|
||||
"you would get the error TypeError: insert() takes exactly 2 arguments (3 given)'")
|
||||
# 3rd try
|
||||
string_list.insert(3, ["e", "f"])
|
||||
# check length
|
||||
print("The length of the list is: "+str(len(string_list))) # -> only 6 and not 7
|
||||
print(string_list[3])
|
||||
# So element 3 of the list is another list
|
||||
# You can call the elements of the sub-list
|
||||
print("First element of sub list: "+string_list[3][0]+" and second element of \
|
||||
sub list: "+string_list[3][1])
|
||||
|
||||
# Reset string_list to keep things easily tractable
|
||||
string_list=["a", "b", "c"]
|
||||
|
||||
# ALTERNATIVE 2: append -> items are added at the end
|
||||
string_list.append("d")
|
||||
|
||||
# Try to add multiple items
|
||||
# 1st try
|
||||
string_list.append("e" "f") # -> the new element is "ef"
|
||||
print(string_list)
|
||||
# 2nd try
|
||||
try:
|
||||
string_list.append("e", "f")
|
||||
except:
|
||||
print("Wrong syntax. If the command were executed without the try-except "\
|
||||
"you would get the error 'TypeError: append() takes exactly one argument (2 given)'")
|
||||
# 3rd try
|
||||
string_list.append(["e", "f"])
|
||||
# check length
|
||||
print("length of list is "+str(len(string_list))) # -> only 6 and not 7
|
||||
print(string_list[len(string_list)-1])
|
||||
# -> element 3 of the list is another list
|
||||
# You can call the elements of the sub-list
|
||||
print("First element of sub list: "+string_list[len(string_list)-1][0]+" and \
|
||||
second element of sub list: "+string_list[len(string_list)-1][1])
|
||||
|
||||
# Reset string_list to keep things easily tractable
|
||||
string_list=["a", "b", "c"]
|
||||
|
||||
# ALTERNATIVE 3: extend -> items are added at the end
|
||||
string_list.extend("d")
|
||||
|
||||
# Try to add multiple items
|
||||
# 1st try
|
||||
string_list.extend("e" "f") # -> Two elements are created -> works!!!
|
||||
print(string_list)
|
||||
# 2nd try
|
||||
try:
|
||||
string_list.extend("e", "f")
|
||||
except:
|
||||
print("Wrong syntax. If the command were executed without the try-except "\
|
||||
"you would get the error 'TypeError: extend() takes exactly one argument (2 given)'")
|
||||
# 3rd try
|
||||
string_list.extend(["e", "f"])
|
||||
print(string_list) # -> also works!!!
|
||||
# check length
|
||||
print("length of list is "+str(len(string_list))) # -> it is 8 and should be 8
|
||||
|
||||
|
||||
# DELETE ITEMS FROM A LIST
|
||||
string_list.remove("a")
|
||||
print("List after deletion of 'a' "+str(string_list))
|
||||
# What happens if an element occurs multiple times
|
||||
string_list.remove("e")
|
||||
print("List after further deletion of 'e' "+str(string_list))
|
||||
# --> only first occurence of "e" is deleted
|
||||
|
||||
|
||||
# FURTHER OPERATIONS WITH LISTS
|
||||
# Accessing parts of a list
|
||||
# Remember the first element is [0]! And the upper bound of the range is not
|
||||
# included, i.e. [0:3] means [0], [1] and [2].
|
||||
print("Sublist from beginning to third element: "+str(string_list[0:3]))
|
||||
print("Sublist from beginning to third element: "+str(string_list[:3]))
|
||||
print("Sublist from second(!) to third element: "+str(string_list[1:3]))
|
||||
print("Sublist from fourth(!) to fifth element: "+str(string_list[3:5]))
|
||||
print("Sublist from fifth(!) to the end: "+str(string_list[4:]))
|
||||
|
||||
# Search in lists
|
||||
position=string_list.index("b")
|
||||
print("Position of 'b' is: "+str(position))
|
||||
# Searching for an element that is not part of the list
|
||||
try:
|
||||
string_list.index("a")
|
||||
except:
|
||||
print("Error message. If the command were executed without the try-except "\
|
||||
"you would get the error 'ValueError: 'a' is not in list'")
|
||||
if "c" in string_list:
|
||||
print("'c' is at position: "+str(string_list.index("c")))
|
||||
|
||||
# Sort list
|
||||
string_list.sort()
|
||||
print('Sorted list: '+str(string_list))
|
||||
string_list.sort(reverse=True)
|
||||
print('Reversely sorted list: '+str(string_list))
|
||||
|
||||
# What happens when sorting mixed (i.e. integers and strings) lists?
|
||||
try:
|
||||
mixed_list.sort()
|
||||
except:
|
||||
print("Error message. If the command were executed without the try-except "\
|
||||
"you would get the error 'TypeError: unorderable types: str() < int()'")
|
||||
|
||||
|
||||
#################################
|
||||
# Part 2: counters
|
||||
#################################
|
||||
'''
|
||||
A Counter is a dictionary subclass for counting hashable objects.
|
||||
It is an unordered collection where elements are stored as dictionary keys and
|
||||
their counts are stored as dictionary values.
|
||||
'''
|
||||
# Creating a counter
|
||||
counter_obj=collections.Counter(["a", "b", "c", "d", "a", "b", "a"])
|
||||
print('The counter object is: '+str(counter_obj))
|
||||
# The previous command is equivalent to
|
||||
counter_obj=collections.Counter(a=3, b=2, c=1, d=1)
|
||||
print('The counter object (2nd command) is: '+str(counter_obj))
|
||||
|
||||
# Add objects to a counter
|
||||
counter_obj.update(["e", "f", "e"])
|
||||
print('The updated counter object is: '+str(counter_obj))
|
||||
# Alternative command
|
||||
counter_obj["g"]=4
|
||||
print('The updated updated counter object is: '+str(counter_obj))
|
||||
|
||||
# Length of the counter
|
||||
length=len(counter_obj)
|
||||
print('The length of the counter is: '+str(length))
|
||||
|
||||
# Loop over the elements of the counter and their frequency
|
||||
i=1
|
||||
for element in counter_obj:
|
||||
print("Element "+str(i)+" of the counter: "+str(element))
|
||||
print("Frequency of Element "+str(i)+" of the counter: "+str(counter_obj[element]))
|
||||
i=i+1
|
||||
|
||||
# .elements() provides an iterator of all individual elements of the counter
|
||||
counter_elements=list(counter_obj.elements())
|
||||
print('Elements of the counter: '+str(counter_elements))
|
||||
|
||||
# APPLY COUNTERS TO TEXTS
|
||||
sentence1="This is the first sentence."
|
||||
sentence2="This is the second sentence, which is longer."
|
||||
|
||||
# Split sentences in words
|
||||
sentence1_words=re.split("\W{1,}", sentence1)
|
||||
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
|
||||
# The last element is empty -> delete it.
|
||||
sentence1_words.remove("")
|
||||
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
|
||||
# -> now okay
|
||||
sentence2_words=re.split("\W{1,}", sentence2)
|
||||
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
|
||||
# The last element is empty -> delete it.
|
||||
sentence2_words.remove("")
|
||||
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
|
||||
# -> now okay
|
||||
|
||||
# Counter words
|
||||
sentence1_counter=collections.Counter(sentence1_words)
|
||||
sentence2_counter=collections.Counter(sentence2_words)
|
||||
|
||||
print(sentence1_counter)
|
||||
print(sentence2_counter)
|
||||
|
||||
# OPERATIONS WITH COUNTERS
|
||||
# add counters
|
||||
add_counters=sentence1_counter+sentence2_counter
|
||||
print("You can add counters: "+str(add_counters))
|
||||
|
||||
# subtract counters
|
||||
subtract_counters=sentence1_counter-sentence2_counter
|
||||
print("You can subtract counters: "+str(subtract_counters))
|
||||
# Each time a new Counter is produced through an operation, any items with zero
|
||||
# or negative counts are discarded. --> only first appears in subtract_counters
|
||||
|
||||
# Intersection of counters
|
||||
intersection_counters=sentence1_counter & sentence2_counter
|
||||
print("You can determine the intersection of counters: "+str(intersection_counters))
|
||||
# -> takes the minimum of occurences; again elements with zero frequency
|
||||
# are not included.
|
||||
|
||||
# Union of counters
|
||||
union_counters=sentence1_counter | sentence2_counter
|
||||
print("You can determine the union of counters: "+str(union_counters))
|
||||
# -> takes the maximum of occurences
|
||||
|
||||
# MOST FREQUENT WORDS
|
||||
# Determine the three most frequent words in the add_counters set.
|
||||
top_3_words=add_counters.most_common(3)
|
||||
print("The top 3 words are: "+str(top_3_words))
|
||||
|
||||
# Identify the two most frequent words with the top 4 words in the add_counters sample.
|
||||
top_4_words=add_counters.most_common(4)
|
||||
# The first [] refers to the line, i.e. is it the second common, second most
|
||||
# frequent word.
|
||||
# The second[] refers either to the word itself [0] or to the frequency of the word [1].
|
||||
# the most frequent word
|
||||
top_word=top_4_words[0][0]
|
||||
top_word_count=top_4_words[0][1]
|
||||
print("The top word is '"+str(top_word)+"', which appears "+str(top_word_count)+" times")
|
||||
# the second most frequent word
|
||||
top_2_word=top_4_words[1][0]
|
||||
top_2_word_count=top_4_words[1][1]
|
||||
print("The second most frequent word is '"+str(top_2_word)+"', which appears "+str(top_2_word_count)+" times")
|
||||
|
||||
|
||||
print("Completed")
|
||||
|
|
@ -0,0 +1,447 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
INTRODUCTION TO REGULAR EXPRESSION
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
This version: June 3, 2019
|
||||
|
||||
What are regular expressions?
|
||||
|
||||
Regular expressions allow you to search for general patterns in texts. The
|
||||
standard string commands like .count("search_term") and .replace("old_word","new_word")
|
||||
can only count and replace one specific word, respectively. They cannot search
|
||||
for general patterns like all words that consist of three or more letters.
|
||||
Assume that you want to identify all numbers in a text or that you search for
|
||||
the year of birth in bios of corporate executives. In the examples, you need a
|
||||
search tool that can process broad patterns --> you need regular expressions.
|
||||
Consider the second example, i.e. you would like to automatically identify
|
||||
people's year of birth from their bios. You know that the number must have four
|
||||
digits and that the first two digits must equal 19. Of course, you could
|
||||
hardcode all possible years (1900, 1901, ..., 1999), but this is unnecessarily
|
||||
complicated and slows down the program. Therefore, it is better to learn
|
||||
how to use regex.
|
||||
|
||||
Useful online resources:
|
||||
1. https://regex101.com/
|
||||
On this webpage, you can enter a text and a regular expression.
|
||||
The webpage highlights the matches and provides explanations for
|
||||
every part of the regex pattern.
|
||||
Caution: click on "Python" in the left menu (the default language is php)!
|
||||
|
||||
2. https://docs.python.org/3/library/re.html
|
||||
The offical documentation of regular expression in Python 3.
|
||||
|
||||
"""
|
||||
|
||||
# To be able to use regular expressions you need to import the re package first.
|
||||
import re
|
||||
|
||||
# Select the directory where you saved the accompanying txt-file.
|
||||
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||
|
||||
|
||||
# In this introduction, we use the accompanying txt-file "Text_Introduction_Regular_Expressions.txt"
|
||||
# open the file
|
||||
text_file=open(directory+'Text_Introduction_Regular_Expressions.txt','r',encoding='cp1252')
|
||||
# read its content
|
||||
text=text_file.read()
|
||||
|
||||
# Let's start with the example from the beginning and search for people's years of birth.
|
||||
# The standard search command for regular expressions is re.search. It searches
|
||||
# for the FIRST match of the expression in the text.
|
||||
# First try
|
||||
match=re.search("19[0-9]{2}",text)
|
||||
# This command searches for four digits of which the first is a 1, the second a 9,
|
||||
# and then there are two further digits which can be any digits.
|
||||
# [0-9] refers to any digit. Equivalently, you can write \d which also refers
|
||||
# to any digits.
|
||||
# The {2} specifies that there must be exactly to digits.
|
||||
|
||||
print(match)
|
||||
# match contains information on the match:
|
||||
# span is the position in text where the match starts and ends; here 226 and 230
|
||||
# furthermore, the matched text is shown. Here, the first match is 1956.
|
||||
# You can use the positions to print the text before the match, after the match,
|
||||
# and, of course, of the matched text.
|
||||
start=match.start()
|
||||
end=match.end()
|
||||
print("From beginning of the document to the match: \n"+text[:start]+"\n\n")
|
||||
print("The match itself: \n"+text[start:end]+"\n\n")
|
||||
print("From end of match to end of document: \n"+text[end:]+"\n\n")
|
||||
|
||||
# To access the match, you can also use the command .group(0):
|
||||
print("Alternative way to access the matched text: \n"+match.group(0)+"\n\n")
|
||||
|
||||
# CAUTION
|
||||
# If no match is found the variable match does not exist.
|
||||
# Example: search for a ten digit number that start with 19
|
||||
match=re.search("19[0-9]{8}",text)
|
||||
# The command start=match.start() returns the follwoing error:
|
||||
# "AttributeError: 'NoneType' object has no attribute 'start'"
|
||||
# SOLUTION
|
||||
match=re.search("19[0-9]{8}",text)
|
||||
if match:
|
||||
# match found, the start .start() is now conditional on the existence of match
|
||||
start=match.start()
|
||||
print("Match found. Starting at position "+str(start))
|
||||
else:
|
||||
# no match found
|
||||
print("No match found")
|
||||
|
||||
'''
|
||||
Information on Syntax, Special Characters in Regular Expression
|
||||
|
||||
Character Meaning
|
||||
[] Indicates a set of characters
|
||||
\[ Matches the actual [
|
||||
\] Matches the actual ]
|
||||
^ negation; the symbols listed afterwards are not allowed in the match
|
||||
E.g., [^0-9] will not match any numbers but all other symbols.
|
||||
\d Any digit, i.e. 0, 1, 2, ..., 9. Equivalent to [0-9]
|
||||
\n Linefeed/newline, the start of a new line.
|
||||
\s Any whitespace, i.e. a tab, a space.
|
||||
CAUTION: \s matches also the newline (\n). This property of \s
|
||||
can lead to unintended matches.
|
||||
RECOMMENDATION: to match whitespaces only use [ \t], i.e. a space
|
||||
and a tab (\t).
|
||||
\S Any non-whitespace symbol.
|
||||
. Any character (digit, letter, symbol [!,?,%,etc.], spaces) but
|
||||
NOT the newline, \n.
|
||||
\. Matches the actual dot.
|
||||
\w Matches word characters, i.e. [0-9a-zA-Z_]
|
||||
The underscore (_) is defined to be a word character.
|
||||
\W Matches any non-word characters, i.e. [^0-9a-zA-Z_]
|
||||
| Or condition (for an example see line 272)
|
||||
() Like in math: parentheses indicate which characters of an expression
|
||||
belong togehter. (For an example see line 272.)
|
||||
\( Matches the actual (
|
||||
\) Matches the actual )
|
||||
|
||||
(?i) Performs the regex case-insensitive. Must be put at the beginning
|
||||
of the regex. E.g. re.search("(?i)TeSt",text) will match
|
||||
TEST, test, Test, etc.
|
||||
re.IGNORECASE Performs the regex case-insensitive. Must be put at the end of
|
||||
the regex as an option. E.g. re.search("test",text,re.IGNORECASE)
|
||||
'''
|
||||
# Examples of character sets
|
||||
# 1. [0-9]: numbers
|
||||
match=re.search("[0-9]","ABC abc 123")
|
||||
print(match)
|
||||
#2. [a-z]: any lower case letter
|
||||
match=re.search("[a-z]","ABC abc 123")
|
||||
print(match)
|
||||
#3. [A-Z]: any upper case letter
|
||||
match=re.search("[A-Z]","ABC abc 123")
|
||||
print(match)
|
||||
#4. [cde]: lower case letters c, d, and e.
|
||||
match=re.search("[cde]","ABC abc 123")
|
||||
print(match)
|
||||
#5. [^A-Zab]: all symbols except captial letters and a and b.
|
||||
match=re.search("[^A-Zab]","ABC abc 123")
|
||||
print(match)
|
||||
# you don't see any character because the match is the first white space before abc
|
||||
|
||||
|
||||
'''
|
||||
Quantifiers for regular expression:
|
||||
n and m refer to non-negative integers (0, 1, 2, ...), where m>n
|
||||
Quantifier Meaning
|
||||
{n} The preceding pattern must be found EXACTLY n times.
|
||||
{n,} The preceding pattern must be found AT LEAST n times.
|
||||
{,n} The preceding pattern must be found AT MOST n times.
|
||||
{n,m} The preceding pattern must be found AT LEAST n but AT MOST m times.
|
||||
{n,}? The ? tells the regex not to be "greedy" (see lines 211 for details)
|
||||
|
||||
There are alternative notations for commonly used quantifiers:
|
||||
* is equivalent to {0,}, i.e. 0 or more repetitions of the preceding pattern.
|
||||
+ is equivalent to {1,}, i.e. 1 or more repetitions of the preceding pattern.
|
||||
? is equivalent to {0,1}, i.e. 0 or 1 repetition of the preceding pattern.
|
||||
'''
|
||||
|
||||
# re.search() returns only the first match: How to get all matches?
|
||||
# Alternative 1: use a loop.
|
||||
text1=text
|
||||
i=1
|
||||
match=re.search("19[0-9]{2}",text1)
|
||||
# Repeat the following commands until no more matches are found.
|
||||
while match:
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
end=match.end()
|
||||
text1=text1[end:]
|
||||
match=re.search("19[0-9]{2}",text1)
|
||||
i=i+1
|
||||
|
||||
# Alternative 2: use re.findall
|
||||
# The syntax is identical to re.search
|
||||
list_of_matches=re.findall("19[0-9]{2}",text)
|
||||
print(list_of_matches)
|
||||
# the individual matches can be called by list_of_matches[i], where i ranges
|
||||
# from zero to the number of matches minus one.
|
||||
# Remember: the first element of a list has the position 0
|
||||
for i in range(0,len(list_of_matches)):
|
||||
print("This is match number "+str(i+1)+" using the re.findall command: "+list_of_matches[i])
|
||||
|
||||
|
||||
# When you read the text you will observe that there are only six years of birth
|
||||
# in the text and not eight -> there are two mismatches -> adjust filter to
|
||||
# get only the years of birth and not all years.
|
||||
text1=text
|
||||
i=1
|
||||
# Check whether the word born appears before the year. The distance between
|
||||
# born and the year must be smaller or equal 15 (plus the two white spaces)
|
||||
match=re.search("born .{,15} 19[0-9]{2}",text1)
|
||||
while match:
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Extract the year
|
||||
match1=re.search("19[0-9]{2}",match.group(0))
|
||||
print("The year of match number "+str(i)+" is: "+match1.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
end=match.end()
|
||||
text1=text1[end:]
|
||||
match=re.search("born .{,15} 19[0-9]{2}",text1)
|
||||
i=i+1
|
||||
|
||||
|
||||
# The quantifiers introduced above are "greedy". For example, if a pattern matches overlapping
|
||||
# text parts of different length, the regex will return the longest match.
|
||||
# Example: search for the first sentence in a text. You know that sentences
|
||||
# end with period in this example.
|
||||
text2="This is the first senctence. This is the second sentence. And so on"
|
||||
# Search for a positive number of occurances of characters followed by a period.
|
||||
# Remeber that the dot is \. in regex. The . will match any character.
|
||||
match=re.search(".{1,}\.",text2)
|
||||
print(match.group(0))
|
||||
# -> the regex returns the first and second sentence.
|
||||
# To get the first match that fulfils the regex, put a ? after the quantifiers.
|
||||
# This makes the quantifier "non-greedy", and only the first occurance will be matched.
|
||||
match=re.search(".{1,}?\.",text2)
|
||||
print(match.group(0))
|
||||
|
||||
# You will often have situations where there are multiple versions of the same
|
||||
# pattern. How can you include all of them in one regular expression?
|
||||
# Example 1: search for the word "losses" in the following sentence:
|
||||
text3="X Corp's soda division returned significant losses in the last quarter. Losses will be reduced this quarter."
|
||||
# the first letter of "loss" can be upper or lower case
|
||||
print("Example 1: Loss and loss")
|
||||
text4=text3
|
||||
i=1
|
||||
# A set of characters [] is matched if at least one of the components of the
|
||||
# set is found in the text. This works only for a single letter/number/symbol
|
||||
# but not for sequences of multiple letters/numbers/symbols.
|
||||
match=re.search("[Ll]oss",text3)
|
||||
while match:
|
||||
end=match.end()
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
text4=text4[end:]
|
||||
match=re.search("[Ll]oss",text4)
|
||||
i=i+1
|
||||
|
||||
# Alternatively
|
||||
list_of_matches=re.findall("[Ll]oss",text3)
|
||||
print("Alternative using re.findall: "+str(list_of_matches))
|
||||
|
||||
# In this example, you could also simply perform a case-insensitive match.
|
||||
print("Case-INsensitive matching using re.IGNORECASE")
|
||||
text4=text3
|
||||
i=1
|
||||
match=re.search("loss",text3,re.IGNORECASE)
|
||||
while match:
|
||||
end=match.end()
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
text4=text4[end:]
|
||||
match=re.search("loss",text4,re.IGNORECASE)
|
||||
i=i+1
|
||||
# Or equivalently
|
||||
print("Case-INsensitive matching using (?i)")
|
||||
text4=text3
|
||||
i=1
|
||||
match=re.search("(?i)loss",text3)
|
||||
while match:
|
||||
end=match.end()
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
text4=text4[end:]
|
||||
match=re.search("(?i)loss",text4)
|
||||
i=i+1
|
||||
|
||||
|
||||
# Example 2: search for the expressions "profits declined" and "profits decreased"
|
||||
# in the following sentence:
|
||||
text3="X Corp's profits declined in 2010, while Y Inc.'s profits decreased the year before."
|
||||
# Here, [] no longer works because we need to match terms consisting of several
|
||||
# characters and [] matches only one character. -> use the OR-operator |
|
||||
print("Example 2: profits declied and profits decreased - First try")
|
||||
text4=text3
|
||||
i=1
|
||||
match=re.search("profits declined|decreased",text3)
|
||||
while match:
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
end=match.end()
|
||||
text4=text4[end:]
|
||||
match=re.search("profits declined|decreased",text4)
|
||||
i=i+1
|
||||
# Problem: regex interprets the entire set of characters before the | as one
|
||||
# alternative.
|
||||
# Solution: use parantheses to define the boundaries.
|
||||
|
||||
print("Example 2: profits declied and profits decreased - Second try")
|
||||
text4=text3
|
||||
i=1
|
||||
match=re.search("profits (declined|decreased)",text3)
|
||||
while match:
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
end=match.end()
|
||||
text4=text4[end:]
|
||||
match=re.search("profits (declined|decreased)",text4)
|
||||
i=i+1
|
||||
|
||||
# Alternative: does re.findall work?
|
||||
list_of_matches=re.findall("profits (declined|decreased)",text3)
|
||||
print(list_of_matches)
|
||||
# -> No! Because there is a major difference between re.search and re.findall
|
||||
# in the way they treat parantheses ().
|
||||
# re.search follows the general regular expression syntax that is also used in
|
||||
# other programming languages.
|
||||
# To use re.findall you have to write down the full text before and after the |.
|
||||
list_of_matches=re.findall("profits declined|profits decreased",text3)
|
||||
print(list_of_matches)
|
||||
|
||||
|
||||
# More information on the difference between re.search and re.findall
|
||||
# Example 3: let's search for the numbers in the second part of the txt file
|
||||
# and compare what the two commands do.
|
||||
# Get the second part
|
||||
match=re.search("Here are some numbers:",text)
|
||||
text4=text[match.end():]
|
||||
print(text4)
|
||||
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||
# What are the individual parts of this pattern?
|
||||
# [0-9]{1,} There has to be at least one digit.
|
||||
# ([0-9]{3}|,){0,} The first digit can be followed by combinations of three
|
||||
# digits and commas (as thousand separator).
|
||||
# \.{0,1} There can be zero or one period as decimal separator.
|
||||
# [0-9]{0,} There can be multiple decimal places.
|
||||
|
||||
i=1
|
||||
while match:
|
||||
print("This is match number "+str(i)+": "+match.group(0))
|
||||
# Check whether there are further matches after the end of the previous match
|
||||
end=match.end()
|
||||
text4=text4[end:]
|
||||
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||
i=i+1
|
||||
|
||||
# Can we obtain the same result by using re.findall?
|
||||
match=re.search("Here are some numbers:",text)
|
||||
text4=text[match.end():]
|
||||
list_of_matches=re.findall("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||
print(list_of_matches)
|
||||
# Does not work!
|
||||
# One has to put "?:" in the part that captures the repetition of the thousands.
|
||||
# This tells re.findall to return the full match and not subpatterns.
|
||||
list_of_matches=re.findall("[0-9]{1,}(?:[0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||
print(list_of_matches)
|
||||
|
||||
# TAKE AWAY: The matching of re.findall does not always match that of re.search
|
||||
# Be careful when using re.findall!!!
|
||||
|
||||
|
||||
# How to delete or substitute parts of texts?
|
||||
# Alternative 1: identify the beginning and end of the matched text part and
|
||||
# remove it from the overall text.
|
||||
# Example delete all numbers in the text
|
||||
text4=text
|
||||
print("Original Text:\n"+text4)
|
||||
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
|
||||
while match:
|
||||
# Remove the match
|
||||
text4=text4[:match.start()]+text4[match.end():]
|
||||
# Check whether there are further matches in the remaining text
|
||||
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
|
||||
print("Text without numbers using re.search:\n"+text4)
|
||||
|
||||
# Alternative 2: use re.sub (sub -> substitute)
|
||||
# syntax: new_text=re.sub(pattern, replacement, old_text)
|
||||
# replacement is some string. Regular expressions are only allowed in the pattern
|
||||
# but not in the replacement.
|
||||
text4=text
|
||||
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","",text4)
|
||||
|
||||
print("Text without numbers using re.sub:\n"+text4)
|
||||
# re.sub is the more efficient way.
|
||||
# Furthermore, re.sub can not only delete text but also replace text.
|
||||
# Example
|
||||
text4=text
|
||||
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","NUMBER",text4)
|
||||
print("Text where numbers are replaced by the word 'NUMBER':\n"+text4)
|
||||
|
||||
|
||||
# Make sure you get the right match --> importance of word boundaries.
|
||||
# When you search for a word it can happen that the word is part of a different
|
||||
# longer word. For example, searching for "high" would also match "highlight".
|
||||
# To avoid such mismatches you can either include word boundaries in the search
|
||||
# (Alternative 1) or split the text first by word boundaries into single words
|
||||
# and perform standard string search operations afterwards (Alternative 2).
|
||||
# Alternative 2 does not return the individual matches but tells you for example
|
||||
# the number of matches
|
||||
# Example: search for the word "is"
|
||||
# Alternative 1:
|
||||
match=re.search("is",text)
|
||||
print("Searching without word boundaries yields: '"+match.group(0)+\
|
||||
"' But the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||
match=re.search("\Wis\W",text)
|
||||
print("Searching with word boundaries yields: '"+match.group(0)+\
|
||||
"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||
# You see that the preceding and subsequent word boundaries are also matched
|
||||
# and saved as the matched term. However, often you want the match to include only
|
||||
# the actual word without its boundaries.
|
||||
# Solution: use so called "look ahead" and "look back" conditions.
|
||||
|
||||
'''
|
||||
Look ahead and look behind/back conditions
|
||||
|
||||
Regex requires that the parts of the pattern that are classified as look ahead
|
||||
or look back/behind are present in the text but does not include them in the match.
|
||||
|
||||
Syntax:
|
||||
positive look ahead: (?=) Example: X(?=\W) requires that there is a word
|
||||
boundary after X
|
||||
negative look ahead: (?!) Example: X(?!\W) requires that there must NOT
|
||||
be a word boundary after X.
|
||||
positive look back: (?<=) Example: (?<=\W)X requires that there is a word
|
||||
boundary before X
|
||||
negative look back: (?<!) Example: (?<!\W)X requires that there must NOT
|
||||
be a word boundary before X.
|
||||
'''
|
||||
match=re.search("(?<=\W)is(?=\W)",text)
|
||||
print("Searching with word boundaries as look ahead and look back condition yields: '" #
|
||||
+match.group(0)+"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||
|
||||
# Does it work also with re.finall?
|
||||
list_of_matches=re.findall("\Wis\W",text)
|
||||
print("Word boundaries using re.findall: "+str(list_of_matches))
|
||||
list_of_matches=re.findall("(?<=\W)is(?=\W)",text)
|
||||
print("Word boundaries as look ahead and look back condition using re.findall: "+str(list_of_matches))
|
||||
print("In total there are "+str(len(list_of_matches))+" matches.")
|
||||
# --> Yes, the approach also work with re.findall.
|
||||
|
||||
# Alternative 2:
|
||||
# Use re.split(), which is similar to split() but more powerful.
|
||||
text_split=re.split("\W",text)
|
||||
print(text_split)
|
||||
# Problem: there are elements in the list that are not words, e.g. ''. These
|
||||
# elements are created because there can be a series of non-word characters (\W),
|
||||
# e.g. ' (' in 'Balmer (born'.
|
||||
# Solution: treat a series of wordboundaries \W as a single split character
|
||||
text_split=re.split("\W{1,}",text)
|
||||
print(text_split)
|
||||
# Now, you do not need to include word boundaries and can use standard string
|
||||
# operations.
|
||||
number_matches=text_split.count("is")
|
||||
print("Using standard string operations, we get "+str(number_matches)+" matches.")
|
||||
# -> same result.
|
||||
|
|
@ -0,0 +1,485 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Mon Mar 21 09:38:32 2022
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
'''
|
||||
This script introduces you to linear models using the sklearn package.
|
||||
Besides sklearn, we will use pandas to work with data sets as well as
|
||||
numpy to perform computations.
|
||||
|
||||
The introduction consists of 10 parts:
|
||||
1. linear regressions using a toy data set
|
||||
2. linear regressions using a "real" data set
|
||||
3. linear regressions using standardized variables
|
||||
4. Ridge regression basics
|
||||
5. Ridge regression with training, tuning, and testing sample
|
||||
6. Ridge regression with cross-validation
|
||||
7. LASSO regression basics
|
||||
8. LASSO regression with training, tuning, and testing sample
|
||||
9. LASSO regression with cross-validation
|
||||
10. Compare the results from Ridge and LASSO
|
||||
|
||||
'''
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# For OLS regressions
|
||||
from sklearn.linear_model import LinearRegression
|
||||
# for Ridge regressions
|
||||
from sklearn.linear_model import Ridge
|
||||
# for computing mean squared errors
|
||||
from sklearn.metrics import mean_squared_error
|
||||
# for plotting the MSEs for different levels of Lambda
|
||||
import matplotlib.pyplot as plot
|
||||
# for Ridge regressions with cross-validation
|
||||
from sklearn.linear_model import RidgeCV
|
||||
# for LASSO regressions
|
||||
from sklearn.linear_model import Lasso
|
||||
# for LASSO regressions with cross-validation
|
||||
from sklearn.linear_model import LassoCV
|
||||
|
||||
# adjust the directory to your folder!!!
|
||||
directory="C:/Lehre/Machine Learning/Data/"
|
||||
|
||||
############################################################
|
||||
# Part 1. Basics: linear regressions in Python using sklearn
|
||||
############################################################
|
||||
print("\nPart 1: Run an OLS regression on a sandbox data set\n")
|
||||
|
||||
# create a random number from a normal distribution with mean 0 and standard deviation 1.
|
||||
random_number=np.random.normal(0, 1)
|
||||
print("A random number is: "+str(random_number))
|
||||
|
||||
# you can also create a vector or matrix of random variables
|
||||
# the parameter size(# of rows, # of columns) specifies the number rows and columns
|
||||
# For example, a (10,1) vector
|
||||
random_number_vector=np.random.normal(0, 1, size=(10,1))
|
||||
print("The vector of random numbers is:")
|
||||
print(random_number_vector)
|
||||
|
||||
# create the independent variable x as a vector of random numbers
|
||||
x_vector=np.random.normal(0, 1, size=(10,1))
|
||||
print("The vector of the independent variable x is:")
|
||||
print(x_vector)
|
||||
|
||||
# create the dependent variable y as
|
||||
# y = 2x + epsilon, where epsilon is the random error term from above
|
||||
y_vector=np.dot(x_vector,2) + random_number_vector
|
||||
print("The vector of the dependent variable y is:")
|
||||
print(y_vector)
|
||||
|
||||
# perform a standard OLS regression with intercept.
|
||||
# The command takes x (independent variable(s)) first and then y (dependent variable)
|
||||
# Note that the default is that the intercept is included. So, strictly speaking,
|
||||
# the (fit_intercept=True) option is not needed.
|
||||
regression_1=LinearRegression(fit_intercept=True).fit(x_vector, y_vector)
|
||||
|
||||
# display the intercept and the beta coefficient on x
|
||||
print("The intercept is: "+str(regression_1.intercept_))
|
||||
# to get it as a scalar/number not an array, use
|
||||
regression_1.intercept_[0]
|
||||
|
||||
print("The coefficient on x is: "+str(regression_1.coef_))
|
||||
# to get it as a scalar/number not an array, use
|
||||
regression_1.coef_[0][0]
|
||||
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_1.score(x_vector, y_vector)))
|
||||
|
||||
|
||||
###############################################################
|
||||
# Part 2: linear regression using a "real" data set
|
||||
###############################################################
|
||||
print("\nPart 2: Run an OLS regression with a real data set\n")
|
||||
|
||||
# import the data for this problem
|
||||
# The data set consists of 200 independent variables (x1 to x200) and
|
||||
# a dependent variable (y).
|
||||
# There are 1,200 observations in total. In the later parts, we will
|
||||
# use the first 1,000 observations for training and the last 200 for testing.
|
||||
# The data are simulated using the following process:
|
||||
# y = 0.5*x1 + 0.5*x2 + ... + 0.5*x100 + random error (mean 0, std. dev. 4)
|
||||
# The x101 to x200 are not directly related to y but are correlated with
|
||||
# the x1 to x100. More specifically,
|
||||
# x101 = 0.7*x1 + random error (mean 0, std. dev. 1)
|
||||
# x102 = 0.7*x2 + random error (mean 0, std. dev. 1)
|
||||
# x200 = 0.7*x100 + random error (mean 0, std. dev. 1)
|
||||
data_frame=pd.read_csv(directory+"regression_data_scikit.csv",sep=";")
|
||||
|
||||
# to get any idea about the data, display the first five data points
|
||||
data_frame.head(5)
|
||||
|
||||
# split the data frame into the independent and dependent variables
|
||||
# the independent variables(x1 to x200) are columns 1 to 200
|
||||
x_variables=data_frame.values[:,:-1]
|
||||
# the dependent variable (y) is column 201
|
||||
y_variable=data_frame.values[:,-1:]
|
||||
|
||||
# run a standard OLS regression
|
||||
regression_OLS=LinearRegression(fit_intercept=True).fit(x_variables, y_variable)
|
||||
# You can double check the results by reruning the regression in Stata or R.
|
||||
|
||||
# display the intercept and the beta coefficients on x1 and x51
|
||||
print("The intercept is: "+str(regression_OLS.intercept_[0]))
|
||||
print("The coefficient on x_1 is: "+str(regression_OLS.coef_[0][0]))
|
||||
print("The coefficient on x_51 is: "+str(regression_OLS.coef_[0][50]))
|
||||
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_OLS.score(x_variables, y_variable)))
|
||||
|
||||
|
||||
##################################################################
|
||||
# Part 3: standardize the data to have mean zero and unit variance
|
||||
# and rerun the regression
|
||||
##################################################################
|
||||
print("\nPart 3a.: Standardize variables\n")
|
||||
|
||||
# standardize x and y to have mean zero and unit variance
|
||||
# axis=0 (axis=1) means that the computation is executed column (row) wise
|
||||
x_variables_mean=np.mean(x_variables,axis=0)
|
||||
# ddof=1 means that we use n-1 to compute the standard deviation
|
||||
x_variables_standard_deviation=np.std(x_variables, axis=0, ddof=1)
|
||||
x_variables_standardized=(x_variables-x_variables_mean)/x_variables_standard_deviation
|
||||
|
||||
# do the same exercise for y
|
||||
y_variable_mean=np.mean(y_variable,axis=0)
|
||||
y_variable_standard_deviation=np.std(y_variable, axis=0, ddof=1)
|
||||
y_variable_standardized=(y_variable-y_variable_mean)/y_variable_standard_deviation
|
||||
|
||||
# rerun the regression using standardized data
|
||||
regression_OLS_standardized=LinearRegression(fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||
# results are identical to a regression in Stata with beta coefficients.
|
||||
|
||||
# display the intercept and the beta coefficients on x_1 and x_51
|
||||
print("The intercept is: "+str(regression_OLS_standardized.intercept_[0]))
|
||||
print("The coefficient on x_1 is: "+str(regression_OLS_standardized.coef_[0][0]))
|
||||
print("The coefficient on x_51 is: "+str(regression_OLS_standardized.coef_[0][50]))
|
||||
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_OLS_standardized.score(x_variables_standardized, y_variable_standardized)))
|
||||
# The R2 is identical to the one from Part 2 -> good!
|
||||
|
||||
#######################################################################################
|
||||
# CAUTION: be careful using the "normalize=True" option in the LinearRegression module!
|
||||
#######################################################################################
|
||||
print("\nPart 3b.: Regression with 'normalization'\n")
|
||||
# Normalizer works on the rows, not the columns!
|
||||
# By default, L2 normalization is applied to each observation so that the
|
||||
# values in a row (!) have a unit norm. Unit norm with L2 means that if each
|
||||
# element were squared and summed, the total would equal 1.
|
||||
regression_OLS_normalized=LinearRegression(fit_intercept=True,normalize=True).fit(x_variables, y_variable)
|
||||
|
||||
# display the intercept and the beta coefficient on x_1 and x_51
|
||||
print("The intercept is: "+str(regression_OLS_normalized.intercept_[0]))
|
||||
print("The coefficient on x_1 is: "+str(regression_OLS_normalized.coef_[0][0]))
|
||||
print("The coefficient on x_51 is: "+str(regression_OLS_normalized.coef_[0][50]))
|
||||
# The coefficients are different from the ones above highlighting that the
|
||||
# "normalize=True" option does not do the same as "normal" standardizing
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_OLS_normalized.score(x_variables, y_variable)))
|
||||
|
||||
|
||||
#######################################################################
|
||||
# Part 4: Ridge regression on the full sample (no training and testing)
|
||||
# This part is to learn the syntax.
|
||||
# We are using the standardized variables to have the same penalty
|
||||
# for a given effect of x on y.
|
||||
# Remember: if the independent variables are measured on very different
|
||||
# scales, the beta coefficients have different sizes (e.g., market cap in
|
||||
# thousand USD vs. past stock returns as a decimal number) and, thus,
|
||||
# the panelty would be applied inconsistently.
|
||||
#######################################################################
|
||||
print("\nPart 4: Ridge regression - learning the syntax\n")
|
||||
|
||||
# the parameter alpha corresponds to the penalty parameter Lambda from
|
||||
# the notation that is typically used.
|
||||
# the default is that the intercept is included, so you do not need the
|
||||
# "intercept=True" parameter. But it is good to keep in mind what
|
||||
# specification you are using.
|
||||
regression_Ridge=Ridge(alpha=10,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||
|
||||
# display the intercept and the beta coefficient on x1 and x51
|
||||
print("The intercept is: "+str(regression_Ridge.intercept_[0]))
|
||||
print("The coefficient on x_1 is: "+str(regression_Ridge.coef_[0][0]))
|
||||
print("The coefficient on x_51 is: "+str(regression_Ridge.coef_[0][50]))
|
||||
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_Ridge.score(x_variables_standardized, y_variable_standardized)))
|
||||
|
||||
# How to compute the mean squared error (MSE)?
|
||||
# 1. get the predicted values
|
||||
y_variable_standardized_predicted=regression_Ridge.predict(x_variables_standardized)
|
||||
# 2. determine the MSE
|
||||
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
||||
|
||||
|
||||
#######################################################################
|
||||
# Part 5: Ridge regression using a training, tuning, and testing sample
|
||||
#######################################################################
|
||||
print("\nPart 5: Ridge regression - Application with training, tuning, and testing data\n")
|
||||
|
||||
# Create a training, tuning, and testing sample
|
||||
# we split the data into a training, a tuning, and a testing set
|
||||
# training data are the frist 800 rows
|
||||
# In the brackets, the first range (before the comma) indicates the rows, the second the columns.
|
||||
x_variables_std_train=x_variables_standardized[:800,:]
|
||||
y_variable_std_train=y_variable_standardized[:800,:]
|
||||
# the tuning data are row 801 to 1000 -> 200 observations
|
||||
x_variables_std_tune=x_variables_standardized[800:1000,:]
|
||||
y_variable_std_tune=y_variable_standardized[800:1000,:]
|
||||
# testing data are the last 200 rows
|
||||
x_variables_std_test=x_variables_standardized[1000:,:]
|
||||
y_variable_std_test=y_variable_standardized[1000:,:]
|
||||
|
||||
|
||||
##########################
|
||||
# find the optimal Lambda
|
||||
##########################
|
||||
# we store the MSE of the training/tuning data for each Lambda
|
||||
mse_train_list=[]
|
||||
mse_tune_list=[]
|
||||
# Again, Lambda and Alpha refer to the same thing.
|
||||
alpha_list=[]
|
||||
|
||||
# we iterate from 0.1 to 100 increasing Lambda=Alpha by 0.1 in each step.
|
||||
alpha=0.1
|
||||
while alpha<100:
|
||||
# train the model
|
||||
regression_Ridge_train=Ridge(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
||||
# add the alpha to the list of alphas
|
||||
alpha_list.append(alpha)
|
||||
# predict y in the training sample
|
||||
y_variable_std_train_predicted=regression_Ridge_train.predict(x_variables_std_train)
|
||||
# predict y in the tuning sample
|
||||
y_variable_std_tune_predicted=regression_Ridge_train.predict(x_variables_std_tune)
|
||||
# compute the MSE in both samples
|
||||
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
||||
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
||||
# append the MSEs to the two lists
|
||||
mse_train_list.append(mse_train)
|
||||
mse_tune_list.append(mse_tune)
|
||||
# continue with the next alpha
|
||||
alpha=alpha+0.1
|
||||
|
||||
########################################
|
||||
# plot the MSEs for the different alphas
|
||||
########################################
|
||||
# MSE in the training sample
|
||||
plot.scatter(alpha_list, mse_train_list)
|
||||
plot.show()
|
||||
# higher Lambda associated with higher MSE
|
||||
|
||||
# MSE in the tuning sample
|
||||
plot.scatter(alpha_list, mse_tune_list)
|
||||
plot.show()
|
||||
# there is an optimal alpha with the lowest MSE
|
||||
|
||||
######################################
|
||||
# determine the optimal Lambda
|
||||
######################################
|
||||
# what is the smallest MSE?
|
||||
minimum=min(mse_tune_list)
|
||||
print("The smallest MSE is "+ str(minimum))
|
||||
# get the position of the minimum MSE in our list
|
||||
index_min_MSE=mse_tune_list.index(minimum)
|
||||
# choose the corresponding alpha
|
||||
alpha_optimal=alpha_list[index_min_MSE]
|
||||
print("The optimal alpha is "+str(alpha_optimal))
|
||||
|
||||
#############################################################
|
||||
# What is the out-of-sample performance of the optimal model?
|
||||
#############################################################
|
||||
# take the full training data set (1000 observations, i.e., training + tuning set)
|
||||
x_variables_std_train_total=np.concatenate((x_variables_std_train, x_variables_std_tune), axis=0)
|
||||
y_variable_std_train_total=np.concatenate((y_variable_std_train, y_variable_std_tune), axis=0)
|
||||
# train the model with the optimal Lambda on the training and tuning data
|
||||
regression_Ridge_optimal=Ridge(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
||||
|
||||
# Mean squared error
|
||||
# predict y in the full training sample
|
||||
y_variable_std_train_total_predicted=regression_Ridge_optimal.predict(x_variables_std_train_total)
|
||||
# predict y in the testing sample
|
||||
# Remeber: we have not used the testing data yet. Firewall principle!!!
|
||||
y_variable_std_test_predicted=regression_Ridge_optimal.predict(x_variables_std_test)
|
||||
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
||||
|
||||
|
||||
#############################################################
|
||||
# Part 6: Ridge regression with k-fold cross-validation
|
||||
# Implement the cross validation using a package
|
||||
#############################################################
|
||||
print("\nPart 6. Ridge regression - Using cross-validation\n")
|
||||
|
||||
# the default for cv is the leave-one-out cross-validation
|
||||
# here we apply five-fold cross-validation
|
||||
regression_Ridge_cv=RidgeCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
y_variable_std_train_total_predicted_cv=regression_Ridge_cv.predict(x_variables_std_train_total)
|
||||
# predict y in the testing sample
|
||||
y_variable_std_test_predicted_cv=regression_Ridge_cv.predict(x_variables_std_test)
|
||||
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
||||
|
||||
|
||||
###########################################
|
||||
# Part 7: LASSO regression
|
||||
# on the full sample -> to learn the syntax
|
||||
###########################################
|
||||
|
||||
print("\nPart 7: LASSO regression - learning the syntax\n")
|
||||
# the parameter alpha corresponds to the penalty parameter Lambda from
|
||||
# the notation that is typically used.
|
||||
# the default is that the intercept is included, so you do not need the
|
||||
# "intercept=True" parameter. But it is good to keep in mind what
|
||||
# specification you are using.
|
||||
regression_Lasso=Lasso(alpha=0.1,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||
|
||||
# display the intercept and the beta coefficient on x1 and x51
|
||||
print("The intercept is: "+str(regression_Lasso.intercept_[0]))
|
||||
print("The coefficient on x_1 is: "+str(regression_Lasso.coef_[0]))
|
||||
print("The coefficient on x_51 is: "+str(regression_Lasso.coef_[50]))
|
||||
|
||||
# R2 of the regression
|
||||
print("The R2 is: "+str(regression_Lasso.score(x_variables_standardized, y_variable_standardized)))
|
||||
|
||||
# How to compute the mean squared error (MSE)?
|
||||
# 1. get the predicted values
|
||||
y_variable_standardized_predicted=regression_Lasso.predict(x_variables_standardized)
|
||||
# 2. determine the MSE
|
||||
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
||||
|
||||
|
||||
####################################################
|
||||
# Part 8: Create a training, tune and testing sample
|
||||
####################################################
|
||||
print("\nPart 8: LASSO regression - Application with training, tuning, and testing data\n")
|
||||
# we use the same training, tuning, and testing data as in part 5.
|
||||
# -> no need to redefine the data sets.
|
||||
|
||||
#################################
|
||||
# find the optimal Lambda
|
||||
#################################
|
||||
# we store the MSE of the training/tuning data for each Lambda
|
||||
mse_train_list=[]
|
||||
mse_tune_list=[]
|
||||
# Again, Lambda and Alpha refer to the same thing.
|
||||
alpha_list=[]
|
||||
|
||||
# we iterate from 0.0001 to 0.25 increasing alpha by 0.0001 in each step.
|
||||
alpha=0.0001
|
||||
while alpha<0.25:
|
||||
# train the model
|
||||
regression_Lasso_train=Lasso(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
||||
# add the alpha to the list of alphas
|
||||
alpha_list.append(alpha)
|
||||
# predict y in the training sample
|
||||
y_variable_std_train_predicted=regression_Lasso_train.predict(x_variables_std_train)
|
||||
# predict y in the tuning sample
|
||||
y_variable_std_tune_predicted=regression_Lasso_train.predict(x_variables_std_tune)
|
||||
# compute the MSE in both samples
|
||||
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
||||
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
||||
# append the MSEs to the two lists
|
||||
mse_train_list.append(mse_train)
|
||||
mse_tune_list.append(mse_tune)
|
||||
# continue with the next alpha
|
||||
alpha=alpha+0.0001
|
||||
|
||||
########################################
|
||||
# plot the MSEs for the different alphas
|
||||
########################################
|
||||
|
||||
# MSE in the training sample
|
||||
plot.scatter(alpha_list, mse_train_list)
|
||||
plot.show()
|
||||
# higher Lambda associated with higher MSE
|
||||
|
||||
# MSE in the tuning sample
|
||||
plot.scatter(alpha_list, mse_tune_list)
|
||||
plot.show()
|
||||
# there is an optimal alpha with the lowest MSE
|
||||
|
||||
|
||||
######################################
|
||||
# determine the optimal Lambda
|
||||
######################################
|
||||
# what is the smallest MSE?
|
||||
minimum=min(mse_tune_list)
|
||||
print("The smallest MSE is "+ str(minimum))
|
||||
# get the position of the minimum MSE
|
||||
index_min_MSE=mse_tune_list.index(minimum)
|
||||
alpha_optimal=alpha_list[index_min_MSE]
|
||||
|
||||
print("The optimal alpha is "+str(alpha_optimal))
|
||||
|
||||
#############################################################
|
||||
# What is the out-of-sample performance of the optimal model?
|
||||
#############################################################
|
||||
# take the full training data set (1000 observations; training + tuning)
|
||||
# use the same variables as in Part 5.
|
||||
|
||||
# train the model with the optimal Lambda on the training and tuning data
|
||||
regression_Lasso_optimal=Lasso(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
||||
|
||||
# Mean squared error
|
||||
# predict y in the full training sample
|
||||
y_variable_std_train_total_predicted=regression_Lasso_optimal.predict(x_variables_std_train_total)
|
||||
# predict y in the testing sample
|
||||
y_variable_std_test_predicted=regression_Lasso_optimal.predict(x_variables_std_test)
|
||||
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
||||
|
||||
|
||||
#############################################################
|
||||
# Part 9: Implement the cross validation using a package
|
||||
#############################################################
|
||||
print("\nPart 9: LASSO regression - Using cross-validation\n")
|
||||
|
||||
# the default for cv in LassoCV is the 5-fold cross-validation
|
||||
regression_Lasso_cv=LassoCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
||||
|
||||
# get the optimal lambda
|
||||
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
||||
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||
|
||||
# Mean squared error using the cross-validated model
|
||||
# predict y in the full training sample
|
||||
y_variable_std_train_total_predicted_cv=regression_Lasso_cv.predict(x_variables_std_train_total)
|
||||
# predict y in the testing sample
|
||||
y_variable_std_test_predicted_cv=regression_Lasso_cv.predict(x_variables_std_test)
|
||||
|
||||
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
||||
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
||||
|
||||
|
||||
#####################################################################
|
||||
# Part 10: Compare the betas from the Ridge and the LASSO regressions
|
||||
#####################################################################
|
||||
print("\nPart 10: Comparison of Ridge and LASSO coefficients\n")
|
||||
# To set to what extend the results of Ridge and LASSO are similar, we
|
||||
# write the coefficients from the cross-validation tasks (Parts 6 and 9)
|
||||
# to a csv files.
|
||||
|
||||
output_file=open(directory+"comparison_coefficients_Ridge_LASSO.csv","w",encoding="utf-8")
|
||||
output_file.write("index;coefficient_Ridge;coefficient_LASSO\n")
|
||||
|
||||
# get the list of coefficients
|
||||
for i in range (0,200):
|
||||
output_file.write(str(i)+';'+str(regression_Ridge_cv.coef_[0][i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||
|
||||
output_file.close()
|
||||
|
||||
print("Completed!")
|
||||
|
|
@ -0,0 +1,52 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Jul 17 17:09:50 2021
|
||||
|
||||
@author: ahillert
|
||||
"""
|
||||
|
||||
from nltk.tokenize import sent_tokenize
|
||||
|
||||
print("\nExample 1\n")
|
||||
text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \
|
||||
+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \
|
||||
+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"
|
||||
|
||||
sentence_list_1=sent_tokenize(text_1)
|
||||
|
||||
for i in range(0,len(sentence_list_1)):
|
||||
print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])
|
||||
|
||||
# -> good performance
|
||||
|
||||
print("\nExample 2\n")
|
||||
text_2=text_1.lower()
|
||||
|
||||
sentence_list_2=sent_tokenize(text_2)
|
||||
|
||||
for i in range(0,len(sentence_list_2)):
|
||||
print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])
|
||||
|
||||
# -> poor performance
|
||||
# For the NLTK tokenizer it makes a difference whether text is lower or upper case.
|
||||
|
||||
|
||||
print("\nExample 3\n")
|
||||
text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \
|
||||
+"While Jane C. Taylor became the president of the S. E. C. " \
|
||||
+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."
|
||||
|
||||
sentence_list_3=sent_tokenize(text_3)
|
||||
|
||||
for i in range(0,len(sentence_list_3)):
|
||||
print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])
|
||||
|
||||
# -> good performance
|
||||
|
||||
print("\nExample 4\n")
|
||||
text_4=text_3.lower()
|
||||
|
||||
sentence_list_4=sent_tokenize(text_4)
|
||||
|
||||
for i in range(0,len(sentence_list_4)):
|
||||
print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])
|
||||
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 17:43:45 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
|
||||
# import modules
|
||||
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
|
||||
# the following three commands:
|
||||
#import nltk
|
||||
#nltk.download('punkt')
|
||||
#nltk.download('stopwords')
|
||||
|
||||
|
||||
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
import re
|
||||
|
||||
################
|
||||
# 1. Tokenize
|
||||
################
|
||||
# Create a test text to see how well nltk.tokenize performs
|
||||
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
|
||||
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
|
||||
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
|
||||
|
||||
# Tokenize sentences
|
||||
sentence_list=sent_tokenize(test_text)
|
||||
print("This is the list of sentences:")
|
||||
print(sentence_list)
|
||||
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
|
||||
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
|
||||
|
||||
# Tokenize words
|
||||
word_list=word_tokenize(test_text)
|
||||
print("This is the list of words:")
|
||||
print(word_list)
|
||||
print(len(word_list))
|
||||
# --> word_tokenize also includes symbols and numbers as words.
|
||||
|
||||
# How to delete the elements that are not real words?
|
||||
word_list_1=[]
|
||||
for word in word_list:
|
||||
if re.search('[A-Za-z]',word):
|
||||
word_list_1.append(word)
|
||||
print("This is the edited list of words. There should be only 'real' words:")
|
||||
print(word_list_1)
|
||||
print(len(word_list_1))
|
||||
|
||||
# Alternative
|
||||
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
|
||||
word_list_2=word_tokenize(test_text1)
|
||||
print("This is the edited list of words. There should be only 'real' words:")
|
||||
print(word_list_2)
|
||||
print(len(word_list_2))
|
||||
|
||||
|
||||
################
|
||||
# 2. Stop Words
|
||||
################
|
||||
example_sentence = "This is an example showing off stop word filtering."
|
||||
stop_words=set(stopwords.words("english"))
|
||||
print("This is the list of stop words from NLTK:")
|
||||
print(stop_words)
|
||||
# --> the stop words are all lower case
|
||||
print(len(stop_words))
|
||||
|
||||
# Split example sentence into words
|
||||
word_list_example=word_tokenize(example_sentence.lower())
|
||||
# Create list for filtered words
|
||||
word_list_filtered=[]
|
||||
|
||||
# filter out stop words
|
||||
for word in word_list_example:
|
||||
if word not in stop_words:
|
||||
word_list_filtered.append(word)
|
||||
|
||||
print("Example sentence after stop words have been deleted:")
|
||||
print(word_list_filtered)
|
||||
|
||||
# How does the example from above look like?
|
||||
test_text_filtered=[]
|
||||
|
||||
# filter out stop words
|
||||
for word in word_tokenize(test_text.lower()):
|
||||
if word not in stop_words:
|
||||
test_text_filtered.append(word)
|
||||
|
||||
print("Test text after stop words have been deleted:")
|
||||
print(test_text_filtered)
|
||||
|
||||
|
||||
################
|
||||
# 3. Stemming
|
||||
################
|
||||
# define an abbreviation
|
||||
ps=PorterStemmer()
|
||||
|
||||
example_words_1=["play", "player", "players", "played", "playing"]
|
||||
|
||||
for word in example_words_1:
|
||||
print(ps.stem(word))
|
||||
# the full syntax without the abbreviation would be:
|
||||
print(PorterStemmer().stem(word))
|
||||
|
||||
# adjectives and adverbs
|
||||
example_words_2=["high", "higher", "highest", "highly", "height"]
|
||||
for word in example_words_2:
|
||||
print(ps.stem(word))
|
||||
# --> comparative and superlative are not reduced to the stem/regular adjective
|
||||
# neither are adverbs
|
||||
|
||||
# Let's see how the stemmer deals with irregular words.
|
||||
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
|
||||
for word in example_words_3:
|
||||
print(ps.stem(word))
|
||||
# --> upper case words are also transformed to lower case.
|
||||
|
||||
# Stem the test text from above
|
||||
# Approach 1: stem word by word
|
||||
test_text_stemmed=[]
|
||||
for word in word_tokenize(test_text):
|
||||
test_text_stemmed.append(ps.stem(word))
|
||||
|
||||
print("Stemming word by word: test text after it has been stemmed:")
|
||||
print(test_text_stemmed)
|
||||
|
||||
# Alternative approach: stem entire text
|
||||
test_text_stemmed=ps.stem(test_text)
|
||||
print("Stemming entire document: test text after it has been stemmed:")
|
||||
print(test_text_stemmed)
|
||||
# -> does not work
|
||||
|
||||
print("End of nltk introduction!")
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
This is the text for the introduction to regular expressions.
|
||||
|
||||
In the first example, we search for the year of birth of current and former CEOs.
|
||||
These are sentences that I made up:
|
||||
Microsoft's former CEO Steve Balmer (born in 1956) graduated from Harvard in 1977.
|
||||
Michael Dell was born in 1965 in Houston and founded Dell Inc in 1984.
|
||||
Walmart is currently run by Doug McMillon, who was born in 1966.
|
||||
|
||||
The following three examples are taken from the Wikipedia pages of the three people.
|
||||
Steven Anthony "Steve" Ballmer (born March 24, 1956) is an American chief executive who is the former chief executive officer of Microsoft from January 2000 to February 2014, and is the current owner of the Los Angeles Clippers. Source: https://en.wikipedia.org/wiki/Steve_Ballmer, June 22, 2017.
|
||||
Michael Saul Dell (born February 23, 1965) is an American business magnate, investor, philanthropist, and author. He is the founder and CEO of Dell Technologies, one of the world’s leading providers of information technology infrastructure solutions. Source: https://en.wikipedia.org/wiki/Michael_Dell, June 22, 2017.
|
||||
Carl Douglas "Doug" McMillon (born October 17, 1966) is an American businessman and is the president and chief executive officer (CEO) of Wal-Mart Stores, Inc. Source: https://en.wikipedia.org/wiki/Doug_McMillon, June 22, 2017.
|
||||
|
||||
Here are some numbers:
|
||||
1,234,567
|
||||
8,901
|
||||
34
|
||||
56.82
|
||||
539,234,353.41
|
||||
1201
lectures/programming/introductions/regression_data_scikit.csv
Normal file
1201
lectures/programming/introductions/regression_data_scikit.csv
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue