Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
lectures/programming/files
|
9
lectures/programming/README.md
Normal file
9
lectures/programming/README.md
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
# Programming Files
|
||||||
|
|
||||||
|
This folder holds various programming files provided by the instructor:
|
||||||
|
- introductions to programming techniques
|
||||||
|
- problem sets
|
||||||
|
- solutions for the problem sets
|
||||||
|
|
||||||
|
In addition, the instructor provided various data files
|
||||||
|
that are too big to be stored in this repository.
|
|
@ -0,0 +1,270 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Jul 11 09:19:54 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
This version: February 22, 2019
|
||||||
|
|
||||||
|
This is an introduction to two data containers: lists and counters.
|
||||||
|
|
||||||
|
Python has several built-in data containers, e.g., sets, dictionaries, and lists
|
||||||
|
In addition to these containers, there are further types.
|
||||||
|
For textual analysis application counters are helpful.
|
||||||
|
|
||||||
|
This introduction covers lists in the first part.
|
||||||
|
The second part introduces the basics of counters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# for counters, you need to import collections
|
||||||
|
import collections
|
||||||
|
import re
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Introduction on data containers
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
#################################
|
||||||
|
# Part 1: lists
|
||||||
|
#################################
|
||||||
|
# Create an empty list
|
||||||
|
empty_list=[]
|
||||||
|
|
||||||
|
# Create non-empty lists
|
||||||
|
string_list=["a", "b", "c"]
|
||||||
|
mixed_list=[1, "ab", -4,"hello"]
|
||||||
|
|
||||||
|
print(mixed_list)
|
||||||
|
|
||||||
|
# Call items of a list
|
||||||
|
print(string_list[0])
|
||||||
|
print(string_list[2])
|
||||||
|
print(string_list[-1])
|
||||||
|
|
||||||
|
# Length of a list
|
||||||
|
length=len(string_list)
|
||||||
|
print("The length of the list is: "+str(length))
|
||||||
|
|
||||||
|
|
||||||
|
# ADD ITEMS TO A LIST
|
||||||
|
# ALTERNATIVE 1: insert -> you can specify the position
|
||||||
|
string_list.insert(1,"d")
|
||||||
|
# you cannot add multiple elements with the insert command
|
||||||
|
# You can try, but it will not work
|
||||||
|
# 1st try
|
||||||
|
string_list.insert(3,"e" "f") # -> the new element is "ef"
|
||||||
|
print(string_list)
|
||||||
|
# 2nd try
|
||||||
|
try:
|
||||||
|
string_list.insert(3,"e", "f")
|
||||||
|
except:
|
||||||
|
print("Wrong syntax. If the command were executed without the try-except "\
|
||||||
|
"you would get the error TypeError: insert() takes exactly 2 arguments (3 given)'")
|
||||||
|
# 3rd try
|
||||||
|
string_list.insert(3, ["e", "f"])
|
||||||
|
# check length
|
||||||
|
print("The length of the list is: "+str(len(string_list))) # -> only 6 and not 7
|
||||||
|
print(string_list[3])
|
||||||
|
# So element 3 of the list is another list
|
||||||
|
# You can call the elements of the sub-list
|
||||||
|
print("First element of sub list: "+string_list[3][0]+" and second element of \
|
||||||
|
sub list: "+string_list[3][1])
|
||||||
|
|
||||||
|
# Reset string_list to keep things easily tractable
|
||||||
|
string_list=["a", "b", "c"]
|
||||||
|
|
||||||
|
# ALTERNATIVE 2: append -> items are added at the end
|
||||||
|
string_list.append("d")
|
||||||
|
|
||||||
|
# Try to add multiple items
|
||||||
|
# 1st try
|
||||||
|
string_list.append("e" "f") # -> the new element is "ef"
|
||||||
|
print(string_list)
|
||||||
|
# 2nd try
|
||||||
|
try:
|
||||||
|
string_list.append("e", "f")
|
||||||
|
except:
|
||||||
|
print("Wrong syntax. If the command were executed without the try-except "\
|
||||||
|
"you would get the error 'TypeError: append() takes exactly one argument (2 given)'")
|
||||||
|
# 3rd try
|
||||||
|
string_list.append(["e", "f"])
|
||||||
|
# check length
|
||||||
|
print("length of list is "+str(len(string_list))) # -> only 6 and not 7
|
||||||
|
print(string_list[len(string_list)-1])
|
||||||
|
# -> element 3 of the list is another list
|
||||||
|
# You can call the elements of the sub-list
|
||||||
|
print("First element of sub list: "+string_list[len(string_list)-1][0]+" and \
|
||||||
|
second element of sub list: "+string_list[len(string_list)-1][1])
|
||||||
|
|
||||||
|
# Reset string_list to keep things easily tractable
|
||||||
|
string_list=["a", "b", "c"]
|
||||||
|
|
||||||
|
# ALTERNATIVE 3: extend -> items are added at the end
|
||||||
|
string_list.extend("d")
|
||||||
|
|
||||||
|
# Try to add multiple items
|
||||||
|
# 1st try
|
||||||
|
string_list.extend("e" "f") # -> Two elements are created -> works!!!
|
||||||
|
print(string_list)
|
||||||
|
# 2nd try
|
||||||
|
try:
|
||||||
|
string_list.extend("e", "f")
|
||||||
|
except:
|
||||||
|
print("Wrong syntax. If the command were executed without the try-except "\
|
||||||
|
"you would get the error 'TypeError: extend() takes exactly one argument (2 given)'")
|
||||||
|
# 3rd try
|
||||||
|
string_list.extend(["e", "f"])
|
||||||
|
print(string_list) # -> also works!!!
|
||||||
|
# check length
|
||||||
|
print("length of list is "+str(len(string_list))) # -> it is 8 and should be 8
|
||||||
|
|
||||||
|
|
||||||
|
# DELETE ITEMS FROM A LIST
|
||||||
|
string_list.remove("a")
|
||||||
|
print("List after deletion of 'a' "+str(string_list))
|
||||||
|
# What happens if an element occurs multiple times
|
||||||
|
string_list.remove("e")
|
||||||
|
print("List after further deletion of 'e' "+str(string_list))
|
||||||
|
# --> only first occurence of "e" is deleted
|
||||||
|
|
||||||
|
|
||||||
|
# FURTHER OPERATIONS WITH LISTS
|
||||||
|
# Accessing parts of a list
|
||||||
|
# Remember the first element is [0]! And the upper bound of the range is not
|
||||||
|
# included, i.e. [0:3] means [0], [1] and [2].
|
||||||
|
print("Sublist from beginning to third element: "+str(string_list[0:3]))
|
||||||
|
print("Sublist from beginning to third element: "+str(string_list[:3]))
|
||||||
|
print("Sublist from second(!) to third element: "+str(string_list[1:3]))
|
||||||
|
print("Sublist from fourth(!) to fifth element: "+str(string_list[3:5]))
|
||||||
|
print("Sublist from fifth(!) to the end: "+str(string_list[4:]))
|
||||||
|
|
||||||
|
# Search in lists
|
||||||
|
position=string_list.index("b")
|
||||||
|
print("Position of 'b' is: "+str(position))
|
||||||
|
# Searching for an element that is not part of the list
|
||||||
|
try:
|
||||||
|
string_list.index("a")
|
||||||
|
except:
|
||||||
|
print("Error message. If the command were executed without the try-except "\
|
||||||
|
"you would get the error 'ValueError: 'a' is not in list'")
|
||||||
|
if "c" in string_list:
|
||||||
|
print("'c' is at position: "+str(string_list.index("c")))
|
||||||
|
|
||||||
|
# Sort list
|
||||||
|
string_list.sort()
|
||||||
|
print('Sorted list: '+str(string_list))
|
||||||
|
string_list.sort(reverse=True)
|
||||||
|
print('Reversely sorted list: '+str(string_list))
|
||||||
|
|
||||||
|
# What happens when sorting mixed (i.e. integers and strings) lists?
|
||||||
|
try:
|
||||||
|
mixed_list.sort()
|
||||||
|
except:
|
||||||
|
print("Error message. If the command were executed without the try-except "\
|
||||||
|
"you would get the error 'TypeError: unorderable types: str() < int()'")
|
||||||
|
|
||||||
|
|
||||||
|
#################################
|
||||||
|
# Part 2: counters
|
||||||
|
#################################
|
||||||
|
'''
|
||||||
|
A Counter is a dictionary subclass for counting hashable objects.
|
||||||
|
It is an unordered collection where elements are stored as dictionary keys and
|
||||||
|
their counts are stored as dictionary values.
|
||||||
|
'''
|
||||||
|
# Creating a counter
|
||||||
|
counter_obj=collections.Counter(["a", "b", "c", "d", "a", "b", "a"])
|
||||||
|
print('The counter object is: '+str(counter_obj))
|
||||||
|
# The previous command is equivalent to
|
||||||
|
counter_obj=collections.Counter(a=3, b=2, c=1, d=1)
|
||||||
|
print('The counter object (2nd command) is: '+str(counter_obj))
|
||||||
|
|
||||||
|
# Add objects to a counter
|
||||||
|
counter_obj.update(["e", "f", "e"])
|
||||||
|
print('The updated counter object is: '+str(counter_obj))
|
||||||
|
# Alternative command
|
||||||
|
counter_obj["g"]=4
|
||||||
|
print('The updated updated counter object is: '+str(counter_obj))
|
||||||
|
|
||||||
|
# Length of the counter
|
||||||
|
length=len(counter_obj)
|
||||||
|
print('The length of the counter is: '+str(length))
|
||||||
|
|
||||||
|
# Loop over the elements of the counter and their frequency
|
||||||
|
i=1
|
||||||
|
for element in counter_obj:
|
||||||
|
print("Element "+str(i)+" of the counter: "+str(element))
|
||||||
|
print("Frequency of Element "+str(i)+" of the counter: "+str(counter_obj[element]))
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# .elements() provides an iterator of all individual elements of the counter
|
||||||
|
counter_elements=list(counter_obj.elements())
|
||||||
|
print('Elements of the counter: '+str(counter_elements))
|
||||||
|
|
||||||
|
# APPLY COUNTERS TO TEXTS
|
||||||
|
sentence1="This is the first sentence."
|
||||||
|
sentence2="This is the second sentence, which is longer."
|
||||||
|
|
||||||
|
# Split sentences in words
|
||||||
|
sentence1_words=re.split("\W{1,}", sentence1)
|
||||||
|
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
|
||||||
|
# The last element is empty -> delete it.
|
||||||
|
sentence1_words.remove("")
|
||||||
|
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
|
||||||
|
# -> now okay
|
||||||
|
sentence2_words=re.split("\W{1,}", sentence2)
|
||||||
|
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
|
||||||
|
# The last element is empty -> delete it.
|
||||||
|
sentence2_words.remove("")
|
||||||
|
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
|
||||||
|
# -> now okay
|
||||||
|
|
||||||
|
# Counter words
|
||||||
|
sentence1_counter=collections.Counter(sentence1_words)
|
||||||
|
sentence2_counter=collections.Counter(sentence2_words)
|
||||||
|
|
||||||
|
print(sentence1_counter)
|
||||||
|
print(sentence2_counter)
|
||||||
|
|
||||||
|
# OPERATIONS WITH COUNTERS
|
||||||
|
# add counters
|
||||||
|
add_counters=sentence1_counter+sentence2_counter
|
||||||
|
print("You can add counters: "+str(add_counters))
|
||||||
|
|
||||||
|
# subtract counters
|
||||||
|
subtract_counters=sentence1_counter-sentence2_counter
|
||||||
|
print("You can subtract counters: "+str(subtract_counters))
|
||||||
|
# Each time a new Counter is produced through an operation, any items with zero
|
||||||
|
# or negative counts are discarded. --> only first appears in subtract_counters
|
||||||
|
|
||||||
|
# Intersection of counters
|
||||||
|
intersection_counters=sentence1_counter & sentence2_counter
|
||||||
|
print("You can determine the intersection of counters: "+str(intersection_counters))
|
||||||
|
# -> takes the minimum of occurences; again elements with zero frequency
|
||||||
|
# are not included.
|
||||||
|
|
||||||
|
# Union of counters
|
||||||
|
union_counters=sentence1_counter | sentence2_counter
|
||||||
|
print("You can determine the union of counters: "+str(union_counters))
|
||||||
|
# -> takes the maximum of occurences
|
||||||
|
|
||||||
|
# MOST FREQUENT WORDS
|
||||||
|
# Determine the three most frequent words in the add_counters set.
|
||||||
|
top_3_words=add_counters.most_common(3)
|
||||||
|
print("The top 3 words are: "+str(top_3_words))
|
||||||
|
|
||||||
|
# Identify the two most frequent words with the top 4 words in the add_counters sample.
|
||||||
|
top_4_words=add_counters.most_common(4)
|
||||||
|
# The first [] refers to the line, i.e. is it the second common, second most
|
||||||
|
# frequent word.
|
||||||
|
# The second[] refers either to the word itself [0] or to the frequency of the word [1].
|
||||||
|
# the most frequent word
|
||||||
|
top_word=top_4_words[0][0]
|
||||||
|
top_word_count=top_4_words[0][1]
|
||||||
|
print("The top word is '"+str(top_word)+"', which appears "+str(top_word_count)+" times")
|
||||||
|
# the second most frequent word
|
||||||
|
top_2_word=top_4_words[1][0]
|
||||||
|
top_2_word_count=top_4_words[1][1]
|
||||||
|
print("The second most frequent word is '"+str(top_2_word)+"', which appears "+str(top_2_word_count)+" times")
|
||||||
|
|
||||||
|
|
||||||
|
print("Completed")
|
|
@ -0,0 +1,447 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
INTRODUCTION TO REGULAR EXPRESSION
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
This version: June 3, 2019
|
||||||
|
|
||||||
|
What are regular expressions?
|
||||||
|
|
||||||
|
Regular expressions allow you to search for general patterns in texts. The
|
||||||
|
standard string commands like .count("search_term") and .replace("old_word","new_word")
|
||||||
|
can only count and replace one specific word, respectively. They cannot search
|
||||||
|
for general patterns like all words that consist of three or more letters.
|
||||||
|
Assume that you want to identify all numbers in a text or that you search for
|
||||||
|
the year of birth in bios of corporate executives. In the examples, you need a
|
||||||
|
search tool that can process broad patterns --> you need regular expressions.
|
||||||
|
Consider the second example, i.e. you would like to automatically identify
|
||||||
|
people's year of birth from their bios. You know that the number must have four
|
||||||
|
digits and that the first two digits must equal 19. Of course, you could
|
||||||
|
hardcode all possible years (1900, 1901, ..., 1999), but this is unnecessarily
|
||||||
|
complicated and slows down the program. Therefore, it is better to learn
|
||||||
|
how to use regex.
|
||||||
|
|
||||||
|
Useful online resources:
|
||||||
|
1. https://regex101.com/
|
||||||
|
On this webpage, you can enter a text and a regular expression.
|
||||||
|
The webpage highlights the matches and provides explanations for
|
||||||
|
every part of the regex pattern.
|
||||||
|
Caution: click on "Python" in the left menu (the default language is php)!
|
||||||
|
|
||||||
|
2. https://docs.python.org/3/library/re.html
|
||||||
|
The offical documentation of regular expression in Python 3.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# To be able to use regular expressions you need to import the re package first.
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Select the directory where you saved the accompanying txt-file.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
|
||||||
|
# In this introduction, we use the accompanying txt-file "Text_Introduction_Regular_Expressions.txt"
|
||||||
|
# open the file
|
||||||
|
text_file=open(directory+'Text_Introduction_Regular_Expressions.txt','r',encoding='cp1252')
|
||||||
|
# read its content
|
||||||
|
text=text_file.read()
|
||||||
|
|
||||||
|
# Let's start with the example from the beginning and search for people's years of birth.
|
||||||
|
# The standard search command for regular expressions is re.search. It searches
|
||||||
|
# for the FIRST match of the expression in the text.
|
||||||
|
# First try
|
||||||
|
match=re.search("19[0-9]{2}",text)
|
||||||
|
# This command searches for four digits of which the first is a 1, the second a 9,
|
||||||
|
# and then there are two further digits which can be any digits.
|
||||||
|
# [0-9] refers to any digit. Equivalently, you can write \d which also refers
|
||||||
|
# to any digits.
|
||||||
|
# The {2} specifies that there must be exactly to digits.
|
||||||
|
|
||||||
|
print(match)
|
||||||
|
# match contains information on the match:
|
||||||
|
# span is the position in text where the match starts and ends; here 226 and 230
|
||||||
|
# furthermore, the matched text is shown. Here, the first match is 1956.
|
||||||
|
# You can use the positions to print the text before the match, after the match,
|
||||||
|
# and, of course, of the matched text.
|
||||||
|
start=match.start()
|
||||||
|
end=match.end()
|
||||||
|
print("From beginning of the document to the match: \n"+text[:start]+"\n\n")
|
||||||
|
print("The match itself: \n"+text[start:end]+"\n\n")
|
||||||
|
print("From end of match to end of document: \n"+text[end:]+"\n\n")
|
||||||
|
|
||||||
|
# To access the match, you can also use the command .group(0):
|
||||||
|
print("Alternative way to access the matched text: \n"+match.group(0)+"\n\n")
|
||||||
|
|
||||||
|
# CAUTION
|
||||||
|
# If no match is found the variable match does not exist.
|
||||||
|
# Example: search for a ten digit number that start with 19
|
||||||
|
match=re.search("19[0-9]{8}",text)
|
||||||
|
# The command start=match.start() returns the follwoing error:
|
||||||
|
# "AttributeError: 'NoneType' object has no attribute 'start'"
|
||||||
|
# SOLUTION
|
||||||
|
match=re.search("19[0-9]{8}",text)
|
||||||
|
if match:
|
||||||
|
# match found, the start .start() is now conditional on the existence of match
|
||||||
|
start=match.start()
|
||||||
|
print("Match found. Starting at position "+str(start))
|
||||||
|
else:
|
||||||
|
# no match found
|
||||||
|
print("No match found")
|
||||||
|
|
||||||
|
'''
|
||||||
|
Information on Syntax, Special Characters in Regular Expression
|
||||||
|
|
||||||
|
Character Meaning
|
||||||
|
[] Indicates a set of characters
|
||||||
|
\[ Matches the actual [
|
||||||
|
\] Matches the actual ]
|
||||||
|
^ negation; the symbols listed afterwards are not allowed in the match
|
||||||
|
E.g., [^0-9] will not match any numbers but all other symbols.
|
||||||
|
\d Any digit, i.e. 0, 1, 2, ..., 9. Equivalent to [0-9]
|
||||||
|
\n Linefeed/newline, the start of a new line.
|
||||||
|
\s Any whitespace, i.e. a tab, a space.
|
||||||
|
CAUTION: \s matches also the newline (\n). This property of \s
|
||||||
|
can lead to unintended matches.
|
||||||
|
RECOMMENDATION: to match whitespaces only use [ \t], i.e. a space
|
||||||
|
and a tab (\t).
|
||||||
|
\S Any non-whitespace symbol.
|
||||||
|
. Any character (digit, letter, symbol [!,?,%,etc.], spaces) but
|
||||||
|
NOT the newline, \n.
|
||||||
|
\. Matches the actual dot.
|
||||||
|
\w Matches word characters, i.e. [0-9a-zA-Z_]
|
||||||
|
The underscore (_) is defined to be a word character.
|
||||||
|
\W Matches any non-word characters, i.e. [^0-9a-zA-Z_]
|
||||||
|
| Or condition (for an example see line 272)
|
||||||
|
() Like in math: parentheses indicate which characters of an expression
|
||||||
|
belong togehter. (For an example see line 272.)
|
||||||
|
\( Matches the actual (
|
||||||
|
\) Matches the actual )
|
||||||
|
|
||||||
|
(?i) Performs the regex case-insensitive. Must be put at the beginning
|
||||||
|
of the regex. E.g. re.search("(?i)TeSt",text) will match
|
||||||
|
TEST, test, Test, etc.
|
||||||
|
re.IGNORECASE Performs the regex case-insensitive. Must be put at the end of
|
||||||
|
the regex as an option. E.g. re.search("test",text,re.IGNORECASE)
|
||||||
|
'''
|
||||||
|
# Examples of character sets
|
||||||
|
# 1. [0-9]: numbers
|
||||||
|
match=re.search("[0-9]","ABC abc 123")
|
||||||
|
print(match)
|
||||||
|
#2. [a-z]: any lower case letter
|
||||||
|
match=re.search("[a-z]","ABC abc 123")
|
||||||
|
print(match)
|
||||||
|
#3. [A-Z]: any upper case letter
|
||||||
|
match=re.search("[A-Z]","ABC abc 123")
|
||||||
|
print(match)
|
||||||
|
#4. [cde]: lower case letters c, d, and e.
|
||||||
|
match=re.search("[cde]","ABC abc 123")
|
||||||
|
print(match)
|
||||||
|
#5. [^A-Zab]: all symbols except captial letters and a and b.
|
||||||
|
match=re.search("[^A-Zab]","ABC abc 123")
|
||||||
|
print(match)
|
||||||
|
# you don't see any character because the match is the first white space before abc
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
Quantifiers for regular expression:
|
||||||
|
n and m refer to non-negative integers (0, 1, 2, ...), where m>n
|
||||||
|
Quantifier Meaning
|
||||||
|
{n} The preceding pattern must be found EXACTLY n times.
|
||||||
|
{n,} The preceding pattern must be found AT LEAST n times.
|
||||||
|
{,n} The preceding pattern must be found AT MOST n times.
|
||||||
|
{n,m} The preceding pattern must be found AT LEAST n but AT MOST m times.
|
||||||
|
{n,}? The ? tells the regex not to be "greedy" (see lines 211 for details)
|
||||||
|
|
||||||
|
There are alternative notations for commonly used quantifiers:
|
||||||
|
* is equivalent to {0,}, i.e. 0 or more repetitions of the preceding pattern.
|
||||||
|
+ is equivalent to {1,}, i.e. 1 or more repetitions of the preceding pattern.
|
||||||
|
? is equivalent to {0,1}, i.e. 0 or 1 repetition of the preceding pattern.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# re.search() returns only the first match: How to get all matches?
|
||||||
|
# Alternative 1: use a loop.
|
||||||
|
text1=text
|
||||||
|
i=1
|
||||||
|
match=re.search("19[0-9]{2}",text1)
|
||||||
|
# Repeat the following commands until no more matches are found.
|
||||||
|
while match:
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
end=match.end()
|
||||||
|
text1=text1[end:]
|
||||||
|
match=re.search("19[0-9]{2}",text1)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Alternative 2: use re.findall
|
||||||
|
# The syntax is identical to re.search
|
||||||
|
list_of_matches=re.findall("19[0-9]{2}",text)
|
||||||
|
print(list_of_matches)
|
||||||
|
# the individual matches can be called by list_of_matches[i], where i ranges
|
||||||
|
# from zero to the number of matches minus one.
|
||||||
|
# Remember: the first element of a list has the position 0
|
||||||
|
for i in range(0,len(list_of_matches)):
|
||||||
|
print("This is match number "+str(i+1)+" using the re.findall command: "+list_of_matches[i])
|
||||||
|
|
||||||
|
|
||||||
|
# When you read the text you will observe that there are only six years of birth
|
||||||
|
# in the text and not eight -> there are two mismatches -> adjust filter to
|
||||||
|
# get only the years of birth and not all years.
|
||||||
|
text1=text
|
||||||
|
i=1
|
||||||
|
# Check whether the word born appears before the year. The distance between
|
||||||
|
# born and the year must be smaller or equal 15 (plus the two white spaces)
|
||||||
|
match=re.search("born .{,15} 19[0-9]{2}",text1)
|
||||||
|
while match:
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Extract the year
|
||||||
|
match1=re.search("19[0-9]{2}",match.group(0))
|
||||||
|
print("The year of match number "+str(i)+" is: "+match1.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
end=match.end()
|
||||||
|
text1=text1[end:]
|
||||||
|
match=re.search("born .{,15} 19[0-9]{2}",text1)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
|
||||||
|
# The quantifiers introduced above are "greedy". For example, if a pattern matches overlapping
|
||||||
|
# text parts of different length, the regex will return the longest match.
|
||||||
|
# Example: search for the first sentence in a text. You know that sentences
|
||||||
|
# end with period in this example.
|
||||||
|
text2="This is the first senctence. This is the second sentence. And so on"
|
||||||
|
# Search for a positive number of occurances of characters followed by a period.
|
||||||
|
# Remeber that the dot is \. in regex. The . will match any character.
|
||||||
|
match=re.search(".{1,}\.",text2)
|
||||||
|
print(match.group(0))
|
||||||
|
# -> the regex returns the first and second sentence.
|
||||||
|
# To get the first match that fulfils the regex, put a ? after the quantifiers.
|
||||||
|
# This makes the quantifier "non-greedy", and only the first occurance will be matched.
|
||||||
|
match=re.search(".{1,}?\.",text2)
|
||||||
|
print(match.group(0))
|
||||||
|
|
||||||
|
# You will often have situations where there are multiple versions of the same
|
||||||
|
# pattern. How can you include all of them in one regular expression?
|
||||||
|
# Example 1: search for the word "losses" in the following sentence:
|
||||||
|
text3="X Corp's soda division returned significant losses in the last quarter. Losses will be reduced this quarter."
|
||||||
|
# the first letter of "loss" can be upper or lower case
|
||||||
|
print("Example 1: Loss and loss")
|
||||||
|
text4=text3
|
||||||
|
i=1
|
||||||
|
# A set of characters [] is matched if at least one of the components of the
|
||||||
|
# set is found in the text. This works only for a single letter/number/symbol
|
||||||
|
# but not for sequences of multiple letters/numbers/symbols.
|
||||||
|
match=re.search("[Ll]oss",text3)
|
||||||
|
while match:
|
||||||
|
end=match.end()
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("[Ll]oss",text4)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Alternatively
|
||||||
|
list_of_matches=re.findall("[Ll]oss",text3)
|
||||||
|
print("Alternative using re.findall: "+str(list_of_matches))
|
||||||
|
|
||||||
|
# In this example, you could also simply perform a case-insensitive match.
|
||||||
|
print("Case-INsensitive matching using re.IGNORECASE")
|
||||||
|
text4=text3
|
||||||
|
i=1
|
||||||
|
match=re.search("loss",text3,re.IGNORECASE)
|
||||||
|
while match:
|
||||||
|
end=match.end()
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("loss",text4,re.IGNORECASE)
|
||||||
|
i=i+1
|
||||||
|
# Or equivalently
|
||||||
|
print("Case-INsensitive matching using (?i)")
|
||||||
|
text4=text3
|
||||||
|
i=1
|
||||||
|
match=re.search("(?i)loss",text3)
|
||||||
|
while match:
|
||||||
|
end=match.end()
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("(?i)loss",text4)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
|
||||||
|
# Example 2: search for the expressions "profits declined" and "profits decreased"
|
||||||
|
# in the following sentence:
|
||||||
|
text3="X Corp's profits declined in 2010, while Y Inc.'s profits decreased the year before."
|
||||||
|
# Here, [] no longer works because we need to match terms consisting of several
|
||||||
|
# characters and [] matches only one character. -> use the OR-operator |
|
||||||
|
print("Example 2: profits declied and profits decreased - First try")
|
||||||
|
text4=text3
|
||||||
|
i=1
|
||||||
|
match=re.search("profits declined|decreased",text3)
|
||||||
|
while match:
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
end=match.end()
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("profits declined|decreased",text4)
|
||||||
|
i=i+1
|
||||||
|
# Problem: regex interprets the entire set of characters before the | as one
|
||||||
|
# alternative.
|
||||||
|
# Solution: use parantheses to define the boundaries.
|
||||||
|
|
||||||
|
print("Example 2: profits declied and profits decreased - Second try")
|
||||||
|
text4=text3
|
||||||
|
i=1
|
||||||
|
match=re.search("profits (declined|decreased)",text3)
|
||||||
|
while match:
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
end=match.end()
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("profits (declined|decreased)",text4)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Alternative: does re.findall work?
|
||||||
|
list_of_matches=re.findall("profits (declined|decreased)",text3)
|
||||||
|
print(list_of_matches)
|
||||||
|
# -> No! Because there is a major difference between re.search and re.findall
|
||||||
|
# in the way they treat parantheses ().
|
||||||
|
# re.search follows the general regular expression syntax that is also used in
|
||||||
|
# other programming languages.
|
||||||
|
# To use re.findall you have to write down the full text before and after the |.
|
||||||
|
list_of_matches=re.findall("profits declined|profits decreased",text3)
|
||||||
|
print(list_of_matches)
|
||||||
|
|
||||||
|
|
||||||
|
# More information on the difference between re.search and re.findall
|
||||||
|
# Example 3: let's search for the numbers in the second part of the txt file
|
||||||
|
# and compare what the two commands do.
|
||||||
|
# Get the second part
|
||||||
|
match=re.search("Here are some numbers:",text)
|
||||||
|
text4=text[match.end():]
|
||||||
|
print(text4)
|
||||||
|
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||||
|
# What are the individual parts of this pattern?
|
||||||
|
# [0-9]{1,} There has to be at least one digit.
|
||||||
|
# ([0-9]{3}|,){0,} The first digit can be followed by combinations of three
|
||||||
|
# digits and commas (as thousand separator).
|
||||||
|
# \.{0,1} There can be zero or one period as decimal separator.
|
||||||
|
# [0-9]{0,} There can be multiple decimal places.
|
||||||
|
|
||||||
|
i=1
|
||||||
|
while match:
|
||||||
|
print("This is match number "+str(i)+": "+match.group(0))
|
||||||
|
# Check whether there are further matches after the end of the previous match
|
||||||
|
end=match.end()
|
||||||
|
text4=text4[end:]
|
||||||
|
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Can we obtain the same result by using re.findall?
|
||||||
|
match=re.search("Here are some numbers:",text)
|
||||||
|
text4=text[match.end():]
|
||||||
|
list_of_matches=re.findall("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||||
|
print(list_of_matches)
|
||||||
|
# Does not work!
|
||||||
|
# One has to put "?:" in the part that captures the repetition of the thousands.
|
||||||
|
# This tells re.findall to return the full match and not subpatterns.
|
||||||
|
list_of_matches=re.findall("[0-9]{1,}(?:[0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
|
||||||
|
print(list_of_matches)
|
||||||
|
|
||||||
|
# TAKE AWAY: The matching of re.findall does not always match that of re.search
|
||||||
|
# Be careful when using re.findall!!!
|
||||||
|
|
||||||
|
|
||||||
|
# How to delete or substitute parts of texts?
|
||||||
|
# Alternative 1: identify the beginning and end of the matched text part and
|
||||||
|
# remove it from the overall text.
|
||||||
|
# Example delete all numbers in the text
|
||||||
|
text4=text
|
||||||
|
print("Original Text:\n"+text4)
|
||||||
|
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
|
||||||
|
while match:
|
||||||
|
# Remove the match
|
||||||
|
text4=text4[:match.start()]+text4[match.end():]
|
||||||
|
# Check whether there are further matches in the remaining text
|
||||||
|
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
|
||||||
|
print("Text without numbers using re.search:\n"+text4)
|
||||||
|
|
||||||
|
# Alternative 2: use re.sub (sub -> substitute)
|
||||||
|
# syntax: new_text=re.sub(pattern, replacement, old_text)
|
||||||
|
# replacement is some string. Regular expressions are only allowed in the pattern
|
||||||
|
# but not in the replacement.
|
||||||
|
text4=text
|
||||||
|
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","",text4)
|
||||||
|
|
||||||
|
print("Text without numbers using re.sub:\n"+text4)
|
||||||
|
# re.sub is the more efficient way.
|
||||||
|
# Furthermore, re.sub can not only delete text but also replace text.
|
||||||
|
# Example
|
||||||
|
text4=text
|
||||||
|
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","NUMBER",text4)
|
||||||
|
print("Text where numbers are replaced by the word 'NUMBER':\n"+text4)
|
||||||
|
|
||||||
|
|
||||||
|
# Make sure you get the right match --> importance of word boundaries.
|
||||||
|
# When you search for a word it can happen that the word is part of a different
|
||||||
|
# longer word. For example, searching for "high" would also match "highlight".
|
||||||
|
# To avoid such mismatches you can either include word boundaries in the search
|
||||||
|
# (Alternative 1) or split the text first by word boundaries into single words
|
||||||
|
# and perform standard string search operations afterwards (Alternative 2).
|
||||||
|
# Alternative 2 does not return the individual matches but tells you for example
|
||||||
|
# the number of matches
|
||||||
|
# Example: search for the word "is"
|
||||||
|
# Alternative 1:
|
||||||
|
match=re.search("is",text)
|
||||||
|
print("Searching without word boundaries yields: '"+match.group(0)+\
|
||||||
|
"' But the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||||
|
match=re.search("\Wis\W",text)
|
||||||
|
print("Searching with word boundaries yields: '"+match.group(0)+\
|
||||||
|
"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||||
|
# You see that the preceding and subsequent word boundaries are also matched
|
||||||
|
# and saved as the matched term. However, often you want the match to include only
|
||||||
|
# the actual word without its boundaries.
|
||||||
|
# Solution: use so called "look ahead" and "look back" conditions.
|
||||||
|
|
||||||
|
'''
|
||||||
|
Look ahead and look behind/back conditions
|
||||||
|
|
||||||
|
Regex requires that the parts of the pattern that are classified as look ahead
|
||||||
|
or look back/behind are present in the text but does not include them in the match.
|
||||||
|
|
||||||
|
Syntax:
|
||||||
|
positive look ahead: (?=) Example: X(?=\W) requires that there is a word
|
||||||
|
boundary after X
|
||||||
|
negative look ahead: (?!) Example: X(?!\W) requires that there must NOT
|
||||||
|
be a word boundary after X.
|
||||||
|
positive look back: (?<=) Example: (?<=\W)X requires that there is a word
|
||||||
|
boundary before X
|
||||||
|
negative look back: (?<!) Example: (?<!\W)X requires that there must NOT
|
||||||
|
be a word boundary before X.
|
||||||
|
'''
|
||||||
|
match=re.search("(?<=\W)is(?=\W)",text)
|
||||||
|
print("Searching with word boundaries as look ahead and look back condition yields: '" #
|
||||||
|
+match.group(0)+"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
|
||||||
|
|
||||||
|
# Does it work also with re.finall?
|
||||||
|
list_of_matches=re.findall("\Wis\W",text)
|
||||||
|
print("Word boundaries using re.findall: "+str(list_of_matches))
|
||||||
|
list_of_matches=re.findall("(?<=\W)is(?=\W)",text)
|
||||||
|
print("Word boundaries as look ahead and look back condition using re.findall: "+str(list_of_matches))
|
||||||
|
print("In total there are "+str(len(list_of_matches))+" matches.")
|
||||||
|
# --> Yes, the approach also work with re.findall.
|
||||||
|
|
||||||
|
# Alternative 2:
|
||||||
|
# Use re.split(), which is similar to split() but more powerful.
|
||||||
|
text_split=re.split("\W",text)
|
||||||
|
print(text_split)
|
||||||
|
# Problem: there are elements in the list that are not words, e.g. ''. These
|
||||||
|
# elements are created because there can be a series of non-word characters (\W),
|
||||||
|
# e.g. ' (' in 'Balmer (born'.
|
||||||
|
# Solution: treat a series of wordboundaries \W as a single split character
|
||||||
|
text_split=re.split("\W{1,}",text)
|
||||||
|
print(text_split)
|
||||||
|
# Now, you do not need to include word boundaries and can use standard string
|
||||||
|
# operations.
|
||||||
|
number_matches=text_split.count("is")
|
||||||
|
print("Using standard string operations, we get "+str(number_matches)+" matches.")
|
||||||
|
# -> same result.
|
|
@ -0,0 +1,485 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Mar 21 09:38:32 2022
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
'''
|
||||||
|
This script introduces you to linear models using the sklearn package.
|
||||||
|
Besides sklearn, we will use pandas to work with data sets as well as
|
||||||
|
numpy to perform computations.
|
||||||
|
|
||||||
|
The introduction consists of 10 parts:
|
||||||
|
1. linear regressions using a toy data set
|
||||||
|
2. linear regressions using a "real" data set
|
||||||
|
3. linear regressions using standardized variables
|
||||||
|
4. Ridge regression basics
|
||||||
|
5. Ridge regression with training, tuning, and testing sample
|
||||||
|
6. Ridge regression with cross-validation
|
||||||
|
7. LASSO regression basics
|
||||||
|
8. LASSO regression with training, tuning, and testing sample
|
||||||
|
9. LASSO regression with cross-validation
|
||||||
|
10. Compare the results from Ridge and LASSO
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
# For OLS regressions
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
# for Ridge regressions
|
||||||
|
from sklearn.linear_model import Ridge
|
||||||
|
# for computing mean squared errors
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
# for plotting the MSEs for different levels of Lambda
|
||||||
|
import matplotlib.pyplot as plot
|
||||||
|
# for Ridge regressions with cross-validation
|
||||||
|
from sklearn.linear_model import RidgeCV
|
||||||
|
# for LASSO regressions
|
||||||
|
from sklearn.linear_model import Lasso
|
||||||
|
# for LASSO regressions with cross-validation
|
||||||
|
from sklearn.linear_model import LassoCV
|
||||||
|
|
||||||
|
# adjust the directory to your folder!!!
|
||||||
|
directory="C:/Lehre/Machine Learning/Data/"
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
# Part 1. Basics: linear regressions in Python using sklearn
|
||||||
|
############################################################
|
||||||
|
print("\nPart 1: Run an OLS regression on a sandbox data set\n")
|
||||||
|
|
||||||
|
# create a random number from a normal distribution with mean 0 and standard deviation 1.
|
||||||
|
random_number=np.random.normal(0, 1)
|
||||||
|
print("A random number is: "+str(random_number))
|
||||||
|
|
||||||
|
# you can also create a vector or matrix of random variables
|
||||||
|
# the parameter size(# of rows, # of columns) specifies the number rows and columns
|
||||||
|
# For example, a (10,1) vector
|
||||||
|
random_number_vector=np.random.normal(0, 1, size=(10,1))
|
||||||
|
print("The vector of random numbers is:")
|
||||||
|
print(random_number_vector)
|
||||||
|
|
||||||
|
# create the independent variable x as a vector of random numbers
|
||||||
|
x_vector=np.random.normal(0, 1, size=(10,1))
|
||||||
|
print("The vector of the independent variable x is:")
|
||||||
|
print(x_vector)
|
||||||
|
|
||||||
|
# create the dependent variable y as
|
||||||
|
# y = 2x + epsilon, where epsilon is the random error term from above
|
||||||
|
y_vector=np.dot(x_vector,2) + random_number_vector
|
||||||
|
print("The vector of the dependent variable y is:")
|
||||||
|
print(y_vector)
|
||||||
|
|
||||||
|
# perform a standard OLS regression with intercept.
|
||||||
|
# The command takes x (independent variable(s)) first and then y (dependent variable)
|
||||||
|
# Note that the default is that the intercept is included. So, strictly speaking,
|
||||||
|
# the (fit_intercept=True) option is not needed.
|
||||||
|
regression_1=LinearRegression(fit_intercept=True).fit(x_vector, y_vector)
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficient on x
|
||||||
|
print("The intercept is: "+str(regression_1.intercept_))
|
||||||
|
# to get it as a scalar/number not an array, use
|
||||||
|
regression_1.intercept_[0]
|
||||||
|
|
||||||
|
print("The coefficient on x is: "+str(regression_1.coef_))
|
||||||
|
# to get it as a scalar/number not an array, use
|
||||||
|
regression_1.coef_[0][0]
|
||||||
|
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_1.score(x_vector, y_vector)))
|
||||||
|
|
||||||
|
|
||||||
|
###############################################################
|
||||||
|
# Part 2: linear regression using a "real" data set
|
||||||
|
###############################################################
|
||||||
|
print("\nPart 2: Run an OLS regression with a real data set\n")
|
||||||
|
|
||||||
|
# import the data for this problem
|
||||||
|
# The data set consists of 200 independent variables (x1 to x200) and
|
||||||
|
# a dependent variable (y).
|
||||||
|
# There are 1,200 observations in total. In the later parts, we will
|
||||||
|
# use the first 1,000 observations for training and the last 200 for testing.
|
||||||
|
# The data are simulated using the following process:
|
||||||
|
# y = 0.5*x1 + 0.5*x2 + ... + 0.5*x100 + random error (mean 0, std. dev. 4)
|
||||||
|
# The x101 to x200 are not directly related to y but are correlated with
|
||||||
|
# the x1 to x100. More specifically,
|
||||||
|
# x101 = 0.7*x1 + random error (mean 0, std. dev. 1)
|
||||||
|
# x102 = 0.7*x2 + random error (mean 0, std. dev. 1)
|
||||||
|
# x200 = 0.7*x100 + random error (mean 0, std. dev. 1)
|
||||||
|
data_frame=pd.read_csv(directory+"regression_data_scikit.csv",sep=";")
|
||||||
|
|
||||||
|
# to get any idea about the data, display the first five data points
|
||||||
|
data_frame.head(5)
|
||||||
|
|
||||||
|
# split the data frame into the independent and dependent variables
|
||||||
|
# the independent variables(x1 to x200) are columns 1 to 200
|
||||||
|
x_variables=data_frame.values[:,:-1]
|
||||||
|
# the dependent variable (y) is column 201
|
||||||
|
y_variable=data_frame.values[:,-1:]
|
||||||
|
|
||||||
|
# run a standard OLS regression
|
||||||
|
regression_OLS=LinearRegression(fit_intercept=True).fit(x_variables, y_variable)
|
||||||
|
# You can double check the results by reruning the regression in Stata or R.
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficients on x1 and x51
|
||||||
|
print("The intercept is: "+str(regression_OLS.intercept_[0]))
|
||||||
|
print("The coefficient on x_1 is: "+str(regression_OLS.coef_[0][0]))
|
||||||
|
print("The coefficient on x_51 is: "+str(regression_OLS.coef_[0][50]))
|
||||||
|
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_OLS.score(x_variables, y_variable)))
|
||||||
|
|
||||||
|
|
||||||
|
##################################################################
|
||||||
|
# Part 3: standardize the data to have mean zero and unit variance
|
||||||
|
# and rerun the regression
|
||||||
|
##################################################################
|
||||||
|
print("\nPart 3a.: Standardize variables\n")
|
||||||
|
|
||||||
|
# standardize x and y to have mean zero and unit variance
|
||||||
|
# axis=0 (axis=1) means that the computation is executed column (row) wise
|
||||||
|
x_variables_mean=np.mean(x_variables,axis=0)
|
||||||
|
# ddof=1 means that we use n-1 to compute the standard deviation
|
||||||
|
x_variables_standard_deviation=np.std(x_variables, axis=0, ddof=1)
|
||||||
|
x_variables_standardized=(x_variables-x_variables_mean)/x_variables_standard_deviation
|
||||||
|
|
||||||
|
# do the same exercise for y
|
||||||
|
y_variable_mean=np.mean(y_variable,axis=0)
|
||||||
|
y_variable_standard_deviation=np.std(y_variable, axis=0, ddof=1)
|
||||||
|
y_variable_standardized=(y_variable-y_variable_mean)/y_variable_standard_deviation
|
||||||
|
|
||||||
|
# rerun the regression using standardized data
|
||||||
|
regression_OLS_standardized=LinearRegression(fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||||
|
# results are identical to a regression in Stata with beta coefficients.
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficients on x_1 and x_51
|
||||||
|
print("The intercept is: "+str(regression_OLS_standardized.intercept_[0]))
|
||||||
|
print("The coefficient on x_1 is: "+str(regression_OLS_standardized.coef_[0][0]))
|
||||||
|
print("The coefficient on x_51 is: "+str(regression_OLS_standardized.coef_[0][50]))
|
||||||
|
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_OLS_standardized.score(x_variables_standardized, y_variable_standardized)))
|
||||||
|
# The R2 is identical to the one from Part 2 -> good!
|
||||||
|
|
||||||
|
#######################################################################################
|
||||||
|
# CAUTION: be careful using the "normalize=True" option in the LinearRegression module!
|
||||||
|
#######################################################################################
|
||||||
|
print("\nPart 3b.: Regression with 'normalization'\n")
|
||||||
|
# Normalizer works on the rows, not the columns!
|
||||||
|
# By default, L2 normalization is applied to each observation so that the
|
||||||
|
# values in a row (!) have a unit norm. Unit norm with L2 means that if each
|
||||||
|
# element were squared and summed, the total would equal 1.
|
||||||
|
regression_OLS_normalized=LinearRegression(fit_intercept=True,normalize=True).fit(x_variables, y_variable)
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficient on x_1 and x_51
|
||||||
|
print("The intercept is: "+str(regression_OLS_normalized.intercept_[0]))
|
||||||
|
print("The coefficient on x_1 is: "+str(regression_OLS_normalized.coef_[0][0]))
|
||||||
|
print("The coefficient on x_51 is: "+str(regression_OLS_normalized.coef_[0][50]))
|
||||||
|
# The coefficients are different from the ones above highlighting that the
|
||||||
|
# "normalize=True" option does not do the same as "normal" standardizing
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_OLS_normalized.score(x_variables, y_variable)))
|
||||||
|
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
# Part 4: Ridge regression on the full sample (no training and testing)
|
||||||
|
# This part is to learn the syntax.
|
||||||
|
# We are using the standardized variables to have the same penalty
|
||||||
|
# for a given effect of x on y.
|
||||||
|
# Remember: if the independent variables are measured on very different
|
||||||
|
# scales, the beta coefficients have different sizes (e.g., market cap in
|
||||||
|
# thousand USD vs. past stock returns as a decimal number) and, thus,
|
||||||
|
# the panelty would be applied inconsistently.
|
||||||
|
#######################################################################
|
||||||
|
print("\nPart 4: Ridge regression - learning the syntax\n")
|
||||||
|
|
||||||
|
# the parameter alpha corresponds to the penalty parameter Lambda from
|
||||||
|
# the notation that is typically used.
|
||||||
|
# the default is that the intercept is included, so you do not need the
|
||||||
|
# "intercept=True" parameter. But it is good to keep in mind what
|
||||||
|
# specification you are using.
|
||||||
|
regression_Ridge=Ridge(alpha=10,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficient on x1 and x51
|
||||||
|
print("The intercept is: "+str(regression_Ridge.intercept_[0]))
|
||||||
|
print("The coefficient on x_1 is: "+str(regression_Ridge.coef_[0][0]))
|
||||||
|
print("The coefficient on x_51 is: "+str(regression_Ridge.coef_[0][50]))
|
||||||
|
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_Ridge.score(x_variables_standardized, y_variable_standardized)))
|
||||||
|
|
||||||
|
# How to compute the mean squared error (MSE)?
|
||||||
|
# 1. get the predicted values
|
||||||
|
y_variable_standardized_predicted=regression_Ridge.predict(x_variables_standardized)
|
||||||
|
# 2. determine the MSE
|
||||||
|
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
||||||
|
|
||||||
|
|
||||||
|
#######################################################################
|
||||||
|
# Part 5: Ridge regression using a training, tuning, and testing sample
|
||||||
|
#######################################################################
|
||||||
|
print("\nPart 5: Ridge regression - Application with training, tuning, and testing data\n")
|
||||||
|
|
||||||
|
# Create a training, tuning, and testing sample
|
||||||
|
# we split the data into a training, a tuning, and a testing set
|
||||||
|
# training data are the frist 800 rows
|
||||||
|
# In the brackets, the first range (before the comma) indicates the rows, the second the columns.
|
||||||
|
x_variables_std_train=x_variables_standardized[:800,:]
|
||||||
|
y_variable_std_train=y_variable_standardized[:800,:]
|
||||||
|
# the tuning data are row 801 to 1000 -> 200 observations
|
||||||
|
x_variables_std_tune=x_variables_standardized[800:1000,:]
|
||||||
|
y_variable_std_tune=y_variable_standardized[800:1000,:]
|
||||||
|
# testing data are the last 200 rows
|
||||||
|
x_variables_std_test=x_variables_standardized[1000:,:]
|
||||||
|
y_variable_std_test=y_variable_standardized[1000:,:]
|
||||||
|
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# find the optimal Lambda
|
||||||
|
##########################
|
||||||
|
# we store the MSE of the training/tuning data for each Lambda
|
||||||
|
mse_train_list=[]
|
||||||
|
mse_tune_list=[]
|
||||||
|
# Again, Lambda and Alpha refer to the same thing.
|
||||||
|
alpha_list=[]
|
||||||
|
|
||||||
|
# we iterate from 0.1 to 100 increasing Lambda=Alpha by 0.1 in each step.
|
||||||
|
alpha=0.1
|
||||||
|
while alpha<100:
|
||||||
|
# train the model
|
||||||
|
regression_Ridge_train=Ridge(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
||||||
|
# add the alpha to the list of alphas
|
||||||
|
alpha_list.append(alpha)
|
||||||
|
# predict y in the training sample
|
||||||
|
y_variable_std_train_predicted=regression_Ridge_train.predict(x_variables_std_train)
|
||||||
|
# predict y in the tuning sample
|
||||||
|
y_variable_std_tune_predicted=regression_Ridge_train.predict(x_variables_std_tune)
|
||||||
|
# compute the MSE in both samples
|
||||||
|
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
||||||
|
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
||||||
|
# append the MSEs to the two lists
|
||||||
|
mse_train_list.append(mse_train)
|
||||||
|
mse_tune_list.append(mse_tune)
|
||||||
|
# continue with the next alpha
|
||||||
|
alpha=alpha+0.1
|
||||||
|
|
||||||
|
########################################
|
||||||
|
# plot the MSEs for the different alphas
|
||||||
|
########################################
|
||||||
|
# MSE in the training sample
|
||||||
|
plot.scatter(alpha_list, mse_train_list)
|
||||||
|
plot.show()
|
||||||
|
# higher Lambda associated with higher MSE
|
||||||
|
|
||||||
|
# MSE in the tuning sample
|
||||||
|
plot.scatter(alpha_list, mse_tune_list)
|
||||||
|
plot.show()
|
||||||
|
# there is an optimal alpha with the lowest MSE
|
||||||
|
|
||||||
|
######################################
|
||||||
|
# determine the optimal Lambda
|
||||||
|
######################################
|
||||||
|
# what is the smallest MSE?
|
||||||
|
minimum=min(mse_tune_list)
|
||||||
|
print("The smallest MSE is "+ str(minimum))
|
||||||
|
# get the position of the minimum MSE in our list
|
||||||
|
index_min_MSE=mse_tune_list.index(minimum)
|
||||||
|
# choose the corresponding alpha
|
||||||
|
alpha_optimal=alpha_list[index_min_MSE]
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal))
|
||||||
|
|
||||||
|
#############################################################
|
||||||
|
# What is the out-of-sample performance of the optimal model?
|
||||||
|
#############################################################
|
||||||
|
# take the full training data set (1000 observations, i.e., training + tuning set)
|
||||||
|
x_variables_std_train_total=np.concatenate((x_variables_std_train, x_variables_std_tune), axis=0)
|
||||||
|
y_variable_std_train_total=np.concatenate((y_variable_std_train, y_variable_std_tune), axis=0)
|
||||||
|
# train the model with the optimal Lambda on the training and tuning data
|
||||||
|
regression_Ridge_optimal=Ridge(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
||||||
|
|
||||||
|
# Mean squared error
|
||||||
|
# predict y in the full training sample
|
||||||
|
y_variable_std_train_total_predicted=regression_Ridge_optimal.predict(x_variables_std_train_total)
|
||||||
|
# predict y in the testing sample
|
||||||
|
# Remeber: we have not used the testing data yet. Firewall principle!!!
|
||||||
|
y_variable_std_test_predicted=regression_Ridge_optimal.predict(x_variables_std_test)
|
||||||
|
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
||||||
|
|
||||||
|
|
||||||
|
#############################################################
|
||||||
|
# Part 6: Ridge regression with k-fold cross-validation
|
||||||
|
# Implement the cross validation using a package
|
||||||
|
#############################################################
|
||||||
|
print("\nPart 6. Ridge regression - Using cross-validation\n")
|
||||||
|
|
||||||
|
# the default for cv is the leave-one-out cross-validation
|
||||||
|
# here we apply five-fold cross-validation
|
||||||
|
regression_Ridge_cv=RidgeCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
y_variable_std_train_total_predicted_cv=regression_Ridge_cv.predict(x_variables_std_train_total)
|
||||||
|
# predict y in the testing sample
|
||||||
|
y_variable_std_test_predicted_cv=regression_Ridge_cv.predict(x_variables_std_test)
|
||||||
|
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Part 7: LASSO regression
|
||||||
|
# on the full sample -> to learn the syntax
|
||||||
|
###########################################
|
||||||
|
|
||||||
|
print("\nPart 7: LASSO regression - learning the syntax\n")
|
||||||
|
# the parameter alpha corresponds to the penalty parameter Lambda from
|
||||||
|
# the notation that is typically used.
|
||||||
|
# the default is that the intercept is included, so you do not need the
|
||||||
|
# "intercept=True" parameter. But it is good to keep in mind what
|
||||||
|
# specification you are using.
|
||||||
|
regression_Lasso=Lasso(alpha=0.1,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
||||||
|
|
||||||
|
# display the intercept and the beta coefficient on x1 and x51
|
||||||
|
print("The intercept is: "+str(regression_Lasso.intercept_[0]))
|
||||||
|
print("The coefficient on x_1 is: "+str(regression_Lasso.coef_[0]))
|
||||||
|
print("The coefficient on x_51 is: "+str(regression_Lasso.coef_[50]))
|
||||||
|
|
||||||
|
# R2 of the regression
|
||||||
|
print("The R2 is: "+str(regression_Lasso.score(x_variables_standardized, y_variable_standardized)))
|
||||||
|
|
||||||
|
# How to compute the mean squared error (MSE)?
|
||||||
|
# 1. get the predicted values
|
||||||
|
y_variable_standardized_predicted=regression_Lasso.predict(x_variables_standardized)
|
||||||
|
# 2. determine the MSE
|
||||||
|
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
||||||
|
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# Part 8: Create a training, tune and testing sample
|
||||||
|
####################################################
|
||||||
|
print("\nPart 8: LASSO regression - Application with training, tuning, and testing data\n")
|
||||||
|
# we use the same training, tuning, and testing data as in part 5.
|
||||||
|
# -> no need to redefine the data sets.
|
||||||
|
|
||||||
|
#################################
|
||||||
|
# find the optimal Lambda
|
||||||
|
#################################
|
||||||
|
# we store the MSE of the training/tuning data for each Lambda
|
||||||
|
mse_train_list=[]
|
||||||
|
mse_tune_list=[]
|
||||||
|
# Again, Lambda and Alpha refer to the same thing.
|
||||||
|
alpha_list=[]
|
||||||
|
|
||||||
|
# we iterate from 0.0001 to 0.25 increasing alpha by 0.0001 in each step.
|
||||||
|
alpha=0.0001
|
||||||
|
while alpha<0.25:
|
||||||
|
# train the model
|
||||||
|
regression_Lasso_train=Lasso(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
||||||
|
# add the alpha to the list of alphas
|
||||||
|
alpha_list.append(alpha)
|
||||||
|
# predict y in the training sample
|
||||||
|
y_variable_std_train_predicted=regression_Lasso_train.predict(x_variables_std_train)
|
||||||
|
# predict y in the tuning sample
|
||||||
|
y_variable_std_tune_predicted=regression_Lasso_train.predict(x_variables_std_tune)
|
||||||
|
# compute the MSE in both samples
|
||||||
|
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
||||||
|
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
||||||
|
# append the MSEs to the two lists
|
||||||
|
mse_train_list.append(mse_train)
|
||||||
|
mse_tune_list.append(mse_tune)
|
||||||
|
# continue with the next alpha
|
||||||
|
alpha=alpha+0.0001
|
||||||
|
|
||||||
|
########################################
|
||||||
|
# plot the MSEs for the different alphas
|
||||||
|
########################################
|
||||||
|
|
||||||
|
# MSE in the training sample
|
||||||
|
plot.scatter(alpha_list, mse_train_list)
|
||||||
|
plot.show()
|
||||||
|
# higher Lambda associated with higher MSE
|
||||||
|
|
||||||
|
# MSE in the tuning sample
|
||||||
|
plot.scatter(alpha_list, mse_tune_list)
|
||||||
|
plot.show()
|
||||||
|
# there is an optimal alpha with the lowest MSE
|
||||||
|
|
||||||
|
|
||||||
|
######################################
|
||||||
|
# determine the optimal Lambda
|
||||||
|
######################################
|
||||||
|
# what is the smallest MSE?
|
||||||
|
minimum=min(mse_tune_list)
|
||||||
|
print("The smallest MSE is "+ str(minimum))
|
||||||
|
# get the position of the minimum MSE
|
||||||
|
index_min_MSE=mse_tune_list.index(minimum)
|
||||||
|
alpha_optimal=alpha_list[index_min_MSE]
|
||||||
|
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal))
|
||||||
|
|
||||||
|
#############################################################
|
||||||
|
# What is the out-of-sample performance of the optimal model?
|
||||||
|
#############################################################
|
||||||
|
# take the full training data set (1000 observations; training + tuning)
|
||||||
|
# use the same variables as in Part 5.
|
||||||
|
|
||||||
|
# train the model with the optimal Lambda on the training and tuning data
|
||||||
|
regression_Lasso_optimal=Lasso(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
||||||
|
|
||||||
|
# Mean squared error
|
||||||
|
# predict y in the full training sample
|
||||||
|
y_variable_std_train_total_predicted=regression_Lasso_optimal.predict(x_variables_std_train_total)
|
||||||
|
# predict y in the testing sample
|
||||||
|
y_variable_std_test_predicted=regression_Lasso_optimal.predict(x_variables_std_test)
|
||||||
|
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
||||||
|
|
||||||
|
|
||||||
|
#############################################################
|
||||||
|
# Part 9: Implement the cross validation using a package
|
||||||
|
#############################################################
|
||||||
|
print("\nPart 9: LASSO regression - Using cross-validation\n")
|
||||||
|
|
||||||
|
# the default for cv in LassoCV is the 5-fold cross-validation
|
||||||
|
regression_Lasso_cv=LassoCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
y_variable_std_train_total_predicted_cv=regression_Lasso_cv.predict(x_variables_std_train_total)
|
||||||
|
# predict y in the testing sample
|
||||||
|
y_variable_std_test_predicted_cv=regression_Lasso_cv.predict(x_variables_std_test)
|
||||||
|
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
# Part 10: Compare the betas from the Ridge and the LASSO regressions
|
||||||
|
#####################################################################
|
||||||
|
print("\nPart 10: Comparison of Ridge and LASSO coefficients\n")
|
||||||
|
# To set to what extend the results of Ridge and LASSO are similar, we
|
||||||
|
# write the coefficients from the cross-validation tasks (Parts 6 and 9)
|
||||||
|
# to a csv files.
|
||||||
|
|
||||||
|
output_file=open(directory+"comparison_coefficients_Ridge_LASSO.csv","w",encoding="utf-8")
|
||||||
|
output_file.write("index;coefficient_Ridge;coefficient_LASSO\n")
|
||||||
|
|
||||||
|
# get the list of coefficients
|
||||||
|
for i in range (0,200):
|
||||||
|
output_file.write(str(i)+';'+str(regression_Ridge_cv.coef_[0][i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||||
|
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("Completed!")
|
|
@ -0,0 +1,52 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sat Jul 17 17:09:50 2021
|
||||||
|
|
||||||
|
@author: ahillert
|
||||||
|
"""
|
||||||
|
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
|
||||||
|
print("\nExample 1\n")
|
||||||
|
text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \
|
||||||
|
+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \
|
||||||
|
+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"
|
||||||
|
|
||||||
|
sentence_list_1=sent_tokenize(text_1)
|
||||||
|
|
||||||
|
for i in range(0,len(sentence_list_1)):
|
||||||
|
print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])
|
||||||
|
|
||||||
|
# -> good performance
|
||||||
|
|
||||||
|
print("\nExample 2\n")
|
||||||
|
text_2=text_1.lower()
|
||||||
|
|
||||||
|
sentence_list_2=sent_tokenize(text_2)
|
||||||
|
|
||||||
|
for i in range(0,len(sentence_list_2)):
|
||||||
|
print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])
|
||||||
|
|
||||||
|
# -> poor performance
|
||||||
|
# For the NLTK tokenizer it makes a difference whether text is lower or upper case.
|
||||||
|
|
||||||
|
|
||||||
|
print("\nExample 3\n")
|
||||||
|
text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \
|
||||||
|
+"While Jane C. Taylor became the president of the S. E. C. " \
|
||||||
|
+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."
|
||||||
|
|
||||||
|
sentence_list_3=sent_tokenize(text_3)
|
||||||
|
|
||||||
|
for i in range(0,len(sentence_list_3)):
|
||||||
|
print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])
|
||||||
|
|
||||||
|
# -> good performance
|
||||||
|
|
||||||
|
print("\nExample 4\n")
|
||||||
|
text_4=text_3.lower()
|
||||||
|
|
||||||
|
sentence_list_4=sent_tokenize(text_4)
|
||||||
|
|
||||||
|
for i in range(0,len(sentence_list_4)):
|
||||||
|
print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])
|
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
|
@ -0,0 +1,137 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Jul 11 17:43:45 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# import modules
|
||||||
|
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
|
||||||
|
# the following three commands:
|
||||||
|
#import nltk
|
||||||
|
#nltk.download('punkt')
|
||||||
|
#nltk.download('stopwords')
|
||||||
|
|
||||||
|
|
||||||
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
import re
|
||||||
|
|
||||||
|
################
|
||||||
|
# 1. Tokenize
|
||||||
|
################
|
||||||
|
# Create a test text to see how well nltk.tokenize performs
|
||||||
|
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
|
||||||
|
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
|
||||||
|
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
|
||||||
|
|
||||||
|
# Tokenize sentences
|
||||||
|
sentence_list=sent_tokenize(test_text)
|
||||||
|
print("This is the list of sentences:")
|
||||||
|
print(sentence_list)
|
||||||
|
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
|
||||||
|
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
|
||||||
|
|
||||||
|
# Tokenize words
|
||||||
|
word_list=word_tokenize(test_text)
|
||||||
|
print("This is the list of words:")
|
||||||
|
print(word_list)
|
||||||
|
print(len(word_list))
|
||||||
|
# --> word_tokenize also includes symbols and numbers as words.
|
||||||
|
|
||||||
|
# How to delete the elements that are not real words?
|
||||||
|
word_list_1=[]
|
||||||
|
for word in word_list:
|
||||||
|
if re.search('[A-Za-z]',word):
|
||||||
|
word_list_1.append(word)
|
||||||
|
print("This is the edited list of words. There should be only 'real' words:")
|
||||||
|
print(word_list_1)
|
||||||
|
print(len(word_list_1))
|
||||||
|
|
||||||
|
# Alternative
|
||||||
|
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
|
||||||
|
word_list_2=word_tokenize(test_text1)
|
||||||
|
print("This is the edited list of words. There should be only 'real' words:")
|
||||||
|
print(word_list_2)
|
||||||
|
print(len(word_list_2))
|
||||||
|
|
||||||
|
|
||||||
|
################
|
||||||
|
# 2. Stop Words
|
||||||
|
################
|
||||||
|
example_sentence = "This is an example showing off stop word filtering."
|
||||||
|
stop_words=set(stopwords.words("english"))
|
||||||
|
print("This is the list of stop words from NLTK:")
|
||||||
|
print(stop_words)
|
||||||
|
# --> the stop words are all lower case
|
||||||
|
print(len(stop_words))
|
||||||
|
|
||||||
|
# Split example sentence into words
|
||||||
|
word_list_example=word_tokenize(example_sentence.lower())
|
||||||
|
# Create list for filtered words
|
||||||
|
word_list_filtered=[]
|
||||||
|
|
||||||
|
# filter out stop words
|
||||||
|
for word in word_list_example:
|
||||||
|
if word not in stop_words:
|
||||||
|
word_list_filtered.append(word)
|
||||||
|
|
||||||
|
print("Example sentence after stop words have been deleted:")
|
||||||
|
print(word_list_filtered)
|
||||||
|
|
||||||
|
# How does the example from above look like?
|
||||||
|
test_text_filtered=[]
|
||||||
|
|
||||||
|
# filter out stop words
|
||||||
|
for word in word_tokenize(test_text.lower()):
|
||||||
|
if word not in stop_words:
|
||||||
|
test_text_filtered.append(word)
|
||||||
|
|
||||||
|
print("Test text after stop words have been deleted:")
|
||||||
|
print(test_text_filtered)
|
||||||
|
|
||||||
|
|
||||||
|
################
|
||||||
|
# 3. Stemming
|
||||||
|
################
|
||||||
|
# define an abbreviation
|
||||||
|
ps=PorterStemmer()
|
||||||
|
|
||||||
|
example_words_1=["play", "player", "players", "played", "playing"]
|
||||||
|
|
||||||
|
for word in example_words_1:
|
||||||
|
print(ps.stem(word))
|
||||||
|
# the full syntax without the abbreviation would be:
|
||||||
|
print(PorterStemmer().stem(word))
|
||||||
|
|
||||||
|
# adjectives and adverbs
|
||||||
|
example_words_2=["high", "higher", "highest", "highly", "height"]
|
||||||
|
for word in example_words_2:
|
||||||
|
print(ps.stem(word))
|
||||||
|
# --> comparative and superlative are not reduced to the stem/regular adjective
|
||||||
|
# neither are adverbs
|
||||||
|
|
||||||
|
# Let's see how the stemmer deals with irregular words.
|
||||||
|
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
|
||||||
|
for word in example_words_3:
|
||||||
|
print(ps.stem(word))
|
||||||
|
# --> upper case words are also transformed to lower case.
|
||||||
|
|
||||||
|
# Stem the test text from above
|
||||||
|
# Approach 1: stem word by word
|
||||||
|
test_text_stemmed=[]
|
||||||
|
for word in word_tokenize(test_text):
|
||||||
|
test_text_stemmed.append(ps.stem(word))
|
||||||
|
|
||||||
|
print("Stemming word by word: test text after it has been stemmed:")
|
||||||
|
print(test_text_stemmed)
|
||||||
|
|
||||||
|
# Alternative approach: stem entire text
|
||||||
|
test_text_stemmed=ps.stem(test_text)
|
||||||
|
print("Stemming entire document: test text after it has been stemmed:")
|
||||||
|
print(test_text_stemmed)
|
||||||
|
# -> does not work
|
||||||
|
|
||||||
|
print("End of nltk introduction!")
|
|
@ -0,0 +1,19 @@
|
||||||
|
This is the text for the introduction to regular expressions.
|
||||||
|
|
||||||
|
In the first example, we search for the year of birth of current and former CEOs.
|
||||||
|
These are sentences that I made up:
|
||||||
|
Microsoft's former CEO Steve Balmer (born in 1956) graduated from Harvard in 1977.
|
||||||
|
Michael Dell was born in 1965 in Houston and founded Dell Inc in 1984.
|
||||||
|
Walmart is currently run by Doug McMillon, who was born in 1966.
|
||||||
|
|
||||||
|
The following three examples are taken from the Wikipedia pages of the three people.
|
||||||
|
Steven Anthony "Steve" Ballmer (born March 24, 1956) is an American chief executive who is the former chief executive officer of Microsoft from January 2000 to February 2014, and is the current owner of the Los Angeles Clippers. Source: https://en.wikipedia.org/wiki/Steve_Ballmer, June 22, 2017.
|
||||||
|
Michael Saul Dell (born February 23, 1965) is an American business magnate, investor, philanthropist, and author. He is the founder and CEO of Dell Technologies, one of the world’s leading providers of information technology infrastructure solutions. Source: https://en.wikipedia.org/wiki/Michael_Dell, June 22, 2017.
|
||||||
|
Carl Douglas "Doug" McMillon (born October 17, 1966) is an American businessman and is the president and chief executive officer (CEO) of Wal-Mart Stores, Inc. Source: https://en.wikipedia.org/wiki/Doug_McMillon, June 22, 2017.
|
||||||
|
|
||||||
|
Here are some numbers:
|
||||||
|
1,234,567
|
||||||
|
8,901
|
||||||
|
34
|
||||||
|
56.82
|
||||||
|
539,234,353.41
|
1201
lectures/programming/introductions/regression_data_scikit.csv
Normal file
1201
lectures/programming/introductions/regression_data_scikit.csv
Normal file
File diff suppressed because it is too large
Load diff
76
lectures/programming/solutions/Problem_10_Complex_Words.py
Normal file
76
lectures/programming/solutions/Problem_10_Complex_Words.py
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the dictionary
|
||||||
|
file_word_list=open(directory+'Complex_Words.txt','r',encoding="utf-8")
|
||||||
|
word_list=file_word_list.read()
|
||||||
|
word_list=word_list.lower()
|
||||||
|
complex_words=word_list.split()
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_Complex_Tone.csv','w',encoding="utf-8")
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_Complex_Words;Percent_Complex_Words\n')
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Use lower case letters
|
||||||
|
text=input_text_10_k.lower()
|
||||||
|
|
||||||
|
# Split the text in words to determine the total number of words
|
||||||
|
list_of_words=re.split('\W{1,}', text)
|
||||||
|
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
# Determine total number of words
|
||||||
|
word_count=len(list_of_words)
|
||||||
|
|
||||||
|
# Reset the number of complex words to zero
|
||||||
|
complex_count=0
|
||||||
|
# For each complex word, count the number of occurrences
|
||||||
|
for i in range(len(complex_words)):
|
||||||
|
complex_count=complex_count+list_of_words.count(complex_words[i])
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, and number of complex words to output file
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||||
|
+str(complex_count)+';'+str(complex_count/word_count)+'\n')
|
||||||
|
|
||||||
|
# Close filings
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
|
@ -0,0 +1,53 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# To determine file size we need the OS package
|
||||||
|
import os
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
|
||||||
|
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# File size of the complete submission file (gross file size)
|
||||||
|
# You have to divide the result by 1024 to get the size in kilobyte
|
||||||
|
# The file size will be affected by html code and exhibits.
|
||||||
|
size_gross=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'.txt')/1024
|
||||||
|
|
||||||
|
# File size of the main text file (net file size)
|
||||||
|
# You have to divide the result by 1024 to get the size in kilobyte
|
||||||
|
size_net=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt')/1024
|
||||||
|
|
||||||
|
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
167
lectures/programming/solutions/Problem_12_Most_Frequent_Words.py
Normal file
|
@ -0,0 +1,167 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Jul 11 09:19:54 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We need regular expressions and counters (->collections)
|
||||||
|
import re
|
||||||
|
import collections
|
||||||
|
# for the bigram part, the sentence tokenizer is helpful
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create an empty counter variable
|
||||||
|
words_counter=collections.Counter()
|
||||||
|
# variable is needed only for an alternative solution
|
||||||
|
words_counter1=collections.Counter()
|
||||||
|
|
||||||
|
# counter for the extra task
|
||||||
|
bigram_counter=collections.Counter()
|
||||||
|
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list; remember to specify the encoding
|
||||||
|
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
|
||||||
|
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
|
||||||
|
# read the content from the file
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# use lower case only so that it does not matter whether a word is at
|
||||||
|
# the beginning of a sentence ("The") or within a sentence ("the").
|
||||||
|
# Please note that this can be problematic, e.g. "US" -> United States vs.
|
||||||
|
# us (personal pronoun)
|
||||||
|
input_text_10_k_lower=input_text_10_k.lower()
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
|
||||||
|
# There can be empty ("") list elements -> remove them
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
# optional commands to remove words that only contain "_"
|
||||||
|
'''
|
||||||
|
for word in list_of_words:
|
||||||
|
if re.sub("[a-zA-Z]","",word)!="":
|
||||||
|
#if word.count("_")>0:
|
||||||
|
list_of_words.remove(word)
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Add the words to our counter
|
||||||
|
words_counter=words_counter+collections.Counter(list_of_words)
|
||||||
|
# alternative solution
|
||||||
|
words_counter1.update(list_of_words)
|
||||||
|
|
||||||
|
|
||||||
|
#############################################
|
||||||
|
# optional part for the extra task on bigrams
|
||||||
|
#############################################
|
||||||
|
|
||||||
|
# create an empty list for the bigrams
|
||||||
|
bigram_list=[]
|
||||||
|
|
||||||
|
# split the text into sentences
|
||||||
|
list_of_sentences=sent_tokenize(input_text_10_k)
|
||||||
|
|
||||||
|
# create the BIGRAM IN EACH SENTENCE
|
||||||
|
for sentence in list_of_sentences:
|
||||||
|
|
||||||
|
# make the sentence lower case
|
||||||
|
sentence_lower=sentence.lower()
|
||||||
|
|
||||||
|
# split the sentence into words
|
||||||
|
list_of_words=re.split("\W{1,}",sentence_lower)
|
||||||
|
|
||||||
|
# remove empty elements
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
#print("these are the words of the sentence:\n"+str(list_of_words))
|
||||||
|
|
||||||
|
# go over all potential two word combinations in the sentence.
|
||||||
|
for word_number in range(0,len(list_of_words)-1):
|
||||||
|
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
|
||||||
|
|
||||||
|
bigram_counter=bigram_counter+collections.Counter(bigram_list)
|
||||||
|
# end of extra task
|
||||||
|
|
||||||
|
|
||||||
|
# Close the 10-K filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
|
||||||
|
######################
|
||||||
|
# Top 100 single words
|
||||||
|
######################
|
||||||
|
# Open the csv file containing the 100 most frequently used words
|
||||||
|
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
|
||||||
|
output_file.write("rank;word;count\n")
|
||||||
|
|
||||||
|
# Get the 100 most frequent words
|
||||||
|
top_100_words=words_counter.most_common(100)
|
||||||
|
# for the alternative solution
|
||||||
|
#top_100_words=words_counter1.most_common(100)
|
||||||
|
|
||||||
|
# Write the 100 most frequent words to the csv file.
|
||||||
|
# Remember Python starts counting at 0, while humans start at 1.
|
||||||
|
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
|
||||||
|
# Consequently, to get a consistent table, we must use the value i for the rank
|
||||||
|
# but call the element i-1.
|
||||||
|
for i in range(1,101):
|
||||||
|
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
|
||||||
|
str(top_100_words[i-1][1])+"\n")
|
||||||
|
|
||||||
|
# Close the csv file
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# Extra task
|
||||||
|
# Top 100 bigrams
|
||||||
|
######################
|
||||||
|
# Open the csv file containing the 100 most frequently used BIGRAMS
|
||||||
|
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
|
||||||
|
output_file_bigram.write("rank;word;count\n")
|
||||||
|
|
||||||
|
# Get the 100 most frequent words
|
||||||
|
top_100_bigrams=bigram_counter.most_common(100)
|
||||||
|
|
||||||
|
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
|
||||||
|
for i in range(1,101):
|
||||||
|
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
|
||||||
|
str(top_100_bigrams[i-1][1])+"\n")
|
||||||
|
|
||||||
|
# Close the csv file
|
||||||
|
output_file_bigram.close()
|
||||||
|
|
||||||
|
|
||||||
|
print("Task done!")
|
96
lectures/programming/solutions/Problem_13_Stemming.py
Normal file
96
lectures/programming/solutions/Problem_13_Stemming.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We need regular epressions, tokenize (to identify words), and stemming.
|
||||||
|
import re
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
#for i in range(1,len(input_text_line)):
|
||||||
|
# for illustration filings 1 to 3 only
|
||||||
|
for i in range(1,4):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list; remember to specify the encoding
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
||||||
|
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
# Get the text of the 10-K
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# We need to tokenize the text because stem only works on a word by word basis.
|
||||||
|
# Stemming an entire document without splitting into words does not work!
|
||||||
|
# The problem is that \n gets lost in this process --> we cannot easily
|
||||||
|
# recreate the document.
|
||||||
|
# idea: replace \n by \n and some indicator that there was a line break.
|
||||||
|
# Here, I choose "LINEBREAKMARK"
|
||||||
|
input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
# There are two alternatives.
|
||||||
|
# Alternative 1 (our standard approach):
|
||||||
|
#word_list=re.split("\W{1,}",input_text_10_k.lower())
|
||||||
|
# Alternative 2 (keeps symbols like ,;.):
|
||||||
|
word_list=word_tokenize(input_text_10_k.lower())
|
||||||
|
|
||||||
|
|
||||||
|
# Stem the text
|
||||||
|
text_stemmed=''
|
||||||
|
for word in word_list:
|
||||||
|
# The following two cases are designed to improve the formatting of the
|
||||||
|
# output file. It is not needed for the subsequent analyses.
|
||||||
|
|
||||||
|
# Case 1: 'word' is not an actual word but a symbol. -> there should
|
||||||
|
# be no whitespace between the previous words and this symbol.
|
||||||
|
# \A and \Z indicate the beginning and end of string -> the 'word' is just
|
||||||
|
# the symbol but not a combination of letters and symbols.
|
||||||
|
|
||||||
|
if re.search("\A[\.\?!,:;']{1,}\Z",word):
|
||||||
|
text_stemmed=text_stemmed+word
|
||||||
|
# Case 2: the word is an actual word -> have a whitespace included.
|
||||||
|
else:
|
||||||
|
text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
||||||
|
|
||||||
|
# The simple solution (without restoring the formatting of the text) is:
|
||||||
|
#text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
|
||||||
|
|
||||||
|
|
||||||
|
# To recreate the text, we need to replace the line break indicators by \n
|
||||||
|
# Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
|
||||||
|
text_stemmed=text_stemmed.replace("linebreakmark","\n")
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the stemmed text
|
||||||
|
output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
|
||||||
|
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
|
||||||
|
output_file_10_k.write(text_stemmed)
|
||||||
|
output_file_10_k.close()
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
print("Task done!")
|
287
lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
Normal file
287
lectures/programming/solutions/Problem_14_Jaccard_Similarity.py
Normal file
|
@ -0,0 +1,287 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
ps=PorterStemmer()
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the iput file. The following command
|
||||||
|
# deletes these lines
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Open the output csv file in which we write the similarities
|
||||||
|
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to first line
|
||||||
|
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
|
||||||
|
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
|
||||||
|
Jaccard_stemmed_NLTK_stop_words\n')
|
||||||
|
|
||||||
|
# Read own stop word list
|
||||||
|
# This list has been created by manually selecting words from the csv-file
|
||||||
|
# 100_most_frequent_words.csv, which is created by the Python program
|
||||||
|
# "Problem_12_Most_Frequent_Words.py".
|
||||||
|
# Simply delete words you consider to be meaningless and that are frequently
|
||||||
|
# used.
|
||||||
|
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
|
||||||
|
stop_word_text=stop_word_file.read()
|
||||||
|
stop_word_line=stop_word_text.split("\n")
|
||||||
|
stop_word_line.remove("")
|
||||||
|
own_stop_words=[""]
|
||||||
|
for i in range(1,len(stop_word_line)):
|
||||||
|
stop_word=stop_word_line[i].split(";")[1]
|
||||||
|
own_stop_words.append(stop_word)
|
||||||
|
|
||||||
|
own_stop_words.remove("")
|
||||||
|
print("This is the list of my stop words:")
|
||||||
|
print(own_stop_words)
|
||||||
|
|
||||||
|
# Read NLTK stop word list
|
||||||
|
NLTK_stop_words=set(stopwords.words("english"))
|
||||||
|
print("This is the list of NLTK stop words:")
|
||||||
|
print(NLTK_stop_words)
|
||||||
|
|
||||||
|
# set default values for variables
|
||||||
|
# It is not required. However, if you don't do it Spyder will suggest that line
|
||||||
|
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
||||||
|
# is incorrect as word_list_old_edited is not yet defined at point in the program
|
||||||
|
# code. In this specific example, this will not cause an error, as we do not enter
|
||||||
|
# the if condition when i=1 -> it
|
||||||
|
word_list_old_edited=[]
|
||||||
|
word_list_edited=[]
|
||||||
|
word_list_old_NLTK_filtered=""
|
||||||
|
word_list_old_own_filtered=""
|
||||||
|
word_list_old_edited_stemmed=""
|
||||||
|
word_list_old_own_filtered_stemmed=""
|
||||||
|
word_list_old_NLTK_filtered_stemmed=""
|
||||||
|
|
||||||
|
#######################################################
|
||||||
|
# Define a function that computes Jaccard similarity
|
||||||
|
# As we need these operations several times, it is
|
||||||
|
# helpful to use a function.
|
||||||
|
######################################################
|
||||||
|
# beginning of the function
|
||||||
|
def jaccard(text1,text2):
|
||||||
|
counter1=Counter(text1)
|
||||||
|
counter2=Counter(text2)
|
||||||
|
|
||||||
|
intersection=counter1 & counter2
|
||||||
|
union=counter1 | counter2
|
||||||
|
|
||||||
|
return len(intersection)/len(union)
|
||||||
|
# end of the function
|
||||||
|
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Write the information from the input file to the output file
|
||||||
|
# we do not add a line break at the end, as we must append the similarity
|
||||||
|
# score first.
|
||||||
|
output_file.write(input_text_line[i])
|
||||||
|
|
||||||
|
# Open the ith 10-K; remember to specify the encoding
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
|
||||||
|
'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# check whether the previous entry of the list is from the same firm
|
||||||
|
permco=input_text_line[i].split(";")[1]
|
||||||
|
permco_old=input_text_line[i-1].split(";")[1]
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
word_list_edited=word_tokenize(input_text_10_k.lower())
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 1: Jaccard for the _edited.txt
|
||||||
|
############################################
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
# the command calls the jaccard function that we have defined above.
|
||||||
|
# in the function, text1=word_list_edited and text2=word_list_old_edited.
|
||||||
|
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_edited=word_list_edited
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 2: Jaccard for the _edited.txt
|
||||||
|
# AND REMOVE STOP WORDS - OWN LIST
|
||||||
|
############################################
|
||||||
|
# remove stop words using personal stop word list
|
||||||
|
word_list_own_filtered=[]
|
||||||
|
for word in word_list_edited:
|
||||||
|
if word not in own_stop_words:
|
||||||
|
word_list_own_filtered.append(word)
|
||||||
|
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
jaccard_similarity=jaccard(word_list_own_filtered,\
|
||||||
|
word_list_old_own_filtered)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_own_filtered=word_list_own_filtered
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 3: Jaccard for the _edited_v1.txt
|
||||||
|
# AND REMOVE STOP WORDS - NLTK LIST
|
||||||
|
############################################
|
||||||
|
# remove stop words using NLTK stop word list
|
||||||
|
word_list_NLTK_filtered=[]
|
||||||
|
for word in word_list_edited:
|
||||||
|
if word not in NLTK_stop_words:
|
||||||
|
word_list_NLTK_filtered.append(word)
|
||||||
|
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
|
||||||
|
word_list_old_NLTK_filtered)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_NLTK_filtered=word_list_NLTK_filtered
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 4: Jaccard for the _stemmed.txt
|
||||||
|
############################################
|
||||||
|
# Create stemmed text
|
||||||
|
word_list_edited_stemmed=[]
|
||||||
|
for word in word_list_edited:
|
||||||
|
word_list_edited_stemmed.append(ps.stem(word))
|
||||||
|
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_edited_stemmed=word_list_edited_stemmed
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 5: Jaccard for the _stemmed.txt
|
||||||
|
# AND REMOVE STOP WORDS - OWN LIST
|
||||||
|
############################################
|
||||||
|
# Caution; in general, it is not clear whether you should first stem or
|
||||||
|
# first remove stop words.
|
||||||
|
# However, in this specific case, you should remove the stop words first
|
||||||
|
# and then stem, as your stop word list is based on the inflected text.
|
||||||
|
|
||||||
|
# remove stop words using personal stop word list
|
||||||
|
word_list_own_filtered=[]
|
||||||
|
for word in word_list_edited:
|
||||||
|
if word not in own_stop_words:
|
||||||
|
word_list_own_filtered.append(word)
|
||||||
|
|
||||||
|
# Create stemmed text
|
||||||
|
word_list_own_filtered_stemmed=[]
|
||||||
|
for word in word_list_own_filtered:
|
||||||
|
word_list_own_filtered_stemmed.append(ps.stem(word))
|
||||||
|
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
|
||||||
|
word_list_old_own_filtered_stemmed)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 6: Jaccard for the _stemmed.txt
|
||||||
|
# AND REMOVE STOP WORDS - NLTK LIST
|
||||||
|
############################################
|
||||||
|
# Caution; it is not clear whether you should first stem or first remove
|
||||||
|
# stop words. However, the NLTK stop word list seems to be based on inflected
|
||||||
|
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
|
||||||
|
# Thus, the stop list seems to be not stemmed.
|
||||||
|
# Thus, you should remove the stop words first and then stem.
|
||||||
|
|
||||||
|
# remove stop words using NLTK stop word list
|
||||||
|
word_list_NLTK_filtered=[]
|
||||||
|
for word in word_list_edited:
|
||||||
|
if word not in NLTK_stop_words:
|
||||||
|
word_list_NLTK_filtered.append(word)
|
||||||
|
|
||||||
|
# Create stemmed text
|
||||||
|
word_list_NLTK_filtered_stemmed=[]
|
||||||
|
for word in word_list_NLTK_filtered:
|
||||||
|
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
|
||||||
|
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
|
||||||
|
word_list_old_NLTK_filtered_stemmed)
|
||||||
|
|
||||||
|
output_file.write(";"+str(jaccard_similarity))
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(";")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
|
||||||
|
|
||||||
|
|
||||||
|
# Write line break to output file
|
||||||
|
output_file.write("\n")
|
||||||
|
|
||||||
|
# Close 10-K filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
stop_word_file.close()
|
||||||
|
print("Task done!")
|
||||||
|
|
|
@ -0,0 +1,161 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Mar 21 09:38:32 2022
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.linear_model import RidgeCV
|
||||||
|
from sklearn.linear_model import LassoCV
|
||||||
|
|
||||||
|
|
||||||
|
# adjust the directory to your folder
|
||||||
|
directory="C:/Lehre/Machine Learning/Data/"
|
||||||
|
|
||||||
|
|
||||||
|
# import the data for this problem
|
||||||
|
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
|
||||||
|
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
|
||||||
|
# The rows of the data are the Form 10-K filings. Each line is one filing.
|
||||||
|
# The columns are the variables. After some identifying information,
|
||||||
|
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
|
||||||
|
# in a 10-K (e.g., 100 times)
|
||||||
|
|
||||||
|
|
||||||
|
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
|
||||||
|
# and Console will crash.
|
||||||
|
# However, you can pick a small subset of the data and look at it.
|
||||||
|
# It list all columns=variables and the first three observations.
|
||||||
|
data_frame_example=data_frame.head(3)
|
||||||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||||
|
|
||||||
|
# To see the variables included in the data use the following command
|
||||||
|
data_frame_column_names=data_frame.columns
|
||||||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||||
|
# This variables shows all column/variable names in a vector.
|
||||||
|
|
||||||
|
# split the data set into the training and testing data
|
||||||
|
# we use the filings from year 2007 as training data
|
||||||
|
data_frame_train=data_frame[data_frame.year==2007]
|
||||||
|
# and the filing from year 2008 as testing data
|
||||||
|
data_frame_test=data_frame[data_frame.year==2008]
|
||||||
|
|
||||||
|
# put the cumulative abnormal return around the filing date into a new variable.
|
||||||
|
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
|
||||||
|
# training data
|
||||||
|
filing_car_train=data_frame_train["excess_ret_t0_t4"]
|
||||||
|
# testing data
|
||||||
|
filing_car_test=data_frame_test["excess_ret_t0_t4"]
|
||||||
|
|
||||||
|
# so far, you have absolute word counts. For example, "loss" is found 5 times.
|
||||||
|
# As the length of the 10-Ks can be different, we scale by the number of words
|
||||||
|
# in the 10-K.
|
||||||
|
document_length_train=data_frame_train["number_of_words"]
|
||||||
|
document_length_test=data_frame_test["number_of_words"]
|
||||||
|
|
||||||
|
|
||||||
|
# the word frequencies are our independent variables -> restrict the data frame
|
||||||
|
# to those variables and drop all variables that are not needed
|
||||||
|
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||||
|
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||||
|
|
||||||
|
# compute relative frequencies, i.e., divide the absolute word count by document length
|
||||||
|
data_frame_train=data_frame_train.div(document_length_train, axis=0)
|
||||||
|
data_frame_test=data_frame_test.div(document_length_test, axis=0)
|
||||||
|
|
||||||
|
# standardize the data frames
|
||||||
|
# training data
|
||||||
|
data_frame_train_mean=np.mean(data_frame_train,axis=0)
|
||||||
|
data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
|
||||||
|
data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
|
||||||
|
# testing data
|
||||||
|
data_frame_test_mean=np.mean(data_frame_test,axis=0)
|
||||||
|
data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
|
||||||
|
data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
|
||||||
|
|
||||||
|
|
||||||
|
# There can be missing values in the standardized variables.
|
||||||
|
# They arise if the word count for a specific word is always zero in the training
|
||||||
|
# or in the testing data. In this case, the standard deviation is zero ->
|
||||||
|
# division by zero -> NaN.
|
||||||
|
# We replace these missing values by zero.
|
||||||
|
# training data
|
||||||
|
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
|
||||||
|
# testing data
|
||||||
|
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# Ridge regression
|
||||||
|
##########################
|
||||||
|
print("\nRidge regression - Using cross-validation\n")
|
||||||
|
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
|
||||||
|
# In this regression, we use the training data.
|
||||||
|
# We use five-fold cross-validation.
|
||||||
|
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
|
||||||
|
# The optimal alpha is at around 140000.
|
||||||
|
regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# what is the R2 in the training and testing data?
|
||||||
|
print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
|
||||||
|
print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
|
||||||
|
# predict y in the testing sample
|
||||||
|
filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
|
||||||
|
# Determine the MSE
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# LASSO regression
|
||||||
|
######################
|
||||||
|
print("\nLASSO regression - Using cross-validation\n")
|
||||||
|
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
|
||||||
|
# In this regression, we use the training data.
|
||||||
|
# We use five-fold cross-validation.
|
||||||
|
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
|
||||||
|
# The optimal alpha is at around 0.86.
|
||||||
|
regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# get the R2 in the training data
|
||||||
|
print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
|
||||||
|
# ... and testing data
|
||||||
|
print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
|
||||||
|
# predict y in the testing sample
|
||||||
|
filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
|
||||||
|
# Determine the MSE
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
|
||||||
|
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
# Compare the betas from the Ridge and the LASSO regressions
|
||||||
|
############################################################
|
||||||
|
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
|
||||||
|
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
|
||||||
|
|
||||||
|
# get the list of coefficients
|
||||||
|
for i in range (0,len(data_frame_train.columns)):
|
||||||
|
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||||
|
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("Completed!")
|
121
lectures/programming/solutions/Problem_1_Fun_with_Python.py
Normal file
121
lectures/programming/solutions/Problem_1_Fun_with_Python.py
Normal file
|
@ -0,0 +1,121 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Nov 13 21:40:57 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Task 1: Open and print
|
||||||
|
# Open the Txt-file
|
||||||
|
print("\nTask 1 starts here!\n")
|
||||||
|
input_file=open(directory+'Fun_with_Python.txt','r')
|
||||||
|
input_text=input_file.read()
|
||||||
|
# Alternative with one command
|
||||||
|
input_text=open(directory+'Fun_with_Python.txt','r').read()
|
||||||
|
|
||||||
|
print(input_text)
|
||||||
|
|
||||||
|
# Task 2: Write text to output file
|
||||||
|
# Create file 'More_fun_with_Python.txt'
|
||||||
|
print("\nTask 2 starts here!\n")
|
||||||
|
output_file=open(directory+'More_fun_with_Python.txt','w')
|
||||||
|
output_file.write("Hallo\n")
|
||||||
|
output_file.write(input_text)
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
# Task 3: loop
|
||||||
|
print("\nTask 3 starts here!\n")
|
||||||
|
# Alternative 1: While loop
|
||||||
|
i = 1
|
||||||
|
while i<=10:
|
||||||
|
print('Iteration Number: '+str(i))
|
||||||
|
i=i+1
|
||||||
|
# Example of a nested loop
|
||||||
|
j=1
|
||||||
|
while j<3:
|
||||||
|
print('Hallo')
|
||||||
|
j=j+1
|
||||||
|
|
||||||
|
# Alternative 2: For loop
|
||||||
|
for i in range(0,10):
|
||||||
|
print('Iteration Number: '+str(i))
|
||||||
|
# there is also a shorter notation: if there is no lower bound it is assumed to be zero
|
||||||
|
for i in range(10):
|
||||||
|
print('Iteration Number: '+str(i))
|
||||||
|
|
||||||
|
|
||||||
|
# Task 4: Print text line by line
|
||||||
|
# Print text line by line
|
||||||
|
print("\nTask 4 starts here!\n")
|
||||||
|
line_of_text=input_text.split('\n')
|
||||||
|
i=0
|
||||||
|
while i<len(line_of_text):
|
||||||
|
print("Line "+str(i+1)+": "+line_of_text[i])
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# First alternative using a for loop
|
||||||
|
for i in range(0,len(line_of_text)):
|
||||||
|
print("Line "+str(i+1)+": "+line_of_text[i])
|
||||||
|
|
||||||
|
|
||||||
|
# Second alternative
|
||||||
|
# for ... in -> for each element of the list do ...
|
||||||
|
# line can be any name; it refers to the elements of the list
|
||||||
|
i=1
|
||||||
|
for line in line_of_text:
|
||||||
|
print("Line "+str(i)+": "+line)
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
|
||||||
|
# Task 5: count 'good'
|
||||||
|
# Count how often the word 'good' appears in the text
|
||||||
|
print("\nTask 5 starts here!\n")
|
||||||
|
number_good=input_text.count('good')
|
||||||
|
print(number_good)
|
||||||
|
# you can write the command in a shorter format
|
||||||
|
print(input_text.count('good'))
|
||||||
|
|
||||||
|
# Task 6a
|
||||||
|
# Print lines with the word 'good'
|
||||||
|
print("\nTask 6a starts here!\n")
|
||||||
|
for i in range(len(line_of_text)):
|
||||||
|
if line_of_text[i].count('good')>=1:
|
||||||
|
print(line_of_text[i])
|
||||||
|
|
||||||
|
|
||||||
|
# Task 7
|
||||||
|
# Print lines that start with the word 'This'
|
||||||
|
print("\nTask 7 starts here!\n")
|
||||||
|
print("\n'This' with a capital T.\n")
|
||||||
|
for i in range(len(line_of_text)):
|
||||||
|
if line_of_text[i].startswith('This')>=1:
|
||||||
|
print(line_of_text[i])
|
||||||
|
|
||||||
|
print("\n'this' with a lower case t.\n")
|
||||||
|
for i in range(len(line_of_text)):
|
||||||
|
if line_of_text[i].startswith('this')>=1:
|
||||||
|
print(line_of_text[i])
|
||||||
|
|
||||||
|
print("Yes, the command is case sensitive (2 vs. 0 matches)!")
|
||||||
|
|
||||||
|
|
||||||
|
# Task 8
|
||||||
|
# Replace the word 'good' by 'excellent'
|
||||||
|
print("\nTask 8 starts here!\n")
|
||||||
|
new_text=input_text.replace("good","excellent")
|
||||||
|
print(new_text)
|
||||||
|
|
||||||
|
# For illustation only
|
||||||
|
print("\nFor illustation only\n")
|
||||||
|
for i in range(len(line_of_text)):
|
||||||
|
new_line_of_text=line_of_text[i].replace('good','excellent')
|
||||||
|
# print the new line IF there are a change.
|
||||||
|
if not new_line_of_text==line_of_text[i]:
|
||||||
|
print(new_line_of_text)
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
|
@ -0,0 +1,72 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 09:21:46 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the txt file with the SEC filings
|
||||||
|
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
|
||||||
|
sec_filings_text=sec_filings_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'SEC_Filings_Output.csv','w')
|
||||||
|
|
||||||
|
# Create first line with variable names
|
||||||
|
# I use semicolons as separator in csv files. You can also use any other symbol.
|
||||||
|
# However, you should make sure that the separator is not part of the data/text
|
||||||
|
# you write to the file.
|
||||||
|
# For example, it would be problematic if you use comma as separator and have
|
||||||
|
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
|
||||||
|
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
|
||||||
|
|
||||||
|
# Split the Input File in separate line
|
||||||
|
sec_filings_line=sec_filings_text.split("\n")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(len(sec_filings_line)):
|
||||||
|
# Does the line refer to a form 10-K file?
|
||||||
|
# As pointed out by Loughran and McDonald (2011), many firms mislabelled
|
||||||
|
# their 10-K filings as 10-K405 filings. Thus, I included these filings
|
||||||
|
# as well.
|
||||||
|
# The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
|
||||||
|
# Depending on the research question at hand one could include amendments as well.
|
||||||
|
# Also, 10KSB (small businesses) could also be included.
|
||||||
|
|
||||||
|
match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
|
||||||
|
if match_10k:
|
||||||
|
|
||||||
|
#if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
|
||||||
|
# Split the line such that the information can be saved in separate
|
||||||
|
# variables
|
||||||
|
# Each information item has a fixed length in the overview files of the
|
||||||
|
# SEC.
|
||||||
|
# Filing type: position 1 to 12
|
||||||
|
# Remember Python starts counting at 0 and does not include the upper bound
|
||||||
|
filing_type=sec_filings_line[i][:12]
|
||||||
|
# Company name: position 13 to 74
|
||||||
|
company_name=sec_filings_line[i][12:74]
|
||||||
|
# CIK: position 75 to 86
|
||||||
|
cik=sec_filings_line[i][74:86]
|
||||||
|
# Filing date: position 87 to 98
|
||||||
|
filing_date=sec_filings_line[i][86:98]
|
||||||
|
# Link: position 99 to end of line
|
||||||
|
link=sec_filings_line[i][98:]
|
||||||
|
|
||||||
|
# Is the 10-K filed between March 10 and March 20?
|
||||||
|
# The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
|
||||||
|
filing_day=filing_date[8:10]
|
||||||
|
filing_month=filing_date[5:7]
|
||||||
|
# Is the Filing Month March?
|
||||||
|
if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
|
||||||
|
# The filing meets the conditions -->
|
||||||
|
# Write output to the csv file
|
||||||
|
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
|
||||||
|
|
||||||
|
sec_filings_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
|
@ -0,0 +1,95 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# We need the urllib package
|
||||||
|
import urllib.request
|
||||||
|
# To automatically create folders we need the os-module (OS: Operating System)
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# Define a user agent
|
||||||
|
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
|
||||||
|
# "Some websites dislike being browsed by programs, or send different versions
|
||||||
|
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
|
||||||
|
# (where x and y are the major and minor version numbers of the Python release,
|
||||||
|
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
|
||||||
|
# The way a browser identifies itself is through the User-Agent header.
|
||||||
|
opener = urllib.request.build_opener()
|
||||||
|
|
||||||
|
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
|
||||||
|
# To still automatically download files, you have different options.
|
||||||
|
# I have listed three examples below but there are many more:
|
||||||
|
# For a comprehensive list see, e.g.:
|
||||||
|
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
|
||||||
|
#opener.addheaders = [('User-agent', 'Mozilla')]
|
||||||
|
#opener.addheaders = [('User-agent', 'Chrome')]
|
||||||
|
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
|
||||||
|
urllib.request.install_opener(opener)
|
||||||
|
|
||||||
|
|
||||||
|
# Open the csv file from part 1 of the problem
|
||||||
|
input_file=open(directory+'SEC_Filings_Output.csv','r')
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
# sometimes you have empty lines after a split command.
|
||||||
|
# You can remove them using the following command
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create a subfolder in which the 10-K filings are saved.
|
||||||
|
# When you download a large number of filings I recommend using subfolders for
|
||||||
|
# each year or even for each year-month combination.
|
||||||
|
# The option "exist_ok=True" makes sure that you do not get an error if the
|
||||||
|
# folder already exists.
|
||||||
|
os.makedirs(directory+"10-Ks/", exist_ok=True)
|
||||||
|
|
||||||
|
# Loop over all lines of the csv file
|
||||||
|
#for i in range(1,len(input_text_line)):
|
||||||
|
# To avoid having to download hundreds of files when we discuss the solution
|
||||||
|
# the loop stops at 20. (Remember the upper bound is not included.)
|
||||||
|
for i in range(1,21):
|
||||||
|
|
||||||
|
# split the line into the five variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We only need the cik and the link.
|
||||||
|
# The cik is the 3rd variable. However, the numbering of lists starts
|
||||||
|
# at zero -> 2nd item of the list "variables"
|
||||||
|
# The link is the 5th variable -> 4th item of the list "variables"
|
||||||
|
cik=variables[2]
|
||||||
|
#cik=cik.replace(" ","")
|
||||||
|
cik=cik.strip()
|
||||||
|
link=variables[4]
|
||||||
|
#link=link.replace(" ","")
|
||||||
|
link=link.strip()
|
||||||
|
|
||||||
|
# Find the filename
|
||||||
|
# The link consistes of differnt parts:
|
||||||
|
# For example: edgar/data/1000753/0000950129-98-001035.txt
|
||||||
|
link_parts=link.split("/")
|
||||||
|
# 1st part: edgar
|
||||||
|
# 2nd part: data
|
||||||
|
# 3rd part: cik
|
||||||
|
# 4th part: file name -> 3rd item of the set
|
||||||
|
filename=link_parts[3]
|
||||||
|
###########################################################################
|
||||||
|
############################ WARNING ######################################
|
||||||
|
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
|
||||||
|
# may use the same filename. Thus, when you only use the filename files
|
||||||
|
# might be overwritten. To avoid this problem you need to have a unique name.
|
||||||
|
# Combining CIK and filename results in a unique identifier, as the
|
||||||
|
# filename appears only once per firm (CIK).
|
||||||
|
# -> use the combination of CIK and filename: cik_filename
|
||||||
|
###########################################################################
|
||||||
|
urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
|
||||||
|
directory+"10-Ks/"+cik+"_"+filename)
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
print("DONE")
|
|
@ -0,0 +1,144 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Apr 12 15:50:22 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Import regular expressions and BeautifulSoup
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the document
|
||||||
|
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
#######################
|
||||||
|
# Task 1: remove tables
|
||||||
|
#######################
|
||||||
|
# Approach
|
||||||
|
# We search for tables until we find no more html tags that indicate the
|
||||||
|
# beginning of a table.
|
||||||
|
# Search for the start html-tag <TABLE>
|
||||||
|
table_match=re.search('<TABLE>', input_text)
|
||||||
|
print("This is the result of the re.search command:")
|
||||||
|
print(table_match)
|
||||||
|
while table_match:
|
||||||
|
# When we have identified a match, i.e. the start of a table, we save
|
||||||
|
# the position of the beginning of the table in the variable "start_table"
|
||||||
|
table_start_match=re.search('<TABLE>', input_text)
|
||||||
|
start_table=table_start_match.start()
|
||||||
|
# Next, we search for the corresponding html tag that indicates the end of
|
||||||
|
# the table and save the end position to the variable "end_table"
|
||||||
|
table_end_match=re.search('</TABLE>', input_text)
|
||||||
|
end_table=table_end_match.end()
|
||||||
|
|
||||||
|
# We can print the text between the start and end html tag to check whether
|
||||||
|
# the table has been identified correctly.
|
||||||
|
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
|
||||||
|
|
||||||
|
# the text between the beginning and end of the html tags is the part which
|
||||||
|
# we would like to delete.
|
||||||
|
# Consequently, we keep the text before the beginning of the table as well
|
||||||
|
# as the text after the ending of the table.
|
||||||
|
input_text=input_text[:start_table]+input_text[end_table:]
|
||||||
|
# Next, we need to check whether there is another table in the rest of the
|
||||||
|
# text.
|
||||||
|
table_match=re.search('<TABLE>', input_text)
|
||||||
|
# As long as "table_match" exists, i.e. we regex result in a match, the loop
|
||||||
|
# will continue.
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# Task 2: remove Exhibits
|
||||||
|
#########################
|
||||||
|
# Exhibits have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>EX...
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||||
|
while exhibit_match:
|
||||||
|
exhibit_start_match=re.search('<TYPE>EX', input_text)
|
||||||
|
start_exhibit=exhibit_start_match.start()
|
||||||
|
# As the exhibits are at the end of the 10-K filing it would not be
|
||||||
|
# necessary to include an end position. We could also drop the entire text
|
||||||
|
# after "<TYPE>EX"
|
||||||
|
# It is important that we search for the </DOCUMENT> only after the exhibit
|
||||||
|
# started. Otherwise, we could get the end of the main document.
|
||||||
|
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+exhibit_end_match.end()
|
||||||
|
# Print the identified text to check whether the exhibit has be identified
|
||||||
|
# correctly
|
||||||
|
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||||
|
|
||||||
|
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
|
||||||
|
# Check whether there are further exhibits
|
||||||
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# Task 3: remove html code
|
||||||
|
##########################
|
||||||
|
# Alternative 1: remove html code without Beautiful Soup
|
||||||
|
text=re.sub('<[^>]{1,}>', '', input_text)
|
||||||
|
# This regex searches for a "<" followed by at least one character that must not
|
||||||
|
# equal > and is completed by >.
|
||||||
|
# You might have thought about using the following command
|
||||||
|
#text=re.sub('<.{1,}>', '', input_text)
|
||||||
|
# However, this command has a problem, as it would delete the following line
|
||||||
|
# entirely: <page> This is some text that should remain <page>
|
||||||
|
# The .{1,} would match 'page> This is some text that should remain <page', as
|
||||||
|
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
|
||||||
|
# Consequently, in the example only the two "<page>" would be deleted.
|
||||||
|
# You can verify this by using regex101.com (remember to check "Python" in the
|
||||||
|
# left menu of the webpage)
|
||||||
|
|
||||||
|
# Alternative 2: remove html code using Beautiful Soup
|
||||||
|
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||||
|
text=html_text.get_text()
|
||||||
|
|
||||||
|
########################
|
||||||
|
# Task 4: delete numbers
|
||||||
|
########################
|
||||||
|
# Alternative 1 - removing numbers step by step
|
||||||
|
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
|
||||||
|
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
|
||||||
|
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
|
||||||
|
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
|
||||||
|
# remove the remaining numbers without commas and dots
|
||||||
|
text=re.sub('[0-9]','',text)
|
||||||
|
|
||||||
|
# Alternative 2 - removing numbers using a single regex
|
||||||
|
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||||
|
|
||||||
|
# Alternative 3 - removing numbers step by step but start with commas and dots
|
||||||
|
# 1. remove comma incl. the surrounding numbers
|
||||||
|
text=re.sub("[0-9],[0-9]","",text)
|
||||||
|
# 2. remove dots incl. the surrounding numbers
|
||||||
|
text=re.sub("[0-9]\.[0-9]","",text)
|
||||||
|
# 3. remove any remaining number
|
||||||
|
text=re.sub("[0-9]","",text)
|
||||||
|
|
||||||
|
|
||||||
|
########################
|
||||||
|
# Task 5: delete symbols
|
||||||
|
########################
|
||||||
|
# When analyzing tone, symbols do not matter, as they are not considered to be
|
||||||
|
# words and thus do not biased the total word count.
|
||||||
|
# However, for training purposes this task is included in the problem.
|
||||||
|
# There is no well defined list of which symbols should be deleted. So, you
|
||||||
|
# can add further symbols.
|
||||||
|
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
|
||||||
|
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
|
||||||
|
|
||||||
|
# Open the output file for the pure text
|
||||||
|
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
|
||||||
|
output_file.write(text)
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
||||||
|
|
209
lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
Normal file
209
lectures/programming/solutions/Problem_5_Clean_SEC_Filing.py
Normal file
|
@ -0,0 +1,209 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Apr 12 15:50:22 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the 10-K
|
||||||
|
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
################################
|
||||||
|
# Remove tables
|
||||||
|
# Same approach as in Problem 4
|
||||||
|
################################
|
||||||
|
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||||
|
# example, we will print the first two tables that we delete.
|
||||||
|
i=1
|
||||||
|
table_match=re.search('<TABLE>', input_text)
|
||||||
|
while table_match:
|
||||||
|
# Search for the beginning of the table
|
||||||
|
table_start_match=re.search('<TABLE>', input_text)
|
||||||
|
start_table=table_start_match.start()
|
||||||
|
# search for the end of the table
|
||||||
|
table_end_match=re.search('</TABLE>', input_text)
|
||||||
|
end_table=table_end_match.end()
|
||||||
|
# The if condition and the printing are just for illustrative purposes.
|
||||||
|
# The commands display the first two tables that are removed from the text.
|
||||||
|
if i<=2:
|
||||||
|
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
|
||||||
|
i=i+1
|
||||||
|
# remove the table
|
||||||
|
input_text=input_text[:start_table]+input_text[end_table:]
|
||||||
|
# check whether there are further tables
|
||||||
|
table_match=re.search('<TABLE>', input_text)
|
||||||
|
|
||||||
|
################################
|
||||||
|
# Remove exhibits
|
||||||
|
# Same approach as in Problem 4
|
||||||
|
################################
|
||||||
|
# Exhibits have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>EX...
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||||
|
# example, we will print the first exhibit that we delete.
|
||||||
|
i=1
|
||||||
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||||
|
while exhibit_match:
|
||||||
|
# Search for the beginning of the exhibit
|
||||||
|
exhibit_start_match=re.search('<TYPE>EX', input_text)
|
||||||
|
start_exhibit=exhibit_start_match.start()
|
||||||
|
# Search for the end of the exhibit
|
||||||
|
# CAUTION: search only in the text after the beginning of the exhibt, as
|
||||||
|
# </DOCUMENT> also appears earlier (e.g. end of main document)
|
||||||
|
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+exhibit_end_match.end()
|
||||||
|
if i<=1:
|
||||||
|
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||||
|
i=i+1
|
||||||
|
# remove exhibit
|
||||||
|
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
|
||||||
|
exhibit_match=re.search('<TYPE>EX', input_text)
|
||||||
|
|
||||||
|
##################
|
||||||
|
# Remove html code
|
||||||
|
##################
|
||||||
|
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||||
|
text=html_text.get_text()
|
||||||
|
|
||||||
|
############################
|
||||||
|
# Remove the Document Header
|
||||||
|
############################
|
||||||
|
# There are different possibilities how one can define the start of the main part of the text
|
||||||
|
# In general, you should delete all text that is uninformative for your analysis.
|
||||||
|
# Alternative 1:
|
||||||
|
# Search for Table of Contents. To not mistakenly match a reference to the
|
||||||
|
# table of contents somewhere in the text, we require a linebreak before and after.
|
||||||
|
# When the "Table of Contents" is centered, there will be whitespaces or tabs
|
||||||
|
# before and potentially also after
|
||||||
|
header_match=re.search('(?i)\n[\t ]{0,}table[\t ]of[\t ]contents[\t ]{0,}\n', text)
|
||||||
|
# Alternative 2:
|
||||||
|
# Search for Documents incorporated by reference.
|
||||||
|
header_match=re.search('\n[\t ]{0,}DOCUMENTS[\t ]INCORPORATED[\t ]BY[\t ]REFERENCE[\t ]{0,}\n', text)
|
||||||
|
if header_match:
|
||||||
|
# Drop the document header and keep only the rest of the text after the header.
|
||||||
|
text=text[header_match.end():]
|
||||||
|
|
||||||
|
#################################################
|
||||||
|
# Delete the text in "PART IV"
|
||||||
|
# This procedure is optional. Look at "Part IV" and decide whether you favor
|
||||||
|
# the approach. I think that the part should be dropped, as it is just a list
|
||||||
|
# of exhibits, some mandatory text required by the SEC [indicated by the
|
||||||
|
# capital letters in the "SIGNATURES" section].
|
||||||
|
#################################################
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Alternative 1: go over all matches but keep only the last one
|
||||||
|
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||||
|
print("Hallo")
|
||||||
|
# match now contains the last match
|
||||||
|
# Delete the text after the last match
|
||||||
|
text=text[:match.start()]
|
||||||
|
|
||||||
|
|
||||||
|
# Alternative 2: save the positions of all matches (more general approach)
|
||||||
|
# to use alternative 2, you have to comment out Alternative 1!
|
||||||
|
# Otherwise line 104 will create a problem when you execute Alternative 2.
|
||||||
|
list_start_matches=[]
|
||||||
|
list_end_matches=[]
|
||||||
|
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||||
|
print(match)
|
||||||
|
list_start_matches.append(match.start())
|
||||||
|
list_end_matches.append(match.end())
|
||||||
|
# Position of last match
|
||||||
|
print(list_start_matches[len(list_start_matches)-1])
|
||||||
|
print(list_end_matches[len(list_start_matches)-1])
|
||||||
|
|
||||||
|
|
||||||
|
# Alternative 3: manual coding using a loop of re.searches
|
||||||
|
# create a copy of the text that we can edit
|
||||||
|
text_check_part_IV=text
|
||||||
|
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
|
||||||
|
# create two lists that we can use to save the start and end positions
|
||||||
|
# of the Part IV matches
|
||||||
|
list_start_matches_v2=[]
|
||||||
|
list_end_matches_v2=[]
|
||||||
|
# variable to save the position of the last match in the overall text
|
||||||
|
end_position_previous_match=0
|
||||||
|
while part_IV_match:
|
||||||
|
start_position_match=end_position_previous_match+part_IV_match.start()
|
||||||
|
end_position_match=end_position_previous_match+part_IV_match.end()
|
||||||
|
|
||||||
|
list_start_matches_v2.append(start_position_match)
|
||||||
|
list_end_matches_v2.append(end_position_match)
|
||||||
|
|
||||||
|
# update the information on the end of the last match
|
||||||
|
end_position_previous_match=end_position_previous_match+part_IV_match.end()
|
||||||
|
|
||||||
|
text_check_part_IV=text_check_part_IV[part_IV_match.end():]
|
||||||
|
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
|
||||||
|
|
||||||
|
# when you compare list_end_matches to list_end_matches_v2, you see that the two
|
||||||
|
# approaches yield the same result.
|
||||||
|
# To double check that the approaches have the same results, you could
|
||||||
|
# replace the Regex in lines 112, 124, and 142 by "\s{2,}PART [A-Z]{1,3}\s{0,}\n".
|
||||||
|
# In these case you have more matches and so you can better check that the
|
||||||
|
# two approaches have identical outcomes.
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Delete the text after the last match
|
||||||
|
text=text[:list_start_matches[len(list_start_matches)-1]]
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Delete item numbers
|
||||||
|
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
|
||||||
|
text=re.sub('(?i)Item [0-9]{1,}A{0,1}(\s|\.|:|\n)','',text)
|
||||||
|
|
||||||
|
# Delete numbers
|
||||||
|
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||||
|
|
||||||
|
# Alternative stepwise procedure to delete numbers
|
||||||
|
# remove commas in numbers, e.g., 1,000 or 12,345,678
|
||||||
|
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
|
||||||
|
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
|
||||||
|
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
|
||||||
|
# remove the remaining numbers without commas and dots
|
||||||
|
text=re.sub('[0-9]','',text)
|
||||||
|
|
||||||
|
|
||||||
|
# Hyphens can be used to indicate that the word is continued in the next
|
||||||
|
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||||
|
# Delete hyphens that are followed by a line feed.
|
||||||
|
text=re.sub('-\n','',text)
|
||||||
|
|
||||||
|
# Replace symbols by a whitespace.
|
||||||
|
# Extra whitespaces are not a problem.
|
||||||
|
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
|
||||||
|
|
||||||
|
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||||
|
# that are preceded by a line break (potentially also whitespaces and tabs)
|
||||||
|
# and that are followed by are followed by a line break (again, there may
|
||||||
|
# also be whitespaces and tabs).
|
||||||
|
text=re.sub('\n[\t ]{0,}(\.|,){1,}[\t ]{0,}\n','\n',text)
|
||||||
|
|
||||||
|
# Drop single-character words
|
||||||
|
# One can argue whether one should implement this procedure. Loughran and
|
||||||
|
# McDonald argue in one of their papers in favor of it.
|
||||||
|
# To make sure that there is just one letter, we require that there is a word
|
||||||
|
# boundary (\W) before and after. We use a positive backward looking and a
|
||||||
|
# positive forward looking condition for this to assure that the word boundary
|
||||||
|
# get not deleted as well.
|
||||||
|
text=re.sub('(?<=\W)[A-Za-z](?=\W)',' ',text)
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the pure text
|
||||||
|
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
|
||||||
|
output_file.write(text)
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
print("COMPLETED.")
|
356
lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
Normal file
356
lectures/programming/solutions/Problem_6_Clean_10-K_Sample.py
Normal file
|
@ -0,0 +1,356 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the iput file. The following command
|
||||||
|
# deletes these lines
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
|
||||||
|
# We subtract 1 from the lenght, as the first line contains the variable names but not data.
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
# To see the progress of your program you can print the number of iteration.
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# split the lines of the CSV-file into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename to open the file
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# the new file name should be "old_name_clean" -> we have to replace ".txt"
|
||||||
|
# by "_clean.txt"
|
||||||
|
filename=filename.replace('.txt','_clean.txt')
|
||||||
|
|
||||||
|
# Remove tables
|
||||||
|
variable=re.search('<TABLE>', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TABLE>', input_text_10_k)
|
||||||
|
start_table=variable.start()
|
||||||
|
variable=re.search('</TABLE>', input_text_10_k)
|
||||||
|
end_table=variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
|
||||||
|
variable=re.search('<TABLE>', input_text_10_k)
|
||||||
|
|
||||||
|
|
||||||
|
####################### Begin of exhibits removal #########################
|
||||||
|
# Exhibits have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>EX...
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
# In the recent years, there are also exhibits with <TYPE>EXCEL
|
||||||
|
# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
|
||||||
|
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||||
|
start_exhibit=variable.start()
|
||||||
|
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
|
||||||
|
variable=re.search('<TYPE>EX', input_text_10_k)
|
||||||
|
|
||||||
|
# In recent years, there are also XML-Exibits.
|
||||||
|
# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
|
||||||
|
# Remove XML-Exhibits, which have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>XML
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||||
|
start_exhibit=variable.start()
|
||||||
|
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||||
|
variable=re.search('<TYPE>XML', input_text_10_k)
|
||||||
|
|
||||||
|
# Furthermore, also in recent years, there are also ZIP-Exibits.
|
||||||
|
# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
|
||||||
|
# Remove ZIP-Exhibits, which have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>ZIP
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||||
|
start_exhibit=variable.start()
|
||||||
|
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||||
|
variable=re.search('<TYPE>ZIP', input_text_10_k)
|
||||||
|
|
||||||
|
# In addition, there are many Graphic-Exibits.
|
||||||
|
# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
|
||||||
|
# Remove GRAPHIC-Exhibits, which have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>GRAPHIC
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||||
|
start_exhibit=variable.start()
|
||||||
|
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||||
|
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
|
||||||
|
|
||||||
|
# Furthermore, there can be also Cover-Exibits.
|
||||||
|
# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
|
||||||
|
# Remove COVER-Exhibits, which have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>COVER
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||||
|
start_exhibit=variable.start()
|
||||||
|
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
|
||||||
|
end_exhibit=start_exhibit+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
|
||||||
|
variable=re.search('<TYPE>COVER', input_text_10_k)
|
||||||
|
|
||||||
|
# Furthermore, there can be also PDF files attached.
|
||||||
|
# These attachments caused BeautifulSoup to crash on some computers.
|
||||||
|
# Remove PDFs
|
||||||
|
variable=re.search('<PDF>', input_text_10_k)
|
||||||
|
while variable:
|
||||||
|
variable=re.search('<PDF>', input_text_10_k)
|
||||||
|
start_pdf=variable.start()
|
||||||
|
variable=re.search('</PDF>', input_text_10_k[start_pdf:])
|
||||||
|
end_pdf=start_pdf+variable.end()
|
||||||
|
input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
|
||||||
|
variable=re.search('<PDF>', input_text_10_k)
|
||||||
|
|
||||||
|
######################## End of exhibits removal ##########################
|
||||||
|
|
||||||
|
# Remove Document Header - PART 1
|
||||||
|
# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
|
||||||
|
# and "</SEC-HEADER>" are mandatory for all filings.
|
||||||
|
variable=re.search('</SEC-HEADER>', input_text_10_k)
|
||||||
|
if variable:
|
||||||
|
input_text_10_k=input_text_10_k[variable.end():]
|
||||||
|
|
||||||
|
|
||||||
|
# In some filings, firms do not use line feeds \n but <div> and </div>
|
||||||
|
# instead to indicate the start and the end of sentences.
|
||||||
|
# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer
|
||||||
|
# neuen Zeile des Fließtextes beginnt."
|
||||||
|
# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
|
||||||
|
# and
|
||||||
|
# "The <div> tag defines a division or a section in an HTML document.
|
||||||
|
# By default, browsers always place a line break before and after the <div> element."
|
||||||
|
# See: https://www.w3schools.com/tags/tag_div.asp
|
||||||
|
# It is important to replace <div> and </div> by linefeeds because otherwise
|
||||||
|
# the entire text will be in a single line and the subsequent commands do
|
||||||
|
# not work properly.
|
||||||
|
input_text_10_k=input_text_10_k.replace("<div>", "\n")
|
||||||
|
input_text_10_k=input_text_10_k.replace("</div>", "\n")
|
||||||
|
|
||||||
|
|
||||||
|
# Remove html code
|
||||||
|
html_text=BeautifulSoup(input_text_10_k, 'html.parser')
|
||||||
|
text=html_text.get_text()
|
||||||
|
|
||||||
|
|
||||||
|
# To get an idea of what the commands below are doing, it is helpful to
|
||||||
|
# write the current version of the text to a file and then compare it to the
|
||||||
|
# final file.
|
||||||
|
filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
|
||||||
|
# Open the output file for the text without html code and without tables+exhibits
|
||||||
|
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
|
||||||
|
output_file_10_k.write(text)
|
||||||
|
output_file_10_k.close()
|
||||||
|
|
||||||
|
|
||||||
|
# Remove the Document Header - PART II
|
||||||
|
# The above command to remove the header ("</SEC-HEADER>") does not capture
|
||||||
|
# the entire header -> we need to delete further parts at the top the filing.
|
||||||
|
# WARNING: The filters below may be specific to this sample of 10-Ks.
|
||||||
|
# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
|
||||||
|
variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
|
||||||
|
if variable:
|
||||||
|
text=text[variable.end():]
|
||||||
|
else:
|
||||||
|
variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
|
||||||
|
if variable:
|
||||||
|
text=text[variable.end():]
|
||||||
|
else:
|
||||||
|
variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
|
||||||
|
if variable:
|
||||||
|
text=text[variable.end():]
|
||||||
|
else:
|
||||||
|
variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
|
||||||
|
if variable:
|
||||||
|
text=text[variable.end():]
|
||||||
|
else:
|
||||||
|
variable=re.search('\nPART\.{1,}', text)
|
||||||
|
if variable:
|
||||||
|
text=text[variable.end():]
|
||||||
|
|
||||||
|
|
||||||
|
# Delete Item numbers
|
||||||
|
text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
|
||||||
|
# Delete Part numbers
|
||||||
|
text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
|
||||||
|
|
||||||
|
# Delete numbers:
|
||||||
|
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
|
||||||
|
|
||||||
|
# File names, e.g. exhibit.pdf or picture.jpeg should be removed
|
||||||
|
text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
|
||||||
|
|
||||||
|
# URLs --> Remove internet addresse
|
||||||
|
text=re.sub("http:/{0,2}", "", text)
|
||||||
|
text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
|
||||||
|
|
||||||
|
|
||||||
|
# In Part 4 of the programming chapter, we will determine the number of
|
||||||
|
# words per sentence. To be able to use the same underlying sample,
|
||||||
|
# we need to implement further corrections. These changes do not affect
|
||||||
|
# the percentage of negative/positive/etc. words.
|
||||||
|
# --> Only relevant for determining the number of sentences
|
||||||
|
# The text contains dots that do not indicate the end of a sentence.
|
||||||
|
# E.g., "Inc." and "St."
|
||||||
|
# The preceding - is found in non-U.S. for example.
|
||||||
|
# Replace or remove specific abreviations
|
||||||
|
# This list is incomplete. In a research project you should spend more time
|
||||||
|
# on editing the data.
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
|
||||||
|
text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
|
||||||
|
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
|
||||||
|
text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
|
||||||
|
|
||||||
|
# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
|
||||||
|
# three repitions of capital letter and dot are also common in filings
|
||||||
|
# we need to check for three instances first.
|
||||||
|
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
|
||||||
|
# now check for two instances
|
||||||
|
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
|
||||||
|
|
||||||
|
# Dots after a single letter can indicate a middle Name Paul J. Smith
|
||||||
|
# or an abbreviation --> also delete these.
|
||||||
|
text=re.sub("( |\n|,)[A-Z]\.", "", text)
|
||||||
|
|
||||||
|
|
||||||
|
# Hyphens can be used to indicate that the word is continued in the next
|
||||||
|
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||||
|
# Replace hyphens followed by a line feed by a hyphen without line feed
|
||||||
|
text=re.sub('-\n','-',text)
|
||||||
|
|
||||||
|
# Delete the minus/hyphens
|
||||||
|
# "Short-term" -> "shortterm"
|
||||||
|
text=re.sub('-','',text)
|
||||||
|
|
||||||
|
|
||||||
|
# --> Only relevant for determining the number of sentences
|
||||||
|
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||||
|
# that are preceded by whitespace or line break and that are followed by
|
||||||
|
# whitespace or line break.
|
||||||
|
text=re.sub('\n(\.|,)\n','\n',text)
|
||||||
|
text=re.sub(' (\.|,) ',' ',text)
|
||||||
|
|
||||||
|
# Delete single character words
|
||||||
|
# One can argue whether one should implement this procedure. Loughran and
|
||||||
|
# McDonald argue in one of their papers in favor of it.
|
||||||
|
# To make sure that there is just one letter, we require that there is a word
|
||||||
|
# boundary (\W) before and after. We use a positive backward looking and a
|
||||||
|
# positive forward looking condition for this to assure that the word boundary
|
||||||
|
# get not deleted as well.
|
||||||
|
text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
|
||||||
|
|
||||||
|
|
||||||
|
# There are sentences that are in upper case letters. However, these are not
|
||||||
|
# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
|
||||||
|
# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
|
||||||
|
# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
|
||||||
|
# We save the edited text in a new variable
|
||||||
|
text_edited=text
|
||||||
|
# Split text in sentences
|
||||||
|
list_sentences=re.split('\.|!|\?', text)
|
||||||
|
# iterate the list of all sentences
|
||||||
|
for j in range(0,len(list_sentences)):
|
||||||
|
# Determine the number of upper case letters
|
||||||
|
upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
|
||||||
|
# Determine the number of all letters
|
||||||
|
total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
|
||||||
|
# If there is at least one letter calculate the fraction of upper case letters
|
||||||
|
if total_letters>0:
|
||||||
|
ratio=upper_letters/total_letters
|
||||||
|
# If the fraction of upper case letters is larger than 0.9 delete
|
||||||
|
# the sentence from the text.
|
||||||
|
if ratio>0.9:
|
||||||
|
text_edited=text_edited.replace(list_sentences[j]+'.','')
|
||||||
|
text_edited=text_edited.replace(list_sentences[j]+'!','')
|
||||||
|
text_edited=text_edited.replace(list_sentences[j]+'?','')
|
||||||
|
|
||||||
|
|
||||||
|
# --> Only relevant for determining the number of sentences
|
||||||
|
# There are a few cases where a dot follows a dot or where a linefeed
|
||||||
|
# separates two dots. --> delete the second dot.
|
||||||
|
text_edited=text_edited.replace('..','.')
|
||||||
|
text_edited=text_edited.replace('.\n.','.')
|
||||||
|
|
||||||
|
# The following commands do not influence the subsequent textual analysis.
|
||||||
|
# The only purpose is to display the output in a nicer format.
|
||||||
|
# Replace lines that contain only whitespaces by a line feed.
|
||||||
|
text_edited=re.sub('\n {1,}\n','\n',text_edited)
|
||||||
|
|
||||||
|
# Replace multiple line feeds by one line feed.
|
||||||
|
text_edited=re.sub('\n{2,}','\n',text_edited)
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the pure text
|
||||||
|
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
|
||||||
|
output_file_10_k.write(text_edited)
|
||||||
|
output_file_10_k.close()
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
114
lectures/programming/solutions/Problem_7_Tone_Analysis.py
Normal file
114
lectures/programming/solutions/Problem_7_Tone_Analysis.py
Normal file
|
@ -0,0 +1,114 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the dictionary
|
||||||
|
# The dictionary has been obtained from Bill McDonald's webpage
|
||||||
|
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||||
|
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||||
|
# --> select negative words and copy them to a txt file
|
||||||
|
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
|
||||||
|
word_list=file_word_list.read()
|
||||||
|
# The LMD words are all in upper case
|
||||||
|
word_list=word_list.lower()
|
||||||
|
negative_words=word_list.split('\n')
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
|
||||||
|
Percentage_Negative_Words\n')
|
||||||
|
|
||||||
|
# Loop over all lines of the csv file
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
#for i in range(1,10):
|
||||||
|
# If the execution of your scripts takes some time, printing the loop iterator
|
||||||
|
# gives you an impression of the overall progress made.
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (2nd column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
|
||||||
|
# modify file name to open the edited files
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
# Open the ith 10-Ks in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Use lower case letters, too
|
||||||
|
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||||
|
# and the document is identical. Remember that you have typically lower and upper case
|
||||||
|
# letters in documents -> modify text.
|
||||||
|
text=input_text_10_k.lower()
|
||||||
|
|
||||||
|
# Split the text in single words to determine the total number of words
|
||||||
|
# \W is a non-word character: "Matches any character which is not a Unicode
|
||||||
|
# word character." (Python documentation)
|
||||||
|
# this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
|
||||||
|
# case letters, no numbers, and no underscore.
|
||||||
|
list_of_words=re.split('\W{1,}', text)
|
||||||
|
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
# It is important that you treat multiple "\W" as one. Otherwise you are left
|
||||||
|
# with elements in the list that are not acutal words.
|
||||||
|
|
||||||
|
# Determine the total number of words
|
||||||
|
word_count=len(list_of_words)
|
||||||
|
|
||||||
|
# Reset the number of negative words to zero
|
||||||
|
negative_count=0
|
||||||
|
# For each negative word, count the number of occurrences
|
||||||
|
for j in range(len(negative_words)):
|
||||||
|
# the command "list_of_words.count(negative_words[i])" only matches if there
|
||||||
|
# is exact overlap between the ith negative word and the words in the list.
|
||||||
|
# For example the following two commands:
|
||||||
|
# list_of_words=["abandon","abandoned","abandonment"]
|
||||||
|
# list_of_words.count("abandon")
|
||||||
|
# yields 1 match
|
||||||
|
# In contrast,
|
||||||
|
# text_of_words="abandon abandoned abandonment"
|
||||||
|
# text_of_words.count("abandon")
|
||||||
|
# yields 3. Thus, you have to split the text to individual words!!!
|
||||||
|
negative_count=negative_count+list_of_words.count(negative_words[j])
|
||||||
|
|
||||||
|
# Get the percentage of negative words
|
||||||
|
percentage_negative=negative_count/word_count
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, number of negative words,
|
||||||
|
# and the percentage of negative words to output file.
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||||
|
+str(negative_count)+';'+str(percentage_negative)+'\n')
|
||||||
|
|
||||||
|
# Close filings
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
|
@ -0,0 +1,130 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the dictionary
|
||||||
|
# The dictionary is obtained from Bill McDonald's webpage
|
||||||
|
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||||
|
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||||
|
# --> select positive words and copy them to a txt file
|
||||||
|
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
|
||||||
|
word_list=file_word_list.read()
|
||||||
|
word_list=word_list.lower()
|
||||||
|
positive_words=word_list.split()
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
|
||||||
|
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
|
||||||
|
|
||||||
|
# Iterate the list of the 200 10-K filings
|
||||||
|
# the last line is empty --> loop only up to len()-1
|
||||||
|
#for i in range(1,len(input_text_line)):
|
||||||
|
for i in range(1,20): # For illustration only
|
||||||
|
# If the execution of your scripts takes some time, printing the iterator
|
||||||
|
# gives you an impression of the overall progress
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (2nd column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
|
||||||
|
# modify file name to open the edited files
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'/10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Use lower case letters, too
|
||||||
|
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||||
|
# and the document are identical. Remember that you have typically lower and upper case
|
||||||
|
# letters in documents -> modify text
|
||||||
|
text=input_text_10_k.lower()
|
||||||
|
|
||||||
|
# Split the text in single words to determine the total number of words
|
||||||
|
list_of_words=re.split('\W{1,}', text)
|
||||||
|
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
# Determine total number of words
|
||||||
|
word_count=len(list_of_words)
|
||||||
|
|
||||||
|
# Reset the number of positive words and positive words adj. for negations to zero
|
||||||
|
positive_count=0
|
||||||
|
positive_count_adj=0
|
||||||
|
# For each positive word, count the number of occurrences
|
||||||
|
for j in range(len(positive_words)):
|
||||||
|
# standard count operation without controlling for negations
|
||||||
|
positive_words_found=list_of_words.count(positive_words[j])
|
||||||
|
|
||||||
|
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
|
||||||
|
# only for Fin-Pos words. Simple negation is taken to be observations
|
||||||
|
# of one of six words (no, not, none, neither, never, nobody) occurring
|
||||||
|
# within three words preceding a positive word.
|
||||||
|
|
||||||
|
# When we have identified positive words we need to search for negations
|
||||||
|
while positive_words_found>0:
|
||||||
|
# identify the position of the matched positive word in the list of all words
|
||||||
|
position_of_word=list_of_words.index(positive_words[j])
|
||||||
|
# identify the three words before the positive word and add them to a list
|
||||||
|
# the \ is a line break
|
||||||
|
list_negation=[list_of_words[max(0,position_of_word-3)],\
|
||||||
|
list_of_words[max(0,position_of_word-2)],list_of_words[max(0,position_of_word-1)]]
|
||||||
|
# check whether one of the three words in list_negation is a negation
|
||||||
|
negation_found=list_negation.count('no')+list_negation.count('not')+\
|
||||||
|
list_negation.count('none')+list_negation.count('neither')+\
|
||||||
|
list_negation.count('never')+list_negation.count('nobody')
|
||||||
|
|
||||||
|
if negation_found==0:
|
||||||
|
# no negation
|
||||||
|
positive_count_adj=positive_count_adj+1
|
||||||
|
positive_count=positive_count+1
|
||||||
|
else:
|
||||||
|
# negation
|
||||||
|
positive_count=positive_count+1
|
||||||
|
|
||||||
|
# delete the matched positive words in the original document
|
||||||
|
list_of_words[position_of_word]=''
|
||||||
|
# check whether there are further matches of the jth positive word
|
||||||
|
positive_words_found=list_of_words.count(positive_words[j])
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, and number of positive
|
||||||
|
# and adjusted positive words to the output file
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||||
|
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
|
||||||
|
';'+str(positive_count_adj/word_count)+'\n')
|
||||||
|
|
||||||
|
# Close filings
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
111
lectures/programming/solutions/Problem_9_Words_per_Sentence.py
Normal file
111
lectures/programming/solutions/Problem_9_Words_per_Sentence.py
Normal file
|
@ -0,0 +1,111 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We split the text into words and sentences using regular expression
|
||||||
|
import re
|
||||||
|
# For comparison, we also include the NLTK tokenizer
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
|
||||||
|
'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
|
||||||
|
'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
text=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Determine number of sentences and number of words
|
||||||
|
# Split the text in words to determine the total number of words
|
||||||
|
list_of_words=re.split('\W{1,}', text)
|
||||||
|
# to make sure that empty list elements do not bias the word count, we delete them.
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
# Determine total number of words
|
||||||
|
word_count=len(list_of_words)
|
||||||
|
|
||||||
|
|
||||||
|
# Split the text by symbols that indicate the end of a sentence
|
||||||
|
# to determine the total number of sentences
|
||||||
|
list_of_sentences=re.split('[\.!\?]{1,}', text)
|
||||||
|
while list_of_sentences.count("")>0:
|
||||||
|
list_of_sentences.remove("")
|
||||||
|
# Alternative 1:
|
||||||
|
list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
|
||||||
|
while list_of_sentences_1.count("")>0:
|
||||||
|
list_of_sentences_1.remove("")
|
||||||
|
# Alternative 2:
|
||||||
|
list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
|
||||||
|
while list_of_sentences_2.count("")>0:
|
||||||
|
list_of_sentences_2.remove("")
|
||||||
|
# Incorrect approach:
|
||||||
|
# re.split splits the string by the occurrences of the pattern.
|
||||||
|
# If capturing parentheses, i.e. (), are used in pattern, then the text
|
||||||
|
# of all groups in the pattern are also returned as part of the resulting list.
|
||||||
|
# See https://docs.python.org/3/library/re.html#re.split for details
|
||||||
|
list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
|
||||||
|
while list_of_sentences_false.count("")>0:
|
||||||
|
list_of_sentences_false.remove("")
|
||||||
|
|
||||||
|
# For comparison, we also include the NLTK tokenizer
|
||||||
|
list_of_sentences_nltk=sent_tokenize(text)
|
||||||
|
|
||||||
|
# Determine total number of sentences
|
||||||
|
sentence_count=len(list_of_sentences)
|
||||||
|
sentence_count_1=len(list_of_sentences_1)
|
||||||
|
sentence_count_2=len(list_of_sentences_2)
|
||||||
|
sentence_count_false=len(list_of_sentences_false)
|
||||||
|
sentence_count_nltk=len(list_of_sentences_nltk)
|
||||||
|
|
||||||
|
# Ratio of # of words over # of sentences
|
||||||
|
wps=word_count/sentence_count
|
||||||
|
wps_1=word_count/sentence_count_1
|
||||||
|
wps_2=word_count/sentence_count_2
|
||||||
|
wps_false=word_count/sentence_count_false
|
||||||
|
wps_nltk=word_count/sentence_count_nltk
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, total number of sentences,
|
||||||
|
# and WPS to the output file
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||||
|
str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
|
||||||
|
str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
|
||||||
|
str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
|
||||||
|
|
||||||
|
# Close filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
189
lectures/programming/templates/NLTK_Sentiment_Analysis.py
Normal file
189
lectures/programming/templates/NLTK_Sentiment_Analysis.py
Normal file
|
@ -0,0 +1,189 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sat Jul 15 21:56:41 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
import random
|
||||||
|
import collections
|
||||||
|
import re
|
||||||
|
|
||||||
|
# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
|
||||||
|
# are positive and the other 1,000 are negative.
|
||||||
|
# if you do not have the movie review corpus yet, download it:
|
||||||
|
nltk.download("movie_reviews")
|
||||||
|
|
||||||
|
from nltk.corpus import movie_reviews
|
||||||
|
|
||||||
|
|
||||||
|
# Create a list that contains the tuples of document and category.
|
||||||
|
# Category is "positive" or "negative"
|
||||||
|
documents = []
|
||||||
|
# For all categories
|
||||||
|
for category in movie_reviews.categories():
|
||||||
|
print("Category: "+str(category))
|
||||||
|
# for all reviews (identified by file ID) in the respective category
|
||||||
|
for file_ID in movie_reviews.fileids(category):
|
||||||
|
# You have to put two parentheses to indicate that you want to add a tuple.
|
||||||
|
documents.append((list(movie_reviews.words(file_ID)),category))
|
||||||
|
|
||||||
|
# Print the first element (i.e. tuple) of documents.
|
||||||
|
print(documents[0])
|
||||||
|
# print the words of the first movie review
|
||||||
|
print(documents[0][0])
|
||||||
|
# print the first word of the first movie review
|
||||||
|
print(documents[0][0][0])
|
||||||
|
|
||||||
|
# print the classification of the first movie review
|
||||||
|
print(documents[0][1])
|
||||||
|
|
||||||
|
# print the classification of the 1000th review (the last negative one)
|
||||||
|
print(documents[999][1])
|
||||||
|
# print the classification of the 1001st review (the first positive one)
|
||||||
|
print(documents[1000][1])
|
||||||
|
|
||||||
|
# The default order of the reviews is first all negative reviews and then all positive ones.
|
||||||
|
# Later we will build a training and a testing set. As we need to have positive and negative
|
||||||
|
# reports in both sets, we randomly shuffle the documents.
|
||||||
|
random.shuffle(documents)
|
||||||
|
|
||||||
|
# Create a list of all words.
|
||||||
|
all_words = []
|
||||||
|
for word in movie_reviews.words():
|
||||||
|
# We use lower case words
|
||||||
|
#all_words.append(word.lower())
|
||||||
|
if re.search("\A[a-z]",word.lower()):
|
||||||
|
# check whether the word is actually a word, i.e., whether it contains
|
||||||
|
# at least one letter
|
||||||
|
#if re.search("[a-z]",word.lower()):
|
||||||
|
# We use lower case words
|
||||||
|
all_words.append(word.lower())
|
||||||
|
|
||||||
|
|
||||||
|
# What are the most frequently used words in the movie reviews?
|
||||||
|
# Alternative 1:
|
||||||
|
# FreqDist sort words from the most frequently used word to the least frequenty used word.
|
||||||
|
all_words_approach_1 = nltk.FreqDist(all_words)
|
||||||
|
print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
|
||||||
|
|
||||||
|
# Alternative 2:
|
||||||
|
# We can also determine the most frequent words by using Counters as we did
|
||||||
|
# in Problem 12 --> transform list of all words to a Counter
|
||||||
|
all_words_approach_2=collections.Counter(all_words)
|
||||||
|
top_15_words=all_words_approach_2.most_common(15)
|
||||||
|
print("Alternative 2: the top 15 words are: "+str(top_15_words))
|
||||||
|
# -> identical results -> perfect.
|
||||||
|
|
||||||
|
# Search for a word and see how often it appears.
|
||||||
|
print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
|
||||||
|
# alternatively
|
||||||
|
print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
|
||||||
|
|
||||||
|
# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
|
||||||
|
# -> create a list that only contains the top 3000 words
|
||||||
|
# get the top 3000 words
|
||||||
|
# Approach 1 using the NLKT.FreqDist from above
|
||||||
|
i=0
|
||||||
|
top_3000_words=all_words_approach_1.most_common(3000)
|
||||||
|
list_top_3000_words_approach_1=[]
|
||||||
|
while i<3000:
|
||||||
|
list_top_3000_words_approach_1.append(top_3000_words[i][0])
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Approach 2 using Counters from above
|
||||||
|
i=0
|
||||||
|
top_3000_words=all_words_approach_2.most_common(3000)
|
||||||
|
list_top_3000_words_approach_2=[]
|
||||||
|
while i<3000:
|
||||||
|
list_top_3000_words_approach_2.append(top_3000_words[i][0])
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# select the list of approach 1 or 2
|
||||||
|
word_features=list_top_3000_words_approach_1
|
||||||
|
|
||||||
|
# We need to identify the words we want to use for classification in the documents.
|
||||||
|
# We define a function for that.
|
||||||
|
def find_features(document):
|
||||||
|
words = set(document)
|
||||||
|
features = {}
|
||||||
|
# loop over all the words we consider for the classification
|
||||||
|
for word in word_features:
|
||||||
|
# The expression returns either true or false
|
||||||
|
features[word] = (word in words)
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
|
# To get an idea what the function find_features() does let's print the features
|
||||||
|
# for one review.
|
||||||
|
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
|
||||||
|
|
||||||
|
|
||||||
|
feature_set = [(find_features(review), category) for (review, category) in documents]
|
||||||
|
|
||||||
|
# How does feature set looks like?
|
||||||
|
print(feature_set[0])
|
||||||
|
# -> it is still a tuple
|
||||||
|
print(feature_set[0][0])
|
||||||
|
# the first element are the 3000 words we use for classification with "True" or "False"
|
||||||
|
# depending on whether the words appear in the review
|
||||||
|
print(feature_set[0][1])
|
||||||
|
# Is the information on whether the review is positive or negative
|
||||||
|
|
||||||
|
# Define the training and testing set
|
||||||
|
# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
|
||||||
|
training_set=feature_set[:1900]
|
||||||
|
testing_set=feature_set[1900:]
|
||||||
|
|
||||||
|
# First we have to train the Naive Bayes Classifier.
|
||||||
|
# It will determine which of the words from word_features appear mostly in positive
|
||||||
|
# reviews and which appear mostly in negative reviews.
|
||||||
|
classifier=nltk.NaiveBayesClassifier.train(training_set)
|
||||||
|
# The following command prints the 20 words that best discriminate between
|
||||||
|
# positive and negative reviews.
|
||||||
|
classifier.show_most_informative_features(20)
|
||||||
|
|
||||||
|
# Let's classify the first element of feature_set
|
||||||
|
# The input for the classification need to be the list of words with True or False
|
||||||
|
print(classifier.classify(feature_set[0][0]))
|
||||||
|
print("The review is actually: "+str(feature_set[0][1]))
|
||||||
|
|
||||||
|
# classify the 100 reports from the testing set
|
||||||
|
# they have the position 1900 to 2000 in the feature set.
|
||||||
|
i=1900
|
||||||
|
classified_set=[]
|
||||||
|
while i<2000:
|
||||||
|
classified_set.append(classifier.classify(feature_set[i][0]))
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
# Compare classification result with actual category
|
||||||
|
i=0
|
||||||
|
# In this list we save tuples of [predicted category, actual category]
|
||||||
|
comparison=[]
|
||||||
|
# In this list we simply save "accurate" and "inaccurate"
|
||||||
|
comparison_2=[]
|
||||||
|
while i<100:
|
||||||
|
comparison.append([classified_set[i],feature_set[i+1900][1]])
|
||||||
|
# If the predicted and acutal classification match -> accurate
|
||||||
|
if comparison[i][0]==comparison[i][1]:
|
||||||
|
comparison_2.append("accurate")
|
||||||
|
else:
|
||||||
|
comparison_2.append("inaccurate")
|
||||||
|
i=i+1
|
||||||
|
|
||||||
|
print(comparison)
|
||||||
|
# We need the number of accurate and inaccurate classifications
|
||||||
|
comparison_counter=collections.Counter(comparison_2)
|
||||||
|
print(comparison_counter)
|
||||||
|
|
||||||
|
# NLT can compute the accuracy directly
|
||||||
|
# What is the accuracy for the testing set?
|
||||||
|
print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
|
||||||
|
# Same value as from our own calculations -> perfect!
|
||||||
|
|
||||||
|
# What is the accuracy for the training set?
|
||||||
|
print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
|
||||||
|
# Higher than in the testing dataset -> expected.
|
||||||
|
|
||||||
|
print("completed!")
|
|
@ -0,0 +1,55 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# To determine file size we need the OS package
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
|
||||||
|
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# File size of the complete submission file (gross file size)
|
||||||
|
# You have to divide the result by 1024 to get the size in kilobyte
|
||||||
|
# The file size will be affected by html code and exhibits.
|
||||||
|
# APPLY THE COMMAND THAT IS SHOWN ON SLIDE 62.
|
||||||
|
size_gross=XXX/1024
|
||||||
|
|
||||||
|
# File size of the main text file (net file size)
|
||||||
|
# You have to divide the result by 1024 to get the size in kilobyte
|
||||||
|
size_net=XXX/1024 # SAME COMMAND AS FOR GROSS FILE SIZE BUT APPLIED TO THE _clean.txt
|
||||||
|
|
||||||
|
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
|
@ -0,0 +1,150 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Jul 11 09:19:54 2017
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We need regular expressions and counters (->collections)
|
||||||
|
import re
|
||||||
|
import collections
|
||||||
|
# for the bigram part, the sentence tokenizer is helpful
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create an empty counter variable
|
||||||
|
words_counter=collections.Counter()
|
||||||
|
|
||||||
|
# counter for the extra task
|
||||||
|
bigram_counter=collections.Counter()
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list; remember to specify the encoding
|
||||||
|
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+\
|
||||||
|
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
|
||||||
|
# read the content from the file
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# THINK ABOUT WE SHOULD USE LOWER OR UPPER CASE CONSISTENTLY!
|
||||||
|
input_text_10_k=
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
list_of_words=re.split('\W{1,}',input_text_10_k)
|
||||||
|
|
||||||
|
# Remember: there can be empty list elements!
|
||||||
|
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||||
|
# You can use an approach similar to the one in lines 24 and 25.
|
||||||
|
COMMANDS TO BE ADDED
|
||||||
|
|
||||||
|
|
||||||
|
# Add the words to our counter
|
||||||
|
words_counter=words_counter+XXXX # COMPLETE THIS COMMAND
|
||||||
|
|
||||||
|
|
||||||
|
#############################################
|
||||||
|
# optional part for the extra task on bigrams
|
||||||
|
#############################################
|
||||||
|
# create an empty list for the bigrams
|
||||||
|
'''
|
||||||
|
bigram_list=[]
|
||||||
|
|
||||||
|
# split the text into sentences
|
||||||
|
list_of_sentences=XXX
|
||||||
|
|
||||||
|
# create the bigrams IN EACH SENTENCE
|
||||||
|
for sentence in list_of_sentences:
|
||||||
|
# split the sentence into words
|
||||||
|
list_of_words=XXX
|
||||||
|
|
||||||
|
# remove empty elements
|
||||||
|
while list_of_words.count("")>0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
# go over all potential two word combinations in the sentence.
|
||||||
|
for word_number in range(XXX,YYY):
|
||||||
|
# add the bigram (two words connected by whitespace) to the list
|
||||||
|
bigram_list.append(WORD_1 + " " + WORD_2)
|
||||||
|
|
||||||
|
# same command as in line 70
|
||||||
|
bigram_counter=bigram_counter+XXX
|
||||||
|
# end of extra task
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
# Close the 10-K filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# Top 100 single words
|
||||||
|
######################
|
||||||
|
# Open the csv file containing the 100 most frequently used words
|
||||||
|
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8",errors="ignore")
|
||||||
|
output_file.write("rank;word;count\n")
|
||||||
|
|
||||||
|
# Get the 100 most frequent words
|
||||||
|
top_100_words=words_counter.XXXX # COMPLETE THIS COMMAND
|
||||||
|
|
||||||
|
# Write the 100 most frequent words to the csv file
|
||||||
|
# REMEMBER: Python starts counting at 0, while humans start at 1.
|
||||||
|
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
|
||||||
|
for i in range(1,101):
|
||||||
|
output_file.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
|
||||||
|
|
||||||
|
# Close the csv file
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# Extra task
|
||||||
|
# Top 100 bigrams
|
||||||
|
######################
|
||||||
|
'''
|
||||||
|
# Open the csv file containing the 100 most frequently used BIGRAMS
|
||||||
|
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
|
||||||
|
output_file_bigram.write("rank;word;count\n")
|
||||||
|
|
||||||
|
# Get the 100 most frequent bigrams: same commend as above
|
||||||
|
top_100_bigrams=bigram_counter.XXX
|
||||||
|
|
||||||
|
# Write the 100 most frequent bigrams to the csv file.
|
||||||
|
# same logic as above
|
||||||
|
for i in range(1,101):
|
||||||
|
output_file_bigram.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
|
||||||
|
|
||||||
|
# Close the csv file
|
||||||
|
output_file_bigram.close()
|
||||||
|
'''
|
||||||
|
|
||||||
|
print("Task done!")
|
83
lectures/programming/templates/Problem_13_Stemming_form.py
Normal file
83
lectures/programming/templates/Problem_13_Stemming_form.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We need regular epressions and stemming.
|
||||||
|
import re
|
||||||
|
from nltk.stem import PorterStemmer
|
||||||
|
# Depending on how you would like to split the text in words, you may need tokenize.
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list; remember to specify the encoding
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
|
||||||
|
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
|
||||||
|
|
||||||
|
# Get the text of the 10-K
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# We need to tokenize the text because stem only works on a word by word basis.
|
||||||
|
# Stemming an entire document without splitting into words does not work!
|
||||||
|
# The problem is that \n gets lost in this process --> we cannot easily
|
||||||
|
# recreate the document.
|
||||||
|
# Solution: replace \n by \n and some indicator that there was a line break.
|
||||||
|
# For example replace("\n","\nHereWasALinebreak")
|
||||||
|
input_text_10_k=input_text_10_k.replace("\n",XXXX)
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
word_list=XXXX
|
||||||
|
|
||||||
|
# Stem the text from above
|
||||||
|
text_stemmed=''
|
||||||
|
# LOOP ALL WORDS, STEM THEM AND RECONNECT THEM.
|
||||||
|
# WARNING: WHEN RECONNECTING WORDS YOU NEED TO INCLUDE A WHITESPACE BETWEEN
|
||||||
|
# THE WORDS. OTHERWISE, THE TEXT GETS MESSED UP.
|
||||||
|
for word in word_list:
|
||||||
|
|
||||||
|
text_stemmed=text_stemmed+XXX # TO BE COMPLETED
|
||||||
|
|
||||||
|
# To recreate the text, we need to replace the line break indicators by \n.
|
||||||
|
# WARNING: PAY ATTENTION TO UPPER/LOWER CASE, IT CAN CHANGE.
|
||||||
|
text_stemmed=text_stemmed.replace(XXXX,XXXX) # UNDO THE TRANSFORMATION FROM LINE 56.
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the stemmed text
|
||||||
|
output_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
|
||||||
|
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
|
||||||
|
output_file_10_k.write(text_stemmed)
|
||||||
|
output_file_10_k.close()
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
print("Task done!")
|
|
@ -0,0 +1,101 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# For the full task, we need a large set of packages:
|
||||||
|
# regular expression, stemming, stop words, tokenization, and counters.
|
||||||
|
import re
|
||||||
|
#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
|
||||||
|
#from nltk.corpus import stopwords # NOT needed for the base comparison
|
||||||
|
#from nltk.stem import PorterStemmer # NOT needed for the base comparison
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
|
|
||||||
|
#ps=PorterStemmer() # NOT needed for the base comparison
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
|
||||||
|
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Open the output csv file in which we write the similarities
|
||||||
|
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to first line
|
||||||
|
output_file.write(input_text_line[0]+';Jaccard\n')
|
||||||
|
|
||||||
|
|
||||||
|
# set default values for variables
|
||||||
|
word_list_old_edited=""
|
||||||
|
word_list_edited=""
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the eight variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (8th column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename_parts=re.split('/',variables[7])
|
||||||
|
filename=filename_parts[3].replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K; remember to specify the encoding
|
||||||
|
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
|
||||||
|
'_edited.txt', 'r', encoding='ascii', errors='ignore')
|
||||||
|
# if the command above does not work (error like "file not found" or "directory not found")
|
||||||
|
# please use the following command:
|
||||||
|
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
|
||||||
|
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Split text into words
|
||||||
|
word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
|
||||||
|
# Alternative using tokenize
|
||||||
|
#word_list_edited=word_tokenize(input_text_10_k.lower())
|
||||||
|
|
||||||
|
# check whether the previous entry of the list is from the same firm
|
||||||
|
permco=input_text_line[i].split(";")[1]
|
||||||
|
permco_old=input_text_line[i-1].split(";")[1]
|
||||||
|
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Sub Task 1: Jaccard for the _edited.txt
|
||||||
|
############################################
|
||||||
|
# compute Jaccard similarity if the previous filing is from the same firm
|
||||||
|
if permco==permco_old:
|
||||||
|
|
||||||
|
counter_current_10k=Counter(XXX)
|
||||||
|
counter_previous_10k=Counter(XXX)
|
||||||
|
|
||||||
|
intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
|
||||||
|
union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)
|
||||||
|
|
||||||
|
jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION
|
||||||
|
output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
|
||||||
|
else:
|
||||||
|
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
|
||||||
|
output_file.write(input_text_line[i]+";"+"\n")
|
||||||
|
|
||||||
|
# Save the current word vector to a separate variable for the comparison of the next report.
|
||||||
|
word_list_old_edited=word_list_edited
|
||||||
|
|
||||||
|
# Close 10-K filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
print("Task done!")
|
||||||
|
|
|
@ -0,0 +1,159 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Mon Mar 21 09:38:32 2022
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.linear_model import RidgeCV
|
||||||
|
from sklearn.linear_model import LassoCV
|
||||||
|
|
||||||
|
|
||||||
|
# adjust the directory to your folder
|
||||||
|
directory="C:/Lehre/Machine Learning/Data/"
|
||||||
|
|
||||||
|
|
||||||
|
# import the data for this problem
|
||||||
|
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
|
||||||
|
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
|
||||||
|
# The rows of the data are the Form 10-K filings. Each line is one filing.
|
||||||
|
# The columns are the variables. After some identifying information,
|
||||||
|
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
|
||||||
|
# in a 10-K (e.g., 100 times)
|
||||||
|
|
||||||
|
|
||||||
|
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
|
||||||
|
# and Console will crash.
|
||||||
|
# However, you can pick a small subset of the data and look at it.
|
||||||
|
# It list all columns=variables and the first three observations.
|
||||||
|
data_frame_example=data_frame.head(3)
|
||||||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||||
|
|
||||||
|
# To see the variables included in the data use the following command
|
||||||
|
data_frame_column_names=data_frame.columns
|
||||||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||||||
|
# This variables shows all column/variable names in a vector.
|
||||||
|
|
||||||
|
# split the data set into the training and testing data
|
||||||
|
# we use the filings from year 2007 as training data
|
||||||
|
data_frame_train=data_frame[data_frame.year==2007]
|
||||||
|
# and the filing from year 2008 as testing data
|
||||||
|
data_frame_test=data_frame[data_frame.year==2008]
|
||||||
|
|
||||||
|
# put the cumulative abnormal return around the filing date into a new variable.
|
||||||
|
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
|
||||||
|
# training data
|
||||||
|
filing_car_train=data_frame_train["excess_ret_t0_t4"]
|
||||||
|
# testing data
|
||||||
|
filing_car_test=data_frame_test["excess_ret_t0_t4"]
|
||||||
|
|
||||||
|
# so far, you have absolute word counts. For example, "loss" is found 5 times.
|
||||||
|
# As the length of the 10-Ks can be different, we scale by the number of words
|
||||||
|
# in the 10-K.
|
||||||
|
document_length_train=data_frame_train["number_of_words"]
|
||||||
|
document_length_test=data_frame_test["number_of_words"]
|
||||||
|
|
||||||
|
|
||||||
|
# the word frequencies are our independent variables -> restrict the data frame
|
||||||
|
# to those variables and drop all variables that are not needed
|
||||||
|
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||||
|
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||||||
|
|
||||||
|
# compute relative frequencies, i.e., divide the absolute word count by document length
|
||||||
|
data_frame_train=data_frame_train.div(document_length_train, axis=0)
|
||||||
|
data_frame_test=data_frame_test.div(document_length_test, axis=0)
|
||||||
|
|
||||||
|
# standardize the data frames
|
||||||
|
# training data
|
||||||
|
data_frame_train_mean=TO BE COMPLETED
|
||||||
|
data_frame_train_sd=TO BE COMPLETED
|
||||||
|
data_frame_train_standardized=TO BE COMPLETED
|
||||||
|
# testing data
|
||||||
|
data_frame_test_mean=TO BE COMPLETED
|
||||||
|
data_frame_test_sd=TO BE COMPLETED
|
||||||
|
data_frame_test_standardized=TO BE COMPLETED
|
||||||
|
|
||||||
|
|
||||||
|
# There can be missing values in the standardized variables.
|
||||||
|
# They arise if the word count for a specific word is always zero in the training
|
||||||
|
# or in the testing data. In this case, the standard deviation is zero ->
|
||||||
|
# division by zero -> NaN.
|
||||||
|
# We replace these missing values by zero.
|
||||||
|
# training data
|
||||||
|
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
|
||||||
|
# testing data
|
||||||
|
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# Ridge regression
|
||||||
|
##########################
|
||||||
|
print("\nRidge regression - Using cross-validation\n")
|
||||||
|
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
|
||||||
|
# In this regression, we use the training data.
|
||||||
|
# We use five-fold cross-validation.
|
||||||
|
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
|
||||||
|
regression_Ridge_cv=RidgeCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=TO BE COMPLETED
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# what is the R2 in the training and testing data?
|
||||||
|
print("The R2 in the training data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
|
||||||
|
print("The R2 in the testing data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
filing_car_train_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
|
||||||
|
# predict y in the testing sample
|
||||||
|
filing_car_test_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
|
||||||
|
# Determine the MSE
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||||
|
|
||||||
|
|
||||||
|
######################
|
||||||
|
# LASSO regression
|
||||||
|
######################
|
||||||
|
print("\nLASSO regression - Using cross-validation\n")
|
||||||
|
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
|
||||||
|
# In this regression, we use the training data.
|
||||||
|
# We use five-fold cross-validation.
|
||||||
|
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
|
||||||
|
regression_Lasso_cv=LassoCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
|
||||||
|
|
||||||
|
# get the optimal lambda
|
||||||
|
alpha_optimal_cv=TO BE COMPLETED
|
||||||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||||||
|
|
||||||
|
# get the R2 in the training data
|
||||||
|
print("The R2 in the training data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
|
||||||
|
# ... and testing data
|
||||||
|
print("The R2 in the testing data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
|
||||||
|
|
||||||
|
# Mean squared error using the cross-validated model
|
||||||
|
# predict y in the full training sample
|
||||||
|
filing_car_train_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
|
||||||
|
# predict y in the testing sample
|
||||||
|
filing_car_test_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
|
||||||
|
# Determine the MSE
|
||||||
|
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||||
|
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
|
||||||
|
|
||||||
|
|
||||||
|
############################################################
|
||||||
|
# Compare the betas from the Ridge and the LASSO regressions
|
||||||
|
############################################################
|
||||||
|
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
|
||||||
|
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
|
||||||
|
|
||||||
|
# get the list of coefficients
|
||||||
|
for i in range (0,len(data_frame_train.columns)):
|
||||||
|
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||||||
|
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("Completed!")
|
88
lectures/programming/templates/Problem_1_form.py
Normal file
88
lectures/programming/templates/Problem_1_form.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Fri Feb 15 21:37:53 2019
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
# It is important to use a single forward slash / but not a single backslash \.
|
||||||
|
|
||||||
|
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||||
|
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# open the Fun_with_Python text file
|
||||||
|
input_file=open(directory+"Fun_with_Python.txt","r")
|
||||||
|
|
||||||
|
###################################
|
||||||
|
# Programming Problem 1
|
||||||
|
###################################
|
||||||
|
|
||||||
|
# Task 1: open the file 'Fun_with_Python.txt' in Spyder and print its content
|
||||||
|
# The file can be found in our data folder
|
||||||
|
|
||||||
|
# get the text from the file
|
||||||
|
input_text= TO BE COMPLETED
|
||||||
|
# print the content, i.e., the text of the file (previous line)
|
||||||
|
print(TO BE COMPLETED)
|
||||||
|
|
||||||
|
# See slide 7
|
||||||
|
|
||||||
|
|
||||||
|
# Task 2: Write the content of 'Fun_with_Python.txt' to a new text file
|
||||||
|
# with the name 'More_fun_with_Python.txt'.
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See slide 8.
|
||||||
|
# REMEMBER to close your file. If you do not close the new txt file, its content
|
||||||
|
# will not be saved to the hard drive. You will find an empty txt in your file manager.
|
||||||
|
|
||||||
|
|
||||||
|
# Task 3: Write a loop that prints some text (whatever you like) ten times.
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See slide 9.
|
||||||
|
# You have several options. While loop, for X in range() loop, etc.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Task 4: Print the text of the "Fun_with_Python" file line by line!
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See slide 10.
|
||||||
|
# You need a loop (Task 3) and in each iteration of the loop have Python print
|
||||||
|
# a line of text.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Task 5: Count how often the word 'good' appears in the document 'Fun_with_Python.txt'!
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See slide 11.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Task 6a: Now, print only the lines that contain the word 'good'!
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See also slide 12.
|
||||||
|
# You can use the line-by-line printing from Task 4 and combine it with the command ".count()" from Task 5
|
||||||
|
# and add the if condition from slide 12.
|
||||||
|
# If condition: for each line check whether the specific line contains the word "good".
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Task 7: print only the lines that start with the word 'This'!
|
||||||
|
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
# See slide 15.
|
||||||
|
# This is very similar to task 6. You only need to modify the if condition a bit.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Task 8a: Replace the word "good" by "excellent" and display the new text!
|
||||||
|
# See slide 16.
|
||||||
|
# ENTER YOUR COMMANDS HERE
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 09:21:46 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
# It is important to use a single forward slash / but not a single backslash \.
|
||||||
|
|
||||||
|
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||||
|
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the txt file with the SEC filings
|
||||||
|
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
|
||||||
|
sec_filings_text=sec_filings_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'SEC_Filings_Output.csv','w')
|
||||||
|
|
||||||
|
# Create first line with variable names
|
||||||
|
# I use semicolons as separator in csv files. You can also use any other symbol.
|
||||||
|
# However, you should make sure that the separator is not part of the data/text
|
||||||
|
# you write to the file.
|
||||||
|
# For example, it would be problematic if you use comma as separator and have
|
||||||
|
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
|
||||||
|
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
|
||||||
|
|
||||||
|
|
||||||
|
# Split the Input File in separate line
|
||||||
|
# DO THE LINE SPIT
|
||||||
|
sec_filings_line=
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
# you can get the number of lines by computing the length of the list of lines,
|
||||||
|
# i.e. by determining the length of sec_filings_line.
|
||||||
|
for / while : # COMPLETE LOOP
|
||||||
|
|
||||||
|
# Does the line refer to a form 10-K file?
|
||||||
|
if : # USE AN IF CONDITION TO TEST THIS -> see TASKS 7 and 8 of PROBLEM 1
|
||||||
|
|
||||||
|
# Split the line such that the information can be saved in separate
|
||||||
|
# variables
|
||||||
|
# Each information item has a fixed length in the overview files of the
|
||||||
|
# SEC.
|
||||||
|
# SEE SLIDE 18 FOR INFORMATION ON THE LENGTH OF THE SEPARATE COLUMNS.
|
||||||
|
|
||||||
|
# COMPLETE THE COMMANDS BELOW
|
||||||
|
filing_type=
|
||||||
|
company_name=
|
||||||
|
cik=
|
||||||
|
filing_date=
|
||||||
|
link=
|
||||||
|
|
||||||
|
# Is the 10-K filed between March 10 and March 20?
|
||||||
|
filing_day=
|
||||||
|
filing_month=
|
||||||
|
# Is the Filing Month March?
|
||||||
|
if : # COMPLETE THE IF-CONDITION
|
||||||
|
# Is the Filing Day between 10 and 20?
|
||||||
|
if : # COMPLETE THE IF-CONDITION
|
||||||
|
# The filing meets the conditions -->
|
||||||
|
# Write output to the csv file
|
||||||
|
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
|
||||||
|
|
||||||
|
|
||||||
|
# Close your input and output file in the end
|
||||||
|
sec_filings_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
|
@ -0,0 +1,103 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Jul 29 11:07:10 2015
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe Uni Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
# It is important to use a single forward slash / but not a single backslash \.
|
||||||
|
|
||||||
|
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||||
|
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
|
||||||
|
# We need the urllib package for the download.
|
||||||
|
import urllib.request
|
||||||
|
# To automatically create folders, we need the os-module (OS: Operating System)
|
||||||
|
import os
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Technical issue
|
||||||
|
# As of March 2021, the SEC no longer accepts requests by the standard urllib settings
|
||||||
|
# you have to make some adjustments
|
||||||
|
###############################################################################
|
||||||
|
# Define a user agent
|
||||||
|
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
|
||||||
|
# "Some websites dislike being browsed by programs, or send different versions
|
||||||
|
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
|
||||||
|
# (where x and y are the major and minor version numbers of the Python release,
|
||||||
|
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
|
||||||
|
# The way a browser identifies itself is through the User-Agent header.
|
||||||
|
opener = urllib.request.build_opener()
|
||||||
|
|
||||||
|
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
|
||||||
|
# To still automatically download files, you have different options.
|
||||||
|
# I have listed three examples below but there are many more:
|
||||||
|
# For a comprehensive list see, e.g.:
|
||||||
|
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
|
||||||
|
#opener.addheaders = [('User-agent', 'Mozilla')]
|
||||||
|
#opener.addheaders = [('User-agent', 'Chrome')]
|
||||||
|
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
|
||||||
|
urllib.request.install_opener(opener)
|
||||||
|
# END of the technical issues
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Open the csv file from part 1 of the problem
|
||||||
|
input_file=open(directory+'SEC_Filings_Output.csv','r')
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# Create a subfolder in which the 10-K filings are saved.
|
||||||
|
# When you download a large number of filings I recommend using subfolders for
|
||||||
|
# each year or even for each year-month-day combination.
|
||||||
|
# In this problem, a single subfolder is fine.
|
||||||
|
os.makedirs( COMPLETE THE COMMAND )
|
||||||
|
# See slide 18 for information on the os.-commands!
|
||||||
|
# IN GENERAL, IF YOU SEE AN UNKNOWN COMMAND, GOOGLE IT TO GET INFORMATION.
|
||||||
|
|
||||||
|
# Loop over all lines of the csv file
|
||||||
|
# Like in part 1 of the problem, you can get the number of lines by computing
|
||||||
|
# the length of the list of lines, i.e. by determining the length of input_text_line.
|
||||||
|
for / while: # COMPLETE THE LOOP
|
||||||
|
# split the line into the five variables
|
||||||
|
# THE ; IS THE SEPARATOR IN THE CSV -> USE THE split() COMMAND
|
||||||
|
variables=
|
||||||
|
|
||||||
|
# We only need the cik and the link to download the file.
|
||||||
|
# The cik is the 3rd variable.
|
||||||
|
# The link is the 5th variable
|
||||||
|
cik=
|
||||||
|
link=
|
||||||
|
|
||||||
|
# identify the filename
|
||||||
|
# The link consistes of differnt parts:
|
||||||
|
# For example: edgar/data/1000753/0000950129-98-001035.txt
|
||||||
|
|
||||||
|
link_parts= # USE A SPLIT
|
||||||
|
# 1st part: edgar
|
||||||
|
# 2nd part: data
|
||||||
|
# 3rd part: cik
|
||||||
|
# 4th part: file name -> see next line
|
||||||
|
filename=link_parts[FILE IN THE NUMBER HERE]
|
||||||
|
###########################################################################
|
||||||
|
############################ WARNING ######################################
|
||||||
|
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
|
||||||
|
# may use the same filename. Thus, when you only use the filename files
|
||||||
|
# might be overwritten. To avoid this problem you need to have a unique name.
|
||||||
|
# Combining CIK and filename results in a unique identifier, as the
|
||||||
|
# filename appears only once per firm (CIK).
|
||||||
|
# -> use the combination of CIK and filename: cik_filename
|
||||||
|
###########################################################################
|
||||||
|
urllib.request.urlretrieve(TO BE COMPLETED)
|
||||||
|
# See slide 19 for information on the urllib.-commands.
|
||||||
|
|
||||||
|
|
||||||
|
# Close your input file
|
||||||
|
input_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
|
@ -0,0 +1,121 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Apr 12 15:50:22 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Import regular expressions and BeautifulSoup
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
# It is important to use a single forward slash / but not a single backslash \.
|
||||||
|
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||||
|
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the document
|
||||||
|
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
#######################
|
||||||
|
# Task 1: remove tables
|
||||||
|
#######################
|
||||||
|
# Approach
|
||||||
|
# We search for tables until we find no more html tags that indicate the
|
||||||
|
# beginning of a table.
|
||||||
|
# Search for the start html-tag <TABLE>
|
||||||
|
table_match=re.search(TO BE COMPLETED, input_text)
|
||||||
|
while : # YOU NEED A LOOP THAT SEARCHES FOR TABLES
|
||||||
|
# When we have identified a match, i.e. the start of a table, we save
|
||||||
|
# the position of the beginning of the table in the variable "start_table"
|
||||||
|
table_start_match=re.search(XXX, input_text)
|
||||||
|
start_table=table_start_match.start()
|
||||||
|
# Next, we search for the corresponding html tag that indicates the end of
|
||||||
|
# the table and save the end position to the variable "end_table"
|
||||||
|
|
||||||
|
# REPEAT THE COMMANDS ABOVE FOR THE END OF TABLE
|
||||||
|
table_end_match=
|
||||||
|
end_table=
|
||||||
|
|
||||||
|
# We can print the text between the start and end html tag to check whether
|
||||||
|
# the table has been identified correctly.
|
||||||
|
print("The text below is a table!\n"+input_text[start_table:end_table])
|
||||||
|
|
||||||
|
# the text between the beginning and end of the html tags is the part which
|
||||||
|
# we would like to delete.
|
||||||
|
# Consequently, we keep the text before the beginning of the table as well
|
||||||
|
# as the text after the ending of the table.
|
||||||
|
input_text=TO BE COMPLETED
|
||||||
|
# Next, we need to check whether there is another table in the rest of the
|
||||||
|
# text.
|
||||||
|
table_match=re.search(SAME COMMAND AS IN LINE 27, input_text)
|
||||||
|
# As long as "table_match" exists, i.e. we regex result in a match, the loop
|
||||||
|
# will continue.
|
||||||
|
|
||||||
|
#########################
|
||||||
|
# Task 2: remove Exhibits
|
||||||
|
#########################
|
||||||
|
# Exhibits have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>EX...
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
|
||||||
|
# THE APPROACH IS THE SAME AS THE SEARCH FOR TABLES ABOVE
|
||||||
|
exhibit_match=re.search(, input_text)
|
||||||
|
while :
|
||||||
|
# get the beginning of the exhibit
|
||||||
|
exhibit_start_match=
|
||||||
|
start_exhibit=
|
||||||
|
# As the exhibits are at the end of the 10-K filing it would not be
|
||||||
|
# necessary to include an end position. We could also drop the entire text
|
||||||
|
# after "<TYPE>EX"
|
||||||
|
# However, for completeness, we will define an end
|
||||||
|
exhibit_end_match=
|
||||||
|
end_exhibit=
|
||||||
|
# Print the identified text to check whether the exhibit has be identified
|
||||||
|
# correctly
|
||||||
|
print("The text below is a exhibit!\n"+input_text[start_exhibit:end_exhibit])
|
||||||
|
|
||||||
|
input_text=TO BE COMPLETED
|
||||||
|
# Check whether there are further exhibits
|
||||||
|
exhibit_match=re.search(SAME COMMAND AS IN LINE 65, input_text)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# Task 3: remove html code
|
||||||
|
##########################
|
||||||
|
# Alternative 1: remove html code without Beautiful Soup
|
||||||
|
text=re.sub(TO BE COMPLETED, '', input_text)
|
||||||
|
# Use a regex that searches for a "<" followed by at least one character that must not
|
||||||
|
# equal > and is completed by >.
|
||||||
|
|
||||||
|
# Alternative 2: remove html code using Beautiful Soup
|
||||||
|
html_text=BeautifulSoup(TO BE COMPLETED)
|
||||||
|
text=html_text.TO BE COMPLETED
|
||||||
|
|
||||||
|
########################
|
||||||
|
# Task 4: delete numbers
|
||||||
|
########################
|
||||||
|
|
||||||
|
# YOU MAY NEED MULTIPLE COMMANDS TO DELETE ALL NUMBERS
|
||||||
|
# Remember that you can have different formats, e.g., 1,234.56 or 0.12 or 1,234,567
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
########################
|
||||||
|
# Task 5: delete symbols
|
||||||
|
########################
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the pure text
|
||||||
|
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
|
||||||
|
output_file.write(text)
|
||||||
|
|
||||||
|
# close all files
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
print("DONE")
|
||||||
|
|
|
@ -0,0 +1,164 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Tue Apr 12 15:50:22 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
# It is important to use a single forward slash / but not a single backslash \.
|
||||||
|
# For MAC users: your directory will usually start with "/Users/". For example:
|
||||||
|
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the 10-K
|
||||||
|
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
################################
|
||||||
|
# Remove tables
|
||||||
|
# Same approach as in Problem 4
|
||||||
|
################################
|
||||||
|
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||||
|
# example, we will print the first two tables that we delete.
|
||||||
|
i=1
|
||||||
|
table_match=re.search(ENTER THE REGEX, input_text)
|
||||||
|
while table_match:
|
||||||
|
# Search for the beginning of the table
|
||||||
|
table_start_match=re.search(REGEX FOR BEGINNING OF TABLE, input_text)
|
||||||
|
start_table=
|
||||||
|
# search for the end of the table
|
||||||
|
table_end_match=REGEX FOR END OF TABLE
|
||||||
|
end_table=
|
||||||
|
# The if condition and the printing are just for illustrative purposes.
|
||||||
|
# The commands display the first two tables that are removed from the text.
|
||||||
|
if i<=2:
|
||||||
|
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
|
||||||
|
i=i+1
|
||||||
|
# remove the table from the original text
|
||||||
|
input_text=TO BE COMPLETED
|
||||||
|
# check whether there are further tables
|
||||||
|
# same command as in line 24
|
||||||
|
table_match=re.search(XXXXXXX, input_text)
|
||||||
|
|
||||||
|
################################
|
||||||
|
# Remove exhibits
|
||||||
|
# Same approach as in Problem 4
|
||||||
|
################################
|
||||||
|
# Exhibits have the following structure
|
||||||
|
# <DOCUMENT>
|
||||||
|
# <TYPE>EX...
|
||||||
|
# ...
|
||||||
|
# </DOCUMENT>
|
||||||
|
# Sometimes it is helpful to print the text parts that are deleted. In this
|
||||||
|
# example, we will print the first exhibit that we delete.
|
||||||
|
i=1
|
||||||
|
exhibit_match=re.search(ENTER THE REGEX, input_text)
|
||||||
|
while exhibit_match:
|
||||||
|
# Search for the beginning of the exhibit
|
||||||
|
exhibit_start_match=re.search(REGEX FOR BEGINNING OF EXHIBIT, input_text)
|
||||||
|
start_exhibit=
|
||||||
|
# Search for the end of the exhibit
|
||||||
|
# CAUTION: search only in the text after the beginning of the exhibt, as
|
||||||
|
# the end-term also appears earlier (e.g. end of main document)
|
||||||
|
exhibit_end_match=re.search(REGEX FOR END OF EXHIBIT, input_text[START OF EHIBIT UNTIL END OF TEXT])
|
||||||
|
end_exhibit=
|
||||||
|
if i<=1:
|
||||||
|
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
|
||||||
|
i=i+1
|
||||||
|
# remove exhibit from the original text
|
||||||
|
input_text=
|
||||||
|
# check whether there are further exhibits
|
||||||
|
# same command as in line 55
|
||||||
|
exhibit_match=re.search(XXXXXXX, input_text)
|
||||||
|
|
||||||
|
##################
|
||||||
|
# Remove html code
|
||||||
|
##################
|
||||||
|
# you can use BeautifulSoup for simplicity
|
||||||
|
html_text=BeautifulSoup(input_text, 'html.parser')
|
||||||
|
text=html_text.get_text()
|
||||||
|
|
||||||
|
############################
|
||||||
|
# Remove the Document Header
|
||||||
|
############################
|
||||||
|
# There are different possibilities how one can define the start of the main part of the text
|
||||||
|
# In general, you should delete all text that is uninformative for your analysis.
|
||||||
|
header_match=re.search(END OF DOCUMENT HEADER, text)
|
||||||
|
if header_match:
|
||||||
|
# Drop the document header and keep only the rest of the text after the header.
|
||||||
|
text=text[XXXXXXXXXXXXXXX]
|
||||||
|
|
||||||
|
|
||||||
|
#################################################
|
||||||
|
# Delete the text in "PART IV"
|
||||||
|
# This procedure is optional. Look at "Part IV" and decide whether you favor
|
||||||
|
# the approach. I think that the part should be dropped, as it is just a list
|
||||||
|
# of exhibits, some mandatory text required by the SEC [indicated by the
|
||||||
|
# capital letters in the "SIGNATURES" section].
|
||||||
|
#################################################
|
||||||
|
'''
|
||||||
|
# Alternative 1: go over all matches but keep only the last one
|
||||||
|
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||||
|
pass
|
||||||
|
# match now contains the last match.
|
||||||
|
# Delete the text after the last match
|
||||||
|
text=text[:match.start()]
|
||||||
|
|
||||||
|
# Alternative 2: save the positions of all matches (more general approach)
|
||||||
|
list_start_matches=[]
|
||||||
|
list_end_matches=[]
|
||||||
|
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
|
||||||
|
list_start_matches.append(match.start())
|
||||||
|
list_end_matches.append(match.end())
|
||||||
|
# Position of last match
|
||||||
|
print(list_start_matches[len(list_start_matches)-1])
|
||||||
|
print(list_end_matches[len(list_start_matches)-1])
|
||||||
|
|
||||||
|
# Delete the text after the last match
|
||||||
|
text=text[:list_start_matches[len(list_start_matches)-1]]
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Delete item numbers
|
||||||
|
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
# Delete numbers
|
||||||
|
# You can use the code from Problem 4.
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
|
||||||
|
# Hyphens can be used to indicate that the word is continued in the next
|
||||||
|
# line. For example, "Micro-\nsoft" (\n is the line feed).
|
||||||
|
# Delete hyphens that are followed by a line feed.
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
# Delete symbols
|
||||||
|
# You can use the code from Problem 4.
|
||||||
|
text=re.sub(TO BE COMPLETED,'',text)
|
||||||
|
|
||||||
|
# Delete dots and commas that are not part of sentences, i.e. commas and dots
|
||||||
|
# that are preceded by whitespace or line break and that are followed by
|
||||||
|
# whitespace or line break.
|
||||||
|
text=re.sub('\n(\.|,)\n','\n',text)
|
||||||
|
|
||||||
|
# Drop single-character words
|
||||||
|
# One can argue whether one should implement this procedure. Loughran and
|
||||||
|
# McDonald argue in one of their papers in favor of it.
|
||||||
|
# To make sure that there is just one letter, we require that there is a word
|
||||||
|
# boundary (\W) before and after. We use a positive backward looking and a
|
||||||
|
# positive forward looking condition for this to assure that the word boundary
|
||||||
|
# get not deleted as well.
|
||||||
|
text=re.sub(TO BE COMPLETED,' ',text)
|
||||||
|
|
||||||
|
|
||||||
|
# Open the output file for the pure text
|
||||||
|
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
|
||||||
|
output_file.write(text)
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
print("COMPLETED.")
|
||||||
|
|
117
lectures/programming/templates/Problem_7_Tone_Analysis_form.py
Normal file
117
lectures/programming/templates/Problem_7_Tone_Analysis_form.py
Normal file
|
@ -0,0 +1,117 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the dictionary
|
||||||
|
# The dictionary has been obtained from Bill McDonald's webpage
|
||||||
|
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||||
|
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||||
|
# --> select negative words and copy them to a txt file
|
||||||
|
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
|
||||||
|
word_list=file_word_list.read()
|
||||||
|
# LOOK AT THE FILE. ARE THE WORDS IN UPPER OR IN LOWER CASE?
|
||||||
|
# MAKE SURE THAT YOU USE A CONSISTENT FORMAT FOR THE TEXT AND THE DICTIONARY.
|
||||||
|
# THE COMMANDS ARE .lower() AND .upper().
|
||||||
|
|
||||||
|
# CREATE A LIST OF NEGATIVE WORDS -> SPLIT THE TEXT
|
||||||
|
negative_words=word_list.XXXX
|
||||||
|
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the input file in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
|
||||||
|
Percentage_Negative_Words\n')
|
||||||
|
|
||||||
|
# Loop over all lines of the csv file
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
# If the execution of your scripts takes some time, printing the loop iterator
|
||||||
|
# gives you an impression of the overall progress made.
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (2nd column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
|
||||||
|
# modify file name to open the edited files
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
# Open the ith 10-Ks in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# CONVERT THE TEXT TO UPPER OR LOWER CASE (see comment above)
|
||||||
|
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||||
|
# and the document is identical. Remember that you have typically lower and upper case
|
||||||
|
# letters in documents -> modify text
|
||||||
|
text=input_text_10_k.XXXXXX
|
||||||
|
|
||||||
|
# Split the text in words to determine the total number of words
|
||||||
|
# LOOK AT THE REGEX INTRODUCTION FOR A SUITABLE SPLIT VARIABLE.
|
||||||
|
list_of_words=re.split(XXXXX, text)
|
||||||
|
|
||||||
|
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
|
||||||
|
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||||
|
# You can use an approach similar to the one in lines 37 and 38.
|
||||||
|
COMMANDS TO BE ADDED
|
||||||
|
|
||||||
|
# Determine the total number of words
|
||||||
|
# COUNT THE NUMBER OF ELEMENTS IN list_of_words
|
||||||
|
word_count=XXXX
|
||||||
|
|
||||||
|
# Reset the number of negative words to zero
|
||||||
|
negative_count=0
|
||||||
|
# For each negative word, count the number of occurrences
|
||||||
|
for j in range(len(negative_words)):
|
||||||
|
|
||||||
|
HERE YOU NEED TO COUNT HOW OFTEN THE jth NEGATIVE WORD IS FOUND IN THE TEXT.
|
||||||
|
COMPARE THE TWO CASES BELOW -> EXECUTE THE COMMANDS (see lines below) IN
|
||||||
|
THE COMMAND LINE AND COMPARE THE RESULTS.
|
||||||
|
WHICH ALTERNATIVE IS THE RIGHT APPROACH?
|
||||||
|
|
||||||
|
ALTERNATIVE 1:
|
||||||
|
list_of_words=["abandon","abandoned","abandonment"]
|
||||||
|
list_of_words.count("abandon")
|
||||||
|
ALTERNATIVE 2:
|
||||||
|
text_of_words="abandon abandoned abandonment"
|
||||||
|
text_of_words.count("abandon")
|
||||||
|
|
||||||
|
ADD THE CORRECT COUNT OF NEGATIVE WORD j TO YOUR OVERALL COUNT.
|
||||||
|
negative_count=negative_count+XXXXX
|
||||||
|
|
||||||
|
# Get the percentage of negative words
|
||||||
|
percentage_negative=negative_count/word_count
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, number of negative words,
|
||||||
|
# and the percentage of negative words to output file.
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
|
||||||
|
+str(negative_count)+';'+str(percentage_negative)+'\n')
|
||||||
|
|
||||||
|
# Close filings
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
|
@ -0,0 +1,131 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Please adjust the directory to your machine.
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the dictionary
|
||||||
|
# The dictionary is obtained from Bill McDonald's webpage
|
||||||
|
# http://www3.nd.edu/~mcdonald/Word_Lists.html
|
||||||
|
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
|
||||||
|
# --> select positive words and copy them to a txt file
|
||||||
|
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
|
||||||
|
word_list=file_word_list.read()
|
||||||
|
|
||||||
|
# LIKE IN PROBLEM 7, YOU HAVE TO APPLY A CONSISTENT FORMAT TO BOTH THE LMD-WORDS
|
||||||
|
# AND THE TEXT OF THE 10-Ks.
|
||||||
|
positive_words=word_list.split()
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the iput file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
|
||||||
|
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
|
||||||
|
|
||||||
|
|
||||||
|
# Iterate the list of the 200 10-K filings
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
# If the execution of your scripts takes some time, printing the iterator
|
||||||
|
# gives you an impression of the overall progress made.
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK (1st column) and the filename (2nd column)
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
|
||||||
|
# modify file name to open the edited files
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'/10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
input_text_10_k=input_file_10_k.read()
|
||||||
|
|
||||||
|
# It is important that the formatting (lower case vs. upper case) of the word list
|
||||||
|
# and the document are identical. Remember that you have typically lower and upper case
|
||||||
|
# letters in documents -> modify text
|
||||||
|
text=XXXX # CONSISTENT FORMAT
|
||||||
|
|
||||||
|
# Split the text in single words to determine the total number of words
|
||||||
|
list_of_words=re.split(XXXX, text) # USE THE SAME COMMAND AS IN PROBLEM 7
|
||||||
|
|
||||||
|
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
|
||||||
|
# Make sure that empty list elements do not bias the word count -> delete them!
|
||||||
|
# You can use an approach similar to the one in lines 34 and 35.
|
||||||
|
COMMANDS TO BE ADDED
|
||||||
|
|
||||||
|
# Determine total number of words
|
||||||
|
word_count=XXXX # SAME COMMAND AS IN PROBLEM 7
|
||||||
|
|
||||||
|
# Reset the number of positive words and positive words adj. for negations to zero.
|
||||||
|
positive_count=0
|
||||||
|
positive_count_adj=0
|
||||||
|
# For each positive word, count the number of occurrences
|
||||||
|
for j in range(len(positive_words)):
|
||||||
|
# standard count operation without controlling for negations
|
||||||
|
positive_words_found=list_of_words.count(positive_words[j])
|
||||||
|
|
||||||
|
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
|
||||||
|
# only for Fin-Pos words. Simple negation is taken to be observations
|
||||||
|
# of one of six words (no, not, none, neither, never, nobody) occurring
|
||||||
|
# within three words preceding a positive word.
|
||||||
|
|
||||||
|
# When we have identified positive words we need to search for negations
|
||||||
|
while positive_words_found>0:
|
||||||
|
# identify the position of the matched positive word in the list of all words
|
||||||
|
position_of_word=list_of_words.XXXXX # THE COMMAND .index() IS HELPFUL HERE
|
||||||
|
|
||||||
|
# identify the three words before the positive word and add them to a list
|
||||||
|
list_negation=[3_WORDS_BEFORE_MATCH,2_WORDS_BEFORE_MATCH,1_WORD_BEFORE_MATCH]
|
||||||
|
# REPLACE THE THREE PLACEHOLDERS BY THE CORRESPONDING ELEMENTS OF list_of_words
|
||||||
|
|
||||||
|
# check whether one of the three words in list_negation is a negation
|
||||||
|
negation_found=list_negation.count('no')+list_negation.count('not')+XXXX TO BE COMPLETED
|
||||||
|
|
||||||
|
if negation_found==0:
|
||||||
|
# no negation
|
||||||
|
positive_count_adj=positive_count_adj+1
|
||||||
|
positive_count=positive_count+1
|
||||||
|
else:
|
||||||
|
# negation
|
||||||
|
positive_count=positive_count+1
|
||||||
|
|
||||||
|
# delete the matched positive words in the original document
|
||||||
|
list_of_words[position_of_word]=XXX
|
||||||
|
# THIS OPERATION IS IMPORTANT BECAUSE OTHERWISE WE WILL GET AN ENDLESS LOOP
|
||||||
|
|
||||||
|
# check whether there are further matches of the jth positive word
|
||||||
|
positive_words_found=list_of_words.count(positive_words[j])
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, and number of positive
|
||||||
|
# and adjusted positive words to the output file
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||||
|
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
|
||||||
|
';'+str(positive_count_adj/word_count)+'\n')
|
||||||
|
|
||||||
|
# Close filings
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
|
@ -0,0 +1,70 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Wed Apr 13 22:43:32 2016
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We split the text into words and sentences using regular expression
|
||||||
|
import re
|
||||||
|
|
||||||
|
directory="C:/Lehre/Textual Analysis/Programming/Files/"
|
||||||
|
|
||||||
|
# Open the csv file containing the list of the 200 10-Ks
|
||||||
|
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
|
||||||
|
input_text=input_file.read()
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;WPS\n')
|
||||||
|
|
||||||
|
# Split the Input File in separate lines
|
||||||
|
input_text_line=input_text.split("\n")
|
||||||
|
|
||||||
|
# In general, there can be empty lines in the input file. The following command
|
||||||
|
# deletes these lines.
|
||||||
|
while input_text_line.count("")>0:
|
||||||
|
input_text_line.remove("")
|
||||||
|
|
||||||
|
# Loop over all lines
|
||||||
|
for i in range(1,len(input_text_line)):
|
||||||
|
print(str(i))
|
||||||
|
# split the line into the two variables
|
||||||
|
variables=input_text_line[i].split(";")
|
||||||
|
# We need the CIK and the filename
|
||||||
|
cik=variables[0]
|
||||||
|
filename=variables[1]
|
||||||
|
filename=filename.replace('.txt','')
|
||||||
|
|
||||||
|
# Open the ith 10-K in the list
|
||||||
|
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
|
||||||
|
encoding='ascii',errors='ignore')
|
||||||
|
text=input_file_10_k.read()
|
||||||
|
|
||||||
|
# Determine number of sentences and number of words
|
||||||
|
# DETERMINE THE NUMBER OF WORDS; YOU KNOW THE COMMAND FROM PROBLEMS 7 AND 8.
|
||||||
|
list_of_words=re.split(XXX, text)
|
||||||
|
# Determine total number of words
|
||||||
|
word_count=XXX
|
||||||
|
# Split the text by symbols that indicate the end of a sentence
|
||||||
|
# to determine the total number of sentences
|
||||||
|
list_of_sentences=re.split(XXX, text)
|
||||||
|
# Determine total number of sentences
|
||||||
|
sentence_count=XXX
|
||||||
|
|
||||||
|
# Ratio of # of words over # of sentences
|
||||||
|
wps=word_count/sentence_count
|
||||||
|
|
||||||
|
# Write cik, file name, total number of words, total number of sentences,
|
||||||
|
# and WPS to the output file
|
||||||
|
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
|
||||||
|
str(sentence_count)+';'+str(wps)+'\n')
|
||||||
|
|
||||||
|
# Close filing
|
||||||
|
input_file_10_k.close()
|
||||||
|
|
||||||
|
|
||||||
|
print("Finished")
|
||||||
|
output_file.close()
|
||||||
|
input_file.close()
|
Loading…
Reference in a new issue