1
0
Fork 0

Add programming files

- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
This commit is contained in:
Alexander Hess 2022-08-05 00:05:05 +02:00
parent 65aae9d4f9
commit a37c87d9c8
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
38 changed files with 6416 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
lectures/programming/files

View file

@ -0,0 +1,9 @@
# Programming Files
This folder holds various programming files provided by the instructor:
- introductions to programming techniques
- problem sets
- solutions for the problem sets
In addition, the instructor provided various data files
that are too big to be stored in this repository.

View file

@ -0,0 +1,270 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
This version: February 22, 2019
This is an introduction to two data containers: lists and counters.
Python has several built-in data containers, e.g., sets, dictionaries, and lists
In addition to these containers, there are further types.
For textual analysis application counters are helpful.
This introduction covers lists in the first part.
The second part introduces the basics of counters.
"""
# for counters, you need to import collections
import collections
import re
###############################################################################
# Introduction on data containers
###############################################################################
#################################
# Part 1: lists
#################################
# Create an empty list
empty_list=[]
# Create non-empty lists
string_list=["a", "b", "c"]
mixed_list=[1, "ab", -4,"hello"]
print(mixed_list)
# Call items of a list
print(string_list[0])
print(string_list[2])
print(string_list[-1])
# Length of a list
length=len(string_list)
print("The length of the list is: "+str(length))
# ADD ITEMS TO A LIST
# ALTERNATIVE 1: insert -> you can specify the position
string_list.insert(1,"d")
# you cannot add multiple elements with the insert command
# You can try, but it will not work
# 1st try
string_list.insert(3,"e" "f") # -> the new element is "ef"
print(string_list)
# 2nd try
try:
string_list.insert(3,"e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error TypeError: insert() takes exactly 2 arguments (3 given)'")
# 3rd try
string_list.insert(3, ["e", "f"])
# check length
print("The length of the list is: "+str(len(string_list))) # -> only 6 and not 7
print(string_list[3])
# So element 3 of the list is another list
# You can call the elements of the sub-list
print("First element of sub list: "+string_list[3][0]+" and second element of \
sub list: "+string_list[3][1])
# Reset string_list to keep things easily tractable
string_list=["a", "b", "c"]
# ALTERNATIVE 2: append -> items are added at the end
string_list.append("d")
# Try to add multiple items
# 1st try
string_list.append("e" "f") # -> the new element is "ef"
print(string_list)
# 2nd try
try:
string_list.append("e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error 'TypeError: append() takes exactly one argument (2 given)'")
# 3rd try
string_list.append(["e", "f"])
# check length
print("length of list is "+str(len(string_list))) # -> only 6 and not 7
print(string_list[len(string_list)-1])
# -> element 3 of the list is another list
# You can call the elements of the sub-list
print("First element of sub list: "+string_list[len(string_list)-1][0]+" and \
second element of sub list: "+string_list[len(string_list)-1][1])
# Reset string_list to keep things easily tractable
string_list=["a", "b", "c"]
# ALTERNATIVE 3: extend -> items are added at the end
string_list.extend("d")
# Try to add multiple items
# 1st try
string_list.extend("e" "f") # -> Two elements are created -> works!!!
print(string_list)
# 2nd try
try:
string_list.extend("e", "f")
except:
print("Wrong syntax. If the command were executed without the try-except "\
"you would get the error 'TypeError: extend() takes exactly one argument (2 given)'")
# 3rd try
string_list.extend(["e", "f"])
print(string_list) # -> also works!!!
# check length
print("length of list is "+str(len(string_list))) # -> it is 8 and should be 8
# DELETE ITEMS FROM A LIST
string_list.remove("a")
print("List after deletion of 'a' "+str(string_list))
# What happens if an element occurs multiple times
string_list.remove("e")
print("List after further deletion of 'e' "+str(string_list))
# --> only first occurence of "e" is deleted
# FURTHER OPERATIONS WITH LISTS
# Accessing parts of a list
# Remember the first element is [0]! And the upper bound of the range is not
# included, i.e. [0:3] means [0], [1] and [2].
print("Sublist from beginning to third element: "+str(string_list[0:3]))
print("Sublist from beginning to third element: "+str(string_list[:3]))
print("Sublist from second(!) to third element: "+str(string_list[1:3]))
print("Sublist from fourth(!) to fifth element: "+str(string_list[3:5]))
print("Sublist from fifth(!) to the end: "+str(string_list[4:]))
# Search in lists
position=string_list.index("b")
print("Position of 'b' is: "+str(position))
# Searching for an element that is not part of the list
try:
string_list.index("a")
except:
print("Error message. If the command were executed without the try-except "\
"you would get the error 'ValueError: 'a' is not in list'")
if "c" in string_list:
print("'c' is at position: "+str(string_list.index("c")))
# Sort list
string_list.sort()
print('Sorted list: '+str(string_list))
string_list.sort(reverse=True)
print('Reversely sorted list: '+str(string_list))
# What happens when sorting mixed (i.e. integers and strings) lists?
try:
mixed_list.sort()
except:
print("Error message. If the command were executed without the try-except "\
"you would get the error 'TypeError: unorderable types: str() < int()'")
#################################
# Part 2: counters
#################################
'''
A Counter is a dictionary subclass for counting hashable objects.
It is an unordered collection where elements are stored as dictionary keys and
their counts are stored as dictionary values.
'''
# Creating a counter
counter_obj=collections.Counter(["a", "b", "c", "d", "a", "b", "a"])
print('The counter object is: '+str(counter_obj))
# The previous command is equivalent to
counter_obj=collections.Counter(a=3, b=2, c=1, d=1)
print('The counter object (2nd command) is: '+str(counter_obj))
# Add objects to a counter
counter_obj.update(["e", "f", "e"])
print('The updated counter object is: '+str(counter_obj))
# Alternative command
counter_obj["g"]=4
print('The updated updated counter object is: '+str(counter_obj))
# Length of the counter
length=len(counter_obj)
print('The length of the counter is: '+str(length))
# Loop over the elements of the counter and their frequency
i=1
for element in counter_obj:
print("Element "+str(i)+" of the counter: "+str(element))
print("Frequency of Element "+str(i)+" of the counter: "+str(counter_obj[element]))
i=i+1
# .elements() provides an iterator of all individual elements of the counter
counter_elements=list(counter_obj.elements())
print('Elements of the counter: '+str(counter_elements))
# APPLY COUNTERS TO TEXTS
sentence1="This is the first sentence."
sentence2="This is the second sentence, which is longer."
# Split sentences in words
sentence1_words=re.split("\W{1,}", sentence1)
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
# The last element is empty -> delete it.
sentence1_words.remove("")
print("The last element is: "+str(sentence1_words[len(sentence1_words)-1]))
# -> now okay
sentence2_words=re.split("\W{1,}", sentence2)
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
# The last element is empty -> delete it.
sentence2_words.remove("")
print("The last element is: "+str(sentence2_words[len(sentence2_words)-1]))
# -> now okay
# Counter words
sentence1_counter=collections.Counter(sentence1_words)
sentence2_counter=collections.Counter(sentence2_words)
print(sentence1_counter)
print(sentence2_counter)
# OPERATIONS WITH COUNTERS
# add counters
add_counters=sentence1_counter+sentence2_counter
print("You can add counters: "+str(add_counters))
# subtract counters
subtract_counters=sentence1_counter-sentence2_counter
print("You can subtract counters: "+str(subtract_counters))
# Each time a new Counter is produced through an operation, any items with zero
# or negative counts are discarded. --> only first appears in subtract_counters
# Intersection of counters
intersection_counters=sentence1_counter & sentence2_counter
print("You can determine the intersection of counters: "+str(intersection_counters))
# -> takes the minimum of occurences; again elements with zero frequency
# are not included.
# Union of counters
union_counters=sentence1_counter | sentence2_counter
print("You can determine the union of counters: "+str(union_counters))
# -> takes the maximum of occurences
# MOST FREQUENT WORDS
# Determine the three most frequent words in the add_counters set.
top_3_words=add_counters.most_common(3)
print("The top 3 words are: "+str(top_3_words))
# Identify the two most frequent words with the top 4 words in the add_counters sample.
top_4_words=add_counters.most_common(4)
# The first [] refers to the line, i.e. is it the second common, second most
# frequent word.
# The second[] refers either to the word itself [0] or to the frequency of the word [1].
# the most frequent word
top_word=top_4_words[0][0]
top_word_count=top_4_words[0][1]
print("The top word is '"+str(top_word)+"', which appears "+str(top_word_count)+" times")
# the second most frequent word
top_2_word=top_4_words[1][0]
top_2_word_count=top_4_words[1][1]
print("The second most frequent word is '"+str(top_2_word)+"', which appears "+str(top_2_word_count)+" times")
print("Completed")

View file

@ -0,0 +1,447 @@
# -*- coding: utf-8 -*-
"""
INTRODUCTION TO REGULAR EXPRESSION
@author: Alexander Hillert, Goethe University Frankfurt
This version: June 3, 2019
What are regular expressions?
Regular expressions allow you to search for general patterns in texts. The
standard string commands like .count("search_term") and .replace("old_word","new_word")
can only count and replace one specific word, respectively. They cannot search
for general patterns like all words that consist of three or more letters.
Assume that you want to identify all numbers in a text or that you search for
the year of birth in bios of corporate executives. In the examples, you need a
search tool that can process broad patterns --> you need regular expressions.
Consider the second example, i.e. you would like to automatically identify
people's year of birth from their bios. You know that the number must have four
digits and that the first two digits must equal 19. Of course, you could
hardcode all possible years (1900, 1901, ..., 1999), but this is unnecessarily
complicated and slows down the program. Therefore, it is better to learn
how to use regex.
Useful online resources:
1. https://regex101.com/
On this webpage, you can enter a text and a regular expression.
The webpage highlights the matches and provides explanations for
every part of the regex pattern.
Caution: click on "Python" in the left menu (the default language is php)!
2. https://docs.python.org/3/library/re.html
The offical documentation of regular expression in Python 3.
"""
# To be able to use regular expressions you need to import the re package first.
import re
# Select the directory where you saved the accompanying txt-file.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# In this introduction, we use the accompanying txt-file "Text_Introduction_Regular_Expressions.txt"
# open the file
text_file=open(directory+'Text_Introduction_Regular_Expressions.txt','r',encoding='cp1252')
# read its content
text=text_file.read()
# Let's start with the example from the beginning and search for people's years of birth.
# The standard search command for regular expressions is re.search. It searches
# for the FIRST match of the expression in the text.
# First try
match=re.search("19[0-9]{2}",text)
# This command searches for four digits of which the first is a 1, the second a 9,
# and then there are two further digits which can be any digits.
# [0-9] refers to any digit. Equivalently, you can write \d which also refers
# to any digits.
# The {2} specifies that there must be exactly to digits.
print(match)
# match contains information on the match:
# span is the position in text where the match starts and ends; here 226 and 230
# furthermore, the matched text is shown. Here, the first match is 1956.
# You can use the positions to print the text before the match, after the match,
# and, of course, of the matched text.
start=match.start()
end=match.end()
print("From beginning of the document to the match: \n"+text[:start]+"\n\n")
print("The match itself: \n"+text[start:end]+"\n\n")
print("From end of match to end of document: \n"+text[end:]+"\n\n")
# To access the match, you can also use the command .group(0):
print("Alternative way to access the matched text: \n"+match.group(0)+"\n\n")
# CAUTION
# If no match is found the variable match does not exist.
# Example: search for a ten digit number that start with 19
match=re.search("19[0-9]{8}",text)
# The command start=match.start() returns the follwoing error:
# "AttributeError: 'NoneType' object has no attribute 'start'"
# SOLUTION
match=re.search("19[0-9]{8}",text)
if match:
# match found, the start .start() is now conditional on the existence of match
start=match.start()
print("Match found. Starting at position "+str(start))
else:
# no match found
print("No match found")
'''
Information on Syntax, Special Characters in Regular Expression
Character Meaning
[] Indicates a set of characters
\[ Matches the actual [
\] Matches the actual ]
^ negation; the symbols listed afterwards are not allowed in the match
E.g., [^0-9] will not match any numbers but all other symbols.
\d Any digit, i.e. 0, 1, 2, ..., 9. Equivalent to [0-9]
\n Linefeed/newline, the start of a new line.
\s Any whitespace, i.e. a tab, a space.
CAUTION: \s matches also the newline (\n). This property of \s
can lead to unintended matches.
RECOMMENDATION: to match whitespaces only use [ \t], i.e. a space
and a tab (\t).
\S Any non-whitespace symbol.
. Any character (digit, letter, symbol [!,?,%,etc.], spaces) but
NOT the newline, \n.
\. Matches the actual dot.
\w Matches word characters, i.e. [0-9a-zA-Z_]
The underscore (_) is defined to be a word character.
\W Matches any non-word characters, i.e. [^0-9a-zA-Z_]
| Or condition (for an example see line 272)
() Like in math: parentheses indicate which characters of an expression
belong togehter. (For an example see line 272.)
\( Matches the actual (
\) Matches the actual )
(?i) Performs the regex case-insensitive. Must be put at the beginning
of the regex. E.g. re.search("(?i)TeSt",text) will match
TEST, test, Test, etc.
re.IGNORECASE Performs the regex case-insensitive. Must be put at the end of
the regex as an option. E.g. re.search("test",text,re.IGNORECASE)
'''
# Examples of character sets
# 1. [0-9]: numbers
match=re.search("[0-9]","ABC abc 123")
print(match)
#2. [a-z]: any lower case letter
match=re.search("[a-z]","ABC abc 123")
print(match)
#3. [A-Z]: any upper case letter
match=re.search("[A-Z]","ABC abc 123")
print(match)
#4. [cde]: lower case letters c, d, and e.
match=re.search("[cde]","ABC abc 123")
print(match)
#5. [^A-Zab]: all symbols except captial letters and a and b.
match=re.search("[^A-Zab]","ABC abc 123")
print(match)
# you don't see any character because the match is the first white space before abc
'''
Quantifiers for regular expression:
n and m refer to non-negative integers (0, 1, 2, ...), where m>n
Quantifier Meaning
{n} The preceding pattern must be found EXACTLY n times.
{n,} The preceding pattern must be found AT LEAST n times.
{,n} The preceding pattern must be found AT MOST n times.
{n,m} The preceding pattern must be found AT LEAST n but AT MOST m times.
{n,}? The ? tells the regex not to be "greedy" (see lines 211 for details)
There are alternative notations for commonly used quantifiers:
* is equivalent to {0,}, i.e. 0 or more repetitions of the preceding pattern.
+ is equivalent to {1,}, i.e. 1 or more repetitions of the preceding pattern.
? is equivalent to {0,1}, i.e. 0 or 1 repetition of the preceding pattern.
'''
# re.search() returns only the first match: How to get all matches?
# Alternative 1: use a loop.
text1=text
i=1
match=re.search("19[0-9]{2}",text1)
# Repeat the following commands until no more matches are found.
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text1=text1[end:]
match=re.search("19[0-9]{2}",text1)
i=i+1
# Alternative 2: use re.findall
# The syntax is identical to re.search
list_of_matches=re.findall("19[0-9]{2}",text)
print(list_of_matches)
# the individual matches can be called by list_of_matches[i], where i ranges
# from zero to the number of matches minus one.
# Remember: the first element of a list has the position 0
for i in range(0,len(list_of_matches)):
print("This is match number "+str(i+1)+" using the re.findall command: "+list_of_matches[i])
# When you read the text you will observe that there are only six years of birth
# in the text and not eight -> there are two mismatches -> adjust filter to
# get only the years of birth and not all years.
text1=text
i=1
# Check whether the word born appears before the year. The distance between
# born and the year must be smaller or equal 15 (plus the two white spaces)
match=re.search("born .{,15} 19[0-9]{2}",text1)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Extract the year
match1=re.search("19[0-9]{2}",match.group(0))
print("The year of match number "+str(i)+" is: "+match1.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text1=text1[end:]
match=re.search("born .{,15} 19[0-9]{2}",text1)
i=i+1
# The quantifiers introduced above are "greedy". For example, if a pattern matches overlapping
# text parts of different length, the regex will return the longest match.
# Example: search for the first sentence in a text. You know that sentences
# end with period in this example.
text2="This is the first senctence. This is the second sentence. And so on"
# Search for a positive number of occurances of characters followed by a period.
# Remeber that the dot is \. in regex. The . will match any character.
match=re.search(".{1,}\.",text2)
print(match.group(0))
# -> the regex returns the first and second sentence.
# To get the first match that fulfils the regex, put a ? after the quantifiers.
# This makes the quantifier "non-greedy", and only the first occurance will be matched.
match=re.search(".{1,}?\.",text2)
print(match.group(0))
# You will often have situations where there are multiple versions of the same
# pattern. How can you include all of them in one regular expression?
# Example 1: search for the word "losses" in the following sentence:
text3="X Corp's soda division returned significant losses in the last quarter. Losses will be reduced this quarter."
# the first letter of "loss" can be upper or lower case
print("Example 1: Loss and loss")
text4=text3
i=1
# A set of characters [] is matched if at least one of the components of the
# set is found in the text. This works only for a single letter/number/symbol
# but not for sequences of multiple letters/numbers/symbols.
match=re.search("[Ll]oss",text3)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("[Ll]oss",text4)
i=i+1
# Alternatively
list_of_matches=re.findall("[Ll]oss",text3)
print("Alternative using re.findall: "+str(list_of_matches))
# In this example, you could also simply perform a case-insensitive match.
print("Case-INsensitive matching using re.IGNORECASE")
text4=text3
i=1
match=re.search("loss",text3,re.IGNORECASE)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("loss",text4,re.IGNORECASE)
i=i+1
# Or equivalently
print("Case-INsensitive matching using (?i)")
text4=text3
i=1
match=re.search("(?i)loss",text3)
while match:
end=match.end()
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
text4=text4[end:]
match=re.search("(?i)loss",text4)
i=i+1
# Example 2: search for the expressions "profits declined" and "profits decreased"
# in the following sentence:
text3="X Corp's profits declined in 2010, while Y Inc.'s profits decreased the year before."
# Here, [] no longer works because we need to match terms consisting of several
# characters and [] matches only one character. -> use the OR-operator |
print("Example 2: profits declied and profits decreased - First try")
text4=text3
i=1
match=re.search("profits declined|decreased",text3)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("profits declined|decreased",text4)
i=i+1
# Problem: regex interprets the entire set of characters before the | as one
# alternative.
# Solution: use parantheses to define the boundaries.
print("Example 2: profits declied and profits decreased - Second try")
text4=text3
i=1
match=re.search("profits (declined|decreased)",text3)
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("profits (declined|decreased)",text4)
i=i+1
# Alternative: does re.findall work?
list_of_matches=re.findall("profits (declined|decreased)",text3)
print(list_of_matches)
# -> No! Because there is a major difference between re.search and re.findall
# in the way they treat parantheses ().
# re.search follows the general regular expression syntax that is also used in
# other programming languages.
# To use re.findall you have to write down the full text before and after the |.
list_of_matches=re.findall("profits declined|profits decreased",text3)
print(list_of_matches)
# More information on the difference between re.search and re.findall
# Example 3: let's search for the numbers in the second part of the txt file
# and compare what the two commands do.
# Get the second part
match=re.search("Here are some numbers:",text)
text4=text[match.end():]
print(text4)
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
# What are the individual parts of this pattern?
# [0-9]{1,} There has to be at least one digit.
# ([0-9]{3}|,){0,} The first digit can be followed by combinations of three
# digits and commas (as thousand separator).
# \.{0,1} There can be zero or one period as decimal separator.
# [0-9]{0,} There can be multiple decimal places.
i=1
while match:
print("This is match number "+str(i)+": "+match.group(0))
# Check whether there are further matches after the end of the previous match
end=match.end()
text4=text4[end:]
match=re.search("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
i=i+1
# Can we obtain the same result by using re.findall?
match=re.search("Here are some numbers:",text)
text4=text[match.end():]
list_of_matches=re.findall("[0-9]{1,}([0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
print(list_of_matches)
# Does not work!
# One has to put "?:" in the part that captures the repetition of the thousands.
# This tells re.findall to return the full match and not subpatterns.
list_of_matches=re.findall("[0-9]{1,}(?:[0-9]{3}|,){0,}\.{0,1}[0-9]{0,}",text4)
print(list_of_matches)
# TAKE AWAY: The matching of re.findall does not always match that of re.search
# Be careful when using re.findall!!!
# How to delete or substitute parts of texts?
# Alternative 1: identify the beginning and end of the matched text part and
# remove it from the overall text.
# Example delete all numbers in the text
text4=text
print("Original Text:\n"+text4)
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
while match:
# Remove the match
text4=text4[:match.start()]+text4[match.end():]
# Check whether there are further matches in the remaining text
match=re.search("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}",text4)
print("Text without numbers using re.search:\n"+text4)
# Alternative 2: use re.sub (sub -> substitute)
# syntax: new_text=re.sub(pattern, replacement, old_text)
# replacement is some string. Regular expressions are only allowed in the pattern
# but not in the replacement.
text4=text
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","",text4)
print("Text without numbers using re.sub:\n"+text4)
# re.sub is the more efficient way.
# Furthermore, re.sub can not only delete text but also replace text.
# Example
text4=text
text4=re.sub("[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}","NUMBER",text4)
print("Text where numbers are replaced by the word 'NUMBER':\n"+text4)
# Make sure you get the right match --> importance of word boundaries.
# When you search for a word it can happen that the word is part of a different
# longer word. For example, searching for "high" would also match "highlight".
# To avoid such mismatches you can either include word boundaries in the search
# (Alternative 1) or split the text first by word boundaries into single words
# and perform standard string search operations afterwards (Alternative 2).
# Alternative 2 does not return the individual matches but tells you for example
# the number of matches
# Example: search for the word "is"
# Alternative 1:
match=re.search("is",text)
print("Searching without word boundaries yields: '"+match.group(0)+\
"' But the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
match=re.search("\Wis\W",text)
print("Searching with word boundaries yields: '"+match.group(0)+\
"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
# You see that the preceding and subsequent word boundaries are also matched
# and saved as the matched term. However, often you want the match to include only
# the actual word without its boundaries.
# Solution: use so called "look ahead" and "look back" conditions.
'''
Look ahead and look behind/back conditions
Regex requires that the parts of the pattern that are classified as look ahead
or look back/behind are present in the text but does not include them in the match.
Syntax:
positive look ahead: (?=) Example: X(?=\W) requires that there is a word
boundary after X
negative look ahead: (?!) Example: X(?!\W) requires that there must NOT
be a word boundary after X.
positive look back: (?<=) Example: (?<=\W)X requires that there is a word
boundary before X
negative look back: (?<!) Example: (?<!\W)X requires that there must NOT
be a word boundary before X.
'''
match=re.search("(?<=\W)is(?=\W)",text)
print("Searching with word boundaries as look ahead and look back condition yields: '" #
+match.group(0)+"' and the surrounding text is: '"+text[match.start()-1:match.end()+1]+"'")
# Does it work also with re.finall?
list_of_matches=re.findall("\Wis\W",text)
print("Word boundaries using re.findall: "+str(list_of_matches))
list_of_matches=re.findall("(?<=\W)is(?=\W)",text)
print("Word boundaries as look ahead and look back condition using re.findall: "+str(list_of_matches))
print("In total there are "+str(len(list_of_matches))+" matches.")
# --> Yes, the approach also work with re.findall.
# Alternative 2:
# Use re.split(), which is similar to split() but more powerful.
text_split=re.split("\W",text)
print(text_split)
# Problem: there are elements in the list that are not words, e.g. ''. These
# elements are created because there can be a series of non-word characters (\W),
# e.g. ' (' in 'Balmer (born'.
# Solution: treat a series of wordboundaries \W as a single split character
text_split=re.split("\W{1,}",text)
print(text_split)
# Now, you do not need to include word boundaries and can use standard string
# operations.
number_matches=text_split.count("is")
print("Using standard string operations, we get "+str(number_matches)+" matches.")
# -> same result.

View file

@ -0,0 +1,485 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
'''
This script introduces you to linear models using the sklearn package.
Besides sklearn, we will use pandas to work with data sets as well as
numpy to perform computations.
The introduction consists of 10 parts:
1. linear regressions using a toy data set
2. linear regressions using a "real" data set
3. linear regressions using standardized variables
4. Ridge regression basics
5. Ridge regression with training, tuning, and testing sample
6. Ridge regression with cross-validation
7. LASSO regression basics
8. LASSO regression with training, tuning, and testing sample
9. LASSO regression with cross-validation
10. Compare the results from Ridge and LASSO
'''
import pandas as pd
import numpy as np
# For OLS regressions
from sklearn.linear_model import LinearRegression
# for Ridge regressions
from sklearn.linear_model import Ridge
# for computing mean squared errors
from sklearn.metrics import mean_squared_error
# for plotting the MSEs for different levels of Lambda
import matplotlib.pyplot as plot
# for Ridge regressions with cross-validation
from sklearn.linear_model import RidgeCV
# for LASSO regressions
from sklearn.linear_model import Lasso
# for LASSO regressions with cross-validation
from sklearn.linear_model import LassoCV
# adjust the directory to your folder!!!
directory="C:/Lehre/Machine Learning/Data/"
############################################################
# Part 1. Basics: linear regressions in Python using sklearn
############################################################
print("\nPart 1: Run an OLS regression on a sandbox data set\n")
# create a random number from a normal distribution with mean 0 and standard deviation 1.
random_number=np.random.normal(0, 1)
print("A random number is: "+str(random_number))
# you can also create a vector or matrix of random variables
# the parameter size(# of rows, # of columns) specifies the number rows and columns
# For example, a (10,1) vector
random_number_vector=np.random.normal(0, 1, size=(10,1))
print("The vector of random numbers is:")
print(random_number_vector)
# create the independent variable x as a vector of random numbers
x_vector=np.random.normal(0, 1, size=(10,1))
print("The vector of the independent variable x is:")
print(x_vector)
# create the dependent variable y as
# y = 2x + epsilon, where epsilon is the random error term from above
y_vector=np.dot(x_vector,2) + random_number_vector
print("The vector of the dependent variable y is:")
print(y_vector)
# perform a standard OLS regression with intercept.
# The command takes x (independent variable(s)) first and then y (dependent variable)
# Note that the default is that the intercept is included. So, strictly speaking,
# the (fit_intercept=True) option is not needed.
regression_1=LinearRegression(fit_intercept=True).fit(x_vector, y_vector)
# display the intercept and the beta coefficient on x
print("The intercept is: "+str(regression_1.intercept_))
# to get it as a scalar/number not an array, use
regression_1.intercept_[0]
print("The coefficient on x is: "+str(regression_1.coef_))
# to get it as a scalar/number not an array, use
regression_1.coef_[0][0]
# R2 of the regression
print("The R2 is: "+str(regression_1.score(x_vector, y_vector)))
###############################################################
# Part 2: linear regression using a "real" data set
###############################################################
print("\nPart 2: Run an OLS regression with a real data set\n")
# import the data for this problem
# The data set consists of 200 independent variables (x1 to x200) and
# a dependent variable (y).
# There are 1,200 observations in total. In the later parts, we will
# use the first 1,000 observations for training and the last 200 for testing.
# The data are simulated using the following process:
# y = 0.5*x1 + 0.5*x2 + ... + 0.5*x100 + random error (mean 0, std. dev. 4)
# The x101 to x200 are not directly related to y but are correlated with
# the x1 to x100. More specifically,
# x101 = 0.7*x1 + random error (mean 0, std. dev. 1)
# x102 = 0.7*x2 + random error (mean 0, std. dev. 1)
# x200 = 0.7*x100 + random error (mean 0, std. dev. 1)
data_frame=pd.read_csv(directory+"regression_data_scikit.csv",sep=";")
# to get any idea about the data, display the first five data points
data_frame.head(5)
# split the data frame into the independent and dependent variables
# the independent variables(x1 to x200) are columns 1 to 200
x_variables=data_frame.values[:,:-1]
# the dependent variable (y) is column 201
y_variable=data_frame.values[:,-1:]
# run a standard OLS regression
regression_OLS=LinearRegression(fit_intercept=True).fit(x_variables, y_variable)
# You can double check the results by reruning the regression in Stata or R.
# display the intercept and the beta coefficients on x1 and x51
print("The intercept is: "+str(regression_OLS.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_OLS.score(x_variables, y_variable)))
##################################################################
# Part 3: standardize the data to have mean zero and unit variance
# and rerun the regression
##################################################################
print("\nPart 3a.: Standardize variables\n")
# standardize x and y to have mean zero and unit variance
# axis=0 (axis=1) means that the computation is executed column (row) wise
x_variables_mean=np.mean(x_variables,axis=0)
# ddof=1 means that we use n-1 to compute the standard deviation
x_variables_standard_deviation=np.std(x_variables, axis=0, ddof=1)
x_variables_standardized=(x_variables-x_variables_mean)/x_variables_standard_deviation
# do the same exercise for y
y_variable_mean=np.mean(y_variable,axis=0)
y_variable_standard_deviation=np.std(y_variable, axis=0, ddof=1)
y_variable_standardized=(y_variable-y_variable_mean)/y_variable_standard_deviation
# rerun the regression using standardized data
regression_OLS_standardized=LinearRegression(fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# results are identical to a regression in Stata with beta coefficients.
# display the intercept and the beta coefficients on x_1 and x_51
print("The intercept is: "+str(regression_OLS_standardized.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS_standardized.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS_standardized.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_OLS_standardized.score(x_variables_standardized, y_variable_standardized)))
# The R2 is identical to the one from Part 2 -> good!
#######################################################################################
# CAUTION: be careful using the "normalize=True" option in the LinearRegression module!
#######################################################################################
print("\nPart 3b.: Regression with 'normalization'\n")
# Normalizer works on the rows, not the columns!
# By default, L2 normalization is applied to each observation so that the
# values in a row (!) have a unit norm. Unit norm with L2 means that if each
# element were squared and summed, the total would equal 1.
regression_OLS_normalized=LinearRegression(fit_intercept=True,normalize=True).fit(x_variables, y_variable)
# display the intercept and the beta coefficient on x_1 and x_51
print("The intercept is: "+str(regression_OLS_normalized.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_OLS_normalized.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_OLS_normalized.coef_[0][50]))
# The coefficients are different from the ones above highlighting that the
# "normalize=True" option does not do the same as "normal" standardizing
# R2 of the regression
print("The R2 is: "+str(regression_OLS_normalized.score(x_variables, y_variable)))
#######################################################################
# Part 4: Ridge regression on the full sample (no training and testing)
# This part is to learn the syntax.
# We are using the standardized variables to have the same penalty
# for a given effect of x on y.
# Remember: if the independent variables are measured on very different
# scales, the beta coefficients have different sizes (e.g., market cap in
# thousand USD vs. past stock returns as a decimal number) and, thus,
# the panelty would be applied inconsistently.
#######################################################################
print("\nPart 4: Ridge regression - learning the syntax\n")
# the parameter alpha corresponds to the penalty parameter Lambda from
# the notation that is typically used.
# the default is that the intercept is included, so you do not need the
# "intercept=True" parameter. But it is good to keep in mind what
# specification you are using.
regression_Ridge=Ridge(alpha=10,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# display the intercept and the beta coefficient on x1 and x51
print("The intercept is: "+str(regression_Ridge.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_Ridge.coef_[0][0]))
print("The coefficient on x_51 is: "+str(regression_Ridge.coef_[0][50]))
# R2 of the regression
print("The R2 is: "+str(regression_Ridge.score(x_variables_standardized, y_variable_standardized)))
# How to compute the mean squared error (MSE)?
# 1. get the predicted values
y_variable_standardized_predicted=regression_Ridge.predict(x_variables_standardized)
# 2. determine the MSE
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
#######################################################################
# Part 5: Ridge regression using a training, tuning, and testing sample
#######################################################################
print("\nPart 5: Ridge regression - Application with training, tuning, and testing data\n")
# Create a training, tuning, and testing sample
# we split the data into a training, a tuning, and a testing set
# training data are the frist 800 rows
# In the brackets, the first range (before the comma) indicates the rows, the second the columns.
x_variables_std_train=x_variables_standardized[:800,:]
y_variable_std_train=y_variable_standardized[:800,:]
# the tuning data are row 801 to 1000 -> 200 observations
x_variables_std_tune=x_variables_standardized[800:1000,:]
y_variable_std_tune=y_variable_standardized[800:1000,:]
# testing data are the last 200 rows
x_variables_std_test=x_variables_standardized[1000:,:]
y_variable_std_test=y_variable_standardized[1000:,:]
##########################
# find the optimal Lambda
##########################
# we store the MSE of the training/tuning data for each Lambda
mse_train_list=[]
mse_tune_list=[]
# Again, Lambda and Alpha refer to the same thing.
alpha_list=[]
# we iterate from 0.1 to 100 increasing Lambda=Alpha by 0.1 in each step.
alpha=0.1
while alpha<100:
# train the model
regression_Ridge_train=Ridge(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
# add the alpha to the list of alphas
alpha_list.append(alpha)
# predict y in the training sample
y_variable_std_train_predicted=regression_Ridge_train.predict(x_variables_std_train)
# predict y in the tuning sample
y_variable_std_tune_predicted=regression_Ridge_train.predict(x_variables_std_tune)
# compute the MSE in both samples
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
# append the MSEs to the two lists
mse_train_list.append(mse_train)
mse_tune_list.append(mse_tune)
# continue with the next alpha
alpha=alpha+0.1
########################################
# plot the MSEs for the different alphas
########################################
# MSE in the training sample
plot.scatter(alpha_list, mse_train_list)
plot.show()
# higher Lambda associated with higher MSE
# MSE in the tuning sample
plot.scatter(alpha_list, mse_tune_list)
plot.show()
# there is an optimal alpha with the lowest MSE
######################################
# determine the optimal Lambda
######################################
# what is the smallest MSE?
minimum=min(mse_tune_list)
print("The smallest MSE is "+ str(minimum))
# get the position of the minimum MSE in our list
index_min_MSE=mse_tune_list.index(minimum)
# choose the corresponding alpha
alpha_optimal=alpha_list[index_min_MSE]
print("The optimal alpha is "+str(alpha_optimal))
#############################################################
# What is the out-of-sample performance of the optimal model?
#############################################################
# take the full training data set (1000 observations, i.e., training + tuning set)
x_variables_std_train_total=np.concatenate((x_variables_std_train, x_variables_std_tune), axis=0)
y_variable_std_train_total=np.concatenate((y_variable_std_train, y_variable_std_tune), axis=0)
# train the model with the optimal Lambda on the training and tuning data
regression_Ridge_optimal=Ridge(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
# Mean squared error
# predict y in the full training sample
y_variable_std_train_total_predicted=regression_Ridge_optimal.predict(x_variables_std_train_total)
# predict y in the testing sample
# Remeber: we have not used the testing data yet. Firewall principle!!!
y_variable_std_test_predicted=regression_Ridge_optimal.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
#############################################################
# Part 6: Ridge regression with k-fold cross-validation
# Implement the cross validation using a package
#############################################################
print("\nPart 6. Ridge regression - Using cross-validation\n")
# the default for cv is the leave-one-out cross-validation
# here we apply five-fold cross-validation
regression_Ridge_cv=RidgeCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
# get the optimal lambda
alpha_optimal_cv=regression_Ridge_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# Mean squared error using the cross-validated model
# predict y in the full training sample
y_variable_std_train_total_predicted_cv=regression_Ridge_cv.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted_cv=regression_Ridge_cv.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
###########################################
# Part 7: LASSO regression
# on the full sample -> to learn the syntax
###########################################
print("\nPart 7: LASSO regression - learning the syntax\n")
# the parameter alpha corresponds to the penalty parameter Lambda from
# the notation that is typically used.
# the default is that the intercept is included, so you do not need the
# "intercept=True" parameter. But it is good to keep in mind what
# specification you are using.
regression_Lasso=Lasso(alpha=0.1,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
# display the intercept and the beta coefficient on x1 and x51
print("The intercept is: "+str(regression_Lasso.intercept_[0]))
print("The coefficient on x_1 is: "+str(regression_Lasso.coef_[0]))
print("The coefficient on x_51 is: "+str(regression_Lasso.coef_[50]))
# R2 of the regression
print("The R2 is: "+str(regression_Lasso.score(x_variables_standardized, y_variable_standardized)))
# How to compute the mean squared error (MSE)?
# 1. get the predicted values
y_variable_standardized_predicted=regression_Lasso.predict(x_variables_standardized)
# 2. determine the MSE
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
####################################################
# Part 8: Create a training, tune and testing sample
####################################################
print("\nPart 8: LASSO regression - Application with training, tuning, and testing data\n")
# we use the same training, tuning, and testing data as in part 5.
# -> no need to redefine the data sets.
#################################
# find the optimal Lambda
#################################
# we store the MSE of the training/tuning data for each Lambda
mse_train_list=[]
mse_tune_list=[]
# Again, Lambda and Alpha refer to the same thing.
alpha_list=[]
# we iterate from 0.0001 to 0.25 increasing alpha by 0.0001 in each step.
alpha=0.0001
while alpha<0.25:
# train the model
regression_Lasso_train=Lasso(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
# add the alpha to the list of alphas
alpha_list.append(alpha)
# predict y in the training sample
y_variable_std_train_predicted=regression_Lasso_train.predict(x_variables_std_train)
# predict y in the tuning sample
y_variable_std_tune_predicted=regression_Lasso_train.predict(x_variables_std_tune)
# compute the MSE in both samples
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
# append the MSEs to the two lists
mse_train_list.append(mse_train)
mse_tune_list.append(mse_tune)
# continue with the next alpha
alpha=alpha+0.0001
########################################
# plot the MSEs for the different alphas
########################################
# MSE in the training sample
plot.scatter(alpha_list, mse_train_list)
plot.show()
# higher Lambda associated with higher MSE
# MSE in the tuning sample
plot.scatter(alpha_list, mse_tune_list)
plot.show()
# there is an optimal alpha with the lowest MSE
######################################
# determine the optimal Lambda
######################################
# what is the smallest MSE?
minimum=min(mse_tune_list)
print("The smallest MSE is "+ str(minimum))
# get the position of the minimum MSE
index_min_MSE=mse_tune_list.index(minimum)
alpha_optimal=alpha_list[index_min_MSE]
print("The optimal alpha is "+str(alpha_optimal))
#############################################################
# What is the out-of-sample performance of the optimal model?
#############################################################
# take the full training data set (1000 observations; training + tuning)
# use the same variables as in Part 5.
# train the model with the optimal Lambda on the training and tuning data
regression_Lasso_optimal=Lasso(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
# Mean squared error
# predict y in the full training sample
y_variable_std_train_total_predicted=regression_Lasso_optimal.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted=regression_Lasso_optimal.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
#############################################################
# Part 9: Implement the cross validation using a package
#############################################################
print("\nPart 9: LASSO regression - Using cross-validation\n")
# the default for cv in LassoCV is the 5-fold cross-validation
regression_Lasso_cv=LassoCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
# get the optimal lambda
alpha_optimal_cv=regression_Lasso_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# Mean squared error using the cross-validated model
# predict y in the full training sample
y_variable_std_train_total_predicted_cv=regression_Lasso_cv.predict(x_variables_std_train_total)
# predict y in the testing sample
y_variable_std_test_predicted_cv=regression_Lasso_cv.predict(x_variables_std_test)
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
#####################################################################
# Part 10: Compare the betas from the Ridge and the LASSO regressions
#####################################################################
print("\nPart 10: Comparison of Ridge and LASSO coefficients\n")
# To set to what extend the results of Ridge and LASSO are similar, we
# write the coefficients from the cross-validation tasks (Parts 6 and 9)
# to a csv files.
output_file=open(directory+"comparison_coefficients_Ridge_LASSO.csv","w",encoding="utf-8")
output_file.write("index;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,200):
output_file.write(str(i)+';'+str(regression_Ridge_cv.coef_[0][i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 17 17:09:50 2021
@author: ahillert
"""
from nltk.tokenize import sent_tokenize
print("\nExample 1\n")
text_1="The S&P 500 rose 43.44 points to 4,159.12. The Dow Jones industrial average " \
+"added 188.11 points, or 0.6 percent, to 34,084.15. The tech-heavy Nasdaq fared " \
+"better than the rest of the market, climbing 236 points, or 1.8 percent, to 13,535.74"
sentence_list_1=sent_tokenize(text_1)
for i in range(0,len(sentence_list_1)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_1[i])
# -> good performance
print("\nExample 2\n")
text_2=text_1.lower()
sentence_list_2=sent_tokenize(text_2)
for i in range(0,len(sentence_list_2)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_2[i])
# -> poor performance
# For the NLTK tokenizer it makes a difference whether text is lower or upper case.
print("\nExample 3\n")
text_3="On Sept. 16, 2020, the U.S. president appointed John D. Smith as head of the F. B. I. " \
+"While Jane C. Taylor became the president of the S. E. C. " \
+"On Jan. 5, 2020, J. C. Penny filed for bankruptcy. Michael T. Brown - reporting from Washington D.C."
sentence_list_3=sent_tokenize(text_3)
for i in range(0,len(sentence_list_3)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_3[i])
# -> good performance
print("\nExample 4\n")
text_4=text_3.lower()
sentence_list_4=sent_tokenize(text_4)
for i in range(0,len(sentence_list_4)):
print("This is sentence "+str(i+1)+":\n"+sentence_list_4[i])

View file

@ -0,0 +1,137 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 17:43:45 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# import modules
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
# the following three commands:
#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
################
# 1. Tokenize
################
# Create a test text to see how well nltk.tokenize performs
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
# Tokenize sentences
sentence_list=sent_tokenize(test_text)
print("This is the list of sentences:")
print(sentence_list)
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
# Tokenize words
word_list=word_tokenize(test_text)
print("This is the list of words:")
print(word_list)
print(len(word_list))
# --> word_tokenize also includes symbols and numbers as words.
# How to delete the elements that are not real words?
word_list_1=[]
for word in word_list:
if re.search('[A-Za-z]',word):
word_list_1.append(word)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_1)
print(len(word_list_1))
# Alternative
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
word_list_2=word_tokenize(test_text1)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_2)
print(len(word_list_2))
################
# 2. Stop Words
################
example_sentence = "This is an example showing off stop word filtering."
stop_words=set(stopwords.words("english"))
print("This is the list of stop words from NLTK:")
print(stop_words)
# --> the stop words are all lower case
print(len(stop_words))
# Split example sentence into words
word_list_example=word_tokenize(example_sentence.lower())
# Create list for filtered words
word_list_filtered=[]
# filter out stop words
for word in word_list_example:
if word not in stop_words:
word_list_filtered.append(word)
print("Example sentence after stop words have been deleted:")
print(word_list_filtered)
# How does the example from above look like?
test_text_filtered=[]
# filter out stop words
for word in word_tokenize(test_text.lower()):
if word not in stop_words:
test_text_filtered.append(word)
print("Test text after stop words have been deleted:")
print(test_text_filtered)
################
# 3. Stemming
################
# define an abbreviation
ps=PorterStemmer()
example_words_1=["play", "player", "players", "played", "playing"]
for word in example_words_1:
print(ps.stem(word))
# the full syntax without the abbreviation would be:
print(PorterStemmer().stem(word))
# adjectives and adverbs
example_words_2=["high", "higher", "highest", "highly", "height"]
for word in example_words_2:
print(ps.stem(word))
# --> comparative and superlative are not reduced to the stem/regular adjective
# neither are adverbs
# Let's see how the stemmer deals with irregular words.
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
for word in example_words_3:
print(ps.stem(word))
# --> upper case words are also transformed to lower case.
# Stem the test text from above
# Approach 1: stem word by word
test_text_stemmed=[]
for word in word_tokenize(test_text):
test_text_stemmed.append(ps.stem(word))
print("Stemming word by word: test text after it has been stemmed:")
print(test_text_stemmed)
# Alternative approach: stem entire text
test_text_stemmed=ps.stem(test_text)
print("Stemming entire document: test text after it has been stemmed:")
print(test_text_stemmed)
# -> does not work
print("End of nltk introduction!")

View file

@ -0,0 +1,19 @@
This is the text for the introduction to regular expressions.
In the first example, we search for the year of birth of current and former CEOs.
These are sentences that I made up:
Microsoft's former CEO Steve Balmer (born in 1956) graduated from Harvard in 1977.
Michael Dell was born in 1965 in Houston and founded Dell Inc in 1984.
Walmart is currently run by Doug McMillon, who was born in 1966.
The following three examples are taken from the Wikipedia pages of the three people.
Steven Anthony "Steve" Ballmer (born March 24, 1956) is an American chief executive who is the former chief executive officer of Microsoft from January 2000 to February 2014, and is the current owner of the Los Angeles Clippers. Source: https://en.wikipedia.org/wiki/Steve_Ballmer, June 22, 2017.
Michael Saul Dell (born February 23, 1965) is an American business magnate, investor, philanthropist, and author. He is the founder and CEO of Dell Technologies, one of the worlds leading providers of information technology infrastructure solutions. Source: https://en.wikipedia.org/wiki/Michael_Dell, June 22, 2017.
Carl Douglas "Doug" McMillon (born October 17, 1966) is an American businessman and is the president and chief executive officer (CEO) of Wal-Mart Stores, Inc. Source: https://en.wikipedia.org/wiki/Doug_McMillon, June 22, 2017.
Here are some numbers:
1,234,567
8,901
34
56.82
539,234,353.41

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
file_word_list=open(directory+'Complex_Words.txt','r',encoding="utf-8")
word_list=file_word_list.read()
word_list=word_list.lower()
complex_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Complex_Tone.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;Number_Words;Number_Complex_Words;Percent_Complex_Words\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters
text=input_text_10_k.lower()
# Split the text in words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Reset the number of complex words to zero
complex_count=0
# For each complex word, count the number of occurrences
for i in range(len(complex_words)):
complex_count=complex_count+list_of_words.count(complex_words[i])
# Write cik, file name, total number of words, and number of complex words to output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(complex_count)+';'+str(complex_count/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,53 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# To determine file size we need the OS package
import os
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# File size of the complete submission file (gross file size)
# You have to divide the result by 1024 to get the size in kilobyte
# The file size will be affected by html code and exhibits.
size_gross=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'.txt')/1024
# File size of the main text file (net file size)
# You have to divide the result by 1024 to get the size in kilobyte
size_net=os.path.getsize(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt')/1024
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,167 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular expressions and counters (->collections)
import re
import collections
# for the bigram part, the sentence tokenizer is helpful
from nltk.tokenize import sent_tokenize
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create an empty counter variable
words_counter=collections.Counter()
# variable is needed only for an alternative solution
words_counter1=collections.Counter()
# counter for the extra task
bigram_counter=collections.Counter()
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+\
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# read the content from the file
input_text_10_k=input_file_10_k.read()
# use lower case only so that it does not matter whether a word is at
# the beginning of a sentence ("The") or within a sentence ("the").
# Please note that this can be problematic, e.g. "US" -> United States vs.
# us (personal pronoun)
input_text_10_k_lower=input_text_10_k.lower()
# Split text into words
list_of_words=re.split('\W{1,}',input_text_10_k_lower)
# There can be empty ("") list elements -> remove them
while list_of_words.count("")>0:
list_of_words.remove("")
# optional commands to remove words that only contain "_"
'''
for word in list_of_words:
if re.sub("[a-zA-Z]","",word)!="":
#if word.count("_")>0:
list_of_words.remove(word)
'''
# Add the words to our counter
words_counter=words_counter+collections.Counter(list_of_words)
# alternative solution
words_counter1.update(list_of_words)
#############################################
# optional part for the extra task on bigrams
#############################################
# create an empty list for the bigrams
bigram_list=[]
# split the text into sentences
list_of_sentences=sent_tokenize(input_text_10_k)
# create the BIGRAM IN EACH SENTENCE
for sentence in list_of_sentences:
# make the sentence lower case
sentence_lower=sentence.lower()
# split the sentence into words
list_of_words=re.split("\W{1,}",sentence_lower)
# remove empty elements
while list_of_words.count("")>0:
list_of_words.remove("")
#print("these are the words of the sentence:\n"+str(list_of_words))
# go over all potential two word combinations in the sentence.
for word_number in range(0,len(list_of_words)-1):
bigram_list.append(list_of_words[word_number]+' '+list_of_words[word_number+1])
bigram_counter=bigram_counter+collections.Counter(bigram_list)
# end of extra task
# Close the 10-K filing
input_file_10_k.close()
input_file.close()
######################
# Top 100 single words
######################
# Open the csv file containing the 100 most frequently used words
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8")
output_file.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_words=words_counter.most_common(100)
# for the alternative solution
#top_100_words=words_counter1.most_common(100)
# Write the 100 most frequent words to the csv file.
# Remember Python starts counting at 0, while humans start at 1.
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
# Consequently, to get a consistent table, we must use the value i for the rank
# but call the element i-1.
for i in range(1,101):
output_file.write(str(i)+";"+str(top_100_words[i-1][0])+";"+\
str(top_100_words[i-1][1])+"\n")
# Close the csv file
output_file.close()
######################
# Extra task
# Top 100 bigrams
######################
# Open the csv file containing the 100 most frequently used BIGRAMS
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
output_file_bigram.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_bigrams=bigram_counter.most_common(100)
# Write the 100 most frequent bigrams to the csv file -> same approach as for the single words.
for i in range(1,101):
output_file_bigram.write(str(i)+";"+str(top_100_bigrams[i-1][0])+";"+\
str(top_100_bigrams[i-1][1])+"\n")
# Close the csv file
output_file_bigram.close()
print("Task done!")

View file

@ -0,0 +1,96 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular epressions, tokenize (to identify words), and stemming.
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
#for i in range(1,len(input_text_line)):
# for illustration filings 1 to 3 only
for i in range(1,4):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# Get the text of the 10-K
input_text_10_k=input_file_10_k.read()
# We need to tokenize the text because stem only works on a word by word basis.
# Stemming an entire document without splitting into words does not work!
# The problem is that \n gets lost in this process --> we cannot easily
# recreate the document.
# idea: replace \n by \n and some indicator that there was a line break.
# Here, I choose "LINEBREAKMARK"
input_text_10_k=input_text_10_k.replace("\n","\nLINEBREAKMARK ")
# Split text into words
# There are two alternatives.
# Alternative 1 (our standard approach):
#word_list=re.split("\W{1,}",input_text_10_k.lower())
# Alternative 2 (keeps symbols like ,;.):
word_list=word_tokenize(input_text_10_k.lower())
# Stem the text
text_stemmed=''
for word in word_list:
# The following two cases are designed to improve the formatting of the
# output file. It is not needed for the subsequent analyses.
# Case 1: 'word' is not an actual word but a symbol. -> there should
# be no whitespace between the previous words and this symbol.
# \A and \Z indicate the beginning and end of string -> the 'word' is just
# the symbol but not a combination of letters and symbols.
if re.search("\A[\.\?!,:;']{1,}\Z",word):
text_stemmed=text_stemmed+word
# Case 2: the word is an actual word -> have a whitespace included.
else:
text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
# The simple solution (without restoring the formatting of the text) is:
#text_stemmed=text_stemmed+" "+PorterStemmer().stem(word)
# To recreate the text, we need to replace the line break indicators by \n
# Because of the stemming "LINEBREAKMARK" becomes "linebreakmark".
text_stemmed=text_stemmed.replace("linebreakmark","\n")
# Open the output file for the stemmed text
output_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename\
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
output_file_10_k.write(text_stemmed)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()
print("Task done!")

View file

@ -0,0 +1,287 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
ps=PorterStemmer()
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard;Jaccard_own_stop_words;\
Jaccard_NLTK_stop_words;Jaccard_stemmed;Jaccard_stemmed_own_stop_words;\
Jaccard_stemmed_NLTK_stop_words\n')
# Read own stop word list
# This list has been created by manually selecting words from the csv-file
# 100_most_frequent_words.csv, which is created by the Python program
# "Problem_12_Most_Frequent_Words.py".
# Simply delete words you consider to be meaningless and that are frequently
# used.
stop_word_file=open(directory+'Stop_Word_List_Alexander.csv','r',encoding="utf-8")
stop_word_text=stop_word_file.read()
stop_word_line=stop_word_text.split("\n")
stop_word_line.remove("")
own_stop_words=[""]
for i in range(1,len(stop_word_line)):
stop_word=stop_word_line[i].split(";")[1]
own_stop_words.append(stop_word)
own_stop_words.remove("")
print("This is the list of my stop words:")
print(own_stop_words)
# Read NLTK stop word list
NLTK_stop_words=set(stopwords.words("english"))
print("This is the list of NLTK stop words:")
print(NLTK_stop_words)
# set default values for variables
# It is not required. However, if you don't do it Spyder will suggest that line
# jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
# is incorrect as word_list_old_edited is not yet defined at point in the program
# code. In this specific example, this will not cause an error, as we do not enter
# the if condition when i=1 -> it
word_list_old_edited=[]
word_list_edited=[]
word_list_old_NLTK_filtered=""
word_list_old_own_filtered=""
word_list_old_edited_stemmed=""
word_list_old_own_filtered_stemmed=""
word_list_old_NLTK_filtered_stemmed=""
#######################################################
# Define a function that computes Jaccard similarity
# As we need these operations several times, it is
# helpful to use a function.
######################################################
# beginning of the function
def jaccard(text1,text2):
counter1=Counter(text1)
counter2=Counter(text2)
intersection=counter1 & counter2
union=counter1 | counter2
return len(intersection)/len(union)
# end of the function
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Write the information from the input file to the output file
# we do not add a line break at the end, as we must append the similarity
# score first.
output_file.write(input_text_line[i])
# Open the ith 10-K; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+\
'_edited.txt', 'r', encoding='ascii', errors='ignore')
input_text_10_k=input_file_10_k.read()
# check whether the previous entry of the list is from the same firm
permco=input_text_line[i].split(";")[1]
permco_old=input_text_line[i-1].split(";")[1]
# Split text into words
word_list_edited=word_tokenize(input_text_10_k.lower())
############################################
# Sub Task 1: Jaccard for the _edited.txt
############################################
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
# the command calls the jaccard function that we have defined above.
# in the function, text1=word_list_edited and text2=word_list_old_edited.
jaccard_similarity=jaccard(word_list_edited,word_list_old_edited)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited=word_list_edited
############################################
# Sub Task 2: Jaccard for the _edited.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered,\
word_list_old_own_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered=word_list_own_filtered
############################################
# Sub Task 3: Jaccard for the _edited_v1.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered,\
word_list_old_NLTK_filtered)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered=word_list_NLTK_filtered
############################################
# Sub Task 4: Jaccard for the _stemmed.txt
############################################
# Create stemmed text
word_list_edited_stemmed=[]
for word in word_list_edited:
word_list_edited_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_edited_stemmed,word_list_old_edited_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited_stemmed=word_list_edited_stemmed
############################################
# Sub Task 5: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - OWN LIST
############################################
# Caution; in general, it is not clear whether you should first stem or
# first remove stop words.
# However, in this specific case, you should remove the stop words first
# and then stem, as your stop word list is based on the inflected text.
# remove stop words using personal stop word list
word_list_own_filtered=[]
for word in word_list_edited:
if word not in own_stop_words:
word_list_own_filtered.append(word)
# Create stemmed text
word_list_own_filtered_stemmed=[]
for word in word_list_own_filtered:
word_list_own_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_own_filtered_stemmed,\
word_list_old_own_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_own_filtered_stemmed=word_list_own_filtered_stemmed
############################################
# Sub Task 6: Jaccard for the _stemmed.txt
# AND REMOVE STOP WORDS - NLTK LIST
############################################
# Caution; it is not clear whether you should first stem or first remove
# stop words. However, the NLTK stop word list seems to be based on inflected
# text, e.g. the word "having" is included. "Having" would be stemmed to "have".
# Thus, the stop list seems to be not stemmed.
# Thus, you should remove the stop words first and then stem.
# remove stop words using NLTK stop word list
word_list_NLTK_filtered=[]
for word in word_list_edited:
if word not in NLTK_stop_words:
word_list_NLTK_filtered.append(word)
# Create stemmed text
word_list_NLTK_filtered_stemmed=[]
for word in word_list_NLTK_filtered:
word_list_NLTK_filtered_stemmed.append(ps.stem(word))
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
jaccard_similarity=jaccard(word_list_NLTK_filtered_stemmed,\
word_list_old_NLTK_filtered_stemmed)
output_file.write(";"+str(jaccard_similarity))
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(";")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_NLTK_filtered_stemmed=word_list_NLTK_filtered_stemmed
# Write line break to output file
output_file.write("\n")
# Close 10-K filing
input_file_10_k.close()
input_file.close()
output_file.close()
stop_word_file.close()
print("Task done!")

View file

@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
# adjust the directory to your folder
directory="C:/Lehre/Machine Learning/Data/"
# import the data for this problem
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
# The rows of the data are the Form 10-K filings. Each line is one filing.
# The columns are the variables. After some identifying information,
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
# in a 10-K (e.g., 100 times)
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
# and Console will crash.
# However, you can pick a small subset of the data and look at it.
# It list all columns=variables and the first three observations.
data_frame_example=data_frame.head(3)
# you can click on this variable in the variable explorer without Spyder crashing.
# To see the variables included in the data use the following command
data_frame_column_names=data_frame.columns
# you can click on this variable in the variable explorer without Spyder crashing.
# This variables shows all column/variable names in a vector.
# split the data set into the training and testing data
# we use the filings from year 2007 as training data
data_frame_train=data_frame[data_frame.year==2007]
# and the filing from year 2008 as testing data
data_frame_test=data_frame[data_frame.year==2008]
# put the cumulative abnormal return around the filing date into a new variable.
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
# training data
filing_car_train=data_frame_train["excess_ret_t0_t4"]
# testing data
filing_car_test=data_frame_test["excess_ret_t0_t4"]
# so far, you have absolute word counts. For example, "loss" is found 5 times.
# As the length of the 10-Ks can be different, we scale by the number of words
# in the 10-K.
document_length_train=data_frame_train["number_of_words"]
document_length_test=data_frame_test["number_of_words"]
# the word frequencies are our independent variables -> restrict the data frame
# to those variables and drop all variables that are not needed
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
# compute relative frequencies, i.e., divide the absolute word count by document length
data_frame_train=data_frame_train.div(document_length_train, axis=0)
data_frame_test=data_frame_test.div(document_length_test, axis=0)
# standardize the data frames
# training data
data_frame_train_mean=np.mean(data_frame_train,axis=0)
data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
# testing data
data_frame_test_mean=np.mean(data_frame_test,axis=0)
data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
# There can be missing values in the standardized variables.
# They arise if the word count for a specific word is always zero in the training
# or in the testing data. In this case, the standard deviation is zero ->
# division by zero -> NaN.
# We replace these missing values by zero.
# training data
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
# testing data
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
##########################
# Ridge regression
##########################
print("\nRidge regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
# The optimal alpha is at around 140000.
regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
# get the optimal lambda
alpha_optimal_cv=regression_Ridge_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# what is the R2 in the training and testing data?
print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
# predict y in the testing sample
filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
######################
# LASSO regression
######################
print("\nLASSO regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
# The optimal alpha is at around 0.86.
regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
# get the optimal lambda
alpha_optimal_cv=regression_Lasso_cv.alpha_
print("The optimal alpha is "+str(alpha_optimal_cv))
# get the R2 in the training data
print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
# ... and testing data
print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
# predict y in the testing sample
filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
############################################################
# Compare the betas from the Ridge and the LASSO regressions
############################################################
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,len(data_frame_train.columns)):
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 13 21:40:57 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Task 1: Open and print
# Open the Txt-file
print("\nTask 1 starts here!\n")
input_file=open(directory+'Fun_with_Python.txt','r')
input_text=input_file.read()
# Alternative with one command
input_text=open(directory+'Fun_with_Python.txt','r').read()
print(input_text)
# Task 2: Write text to output file
# Create file 'More_fun_with_Python.txt'
print("\nTask 2 starts here!\n")
output_file=open(directory+'More_fun_with_Python.txt','w')
output_file.write("Hallo\n")
output_file.write(input_text)
output_file.close()
# Task 3: loop
print("\nTask 3 starts here!\n")
# Alternative 1: While loop
i = 1
while i<=10:
print('Iteration Number: '+str(i))
i=i+1
# Example of a nested loop
j=1
while j<3:
print('Hallo')
j=j+1
# Alternative 2: For loop
for i in range(0,10):
print('Iteration Number: '+str(i))
# there is also a shorter notation: if there is no lower bound it is assumed to be zero
for i in range(10):
print('Iteration Number: '+str(i))
# Task 4: Print text line by line
# Print text line by line
print("\nTask 4 starts here!\n")
line_of_text=input_text.split('\n')
i=0
while i<len(line_of_text):
print("Line "+str(i+1)+": "+line_of_text[i])
i=i+1
# First alternative using a for loop
for i in range(0,len(line_of_text)):
print("Line "+str(i+1)+": "+line_of_text[i])
# Second alternative
# for ... in -> for each element of the list do ...
# line can be any name; it refers to the elements of the list
i=1
for line in line_of_text:
print("Line "+str(i)+": "+line)
i=i+1
# Task 5: count 'good'
# Count how often the word 'good' appears in the text
print("\nTask 5 starts here!\n")
number_good=input_text.count('good')
print(number_good)
# you can write the command in a shorter format
print(input_text.count('good'))
# Task 6a
# Print lines with the word 'good'
print("\nTask 6a starts here!\n")
for i in range(len(line_of_text)):
if line_of_text[i].count('good')>=1:
print(line_of_text[i])
# Task 7
# Print lines that start with the word 'This'
print("\nTask 7 starts here!\n")
print("\n'This' with a capital T.\n")
for i in range(len(line_of_text)):
if line_of_text[i].startswith('This')>=1:
print(line_of_text[i])
print("\n'this' with a lower case t.\n")
for i in range(len(line_of_text)):
if line_of_text[i].startswith('this')>=1:
print(line_of_text[i])
print("Yes, the command is case sensitive (2 vs. 0 matches)!")
# Task 8
# Replace the word 'good' by 'excellent'
print("\nTask 8 starts here!\n")
new_text=input_text.replace("good","excellent")
print(new_text)
# For illustation only
print("\nFor illustation only\n")
for i in range(len(line_of_text)):
new_line_of_text=line_of_text[i].replace('good','excellent')
# print the new line IF there are a change.
if not new_line_of_text==line_of_text[i]:
print(new_line_of_text)
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 09:21:46 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the txt file with the SEC filings
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
sec_filings_text=sec_filings_file.read()
# Create output file
output_file=open(directory+'SEC_Filings_Output.csv','w')
# Create first line with variable names
# I use semicolons as separator in csv files. You can also use any other symbol.
# However, you should make sure that the separator is not part of the data/text
# you write to the file.
# For example, it would be problematic if you use comma as separator and have
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
# Split the Input File in separate line
sec_filings_line=sec_filings_text.split("\n")
# Loop over all lines
for i in range(len(sec_filings_line)):
# Does the line refer to a form 10-K file?
# As pointed out by Loughran and McDonald (2011), many firms mislabelled
# their 10-K filings as 10-K405 filings. Thus, I included these filings
# as well.
# The condition below excludes amendments to 10-Ks ("10-K/A" and "10-K405/A").
# Depending on the research question at hand one could include amendments as well.
# Also, 10KSB (small businesses) could also be included.
match_10k=re.search("\A10-K( |405 )",sec_filings_line[i])
if match_10k:
#if sec_filings_line[i].startswith("10-K ")==1 or sec_filings_line[i].startswith("10-K405 ")==1:
# Split the line such that the information can be saved in separate
# variables
# Each information item has a fixed length in the overview files of the
# SEC.
# Filing type: position 1 to 12
# Remember Python starts counting at 0 and does not include the upper bound
filing_type=sec_filings_line[i][:12]
# Company name: position 13 to 74
company_name=sec_filings_line[i][12:74]
# CIK: position 75 to 86
cik=sec_filings_line[i][74:86]
# Filing date: position 87 to 98
filing_date=sec_filings_line[i][86:98]
# Link: position 99 to end of line
link=sec_filings_line[i][98:]
# Is the 10-K filed between March 10 and March 20?
# The filing date is in the format "YYYY-MM-DD" (e.g. "1998-03-31")
filing_day=filing_date[8:10]
filing_month=filing_date[5:7]
# Is the Filing Month March?
if int(filing_month)==3 and int(filing_day)>=10 and int(filing_day)<=20:
# The filing meets the conditions -->
# Write output to the csv file
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
sec_filings_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# We need the urllib package
import urllib.request
# To automatically create folders we need the os-module (OS: Operating System)
import os
# Define a user agent
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
# "Some websites dislike being browsed by programs, or send different versions
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
# (where x and y are the major and minor version numbers of the Python release,
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
# The way a browser identifies itself is through the User-Agent header.
opener = urllib.request.build_opener()
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
# To still automatically download files, you have different options.
# I have listed three examples below but there are many more:
# For a comprehensive list see, e.g.:
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
#opener.addheaders = [('User-agent', 'Mozilla')]
#opener.addheaders = [('User-agent', 'Chrome')]
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
urllib.request.install_opener(opener)
# Open the csv file from part 1 of the problem
input_file=open(directory+'SEC_Filings_Output.csv','r')
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# sometimes you have empty lines after a split command.
# You can remove them using the following command
while input_text_line.count("")>0:
input_text_line.remove("")
# Create a subfolder in which the 10-K filings are saved.
# When you download a large number of filings I recommend using subfolders for
# each year or even for each year-month combination.
# The option "exist_ok=True" makes sure that you do not get an error if the
# folder already exists.
os.makedirs(directory+"10-Ks/", exist_ok=True)
# Loop over all lines of the csv file
#for i in range(1,len(input_text_line)):
# To avoid having to download hundreds of files when we discuss the solution
# the loop stops at 20. (Remember the upper bound is not included.)
for i in range(1,21):
# split the line into the five variables
variables=input_text_line[i].split(";")
# We only need the cik and the link.
# The cik is the 3rd variable. However, the numbering of lists starts
# at zero -> 2nd item of the list "variables"
# The link is the 5th variable -> 4th item of the list "variables"
cik=variables[2]
#cik=cik.replace(" ","")
cik=cik.strip()
link=variables[4]
#link=link.replace(" ","")
link=link.strip()
# Find the filename
# The link consistes of differnt parts:
# For example: edgar/data/1000753/0000950129-98-001035.txt
link_parts=link.split("/")
# 1st part: edgar
# 2nd part: data
# 3rd part: cik
# 4th part: file name -> 3rd item of the set
filename=link_parts[3]
###########################################################################
############################ WARNING ######################################
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
# may use the same filename. Thus, when you only use the filename files
# might be overwritten. To avoid this problem you need to have a unique name.
# Combining CIK and filename results in a unique identifier, as the
# filename appears only once per firm (CIK).
# -> use the combination of CIK and filename: cik_filename
###########################################################################
urllib.request.urlretrieve("http://www.sec.gov/Archives/"+link,\
directory+"10-Ks/"+cik+"_"+filename)
input_file.close()
print("DONE")

View file

@ -0,0 +1,144 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the document
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
input_text=input_file.read()
#######################
# Task 1: remove tables
#######################
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search('<TABLE>', input_text)
print("This is the result of the re.search command:")
print(table_match)
while table_match:
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search('<TABLE>', input_text)
start_table=table_start_match.start()
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
table_end_match=re.search('</TABLE>', input_text)
end_table=table_end_match.end()
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table]+"\n")
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
input_text=input_text[:start_table]+input_text[end_table:]
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search('<TABLE>', input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
#########################
# Task 2: remove Exhibits
#########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
exhibit_start_match=re.search('<TYPE>EX', input_text)
start_exhibit=exhibit_start_match.start()
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# It is important that we search for the </DOCUMENT> only after the exhibit
# started. Otherwise, we could get the end of the main document.
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
end_exhibit=start_exhibit+exhibit_end_match.end()
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is an exhibit!\n"+input_text[start_exhibit:end_exhibit]+"\n")
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
# Check whether there are further exhibits
exhibit_match=re.search('<TYPE>EX', input_text)
##########################
# Task 3: remove html code
##########################
# Alternative 1: remove html code without Beautiful Soup
text=re.sub('<[^>]{1,}>', '', input_text)
# This regex searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# You might have thought about using the following command
#text=re.sub('<.{1,}>', '', input_text)
# However, this command has a problem, as it would delete the following line
# entirely: <page> This is some text that should remain <page>
# The .{1,} would match 'page> This is some text that should remain <page', as
# regex are greedy. The [^>]{1,} avoids this problem by not allowing to match >
# Consequently, in the example only the two "<page>" would be deleted.
# You can verify this by using regex101.com (remember to check "Python" in the
# left menu of the webpage)
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
########################
# Task 4: delete numbers
########################
# Alternative 1 - removing numbers step by step
# remove commas in numbers, e.g., 1,000 or 12,345,678 or 123,456,789,123,123
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
# remove the remaining numbers without commas and dots
text=re.sub('[0-9]','',text)
# Alternative 2 - removing numbers using a single regex
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# Alternative 3 - removing numbers step by step but start with commas and dots
# 1. remove comma incl. the surrounding numbers
text=re.sub("[0-9],[0-9]","",text)
# 2. remove dots incl. the surrounding numbers
text=re.sub("[0-9]\.[0-9]","",text)
# 3. remove any remaining number
text=re.sub("[0-9]","",text)
########################
# Task 5: delete symbols
########################
# When analyzing tone, symbols do not matter, as they are not considered to be
# words and thus do not biased the total word count.
# However, for training purposes this task is included in the problem.
# There is no well defined list of which symbols should be deleted. So, you
# can add further symbols.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
text=re.sub('[^a-zA-Z \.,\!\?\n]','',text)
# Open the output file for the pure text
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
output_file.write(text)
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the 10-K
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
input_text=input_file.read()
################################
# Remove tables
# Same approach as in Problem 4
################################
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first two tables that we delete.
i=1
table_match=re.search('<TABLE>', input_text)
while table_match:
# Search for the beginning of the table
table_start_match=re.search('<TABLE>', input_text)
start_table=table_start_match.start()
# search for the end of the table
table_end_match=re.search('</TABLE>', input_text)
end_table=table_end_match.end()
# The if condition and the printing are just for illustrative purposes.
# The commands display the first two tables that are removed from the text.
if i<=2:
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
i=i+1
# remove the table
input_text=input_text[:start_table]+input_text[end_table:]
# check whether there are further tables
table_match=re.search('<TABLE>', input_text)
################################
# Remove exhibits
# Same approach as in Problem 4
################################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first exhibit that we delete.
i=1
exhibit_match=re.search('<TYPE>EX', input_text)
while exhibit_match:
# Search for the beginning of the exhibit
exhibit_start_match=re.search('<TYPE>EX', input_text)
start_exhibit=exhibit_start_match.start()
# Search for the end of the exhibit
# CAUTION: search only in the text after the beginning of the exhibt, as
# </DOCUMENT> also appears earlier (e.g. end of main document)
exhibit_end_match=re.search('</DOCUMENT>', input_text[start_exhibit:])
end_exhibit=start_exhibit+exhibit_end_match.end()
if i<=1:
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
i=i+1
# remove exhibit
input_text=input_text[:start_exhibit]+input_text[end_exhibit:]
exhibit_match=re.search('<TYPE>EX', input_text)
##################
# Remove html code
##################
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
############################
# Remove the Document Header
############################
# There are different possibilities how one can define the start of the main part of the text
# In general, you should delete all text that is uninformative for your analysis.
# Alternative 1:
# Search for Table of Contents. To not mistakenly match a reference to the
# table of contents somewhere in the text, we require a linebreak before and after.
# When the "Table of Contents" is centered, there will be whitespaces or tabs
# before and potentially also after
header_match=re.search('(?i)\n[\t ]{0,}table[\t ]of[\t ]contents[\t ]{0,}\n', text)
# Alternative 2:
# Search for Documents incorporated by reference.
header_match=re.search('\n[\t ]{0,}DOCUMENTS[\t ]INCORPORATED[\t ]BY[\t ]REFERENCE[\t ]{0,}\n', text)
if header_match:
# Drop the document header and keep only the rest of the text after the header.
text=text[header_match.end():]
#################################################
# Delete the text in "PART IV"
# This procedure is optional. Look at "Part IV" and decide whether you favor
# the approach. I think that the part should be dropped, as it is just a list
# of exhibits, some mandatory text required by the SEC [indicated by the
# capital letters in the "SIGNATURES" section].
#################################################
'''
# Alternative 1: go over all matches but keep only the last one
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
print("Hallo")
# match now contains the last match
# Delete the text after the last match
text=text[:match.start()]
# Alternative 2: save the positions of all matches (more general approach)
# to use alternative 2, you have to comment out Alternative 1!
# Otherwise line 104 will create a problem when you execute Alternative 2.
list_start_matches=[]
list_end_matches=[]
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
print(match)
list_start_matches.append(match.start())
list_end_matches.append(match.end())
# Position of last match
print(list_start_matches[len(list_start_matches)-1])
print(list_end_matches[len(list_start_matches)-1])
# Alternative 3: manual coding using a loop of re.searches
# create a copy of the text that we can edit
text_check_part_IV=text
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
# create two lists that we can use to save the start and end positions
# of the Part IV matches
list_start_matches_v2=[]
list_end_matches_v2=[]
# variable to save the position of the last match in the overall text
end_position_previous_match=0
while part_IV_match:
start_position_match=end_position_previous_match+part_IV_match.start()
end_position_match=end_position_previous_match+part_IV_match.end()
list_start_matches_v2.append(start_position_match)
list_end_matches_v2.append(end_position_match)
# update the information on the end of the last match
end_position_previous_match=end_position_previous_match+part_IV_match.end()
text_check_part_IV=text_check_part_IV[part_IV_match.end():]
part_IV_match=re.search('\s{2,}PART IV\s{0,}\n', text_check_part_IV)
# when you compare list_end_matches to list_end_matches_v2, you see that the two
# approaches yield the same result.
# To double check that the approaches have the same results, you could
# replace the Regex in lines 112, 124, and 142 by "\s{2,}PART [A-Z]{1,3}\s{0,}\n".
# In these case you have more matches and so you can better check that the
# two approaches have identical outcomes.
'''
'''
# Delete the text after the last match
text=text[:list_start_matches[len(list_start_matches)-1]]
'''
# Delete item numbers
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
text=re.sub('(?i)Item [0-9]{1,}A{0,1}(\s|\.|:|\n)','',text)
# Delete numbers
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# Alternative stepwise procedure to delete numbers
# remove commas in numbers, e.g., 1,000 or 12,345,678
text=re.sub('[0-9]{1,3},([0-9]{3},){0,}[0-9]{3}','',text)
# remove dots in numbers, e.g., 34.56 or 12,345.678 (-> previous command leaves .678)
text=re.sub('[0-9]{0,}\.[0-9]{1,}','',text)
# remove the remaining numbers without commas and dots
text=re.sub('[0-9]','',text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Delete hyphens that are followed by a line feed.
text=re.sub('-\n','',text)
# Replace symbols by a whitespace.
# Extra whitespaces are not a problem.
text=re.sub('\(|\)|\[|\]|\$|§|%|\*|/|·|-',' ',text)
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by a line break (potentially also whitespaces and tabs)
# and that are followed by are followed by a line break (again, there may
# also be whitespaces and tabs).
text=re.sub('\n[\t ]{0,}(\.|,){1,}[\t ]{0,}\n','\n',text)
# Drop single-character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub('(?<=\W)[A-Za-z](?=\W)',' ',text)
# Open the output file for the pure text
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
output_file.write(text)
input_file.close()
output_file.close()
print("COMPLETED.")

View file

@ -0,0 +1,356 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r')
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines
while input_text_line.count("")>0:
input_text_line.remove("")
print("The input file contains "+str(len(input_text_line)-1)+" non-empty lines with data.")
# We subtract 1 from the lenght, as the first line contains the variable names but not data.
# Loop over all lines
for i in range(1,len(input_text_line)):
# To see the progress of your program you can print the number of iteration.
print(str(i))
# split the lines of the CSV-file into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename to open the file
cik=variables[0]
filename=variables[1]
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# the new file name should be "old_name_clean" -> we have to replace ".txt"
# by "_clean.txt"
filename=filename.replace('.txt','_clean.txt')
# Remove tables
variable=re.search('<TABLE>', input_text_10_k)
while variable:
variable=re.search('<TABLE>', input_text_10_k)
start_table=variable.start()
variable=re.search('</TABLE>', input_text_10_k)
end_table=variable.end()
input_text_10_k=input_text_10_k[:(start_table)]+input_text_10_k[(end_table):]
variable=re.search('<TABLE>', input_text_10_k)
####################### Begin of exhibits removal #########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# In the recent years, there are also exhibits with <TYPE>EXCEL
# -> as we search for "<TYPE>EX", the loop will delete <TYPE>EXCEL exhibits, too.
variable=re.search('<TYPE>EX', input_text_10_k)
while variable:
variable=re.search('<TYPE>EX', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:(start_exhibit)]+input_text_10_k[(end_exhibit):]
variable=re.search('<TYPE>EX', input_text_10_k)
# In recent years, there are also XML-Exibits.
# CAUTION: These are <TYPE>XML and not <TYPE>EX -> need separate cleaning
# Remove XML-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>XML
# ...
# </DOCUMENT>
variable=re.search('<TYPE>XML', input_text_10_k)
while variable:
variable=re.search('<TYPE>XML', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>XML', input_text_10_k)
# Furthermore, also in recent years, there are also ZIP-Exibits.
# CAUTION: These are <TYPE>ZIP and not <TYPE>EX -> need separate cleaning
# Remove ZIP-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>ZIP
# ...
# </DOCUMENT>
variable=re.search('<TYPE>ZIP', input_text_10_k)
while variable:
variable=re.search('<TYPE>ZIP', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>ZIP', input_text_10_k)
# In addition, there are many Graphic-Exibits.
# CAUTION: These are <TYPE>GRAPHIC and not <TYPE>EX -> need separate cleaning
# Remove GRAPHIC-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>GRAPHIC
# ...
# </DOCUMENT>
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
while variable:
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>GRAPHIC', input_text_10_k)
# Furthermore, there can be also Cover-Exibits.
# CAUTION: These are <TYPE>COVER and not <TYPE>EX -> need separate cleaning
# Remove COVER-Exhibits, which have the following structure
# <DOCUMENT>
# <TYPE>COVER
# ...
# </DOCUMENT>
variable=re.search('<TYPE>COVER', input_text_10_k)
while variable:
variable=re.search('<TYPE>COVER', input_text_10_k)
start_exhibit=variable.start()
variable=re.search('</DOCUMENT>', input_text_10_k[start_exhibit:])
end_exhibit=start_exhibit+variable.end()
input_text_10_k=input_text_10_k[:start_exhibit]+input_text_10_k[end_exhibit:]
variable=re.search('<TYPE>COVER', input_text_10_k)
# Furthermore, there can be also PDF files attached.
# These attachments caused BeautifulSoup to crash on some computers.
# Remove PDFs
variable=re.search('<PDF>', input_text_10_k)
while variable:
variable=re.search('<PDF>', input_text_10_k)
start_pdf=variable.start()
variable=re.search('</PDF>', input_text_10_k[start_pdf:])
end_pdf=start_pdf+variable.end()
input_text_10_k=input_text_10_k[:(start_pdf)]+input_text_10_k[(end_pdf):]
variable=re.search('<PDF>', input_text_10_k)
######################## End of exhibits removal ##########################
# Remove Document Header - PART 1
# This condition should work for all 10-K filings as the hmtl tags "<SEC-HEADER>"
# and "</SEC-HEADER>" are mandatory for all filings.
variable=re.search('</SEC-HEADER>', input_text_10_k)
if variable:
input_text_10_k=input_text_10_k[variable.end():]
# In some filings, firms do not use line feeds \n but <div> and </div>
# instead to indicate the start and the end of sentences.
# "Dieses allgemeine Element bewirkt nichts weiter als dass es in einer
# neuen Zeile des Fließtextes beginnt."
# see https://wiki.selfhtml.org/wiki/HTML/Textstrukturierung/div
# and
# "The <div> tag defines a division or a section in an HTML document.
# By default, browsers always place a line break before and after the <div> element."
# See: https://www.w3schools.com/tags/tag_div.asp
# It is important to replace <div> and </div> by linefeeds because otherwise
# the entire text will be in a single line and the subsequent commands do
# not work properly.
input_text_10_k=input_text_10_k.replace("<div>", "\n")
input_text_10_k=input_text_10_k.replace("</div>", "\n")
# Remove html code
html_text=BeautifulSoup(input_text_10_k, 'html.parser')
text=html_text.get_text()
# To get an idea of what the commands below are doing, it is helpful to
# write the current version of the text to a file and then compare it to the
# final file.
filename2=filename.replace('_clean.txt','_without_HtmlTablesExhibits.txt')
# Open the output file for the text without html code and without tables+exhibits
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename2,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text)
output_file_10_k.close()
# Remove the Document Header - PART II
# The above command to remove the header ("</SEC-HEADER>") does not capture
# the entire header -> we need to delete further parts at the top the filing.
# WARNING: The filters below may be specific to this sample of 10-Ks.
# Some firms have line breaks instead of whitespaces -> use "[ \n]" and not just " ".
variable=re.search('(?i)\n {0,}DOCUMENTS[ \n]INCORPORATED[ \n]BY[ \n]REFERENCE {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}table of contents {0,}\n', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)\n {0,}Indicate the number of shares outstanding\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('(?i)may be deemed “forwardlooking statements”\.{1,}', text)
if variable:
text=text[variable.end():]
else:
variable=re.search('\nPART\.{1,}', text)
if variable:
text=text[variable.end():]
# Delete Item numbers
text=re.sub('(?i)Item {1,}[0-9]{1,}(A|B){0,1}(\s|\.|:|\n)','',text)
# Delete Part numbers
text=re.sub('(?i)Part (1|2|3|4|III|II|I|IV)','',text)
# Delete numbers:
text=re.sub('[0-9]{1,}(,[0-9]{3}){0,}(\.[0-9]{1,}){0,1}','',text)
# File names, e.g. exhibit.pdf or picture.jpeg should be removed
text=re.sub("[ |\n]\S{1,}\.(pdf|htm|html|doc|jpg|txt|xml)(?=[ \n\.\?!])", "", text)
# URLs --> Remove internet addresse
text=re.sub("http:/{0,2}", "", text)
text=re.sub("www\..{1,}\.[a-z]{2,4}(?=[ \n\.\?!])", "", text)
# In Part 4 of the programming chapter, we will determine the number of
# words per sentence. To be able to use the same underlying sample,
# we need to implement further corrections. These changes do not affect
# the percentage of negative/positive/etc. words.
# --> Only relevant for determining the number of sentences
# The text contains dots that do not indicate the end of a sentence.
# E.g., "Inc." and "St."
# The preceding - is found in non-U.S. for example.
# Replace or remove specific abreviations
# This list is incomplete. In a research project you should spend more time
# on editing the data.
text=re.sub("(?i)(-|\s|\A|,)Inc\.", " Inc", text)
text=re.sub("(?i)(-|\s|\A|,)Corp\.", " Corp", text)
text=re.sub("(?i)(-|\s|\A|,)Ltd\.", " Ltd", text)
text=re.sub("(?i)(-|\s|\A|,)Co\.", " Co", text)
text=re.sub("(?i)(-|\s|\A|,)S\.A\.", " SA", text)
text=re.sub("(?i)(-|\s|\A|,)U\.S\.", " US", text)
text=re.sub("(?i)(-|\s|\A|,)Ms\.", " Ms", text)
text=re.sub("(?i)(-|\s|\A|,)Mr\.", " Mr", text)
text=re.sub("(?i)(-|\s|\A|,)No\.", " Number", text)
text=re.sub("(?i)(-|\s|\A|,)v\.s\.", " vs", text)
text=re.sub("(?i)(-|\s|\A|,)St\.", " ", text)
text=re.sub("(?i)(-|\s|\A|,)Jr\.", " ", text)
text=re.sub("(?i)(\s|\A|,)Jan\.", " January", text)
text=re.sub("(?i)(\s|\A|,)Feb\.", " February", text)
text=re.sub("(?i)(\s|\A|,)Mar\.", " March", text)
text=re.sub("(?i)(\s|\A|,)Apr\.", " April", text)
text=re.sub("(?i)(\s|\A|,)May\.", " May", text)
text=re.sub("(?i)(\s|\A|,)Jun\.", " June", text)
text=re.sub("(?i)(\s|\A|,)Jul\.", " July", text)
text=re.sub("(?i)(\s|\A|,)Aug\.", " August", text)
text=re.sub("(?i)(\s|\A|,)Sep\.", " September", text)
text=re.sub("(?i)(\s|\A|,)Oct\.", " October", text)
text=re.sub("(?i)(\s|\A|,)Nov\.", " November", text)
text=re.sub("(?i)(\s|\A|,)Dec\.", " December", text)
# The sequence capital letter -> dot -> capital letter -> dot indicates an abbreviation
# three repitions of capital letter and dot are also common in filings
# we need to check for three instances first.
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.[A-Z]\.", " ", text)
# now check for two instances
text=re.sub("( |\n|,)[A-Z]\.[A-Z]\.", " ", text)
# Dots after a single letter can indicate a middle Name Paul J. Smith
# or an abbreviation --> also delete these.
text=re.sub("( |\n|,)[A-Z]\.", "", text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Replace hyphens followed by a line feed by a hyphen without line feed
text=re.sub('-\n','-',text)
# Delete the minus/hyphens
# "Short-term" -> "shortterm"
text=re.sub('-','',text)
# --> Only relevant for determining the number of sentences
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by whitespace or line break and that are followed by
# whitespace or line break.
text=re.sub('\n(\.|,)\n','\n',text)
text=re.sub(' (\.|,) ',' ',text)
# Delete single character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub('(?i)(?<=\W)[a-z](?=\W)',' ',text)
# There are sentences that are in upper case letters. However, these are not
# "real" sentences. Examples: "RESTRICTIONS ON TRANSFER OF NOTE."
# or "THIS NOTE AND THE RIGHTS AND OBLIGATIONS EVIDENCED HEREBY ARE
# SUBORDINATED TO THE PRIOR PAYMENT OF CERTAIN OBLIGATIONS [...]"
# We save the edited text in a new variable
text_edited=text
# Split text in sentences
list_sentences=re.split('\.|!|\?', text)
# iterate the list of all sentences
for j in range(0,len(list_sentences)):
# Determine the number of upper case letters
upper_letters=len(re.findall('[A-Z]',list_sentences[j]))
# Determine the number of all letters
total_letters=len(re.findall('[A-Za-z]',list_sentences[j]))
# If there is at least one letter calculate the fraction of upper case letters
if total_letters>0:
ratio=upper_letters/total_letters
# If the fraction of upper case letters is larger than 0.9 delete
# the sentence from the text.
if ratio>0.9:
text_edited=text_edited.replace(list_sentences[j]+'.','')
text_edited=text_edited.replace(list_sentences[j]+'!','')
text_edited=text_edited.replace(list_sentences[j]+'?','')
# --> Only relevant for determining the number of sentences
# There are a few cases where a dot follows a dot or where a linefeed
# separates two dots. --> delete the second dot.
text_edited=text_edited.replace('..','.')
text_edited=text_edited.replace('.\n.','.')
# The following commands do not influence the subsequent textual analysis.
# The only purpose is to display the output in a nicer format.
# Replace lines that contain only whitespaces by a line feed.
text_edited=re.sub('\n {1,}\n','\n',text_edited)
# Replace multiple line feeds by one line feed.
text_edited=re.sub('\n{2,}','\n',text_edited)
# Open the output file for the pure text
output_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename,'w',encoding='ascii',errors='ignore')
output_file_10_k.write(text_edited)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()

View file

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary has been obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select negative words and copy them to a txt file
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# The LMD words are all in upper case
word_list=word_list.lower()
negative_words=word_list.split('\n')
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
Percentage_Negative_Words\n')
# Loop over all lines of the csv file
for i in range(1,len(input_text_line)):
#for i in range(1,10):
# If the execution of your scripts takes some time, printing the loop iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-Ks in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+'_'+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters, too
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document is identical. Remember that you have typically lower and upper case
# letters in documents -> modify text.
text=input_text_10_k.lower()
# Split the text in single words to determine the total number of words
# \W is a non-word character: "Matches any character which is not a Unicode
# word character." (Python documentation)
# this is equivalent to [^a-zA-Z0-9_], i.e. no lower case letters, no upper
# case letters, no numbers, and no underscore.
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# It is important that you treat multiple "\W" as one. Otherwise you are left
# with elements in the list that are not acutal words.
# Determine the total number of words
word_count=len(list_of_words)
# Reset the number of negative words to zero
negative_count=0
# For each negative word, count the number of occurrences
for j in range(len(negative_words)):
# the command "list_of_words.count(negative_words[i])" only matches if there
# is exact overlap between the ith negative word and the words in the list.
# For example the following two commands:
# list_of_words=["abandon","abandoned","abandonment"]
# list_of_words.count("abandon")
# yields 1 match
# In contrast,
# text_of_words="abandon abandoned abandonment"
# text_of_words.count("abandon")
# yields 3. Thus, you have to split the text to individual words!!!
negative_count=negative_count+list_of_words.count(negative_words[j])
# Get the percentage of negative words
percentage_negative=negative_count/word_count
# Write cik, file name, total number of words, number of negative words,
# and the percentage of negative words to output file.
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(negative_count)+';'+str(percentage_negative)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,130 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert
"""
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary is obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select positive words and copy them to a txt file
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
word_list=file_word_list.read()
word_list=word_list.lower()
positive_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
# Iterate the list of the 200 10-K filings
# the last line is empty --> loop only up to len()-1
#for i in range(1,len(input_text_line)):
for i in range(1,20): # For illustration only
# If the execution of your scripts takes some time, printing the iterator
# gives you an impression of the overall progress
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'/10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Use lower case letters, too
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document are identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=input_text_10_k.lower()
# Split the text in single words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Reset the number of positive words and positive words adj. for negations to zero
positive_count=0
positive_count_adj=0
# For each positive word, count the number of occurrences
for j in range(len(positive_words)):
# standard count operation without controlling for negations
positive_words_found=list_of_words.count(positive_words[j])
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
# only for Fin-Pos words. Simple negation is taken to be observations
# of one of six words (no, not, none, neither, never, nobody) occurring
# within three words preceding a positive word.
# When we have identified positive words we need to search for negations
while positive_words_found>0:
# identify the position of the matched positive word in the list of all words
position_of_word=list_of_words.index(positive_words[j])
# identify the three words before the positive word and add them to a list
# the \ is a line break
list_negation=[list_of_words[max(0,position_of_word-3)],\
list_of_words[max(0,position_of_word-2)],list_of_words[max(0,position_of_word-1)]]
# check whether one of the three words in list_negation is a negation
negation_found=list_negation.count('no')+list_negation.count('not')+\
list_negation.count('none')+list_negation.count('neither')+\
list_negation.count('never')+list_negation.count('nobody')
if negation_found==0:
# no negation
positive_count_adj=positive_count_adj+1
positive_count=positive_count+1
else:
# negation
positive_count=positive_count+1
# delete the matched positive words in the original document
list_of_words[position_of_word]=''
# check whether there are further matches of the jth positive word
positive_words_found=list_of_words.count(positive_words[j])
# Write cik, file name, total number of words, and number of positive
# and adjusted positive words to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
';'+str(positive_count_adj/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We split the text into words and sentences using regular expression
import re
# For comparison, we also include the NLTK tokenizer
from nltk.tokenize import sent_tokenize
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;'\
'Number_of_Sentences_1;Number_of_Sentences_2;Number_of_Sentences_false;'\
'Number_of_Sentences_NLTK;WPS;WPS_1;WPS_2;WPS_false;WPS_NLTK\n')
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
text=input_file_10_k.read()
# Determine number of sentences and number of words
# Split the text in words to determine the total number of words
list_of_words=re.split('\W{1,}', text)
# to make sure that empty list elements do not bias the word count, we delete them.
while list_of_words.count("")>0:
list_of_words.remove("")
# Determine total number of words
word_count=len(list_of_words)
# Split the text by symbols that indicate the end of a sentence
# to determine the total number of sentences
list_of_sentences=re.split('[\.!\?]{1,}', text)
while list_of_sentences.count("")>0:
list_of_sentences.remove("")
# Alternative 1:
list_of_sentences_1=re.split('(?:\.|!|\?){1,}', text)
while list_of_sentences_1.count("")>0:
list_of_sentences_1.remove("")
# Alternative 2:
list_of_sentences_2=re.split('\.{1,}|!{1,}|\?{1,}', text)
while list_of_sentences_2.count("")>0:
list_of_sentences_2.remove("")
# Incorrect approach:
# re.split splits the string by the occurrences of the pattern.
# If capturing parentheses, i.e. (), are used in pattern, then the text
# of all groups in the pattern are also returned as part of the resulting list.
# See https://docs.python.org/3/library/re.html#re.split for details
list_of_sentences_false=re.split('(\.|!|\?){1,}', text)
while list_of_sentences_false.count("")>0:
list_of_sentences_false.remove("")
# For comparison, we also include the NLTK tokenizer
list_of_sentences_nltk=sent_tokenize(text)
# Determine total number of sentences
sentence_count=len(list_of_sentences)
sentence_count_1=len(list_of_sentences_1)
sentence_count_2=len(list_of_sentences_2)
sentence_count_false=len(list_of_sentences_false)
sentence_count_nltk=len(list_of_sentences_nltk)
# Ratio of # of words over # of sentences
wps=word_count/sentence_count
wps_1=word_count/sentence_count_1
wps_2=word_count/sentence_count_2
wps_false=word_count/sentence_count_false
wps_nltk=word_count/sentence_count_nltk
# Write cik, file name, total number of words, total number of sentences,
# and WPS to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(sentence_count)+';'+str(sentence_count_1)+';'+str(sentence_count_2)+';'+\
str(sentence_count_false)+';'+str(sentence_count_nltk)+';'+str(wps)+';'+\
str(wps_1)+';'+str(wps_2)+';'+str(wps_false)+';'+str(wps_nltk)+'\n')
# Close filing
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,189 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 15 21:56:41 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
import nltk
import random
import collections
import re
# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
# are positive and the other 1,000 are negative.
# if you do not have the movie review corpus yet, download it:
nltk.download("movie_reviews")
from nltk.corpus import movie_reviews
# Create a list that contains the tuples of document and category.
# Category is "positive" or "negative"
documents = []
# For all categories
for category in movie_reviews.categories():
print("Category: "+str(category))
# for all reviews (identified by file ID) in the respective category
for file_ID in movie_reviews.fileids(category):
# You have to put two parentheses to indicate that you want to add a tuple.
documents.append((list(movie_reviews.words(file_ID)),category))
# Print the first element (i.e. tuple) of documents.
print(documents[0])
# print the words of the first movie review
print(documents[0][0])
# print the first word of the first movie review
print(documents[0][0][0])
# print the classification of the first movie review
print(documents[0][1])
# print the classification of the 1000th review (the last negative one)
print(documents[999][1])
# print the classification of the 1001st review (the first positive one)
print(documents[1000][1])
# The default order of the reviews is first all negative reviews and then all positive ones.
# Later we will build a training and a testing set. As we need to have positive and negative
# reports in both sets, we randomly shuffle the documents.
random.shuffle(documents)
# Create a list of all words.
all_words = []
for word in movie_reviews.words():
# We use lower case words
#all_words.append(word.lower())
if re.search("\A[a-z]",word.lower()):
# check whether the word is actually a word, i.e., whether it contains
# at least one letter
#if re.search("[a-z]",word.lower()):
# We use lower case words
all_words.append(word.lower())
# What are the most frequently used words in the movie reviews?
# Alternative 1:
# FreqDist sort words from the most frequently used word to the least frequenty used word.
all_words_approach_1 = nltk.FreqDist(all_words)
print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
# Alternative 2:
# We can also determine the most frequent words by using Counters as we did
# in Problem 12 --> transform list of all words to a Counter
all_words_approach_2=collections.Counter(all_words)
top_15_words=all_words_approach_2.most_common(15)
print("Alternative 2: the top 15 words are: "+str(top_15_words))
# -> identical results -> perfect.
# Search for a word and see how often it appears.
print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
# alternatively
print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
# -> create a list that only contains the top 3000 words
# get the top 3000 words
# Approach 1 using the NLKT.FreqDist from above
i=0
top_3000_words=all_words_approach_1.most_common(3000)
list_top_3000_words_approach_1=[]
while i<3000:
list_top_3000_words_approach_1.append(top_3000_words[i][0])
i=i+1
# Approach 2 using Counters from above
i=0
top_3000_words=all_words_approach_2.most_common(3000)
list_top_3000_words_approach_2=[]
while i<3000:
list_top_3000_words_approach_2.append(top_3000_words[i][0])
i=i+1
# select the list of approach 1 or 2
word_features=list_top_3000_words_approach_1
# We need to identify the words we want to use for classification in the documents.
# We define a function for that.
def find_features(document):
words = set(document)
features = {}
# loop over all the words we consider for the classification
for word in word_features:
# The expression returns either true or false
features[word] = (word in words)
return features
# To get an idea what the function find_features() does let's print the features
# for one review.
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
feature_set = [(find_features(review), category) for (review, category) in documents]
# How does feature set looks like?
print(feature_set[0])
# -> it is still a tuple
print(feature_set[0][0])
# the first element are the 3000 words we use for classification with "True" or "False"
# depending on whether the words appear in the review
print(feature_set[0][1])
# Is the information on whether the review is positive or negative
# Define the training and testing set
# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
training_set=feature_set[:1900]
testing_set=feature_set[1900:]
# First we have to train the Naive Bayes Classifier.
# It will determine which of the words from word_features appear mostly in positive
# reviews and which appear mostly in negative reviews.
classifier=nltk.NaiveBayesClassifier.train(training_set)
# The following command prints the 20 words that best discriminate between
# positive and negative reviews.
classifier.show_most_informative_features(20)
# Let's classify the first element of feature_set
# The input for the classification need to be the list of words with True or False
print(classifier.classify(feature_set[0][0]))
print("The review is actually: "+str(feature_set[0][1]))
# classify the 100 reports from the testing set
# they have the position 1900 to 2000 in the feature set.
i=1900
classified_set=[]
while i<2000:
classified_set.append(classifier.classify(feature_set[i][0]))
i=i+1
# Compare classification result with actual category
i=0
# In this list we save tuples of [predicted category, actual category]
comparison=[]
# In this list we simply save "accurate" and "inaccurate"
comparison_2=[]
while i<100:
comparison.append([classified_set[i],feature_set[i+1900][1]])
# If the predicted and acutal classification match -> accurate
if comparison[i][0]==comparison[i][1]:
comparison_2.append("accurate")
else:
comparison_2.append("inaccurate")
i=i+1
print(comparison)
# We need the number of accurate and inaccurate classifications
comparison_counter=collections.Counter(comparison_2)
print(comparison_counter)
# NLT can compute the accuracy directly
# What is the accuracy for the testing set?
print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
# Same value as from our own calculations -> perfect!
# What is the accuracy for the training set?
print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
# Higher than in the testing dataset -> expected.
print("completed!")

View file

@ -0,0 +1,55 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# To determine file size we need the OS package
import os
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_File_Size.csv','w',encoding="utf-8")
output_file.write('CIK;Filename;File_size_gross;File_size_net\n')
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# File size of the complete submission file (gross file size)
# You have to divide the result by 1024 to get the size in kilobyte
# The file size will be affected by html code and exhibits.
# APPLY THE COMMAND THAT IS SHOWN ON SLIDE 62.
size_gross=XXX/1024
# File size of the main text file (net file size)
# You have to divide the result by 1024 to get the size in kilobyte
size_net=XXX/1024 # SAME COMMAND AS FOR GROSS FILE SIZE BUT APPLIED TO THE _clean.txt
output_file.write(cik+';'+filename+';'+str(size_gross)+';'+str(size_net)+'\n')
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,150 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 09:19:54 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular expressions and counters (->collections)
import re
import collections
# for the bigram part, the sentence tokenizer is helpful
from nltk.tokenize import sent_tokenize
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M.
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create an empty counter variable
words_counter=collections.Counter()
# counter for the extra task
bigram_counter=collections.Counter()
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
# The files are available in the zip file "10-K_Textual_Similarity_edited.zip".
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+\
filename+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# read the content from the file
input_text_10_k=input_file_10_k.read()
# THINK ABOUT WE SHOULD USE LOWER OR UPPER CASE CONSISTENTLY!
input_text_10_k=
# Split text into words
list_of_words=re.split('\W{1,}',input_text_10_k)
# Remember: there can be empty list elements!
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 24 and 25.
COMMANDS TO BE ADDED
# Add the words to our counter
words_counter=words_counter+XXXX # COMPLETE THIS COMMAND
#############################################
# optional part for the extra task on bigrams
#############################################
# create an empty list for the bigrams
'''
bigram_list=[]
# split the text into sentences
list_of_sentences=XXX
# create the bigrams IN EACH SENTENCE
for sentence in list_of_sentences:
# split the sentence into words
list_of_words=XXX
# remove empty elements
while list_of_words.count("")>0:
list_of_words.remove("")
# go over all potential two word combinations in the sentence.
for word_number in range(XXX,YYY):
# add the bigram (two words connected by whitespace) to the list
bigram_list.append(WORD_1 + " " + WORD_2)
# same command as in line 70
bigram_counter=bigram_counter+XXX
# end of extra task
'''
# Close the 10-K filing
input_file_10_k.close()
input_file.close()
######################
# Top 100 single words
######################
# Open the csv file containing the 100 most frequently used words
output_file=open(directory+'Problem_12_100_most_frequent_words.csv','w',encoding="utf-8",errors="ignore")
output_file.write("rank;word;count\n")
# Get the 100 most frequent words
top_100_words=words_counter.XXXX # COMPLETE THIS COMMAND
# Write the 100 most frequent words to the csv file
# REMEMBER: Python starts counting at 0, while humans start at 1.
# So, the most frequent words (rank 1 in human counting) is element 0 for Python.
for i in range(1,101):
output_file.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
# Close the csv file
output_file.close()
######################
# Extra task
# Top 100 bigrams
######################
'''
# Open the csv file containing the 100 most frequently used BIGRAMS
output_file_bigram=open(directory+'Problem_12_100_most_frequent_bigrams.csv','w',encoding="utf-8")
output_file_bigram.write("rank;word;count\n")
# Get the 100 most frequent bigrams: same commend as above
top_100_bigrams=bigram_counter.XXX
# Write the 100 most frequent bigrams to the csv file.
# same logic as above
for i in range(1,101):
output_file_bigram.write(str(i)+";"+XXXX (-> word)+";"+XXXX (-> the frequency of the word)+"\n") # COMPLETE THIS COMMAND
# Close the csv file
output_file_bigram.close()
'''
print("Task done!")

View file

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We need regular epressions and stemming.
import re
from nltk.stem import PorterStemmer
# Depending on how you would like to split the text in words, you may need tokenize.
from nltk.tokenize import word_tokenize
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K in the list; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
# Get the text of the 10-K
input_text_10_k=input_file_10_k.read()
# We need to tokenize the text because stem only works on a word by word basis.
# Stemming an entire document without splitting into words does not work!
# The problem is that \n gets lost in this process --> we cannot easily
# recreate the document.
# Solution: replace \n by \n and some indicator that there was a line break.
# For example replace("\n","\nHereWasALinebreak")
input_text_10_k=input_text_10_k.replace("\n",XXXX)
# Split text into words
word_list=XXXX
# Stem the text from above
text_stemmed=''
# LOOP ALL WORDS, STEM THEM AND RECONNECT THEM.
# WARNING: WHEN RECONNECTING WORDS YOU NEED TO INCLUDE A WHITESPACE BETWEEN
# THE WORDS. OTHERWISE, THE TEXT GETS MESSED UP.
for word in word_list:
text_stemmed=text_stemmed+XXX # TO BE COMPLETED
# To recreate the text, we need to replace the line break indicators by \n.
# WARNING: PAY ATTENTION TO UPPER/LOWER CASE, IT CAN CHANGE.
text_stemmed=text_stemmed.replace(XXXX,XXXX) # UNDO THE TRANSFORMATION FROM LINE 56.
# Open the output file for the stemmed text
output_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename\
+'_stemmed.txt', 'w', encoding='ascii', errors='ignore')
output_file_10_k.write(text_stemmed)
output_file_10_k.close()
input_file_10_k.close()
input_file.close()
print("Task done!")

View file

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe University Frankfurt
"""
# For the full task, we need a large set of packages:
# regular expression, stemming, stop words, tokenization, and counters.
import re
#from nltk.tokenize import word_tokenize # NOT needed for the base comparison
#from nltk.corpus import stopwords # NOT needed for the base comparison
#from nltk.stem import PorterStemmer # NOT needed for the base comparison
from collections import Counter
#ps=PorterStemmer() # NOT needed for the base comparison
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 10-Ks from MSFT, KO, and 3M
input_file=open(directory+'list_10-K_filings_textual_similarity.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Open the output csv file in which we write the similarities
output_file=open(directory+'list_10-K_filings_textual_similarity_jaccard.csv','w',encoding="utf-8")
# Write variable names to first line
output_file.write(input_text_line[0]+';Jaccard\n')
# set default values for variables
word_list_old_edited=""
word_list_edited=""
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the eight variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (8th column)
cik=variables[0]
filename_parts=re.split('/',variables[7])
filename=filename_parts[3].replace('.txt','')
# Open the ith 10-K; remember to specify the encoding
input_file_10_k=open(directory+'10-K_Textual_Similarity_edited/'+cik+'_'+filename+\
'_edited.txt', 'r', encoding='ascii', errors='ignore')
# if the command above does not work (error like "file not found" or "directory not found")
# please use the following command:
#input_file_10_k=open(directory+'10-K_Textual_Similarity/'+cik+'_'+filename+'_edited.txt','r',encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# Split text into words
word_list_edited=re.split("\W{1,}",input_text_10_k.lower())
# Alternative using tokenize
#word_list_edited=word_tokenize(input_text_10_k.lower())
# check whether the previous entry of the list is from the same firm
permco=input_text_line[i].split(";")[1]
permco_old=input_text_line[i-1].split(";")[1]
############################################
# Sub Task 1: Jaccard for the _edited.txt
############################################
# compute Jaccard similarity if the previous filing is from the same firm
if permco==permco_old:
counter_current_10k=Counter(XXX)
counter_previous_10k=Counter(XXX)
intersection=XXX see "Introduction_Container_Datatypes.py" (at the end of the file)
union=XXXX see "Introduction_Container_Datatypes.py" (at the end of the file)
jaccard_similarity=XXXx # ELEMENTS IN INTERSECTION / # ELEMENTS IN UNION
output_file.write(input_text_line[i]+";"+str(jaccard_similarity)+"\n")
else:
# The previous filing is not from the same firm -> cannot compute Jaccard similarity
output_file.write(input_text_line[i]+";"+"\n")
# Save the current word vector to a separate variable for the comparison of the next report.
word_list_old_edited=word_list_edited
# Close 10-K filing
input_file_10_k.close()
input_file.close()
output_file.close()
print("Task done!")

View file

@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 21 09:38:32 2022
@author: Alexander Hillert, Goethe University Frankfurt
"""
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
# adjust the directory to your folder
directory="C:/Lehre/Machine Learning/Data/"
# import the data for this problem
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
# The rows of the data are the Form 10-K filings. Each line is one filing.
# The columns are the variables. After some identifying information,
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
# in a 10-K (e.g., 100 times)
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
# and Console will crash.
# However, you can pick a small subset of the data and look at it.
# It list all columns=variables and the first three observations.
data_frame_example=data_frame.head(3)
# you can click on this variable in the variable explorer without Spyder crashing.
# To see the variables included in the data use the following command
data_frame_column_names=data_frame.columns
# you can click on this variable in the variable explorer without Spyder crashing.
# This variables shows all column/variable names in a vector.
# split the data set into the training and testing data
# we use the filings from year 2007 as training data
data_frame_train=data_frame[data_frame.year==2007]
# and the filing from year 2008 as testing data
data_frame_test=data_frame[data_frame.year==2008]
# put the cumulative abnormal return around the filing date into a new variable.
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
# training data
filing_car_train=data_frame_train["excess_ret_t0_t4"]
# testing data
filing_car_test=data_frame_test["excess_ret_t0_t4"]
# so far, you have absolute word counts. For example, "loss" is found 5 times.
# As the length of the 10-Ks can be different, we scale by the number of words
# in the 10-K.
document_length_train=data_frame_train["number_of_words"]
document_length_test=data_frame_test["number_of_words"]
# the word frequencies are our independent variables -> restrict the data frame
# to those variables and drop all variables that are not needed
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
# compute relative frequencies, i.e., divide the absolute word count by document length
data_frame_train=data_frame_train.div(document_length_train, axis=0)
data_frame_test=data_frame_test.div(document_length_test, axis=0)
# standardize the data frames
# training data
data_frame_train_mean=TO BE COMPLETED
data_frame_train_sd=TO BE COMPLETED
data_frame_train_standardized=TO BE COMPLETED
# testing data
data_frame_test_mean=TO BE COMPLETED
data_frame_test_sd=TO BE COMPLETED
data_frame_test_standardized=TO BE COMPLETED
# There can be missing values in the standardized variables.
# They arise if the word count for a specific word is always zero in the training
# or in the testing data. In this case, the standard deviation is zero ->
# division by zero -> NaN.
# We replace these missing values by zero.
# training data
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
# testing data
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
##########################
# Ridge regression
##########################
print("\nRidge regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
regression_Ridge_cv=RidgeCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
# get the optimal lambda
alpha_optimal_cv=TO BE COMPLETED
print("The optimal alpha is "+str(alpha_optimal_cv))
# what is the R2 in the training and testing data?
print("The R2 in the training data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
print("The R2 in the testing data is: "+str(regression_Ridge_cv.TO BE COMPLETED))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
# predict y in the testing sample
filing_car_test_predicted_Ridge=regression_Ridge_cv.TO BE COMPLETED
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
######################
# LASSO regression
######################
print("\nLASSO regression - Using cross-validation\n")
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
# In this regression, we use the training data.
# We use five-fold cross-validation.
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
regression_Lasso_cv=LassoCV(alphas=TO BE COMPLETED, fit_intercept=True,cv=5).fit(TO BE COMPLETED)
# get the optimal lambda
alpha_optimal_cv=TO BE COMPLETED
print("The optimal alpha is "+str(alpha_optimal_cv))
# get the R2 in the training data
print("The R2 in the training data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
# ... and testing data
print("The R2 in the testing data is: "+str(regression_Lasso_cv.TO BE COMPLETED))
# Mean squared error using the cross-validated model
# predict y in the full training sample
filing_car_train_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
# predict y in the testing sample
filing_car_test_predicted_Lasso=regression_Lasso_cv.TO BE COMPLETED
# Determine the MSE
print("The MSE in the full training data is: "+str(mean_squared_error(TO BE COMPLETED)))
print("The MSE in the testing data is: "+str(mean_squared_error(TO BE COMPLETED)))
############################################################
# Compare the betas from the Ridge and the LASSO regressions
############################################################
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
# get the list of coefficients
for i in range (0,len(data_frame_train.columns)):
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
output_file.close()
print("Completed!")

View file

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 15 21:37:53 2019
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# open the Fun_with_Python text file
input_file=open(directory+"Fun_with_Python.txt","r")
###################################
# Programming Problem 1
###################################
# Task 1: open the file 'Fun_with_Python.txt' in Spyder and print its content
# The file can be found in our data folder
# get the text from the file
input_text= TO BE COMPLETED
# print the content, i.e., the text of the file (previous line)
print(TO BE COMPLETED)
# See slide 7
# Task 2: Write the content of 'Fun_with_Python.txt' to a new text file
# with the name 'More_fun_with_Python.txt'.
# ENTER YOUR COMMANDS HERE
# See slide 8.
# REMEMBER to close your file. If you do not close the new txt file, its content
# will not be saved to the hard drive. You will find an empty txt in your file manager.
# Task 3: Write a loop that prints some text (whatever you like) ten times.
# ENTER YOUR COMMANDS HERE
# See slide 9.
# You have several options. While loop, for X in range() loop, etc.
# Task 4: Print the text of the "Fun_with_Python" file line by line!
# ENTER YOUR COMMANDS HERE
# See slide 10.
# You need a loop (Task 3) and in each iteration of the loop have Python print
# a line of text.
# Task 5: Count how often the word 'good' appears in the document 'Fun_with_Python.txt'!
# ENTER YOUR COMMANDS HERE
# See slide 11.
# Task 6a: Now, print only the lines that contain the word 'good'!
# ENTER YOUR COMMANDS HERE
# See also slide 12.
# You can use the line-by-line printing from Task 4 and combine it with the command ".count()" from Task 5
# and add the if condition from slide 12.
# If condition: for each line check whether the specific line contains the word "good".
# Task 7: print only the lines that start with the word 'This'!
# ENTER YOUR COMMANDS HERE
# See slide 15.
# This is very similar to task 6. You only need to modify the if condition a bit.
# Task 8a: Replace the word "good" by "excellent" and display the new text!
# See slide 16.
# ENTER YOUR COMMANDS HERE

View file

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 09:21:46 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the txt file with the SEC filings
sec_filings_file=open(directory+'formidx_1998Q1.txt','r')
sec_filings_text=sec_filings_file.read()
# Create output file
output_file=open(directory+'SEC_Filings_Output.csv','w')
# Create first line with variable names
# I use semicolons as separator in csv files. You can also use any other symbol.
# However, you should make sure that the separator is not part of the data/text
# you write to the file.
# For example, it would be problematic if you use comma as separator and have
# company names like "AMERICAN HEALTHCORP, INC." or "AMERICAN FUNDING, INC."
output_file.write("Form_Type;Company_Name;CIK;Filing_Date;Link\n")
# Split the Input File in separate line
# DO THE LINE SPIT
sec_filings_line=
# Loop over all lines
# you can get the number of lines by computing the length of the list of lines,
# i.e. by determining the length of sec_filings_line.
for / while : # COMPLETE LOOP
# Does the line refer to a form 10-K file?
if : # USE AN IF CONDITION TO TEST THIS -> see TASKS 7 and 8 of PROBLEM 1
# Split the line such that the information can be saved in separate
# variables
# Each information item has a fixed length in the overview files of the
# SEC.
# SEE SLIDE 18 FOR INFORMATION ON THE LENGTH OF THE SEPARATE COLUMNS.
# COMPLETE THE COMMANDS BELOW
filing_type=
company_name=
cik=
filing_date=
link=
# Is the 10-K filed between March 10 and March 20?
filing_day=
filing_month=
# Is the Filing Month March?
if : # COMPLETE THE IF-CONDITION
# Is the Filing Day between 10 and 20?
if : # COMPLETE THE IF-CONDITION
# The filing meets the conditions -->
# Write output to the csv file
output_file.write(filing_type+";"+company_name+";"+cik+";"+filing_date+";"+link+"\n")
# Close your input and output file in the end
sec_filings_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 29 11:07:10 2015
@author: Alexander Hillert, Goethe Uni Frankfurt
"""
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# We need the urllib package for the download.
import urllib.request
# To automatically create folders, we need the os-module (OS: Operating System)
import os
###############################################################################
# Technical issue
# As of March 2021, the SEC no longer accepts requests by the standard urllib settings
# you have to make some adjustments
###############################################################################
# Define a user agent
# Information on user agents are from https://docs.python.org/3/howto/urllib2.html:
# "Some websites dislike being browsed by programs, or send different versions
# to different browsers. By default urllib identifies itself as Python-urllib/x.y
# (where x and y are the major and minor version numbers of the Python release,
# e.g. Python-urllib/2.5), which may confuse the site, or just plain not work.
# The way a browser identifies itself is through the User-Agent header.
opener = urllib.request.build_opener()
# The SEC recently rejected requests from Python-urllib/x.y user agent (see above)
# To still automatically download files, you have different options.
# I have listed three examples below but there are many more:
# For a comprehensive list see, e.g.:
# https://developers.whatismybrowser.com/useragents/explore/software_type_specific/web-browser/
#opener.addheaders = [('User-agent', 'Mozilla')]
#opener.addheaders = [('User-agent', 'Chrome')]
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)')]
urllib.request.install_opener(opener)
# END of the technical issues
# Open the csv file from part 1 of the problem
input_file=open(directory+'SEC_Filings_Output.csv','r')
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# Create a subfolder in which the 10-K filings are saved.
# When you download a large number of filings I recommend using subfolders for
# each year or even for each year-month-day combination.
# In this problem, a single subfolder is fine.
os.makedirs( COMPLETE THE COMMAND )
# See slide 18 for information on the os.-commands!
# IN GENERAL, IF YOU SEE AN UNKNOWN COMMAND, GOOGLE IT TO GET INFORMATION.
# Loop over all lines of the csv file
# Like in part 1 of the problem, you can get the number of lines by computing
# the length of the list of lines, i.e. by determining the length of input_text_line.
for / while: # COMPLETE THE LOOP
# split the line into the five variables
# THE ; IS THE SEPARATOR IN THE CSV -> USE THE split() COMMAND
variables=
# We only need the cik and the link to download the file.
# The cik is the 3rd variable.
# The link is the 5th variable
cik=
link=
# identify the filename
# The link consistes of differnt parts:
# For example: edgar/data/1000753/0000950129-98-001035.txt
link_parts= # USE A SPLIT
# 1st part: edgar
# 2nd part: data
# 3rd part: cik
# 4th part: file name -> see next line
filename=link_parts[FILE IN THE NUMBER HERE]
###########################################################################
############################ WARNING ######################################
# The filename does NOT uniquely identify the SEC filings as different firms (CIKs)
# may use the same filename. Thus, when you only use the filename files
# might be overwritten. To avoid this problem you need to have a unique name.
# Combining CIK and filename results in a unique identifier, as the
# filename appears only once per firm (CIK).
# -> use the combination of CIK and filename: cik_filename
###########################################################################
urllib.request.urlretrieve(TO BE COMPLETED)
# See slide 19 for information on the urllib.-commands.
# Close your input file
input_file.close()
print("DONE")

View file

@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# Import regular expressions and BeautifulSoup
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the document
input_file=open(directory+'Exercise_4_Application_Regular_Expressions.txt','r',encoding="utf-8")
input_text=input_file.read()
#######################
# Task 1: remove tables
#######################
# Approach
# We search for tables until we find no more html tags that indicate the
# beginning of a table.
# Search for the start html-tag <TABLE>
table_match=re.search(TO BE COMPLETED, input_text)
while : # YOU NEED A LOOP THAT SEARCHES FOR TABLES
# When we have identified a match, i.e. the start of a table, we save
# the position of the beginning of the table in the variable "start_table"
table_start_match=re.search(XXX, input_text)
start_table=table_start_match.start()
# Next, we search for the corresponding html tag that indicates the end of
# the table and save the end position to the variable "end_table"
# REPEAT THE COMMANDS ABOVE FOR THE END OF TABLE
table_end_match=
end_table=
# We can print the text between the start and end html tag to check whether
# the table has been identified correctly.
print("The text below is a table!\n"+input_text[start_table:end_table])
# the text between the beginning and end of the html tags is the part which
# we would like to delete.
# Consequently, we keep the text before the beginning of the table as well
# as the text after the ending of the table.
input_text=TO BE COMPLETED
# Next, we need to check whether there is another table in the rest of the
# text.
table_match=re.search(SAME COMMAND AS IN LINE 27, input_text)
# As long as "table_match" exists, i.e. we regex result in a match, the loop
# will continue.
#########################
# Task 2: remove Exhibits
#########################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# THE APPROACH IS THE SAME AS THE SEARCH FOR TABLES ABOVE
exhibit_match=re.search(, input_text)
while :
# get the beginning of the exhibit
exhibit_start_match=
start_exhibit=
# As the exhibits are at the end of the 10-K filing it would not be
# necessary to include an end position. We could also drop the entire text
# after "<TYPE>EX"
# However, for completeness, we will define an end
exhibit_end_match=
end_exhibit=
# Print the identified text to check whether the exhibit has be identified
# correctly
print("The text below is a exhibit!\n"+input_text[start_exhibit:end_exhibit])
input_text=TO BE COMPLETED
# Check whether there are further exhibits
exhibit_match=re.search(SAME COMMAND AS IN LINE 65, input_text)
##########################
# Task 3: remove html code
##########################
# Alternative 1: remove html code without Beautiful Soup
text=re.sub(TO BE COMPLETED, '', input_text)
# Use a regex that searches for a "<" followed by at least one character that must not
# equal > and is completed by >.
# Alternative 2: remove html code using Beautiful Soup
html_text=BeautifulSoup(TO BE COMPLETED)
text=html_text.TO BE COMPLETED
########################
# Task 4: delete numbers
########################
# YOU MAY NEED MULTIPLE COMMANDS TO DELETE ALL NUMBERS
# Remember that you can have different formats, e.g., 1,234.56 or 0.12 or 1,234,567
text=re.sub(TO BE COMPLETED,'',text)
########################
# Task 5: delete symbols
########################
text=re.sub(TO BE COMPLETED,'',text)
# Open the output file for the pure text
output_file=open(directory+'Exercise_4_Application_Regular_Expressions_clean.txt','w',encoding="utf-8")
output_file.write(text)
# close all files
input_file.close()
output_file.close()
print("DONE")

View file

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 12 15:50:22 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
from bs4 import BeautifulSoup
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# It is important to use a single forward slash / but not a single backslash \.
# For MAC users: your directory will usually start with "/Users/". For example:
#directory="/Users/FirstnameLastname/Textual Analysis/Programming/Files/"
# Open the 10-K
input_file=open(directory+'0000950130-98-001359.txt','r',encoding='ascii',errors='ignore')
input_text=input_file.read()
################################
# Remove tables
# Same approach as in Problem 4
################################
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first two tables that we delete.
i=1
table_match=re.search(ENTER THE REGEX, input_text)
while table_match:
# Search for the beginning of the table
table_start_match=re.search(REGEX FOR BEGINNING OF TABLE, input_text)
start_table=
# search for the end of the table
table_end_match=REGEX FOR END OF TABLE
end_table=
# The if condition and the printing are just for illustrative purposes.
# The commands display the first two tables that are removed from the text.
if i<=2:
print("This is the "+str(i)+". Table in the 10-K.\n"+input_text[start_table:end_table]+"\n")
i=i+1
# remove the table from the original text
input_text=TO BE COMPLETED
# check whether there are further tables
# same command as in line 24
table_match=re.search(XXXXXXX, input_text)
################################
# Remove exhibits
# Same approach as in Problem 4
################################
# Exhibits have the following structure
# <DOCUMENT>
# <TYPE>EX...
# ...
# </DOCUMENT>
# Sometimes it is helpful to print the text parts that are deleted. In this
# example, we will print the first exhibit that we delete.
i=1
exhibit_match=re.search(ENTER THE REGEX, input_text)
while exhibit_match:
# Search for the beginning of the exhibit
exhibit_start_match=re.search(REGEX FOR BEGINNING OF EXHIBIT, input_text)
start_exhibit=
# Search for the end of the exhibit
# CAUTION: search only in the text after the beginning of the exhibt, as
# the end-term also appears earlier (e.g. end of main document)
exhibit_end_match=re.search(REGEX FOR END OF EXHIBIT, input_text[START OF EHIBIT UNTIL END OF TEXT])
end_exhibit=
if i<=1:
print("This is the "+str(i)+". Exhibit in the 10-K.\n"+input_text[start_exhibit:end_exhibit]+"\n")
i=i+1
# remove exhibit from the original text
input_text=
# check whether there are further exhibits
# same command as in line 55
exhibit_match=re.search(XXXXXXX, input_text)
##################
# Remove html code
##################
# you can use BeautifulSoup for simplicity
html_text=BeautifulSoup(input_text, 'html.parser')
text=html_text.get_text()
############################
# Remove the Document Header
############################
# There are different possibilities how one can define the start of the main part of the text
# In general, you should delete all text that is uninformative for your analysis.
header_match=re.search(END OF DOCUMENT HEADER, text)
if header_match:
# Drop the document header and keep only the rest of the text after the header.
text=text[XXXXXXXXXXXXXXX]
#################################################
# Delete the text in "PART IV"
# This procedure is optional. Look at "Part IV" and decide whether you favor
# the approach. I think that the part should be dropped, as it is just a list
# of exhibits, some mandatory text required by the SEC [indicated by the
# capital letters in the "SIGNATURES" section].
#################################################
'''
# Alternative 1: go over all matches but keep only the last one
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
pass
# match now contains the last match.
# Delete the text after the last match
text=text[:match.start()]
# Alternative 2: save the positions of all matches (more general approach)
list_start_matches=[]
list_end_matches=[]
for match in re.finditer('\s{2,}PART IV\s{0,}\n', text):
list_start_matches.append(match.start())
list_end_matches.append(match.end())
# Position of last match
print(list_start_matches[len(list_start_matches)-1])
print(list_end_matches[len(list_start_matches)-1])
# Delete the text after the last match
text=text[:list_start_matches[len(list_start_matches)-1]]
'''
# Delete item numbers
# This is optional. It removes "Item 1.", "ITEM 1.", "Item 10.", "Item 7A."
text=re.sub(TO BE COMPLETED,'',text)
# Delete numbers
# You can use the code from Problem 4.
text=re.sub(TO BE COMPLETED,'',text)
# Hyphens can be used to indicate that the word is continued in the next
# line. For example, "Micro-\nsoft" (\n is the line feed).
# Delete hyphens that are followed by a line feed.
text=re.sub(TO BE COMPLETED,'',text)
# Delete symbols
# You can use the code from Problem 4.
text=re.sub(TO BE COMPLETED,'',text)
# Delete dots and commas that are not part of sentences, i.e. commas and dots
# that are preceded by whitespace or line break and that are followed by
# whitespace or line break.
text=re.sub('\n(\.|,)\n','\n',text)
# Drop single-character words
# One can argue whether one should implement this procedure. Loughran and
# McDonald argue in one of their papers in favor of it.
# To make sure that there is just one letter, we require that there is a word
# boundary (\W) before and after. We use a positive backward looking and a
# positive forward looking condition for this to assure that the word boundary
# get not deleted as well.
text=re.sub(TO BE COMPLETED,' ',text)
# Open the output file for the pure text
output_file=open(directory+'0000950130-98-001359_clean.txt','w',encoding='ascii',errors='ignore')
output_file.write(text)
input_file.close()
output_file.close()
print("COMPLETED.")

View file

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary has been obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select negative words and copy them to a txt file
file_word_list=open(directory+'LMD_Neg.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# LOOK AT THE FILE. ARE THE WORDS IN UPPER OR IN LOWER CASE?
# MAKE SURE THAT YOU USE A CONSISTENT FORMAT FOR THE TEXT AND THE DICTIONARY.
# THE COMMANDS ARE .lower() AND .upper().
# CREATE A LIST OF NEGATIVE WORDS -> SPLIT THE TEXT
negative_words=word_list.XXXX
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the input file in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Negative_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Negative_Words;\
Percentage_Negative_Words\n')
# Loop over all lines of the csv file
for i in range(1,len(input_text_line)):
# If the execution of your scripts takes some time, printing the loop iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-Ks in the list
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+'_'+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# CONVERT THE TEXT TO UPPER OR LOWER CASE (see comment above)
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document is identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=input_text_10_k.XXXXXX
# Split the text in words to determine the total number of words
# LOOK AT THE REGEX INTRODUCTION FOR A SUITABLE SPLIT VARIABLE.
list_of_words=re.split(XXXXX, text)
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 37 and 38.
COMMANDS TO BE ADDED
# Determine the total number of words
# COUNT THE NUMBER OF ELEMENTS IN list_of_words
word_count=XXXX
# Reset the number of negative words to zero
negative_count=0
# For each negative word, count the number of occurrences
for j in range(len(negative_words)):
HERE YOU NEED TO COUNT HOW OFTEN THE jth NEGATIVE WORD IS FOUND IN THE TEXT.
COMPARE THE TWO CASES BELOW -> EXECUTE THE COMMANDS (see lines below) IN
THE COMMAND LINE AND COMPARE THE RESULTS.
WHICH ALTERNATIVE IS THE RIGHT APPROACH?
ALTERNATIVE 1:
list_of_words=["abandon","abandoned","abandonment"]
list_of_words.count("abandon")
ALTERNATIVE 2:
text_of_words="abandon abandoned abandonment"
text_of_words.count("abandon")
ADD THE CORRECT COUNT OF NEGATIVE WORD j TO YOUR OVERALL COUNT.
negative_count=negative_count+XXXXX
# Get the percentage of negative words
percentage_negative=negative_count/word_count
# Write cik, file name, total number of words, number of negative words,
# and the percentage of negative words to output file.
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'\
+str(negative_count)+';'+str(percentage_negative)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,131 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert
"""
import re
# Please adjust the directory to your machine.
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the dictionary
# The dictionary is obtained from Bill McDonald's webpage
# http://www3.nd.edu/~mcdonald/Word_Lists.html
# --> LoughranMcDonald_MasterDictionary_2014.xlsx
# --> select positive words and copy them to a txt file
file_word_list=open(directory+'LMD_Pos.txt','r',encoding="utf-8")
word_list=file_word_list.read()
# LIKE IN PROBLEM 7, YOU HAVE TO APPLY A CONSISTENT FORMAT TO BOTH THE LMD-WORDS
# AND THE TEXT OF THE 10-Ks.
positive_words=word_list.split()
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the iput file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_Positive_Tone.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_Pos_Words;Number_Pos_Words_adj;'\
+'Percent_Pos_Words;Percent_Pos_Words_adj\n')
# Iterate the list of the 200 10-K filings
for i in range(1,len(input_text_line)):
# If the execution of your scripts takes some time, printing the iterator
# gives you an impression of the overall progress made.
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK (1st column) and the filename (2nd column)
cik=variables[0]
filename=variables[1]
# modify file name to open the edited files
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'/10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
input_text_10_k=input_file_10_k.read()
# It is important that the formatting (lower case vs. upper case) of the word list
# and the document are identical. Remember that you have typically lower and upper case
# letters in documents -> modify text
text=XXXX # CONSISTENT FORMAT
# Split the text in single words to determine the total number of words
list_of_words=re.split(XXXX, text) # USE THE SAME COMMAND AS IN PROBLEM 7
# ARE THERE EMPTY ELEMENTS IN THE LIST OF WORDS?
# Make sure that empty list elements do not bias the word count -> delete them!
# You can use an approach similar to the one in lines 34 and 35.
COMMANDS TO BE ADDED
# Determine total number of words
word_count=XXXX # SAME COMMAND AS IN PROBLEM 7
# Reset the number of positive words and positive words adj. for negations to zero.
positive_count=0
positive_count_adj=0
# For each positive word, count the number of occurrences
for j in range(len(positive_words)):
# standard count operation without controlling for negations
positive_words_found=list_of_words.count(positive_words[j])
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
# only for Fin-Pos words. Simple negation is taken to be observations
# of one of six words (no, not, none, neither, never, nobody) occurring
# within three words preceding a positive word.
# When we have identified positive words we need to search for negations
while positive_words_found>0:
# identify the position of the matched positive word in the list of all words
position_of_word=list_of_words.XXXXX # THE COMMAND .index() IS HELPFUL HERE
# identify the three words before the positive word and add them to a list
list_negation=[3_WORDS_BEFORE_MATCH,2_WORDS_BEFORE_MATCH,1_WORD_BEFORE_MATCH]
# REPLACE THE THREE PLACEHOLDERS BY THE CORRESPONDING ELEMENTS OF list_of_words
# check whether one of the three words in list_negation is a negation
negation_found=list_negation.count('no')+list_negation.count('not')+XXXX TO BE COMPLETED
if negation_found==0:
# no negation
positive_count_adj=positive_count_adj+1
positive_count=positive_count+1
else:
# negation
positive_count=positive_count+1
# delete the matched positive words in the original document
list_of_words[position_of_word]=XXX
# THIS OPERATION IS IMPORTANT BECAUSE OTHERWISE WE WILL GET AN ENDLESS LOOP
# check whether there are further matches of the jth positive word
positive_words_found=list_of_words.count(positive_words[j])
# Write cik, file name, total number of words, and number of positive
# and adjusted positive words to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(positive_count)+';'+str(positive_count_adj)+';'+str(positive_count/word_count)+\
';'+str(positive_count_adj/word_count)+'\n')
# Close filings
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()

View file

@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 13 22:43:32 2016
@author: Alexander Hillert, Goethe University Frankfurt
"""
# We split the text into words and sentences using regular expression
import re
directory="C:/Lehre/Textual Analysis/Programming/Files/"
# Open the csv file containing the list of the 200 10-Ks
input_file=open(directory+'10-K_Sample_2011Q1_Input.csv','r',encoding="utf-8")
input_text=input_file.read()
# Create output file
output_file=open(directory+'10-K_Sample_2011Q1_Output_WPS.csv','w',encoding="utf-8")
# Write variable names to the first line of the output file
output_file.write('CIK;Filename;Number_Words;Number_of_Sentences;WPS\n')
# Split the Input File in separate lines
input_text_line=input_text.split("\n")
# In general, there can be empty lines in the input file. The following command
# deletes these lines.
while input_text_line.count("")>0:
input_text_line.remove("")
# Loop over all lines
for i in range(1,len(input_text_line)):
print(str(i))
# split the line into the two variables
variables=input_text_line[i].split(";")
# We need the CIK and the filename
cik=variables[0]
filename=variables[1]
filename=filename.replace('.txt','')
# Open the ith 10-K in the list
input_file_10_k=open(directory+'10-K_Sample_clean/'+cik+"_"+filename+'_clean.txt','r',\
encoding='ascii',errors='ignore')
text=input_file_10_k.read()
# Determine number of sentences and number of words
# DETERMINE THE NUMBER OF WORDS; YOU KNOW THE COMMAND FROM PROBLEMS 7 AND 8.
list_of_words=re.split(XXX, text)
# Determine total number of words
word_count=XXX
# Split the text by symbols that indicate the end of a sentence
# to determine the total number of sentences
list_of_sentences=re.split(XXX, text)
# Determine total number of sentences
sentence_count=XXX
# Ratio of # of words over # of sentences
wps=word_count/sentence_count
# Write cik, file name, total number of words, total number of sentences,
# and WPS to the output file
output_file.write(cik+';'+filename+'_clean.txt;'+str(word_count)+';'+\
str(sentence_count)+';'+str(wps)+'\n')
# Close filing
input_file_10_k.close()
print("Finished")
output_file.close()
input_file.close()