Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/introductions/NLTK_introduction.py
+++ b/lectures/programming/introductions/NLTK_introduction.py
@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jul 11 17:43:45 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+
+# import modules
+# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
+# the following three commands:
+#import nltk
+#nltk.download('punkt')
+#nltk.download('stopwords')
+
+
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer
+import re
+
+################
+# 1. Tokenize
+################
+# Create a test text to see how well nltk.tokenize performs
+test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
+from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
+U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
+
+# Tokenize sentences
+sentence_list=sent_tokenize(test_text)
+print("This is the list of sentences:")
+print(sentence_list)
+# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
+# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
+ 
+# Tokenize words
+word_list=word_tokenize(test_text)
+print("This is the list of words:")
+print(word_list)
+print(len(word_list))
+# --> word_tokenize also includes symbols and numbers as words.
+
+# How to delete the elements that are not real words?
+word_list_1=[]
+for word in word_list:
+    if re.search('[A-Za-z]',word):
+        word_list_1.append(word)
+print("This is the edited list of words. There should be only 'real' words:")
+print(word_list_1)
+print(len(word_list_1))
+
+# Alternative
+test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
+word_list_2=word_tokenize(test_text1)
+print("This is the edited list of words. There should be only 'real' words:")
+print(word_list_2)
+print(len(word_list_2))
+
+
+################
+# 2. Stop Words
+################
+example_sentence = "This is an example showing off stop word filtering."
+stop_words=set(stopwords.words("english"))
+print("This is the list of stop words from NLTK:")
+print(stop_words)
+# --> the stop words are all lower case
+print(len(stop_words))
+
+# Split example sentence into words
+word_list_example=word_tokenize(example_sentence.lower())
+# Create list for filtered words
+word_list_filtered=[]
+
+# filter out stop words
+for word in word_list_example:
+    if word not in stop_words:
+        word_list_filtered.append(word)
+
+print("Example sentence after stop words have been deleted:")
+print(word_list_filtered)
+
+# How does the example from above look like?
+test_text_filtered=[]
+
+# filter out stop words
+for word in word_tokenize(test_text.lower()):
+    if word not in stop_words:
+        test_text_filtered.append(word)
+
+print("Test text after stop words have been deleted:")
+print(test_text_filtered)
+
+
+################
+# 3. Stemming
+################
+# define an abbreviation
+ps=PorterStemmer()
+
+example_words_1=["play", "player", "players", "played", "playing"]
+
+for word in example_words_1:
+    print(ps.stem(word))
+    # the full syntax without the abbreviation would be:
+    print(PorterStemmer().stem(word))
+
+# adjectives and adverbs
+example_words_2=["high", "higher", "highest", "highly", "height"]
+for word in example_words_2:
+    print(ps.stem(word))
+# --> comparative and superlative are not reduced to the stem/regular adjective
+# neither are adverbs
+
+# Let's see how the stemmer deals with irregular words.
+example_words_3=["good", "better", "best", "well", "God", "Goodness"]
+for word in example_words_3:
+    print(ps.stem(word))
+# --> upper case words are also transformed to lower case.
+
+# Stem the test text from above
+# Approach 1: stem word by word
+test_text_stemmed=[]
+for word in word_tokenize(test_text):
+    test_text_stemmed.append(ps.stem(word))
+
+print("Stemming word by word: test text after it has been stemmed:")
+print(test_text_stemmed)
+
+# Alternative approach: stem entire text
+test_text_stemmed=ps.stem(test_text)
+print("Stemming entire document: test text after it has been stemmed:")
+print(test_text_stemmed)
+# -> does not work
+
+print("End of nltk introduction!")