Add programming files
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
This commit is contained in:
parent
65aae9d4f9
commit
a37c87d9c8
38 changed files with 6416 additions and 0 deletions
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
137
lectures/programming/introductions/NLTK_introduction.py
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jul 11 17:43:45 2017
|
||||
|
||||
@author: Alexander Hillert, Goethe University Frankfurt
|
||||
"""
|
||||
|
||||
|
||||
# import modules
|
||||
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
|
||||
# the following three commands:
|
||||
#import nltk
|
||||
#nltk.download('punkt')
|
||||
#nltk.download('stopwords')
|
||||
|
||||
|
||||
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import PorterStemmer
|
||||
import re
|
||||
|
||||
################
|
||||
# 1. Tokenize
|
||||
################
|
||||
# Create a test text to see how well nltk.tokenize performs
|
||||
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
|
||||
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
|
||||
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
|
||||
|
||||
# Tokenize sentences
|
||||
sentence_list=sent_tokenize(test_text)
|
||||
print("This is the list of sentences:")
|
||||
print(sentence_list)
|
||||
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
|
||||
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
|
||||
|
||||
# Tokenize words
|
||||
word_list=word_tokenize(test_text)
|
||||
print("This is the list of words:")
|
||||
print(word_list)
|
||||
print(len(word_list))
|
||||
# --> word_tokenize also includes symbols and numbers as words.
|
||||
|
||||
# How to delete the elements that are not real words?
|
||||
word_list_1=[]
|
||||
for word in word_list:
|
||||
if re.search('[A-Za-z]',word):
|
||||
word_list_1.append(word)
|
||||
print("This is the edited list of words. There should be only 'real' words:")
|
||||
print(word_list_1)
|
||||
print(len(word_list_1))
|
||||
|
||||
# Alternative
|
||||
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
|
||||
word_list_2=word_tokenize(test_text1)
|
||||
print("This is the edited list of words. There should be only 'real' words:")
|
||||
print(word_list_2)
|
||||
print(len(word_list_2))
|
||||
|
||||
|
||||
################
|
||||
# 2. Stop Words
|
||||
################
|
||||
example_sentence = "This is an example showing off stop word filtering."
|
||||
stop_words=set(stopwords.words("english"))
|
||||
print("This is the list of stop words from NLTK:")
|
||||
print(stop_words)
|
||||
# --> the stop words are all lower case
|
||||
print(len(stop_words))
|
||||
|
||||
# Split example sentence into words
|
||||
word_list_example=word_tokenize(example_sentence.lower())
|
||||
# Create list for filtered words
|
||||
word_list_filtered=[]
|
||||
|
||||
# filter out stop words
|
||||
for word in word_list_example:
|
||||
if word not in stop_words:
|
||||
word_list_filtered.append(word)
|
||||
|
||||
print("Example sentence after stop words have been deleted:")
|
||||
print(word_list_filtered)
|
||||
|
||||
# How does the example from above look like?
|
||||
test_text_filtered=[]
|
||||
|
||||
# filter out stop words
|
||||
for word in word_tokenize(test_text.lower()):
|
||||
if word not in stop_words:
|
||||
test_text_filtered.append(word)
|
||||
|
||||
print("Test text after stop words have been deleted:")
|
||||
print(test_text_filtered)
|
||||
|
||||
|
||||
################
|
||||
# 3. Stemming
|
||||
################
|
||||
# define an abbreviation
|
||||
ps=PorterStemmer()
|
||||
|
||||
example_words_1=["play", "player", "players", "played", "playing"]
|
||||
|
||||
for word in example_words_1:
|
||||
print(ps.stem(word))
|
||||
# the full syntax without the abbreviation would be:
|
||||
print(PorterStemmer().stem(word))
|
||||
|
||||
# adjectives and adverbs
|
||||
example_words_2=["high", "higher", "highest", "highly", "height"]
|
||||
for word in example_words_2:
|
||||
print(ps.stem(word))
|
||||
# --> comparative and superlative are not reduced to the stem/regular adjective
|
||||
# neither are adverbs
|
||||
|
||||
# Let's see how the stemmer deals with irregular words.
|
||||
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
|
||||
for word in example_words_3:
|
||||
print(ps.stem(word))
|
||||
# --> upper case words are also transformed to lower case.
|
||||
|
||||
# Stem the test text from above
|
||||
# Approach 1: stem word by word
|
||||
test_text_stemmed=[]
|
||||
for word in word_tokenize(test_text):
|
||||
test_text_stemmed.append(ps.stem(word))
|
||||
|
||||
print("Stemming word by word: test text after it has been stemmed:")
|
||||
print(test_text_stemmed)
|
||||
|
||||
# Alternative approach: stem entire text
|
||||
test_text_stemmed=ps.stem(test_text)
|
||||
print("Stemming entire document: test text after it has been stemmed:")
|
||||
print(test_text_stemmed)
|
||||
# -> does not work
|
||||
|
||||
print("End of nltk introduction!")
|
||||
Loading…
Add table
Add a link
Reference in a new issue