whu-textual-analysis/lectures/programming/introductions/NLTK_introduction.py

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 17:43:45 2017

@author: Alexander Hillert, Goethe University Frankfurt
"""


# import modules
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
# the following three commands:
#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')


from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

################
# 1. Tokenize
################
# Create a test text to see how well nltk.tokenize performs
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."

# Tokenize sentences
sentence_list=sent_tokenize(test_text)
print("This is the list of sentences:")
print(sentence_list)
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance

# Tokenize words
word_list=word_tokenize(test_text)
print("This is the list of words:")
print(word_list)
print(len(word_list))
# --> word_tokenize also includes symbols and numbers as words.

# How to delete the elements that are not real words?
word_list_1=[]
for word in word_list:
    if re.search('[A-Za-z]',word):
        word_list_1.append(word)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_1)
print(len(word_list_1))

# Alternative
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
word_list_2=word_tokenize(test_text1)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_2)
print(len(word_list_2))


################
# 2. Stop Words
################
example_sentence = "This is an example showing off stop word filtering."
stop_words=set(stopwords.words("english"))
print("This is the list of stop words from NLTK:")
print(stop_words)
# --> the stop words are all lower case
print(len(stop_words))

# Split example sentence into words
word_list_example=word_tokenize(example_sentence.lower())
# Create list for filtered words
word_list_filtered=[]

# filter out stop words
for word in word_list_example:
    if word not in stop_words:
        word_list_filtered.append(word)

print("Example sentence after stop words have been deleted:")
print(word_list_filtered)

# How does the example from above look like?
test_text_filtered=[]

# filter out stop words
for word in word_tokenize(test_text.lower()):
    if word not in stop_words:
        test_text_filtered.append(word)

print("Test text after stop words have been deleted:")
print(test_text_filtered)


################
# 3. Stemming
################
# define an abbreviation
ps=PorterStemmer()

example_words_1=["play", "player", "players", "played", "playing"]

for word in example_words_1:
    print(ps.stem(word))
    # the full syntax without the abbreviation would be:
    print(PorterStemmer().stem(word))

# adjectives and adverbs
example_words_2=["high", "higher", "highest", "highly", "height"]
for word in example_words_2:
    print(ps.stem(word))
# --> comparative and superlative are not reduced to the stem/regular adjective
# neither are adverbs

# Let's see how the stemmer deals with irregular words.
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
for word in example_words_3:
    print(ps.stem(word))
# --> upper case words are also transformed to lower case.

# Stem the test text from above
# Approach 1: stem word by word
test_text_stemmed=[]
for word in word_tokenize(test_text):
    test_text_stemmed.append(ps.stem(word))

print("Stemming word by word: test text after it has been stemmed:")
print(test_text_stemmed)

# Alternative approach: stem entire text
test_text_stemmed=ps.stem(test_text)
print("Stemming entire document: test text after it has been stemmed:")
print(test_text_stemmed)
# -> does not work

print("End of nltk introduction!")