Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
137 lines
4.1 KiB
Python
137 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Tue Jul 11 17:43:45 2017
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
|
|
# import modules
|
|
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
|
|
# the following three commands:
|
|
#import nltk
|
|
#nltk.download('punkt')
|
|
#nltk.download('stopwords')
|
|
|
|
|
|
from nltk.tokenize import word_tokenize, sent_tokenize
|
|
from nltk.corpus import stopwords
|
|
from nltk.stem import PorterStemmer
|
|
import re
|
|
|
|
################
|
|
# 1. Tokenize
|
|
################
|
|
# Create a test text to see how well nltk.tokenize performs
|
|
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
|
|
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
|
|
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
|
|
|
|
# Tokenize sentences
|
|
sentence_list=sent_tokenize(test_text)
|
|
print("This is the list of sentences:")
|
|
print(sentence_list)
|
|
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
|
|
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
|
|
|
|
# Tokenize words
|
|
word_list=word_tokenize(test_text)
|
|
print("This is the list of words:")
|
|
print(word_list)
|
|
print(len(word_list))
|
|
# --> word_tokenize also includes symbols and numbers as words.
|
|
|
|
# How to delete the elements that are not real words?
|
|
word_list_1=[]
|
|
for word in word_list:
|
|
if re.search('[A-Za-z]',word):
|
|
word_list_1.append(word)
|
|
print("This is the edited list of words. There should be only 'real' words:")
|
|
print(word_list_1)
|
|
print(len(word_list_1))
|
|
|
|
# Alternative
|
|
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
|
|
word_list_2=word_tokenize(test_text1)
|
|
print("This is the edited list of words. There should be only 'real' words:")
|
|
print(word_list_2)
|
|
print(len(word_list_2))
|
|
|
|
|
|
################
|
|
# 2. Stop Words
|
|
################
|
|
example_sentence = "This is an example showing off stop word filtering."
|
|
stop_words=set(stopwords.words("english"))
|
|
print("This is the list of stop words from NLTK:")
|
|
print(stop_words)
|
|
# --> the stop words are all lower case
|
|
print(len(stop_words))
|
|
|
|
# Split example sentence into words
|
|
word_list_example=word_tokenize(example_sentence.lower())
|
|
# Create list for filtered words
|
|
word_list_filtered=[]
|
|
|
|
# filter out stop words
|
|
for word in word_list_example:
|
|
if word not in stop_words:
|
|
word_list_filtered.append(word)
|
|
|
|
print("Example sentence after stop words have been deleted:")
|
|
print(word_list_filtered)
|
|
|
|
# How does the example from above look like?
|
|
test_text_filtered=[]
|
|
|
|
# filter out stop words
|
|
for word in word_tokenize(test_text.lower()):
|
|
if word not in stop_words:
|
|
test_text_filtered.append(word)
|
|
|
|
print("Test text after stop words have been deleted:")
|
|
print(test_text_filtered)
|
|
|
|
|
|
################
|
|
# 3. Stemming
|
|
################
|
|
# define an abbreviation
|
|
ps=PorterStemmer()
|
|
|
|
example_words_1=["play", "player", "players", "played", "playing"]
|
|
|
|
for word in example_words_1:
|
|
print(ps.stem(word))
|
|
# the full syntax without the abbreviation would be:
|
|
print(PorterStemmer().stem(word))
|
|
|
|
# adjectives and adverbs
|
|
example_words_2=["high", "higher", "highest", "highly", "height"]
|
|
for word in example_words_2:
|
|
print(ps.stem(word))
|
|
# --> comparative and superlative are not reduced to the stem/regular adjective
|
|
# neither are adverbs
|
|
|
|
# Let's see how the stemmer deals with irregular words.
|
|
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
|
|
for word in example_words_3:
|
|
print(ps.stem(word))
|
|
# --> upper case words are also transformed to lower case.
|
|
|
|
# Stem the test text from above
|
|
# Approach 1: stem word by word
|
|
test_text_stemmed=[]
|
|
for word in word_tokenize(test_text):
|
|
test_text_stemmed.append(ps.stem(word))
|
|
|
|
print("Stemming word by word: test text after it has been stemmed:")
|
|
print(test_text_stemmed)
|
|
|
|
# Alternative approach: stem entire text
|
|
test_text_stemmed=ps.stem(test_text)
|
|
print("Stemming entire document: test text after it has been stemmed:")
|
|
print(test_text_stemmed)
|
|
# -> does not work
|
|
|
|
print("End of nltk introduction!")
|