1
0
Fork 0
whu-textual-analysis/lectures/programming/introductions/NLTK_introduction.py
Alexander Hess a37c87d9c8
Add programming files
- add the code files provided by the instructor
- the programming/files folder with the data files is NOT included
  here due to its size
- add a .gitignore file to exclude the data files' folder
2022-08-05 00:06:58 +02:00

137 lines
4.1 KiB
Python

# -*- coding: utf-8 -*-
"""
Created on Tue Jul 11 17:43:45 2017
@author: Alexander Hillert, Goethe University Frankfurt
"""
# import modules
# if you need to download the nltk packages 'punkt' and 'stopwords' you can use
# the following three commands:
#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
################
# 1. Tokenize
################
# Create a test text to see how well nltk.tokenize performs
test_text="Microsoft Corp. announced they would acquire Yahoo! for $3.4 to prevent Google Inc. \
from taking over Software Ltd. headerquartered in St. Louis. XYZ S.A. is located in the \
U.S. and run by Dr. John P. Smith, who likes short-term risk-based calculations."
# Tokenize sentences
sentence_list=sent_tokenize(test_text)
print("This is the list of sentences:")
print(sentence_list)
# looks good. Only the split after "Yahoo" is incorrect. The tool correctly
# recognizes "Mr.", "Dr.", "Inc.", etc. -> good performance
# Tokenize words
word_list=word_tokenize(test_text)
print("This is the list of words:")
print(word_list)
print(len(word_list))
# --> word_tokenize also includes symbols and numbers as words.
# How to delete the elements that are not real words?
word_list_1=[]
for word in word_list:
if re.search('[A-Za-z]',word):
word_list_1.append(word)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_1)
print(len(word_list_1))
# Alternative
test_text1=re.sub('[^A-Za-z\s\n]','',test_text)
word_list_2=word_tokenize(test_text1)
print("This is the edited list of words. There should be only 'real' words:")
print(word_list_2)
print(len(word_list_2))
################
# 2. Stop Words
################
example_sentence = "This is an example showing off stop word filtering."
stop_words=set(stopwords.words("english"))
print("This is the list of stop words from NLTK:")
print(stop_words)
# --> the stop words are all lower case
print(len(stop_words))
# Split example sentence into words
word_list_example=word_tokenize(example_sentence.lower())
# Create list for filtered words
word_list_filtered=[]
# filter out stop words
for word in word_list_example:
if word not in stop_words:
word_list_filtered.append(word)
print("Example sentence after stop words have been deleted:")
print(word_list_filtered)
# How does the example from above look like?
test_text_filtered=[]
# filter out stop words
for word in word_tokenize(test_text.lower()):
if word not in stop_words:
test_text_filtered.append(word)
print("Test text after stop words have been deleted:")
print(test_text_filtered)
################
# 3. Stemming
################
# define an abbreviation
ps=PorterStemmer()
example_words_1=["play", "player", "players", "played", "playing"]
for word in example_words_1:
print(ps.stem(word))
# the full syntax without the abbreviation would be:
print(PorterStemmer().stem(word))
# adjectives and adverbs
example_words_2=["high", "higher", "highest", "highly", "height"]
for word in example_words_2:
print(ps.stem(word))
# --> comparative and superlative are not reduced to the stem/regular adjective
# neither are adverbs
# Let's see how the stemmer deals with irregular words.
example_words_3=["good", "better", "best", "well", "God", "Goodness"]
for word in example_words_3:
print(ps.stem(word))
# --> upper case words are also transformed to lower case.
# Stem the test text from above
# Approach 1: stem word by word
test_text_stemmed=[]
for word in word_tokenize(test_text):
test_text_stemmed.append(ps.stem(word))
print("Stemming word by word: test text after it has been stemmed:")
print(test_text_stemmed)
# Alternative approach: stem entire text
test_text_stemmed=ps.stem(test_text)
print("Stemming entire document: test text after it has been stemmed:")
print(test_text_stemmed)
# -> does not work
print("End of nltk introduction!")