Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
189 lines
7 KiB
Python
189 lines
7 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sat Jul 15 21:56:41 2017
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
import nltk
|
|
import random
|
|
import collections
|
|
import re
|
|
|
|
# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
|
|
# are positive and the other 1,000 are negative.
|
|
# if you do not have the movie review corpus yet, download it:
|
|
nltk.download("movie_reviews")
|
|
|
|
from nltk.corpus import movie_reviews
|
|
|
|
|
|
# Create a list that contains the tuples of document and category.
|
|
# Category is "positive" or "negative"
|
|
documents = []
|
|
# For all categories
|
|
for category in movie_reviews.categories():
|
|
print("Category: "+str(category))
|
|
# for all reviews (identified by file ID) in the respective category
|
|
for file_ID in movie_reviews.fileids(category):
|
|
# You have to put two parentheses to indicate that you want to add a tuple.
|
|
documents.append((list(movie_reviews.words(file_ID)),category))
|
|
|
|
# Print the first element (i.e. tuple) of documents.
|
|
print(documents[0])
|
|
# print the words of the first movie review
|
|
print(documents[0][0])
|
|
# print the first word of the first movie review
|
|
print(documents[0][0][0])
|
|
|
|
# print the classification of the first movie review
|
|
print(documents[0][1])
|
|
|
|
# print the classification of the 1000th review (the last negative one)
|
|
print(documents[999][1])
|
|
# print the classification of the 1001st review (the first positive one)
|
|
print(documents[1000][1])
|
|
|
|
# The default order of the reviews is first all negative reviews and then all positive ones.
|
|
# Later we will build a training and a testing set. As we need to have positive and negative
|
|
# reports in both sets, we randomly shuffle the documents.
|
|
random.shuffle(documents)
|
|
|
|
# Create a list of all words.
|
|
all_words = []
|
|
for word in movie_reviews.words():
|
|
# We use lower case words
|
|
#all_words.append(word.lower())
|
|
if re.search("\A[a-z]",word.lower()):
|
|
# check whether the word is actually a word, i.e., whether it contains
|
|
# at least one letter
|
|
#if re.search("[a-z]",word.lower()):
|
|
# We use lower case words
|
|
all_words.append(word.lower())
|
|
|
|
|
|
# What are the most frequently used words in the movie reviews?
|
|
# Alternative 1:
|
|
# FreqDist sort words from the most frequently used word to the least frequenty used word.
|
|
all_words_approach_1 = nltk.FreqDist(all_words)
|
|
print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
|
|
|
|
# Alternative 2:
|
|
# We can also determine the most frequent words by using Counters as we did
|
|
# in Problem 12 --> transform list of all words to a Counter
|
|
all_words_approach_2=collections.Counter(all_words)
|
|
top_15_words=all_words_approach_2.most_common(15)
|
|
print("Alternative 2: the top 15 words are: "+str(top_15_words))
|
|
# -> identical results -> perfect.
|
|
|
|
# Search for a word and see how often it appears.
|
|
print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
|
|
# alternatively
|
|
print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
|
|
|
|
# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
|
|
# -> create a list that only contains the top 3000 words
|
|
# get the top 3000 words
|
|
# Approach 1 using the NLKT.FreqDist from above
|
|
i=0
|
|
top_3000_words=all_words_approach_1.most_common(3000)
|
|
list_top_3000_words_approach_1=[]
|
|
while i<3000:
|
|
list_top_3000_words_approach_1.append(top_3000_words[i][0])
|
|
i=i+1
|
|
|
|
# Approach 2 using Counters from above
|
|
i=0
|
|
top_3000_words=all_words_approach_2.most_common(3000)
|
|
list_top_3000_words_approach_2=[]
|
|
while i<3000:
|
|
list_top_3000_words_approach_2.append(top_3000_words[i][0])
|
|
i=i+1
|
|
|
|
# select the list of approach 1 or 2
|
|
word_features=list_top_3000_words_approach_1
|
|
|
|
# We need to identify the words we want to use for classification in the documents.
|
|
# We define a function for that.
|
|
def find_features(document):
|
|
words = set(document)
|
|
features = {}
|
|
# loop over all the words we consider for the classification
|
|
for word in word_features:
|
|
# The expression returns either true or false
|
|
features[word] = (word in words)
|
|
|
|
return features
|
|
|
|
# To get an idea what the function find_features() does let's print the features
|
|
# for one review.
|
|
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
|
|
|
|
|
|
feature_set = [(find_features(review), category) for (review, category) in documents]
|
|
|
|
# How does feature set looks like?
|
|
print(feature_set[0])
|
|
# -> it is still a tuple
|
|
print(feature_set[0][0])
|
|
# the first element are the 3000 words we use for classification with "True" or "False"
|
|
# depending on whether the words appear in the review
|
|
print(feature_set[0][1])
|
|
# Is the information on whether the review is positive or negative
|
|
|
|
# Define the training and testing set
|
|
# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
|
|
training_set=feature_set[:1900]
|
|
testing_set=feature_set[1900:]
|
|
|
|
# First we have to train the Naive Bayes Classifier.
|
|
# It will determine which of the words from word_features appear mostly in positive
|
|
# reviews and which appear mostly in negative reviews.
|
|
classifier=nltk.NaiveBayesClassifier.train(training_set)
|
|
# The following command prints the 20 words that best discriminate between
|
|
# positive and negative reviews.
|
|
classifier.show_most_informative_features(20)
|
|
|
|
# Let's classify the first element of feature_set
|
|
# The input for the classification need to be the list of words with True or False
|
|
print(classifier.classify(feature_set[0][0]))
|
|
print("The review is actually: "+str(feature_set[0][1]))
|
|
|
|
# classify the 100 reports from the testing set
|
|
# they have the position 1900 to 2000 in the feature set.
|
|
i=1900
|
|
classified_set=[]
|
|
while i<2000:
|
|
classified_set.append(classifier.classify(feature_set[i][0]))
|
|
i=i+1
|
|
|
|
# Compare classification result with actual category
|
|
i=0
|
|
# In this list we save tuples of [predicted category, actual category]
|
|
comparison=[]
|
|
# In this list we simply save "accurate" and "inaccurate"
|
|
comparison_2=[]
|
|
while i<100:
|
|
comparison.append([classified_set[i],feature_set[i+1900][1]])
|
|
# If the predicted and acutal classification match -> accurate
|
|
if comparison[i][0]==comparison[i][1]:
|
|
comparison_2.append("accurate")
|
|
else:
|
|
comparison_2.append("inaccurate")
|
|
i=i+1
|
|
|
|
print(comparison)
|
|
# We need the number of accurate and inaccurate classifications
|
|
comparison_counter=collections.Counter(comparison_2)
|
|
print(comparison_counter)
|
|
|
|
# NLT can compute the accuracy directly
|
|
# What is the accuracy for the testing set?
|
|
print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
|
|
# Same value as from our own calculations -> perfect!
|
|
|
|
# What is the accuracy for the training set?
|
|
print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
|
|
# Higher than in the testing dataset -> expected.
|
|
|
|
print("completed!")
|