Add programming files

- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
2022-08-05 00:05:05 +02:00 · 2022-08-05 00:05:05 +02:00 · a37c87d9c8
commit a37c87d9c8
parent 65aae9d4f9
38 changed files with 6416 additions and 0 deletions
--- a/lectures/programming/templates/NLTK_Sentiment_Analysis.py
+++ b/lectures/programming/templates/NLTK_Sentiment_Analysis.py
@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jul 15 21:56:41 2017
+
+@author: Alexander Hillert, Goethe University Frankfurt
+"""
+
+import nltk
+import random
+import collections
+import re
+
+# We will use the NLTK Corpus containing 2,000 Movie Reviews of which 1,000
+# are positive and the other 1,000 are negative.
+# if you do not have the movie review corpus yet, download it:
+nltk.download("movie_reviews")
+
+from nltk.corpus import movie_reviews
+
+
+# Create a list that contains the tuples of document and category.
+# Category is "positive" or "negative"
+documents = []
+# For all categories
+for category in movie_reviews.categories():
+    print("Category: "+str(category))
+    # for all reviews (identified by file ID) in the respective category
+    for file_ID in movie_reviews.fileids(category):
+        # You have to put two parentheses to indicate that you want to add a tuple.
+        documents.append((list(movie_reviews.words(file_ID)),category))
+
+# Print the first element (i.e. tuple) of documents.
+print(documents[0])
+# print the words of the first movie review
+print(documents[0][0])
+# print the first word of the first movie review
+print(documents[0][0][0])
+
+# print the classification of the first movie review
+print(documents[0][1])
+
+# print the classification of the 1000th review (the last negative one)
+print(documents[999][1])
+# print the classification of the 1001st review (the first positive one)
+print(documents[1000][1])
+
+# The default order of the reviews is first all negative reviews and then all positive ones.
+# Later we will build a training and a testing set. As we need to have positive and negative
+# reports in both sets, we randomly shuffle the documents.
+random.shuffle(documents)
+
+# Create a list of all words.
+all_words = []
+for word in movie_reviews.words():
+    # We use lower case words
+    #all_words.append(word.lower())
+    if re.search("\A[a-z]",word.lower()):
+    # check whether the word is actually a word, i.e., whether it contains
+    # at least one letter
+    #if re.search("[a-z]",word.lower()):
+        # We use lower case words
+        all_words.append(word.lower())
+    
+
+# What are the most frequently used words in the movie reviews?
+# Alternative 1:
+# FreqDist sort words from the most frequently used word to the least frequenty used word.
+all_words_approach_1 = nltk.FreqDist(all_words)
+print("Alternative 1: the top 15 words are: "+str(all_words_approach_1.most_common(15)))
+
+# Alternative 2:
+# We can also determine the most frequent words by using Counters as we did
+# in Problem 12 --> transform list of all words to a Counter
+all_words_approach_2=collections.Counter(all_words)
+top_15_words=all_words_approach_2.most_common(15)
+print("Alternative 2: the top 15 words are: "+str(top_15_words))
+# -> identical results -> perfect.
+
+# Search for a word and see how often it appears.
+print("The word 'stupid' appears "+str(all_words_approach_1["stupid"])+" in the movie reviews.")
+# alternatively
+print("The word 'stupid' appears "+str(all_words_approach_2["stupid"])+" in the movie reviews.")
+
+# How can we restrict the set of words that we use for training the Naive Bayes algorithm?
+# -> create a list that only contains the top 3000 words
+# get the top 3000 words
+# Approach 1 using the NLKT.FreqDist from above
+i=0
+top_3000_words=all_words_approach_1.most_common(3000)
+list_top_3000_words_approach_1=[]
+while i<3000:
+    list_top_3000_words_approach_1.append(top_3000_words[i][0])
+    i=i+1
+    
+# Approach 2 using Counters from above
+i=0
+top_3000_words=all_words_approach_2.most_common(3000)
+list_top_3000_words_approach_2=[]
+while i<3000:
+    list_top_3000_words_approach_2.append(top_3000_words[i][0])
+    i=i+1
+
+# select the list of approach 1 or 2
+word_features=list_top_3000_words_approach_1
+
+# We need to identify the words we want to use for classification in the documents.
+# We define a function for that.
+def find_features(document):
+    words = set(document)
+    features = {}
+    # loop over all the words we consider for the classification
+    for word in word_features:
+        # The expression returns either true or false
+        features[word] = (word in words)
+    
+    return features
+
+# To get an idea what the function find_features() does let's print the features
+# for one review.
+print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
+
+
+feature_set = [(find_features(review), category) for (review, category) in documents]
+
+# How does feature set looks like?
+print(feature_set[0])
+# -> it is still a tuple
+print(feature_set[0][0])
+# the first element are the 3000 words we use for classification with "True" or "False"
+# depending on whether the words appear in the review
+print(feature_set[0][1])
+# Is the information on whether the review is positive or negative
+
+# Define the training and testing set
+# The training set comprises the first 1900 reviews and the testing set the last 100 reviews.
+training_set=feature_set[:1900]
+testing_set=feature_set[1900:]
+
+# First we have to train the Naive Bayes Classifier.
+# It will determine which of the words from word_features appear mostly in positive
+# reviews and which appear mostly in negative reviews.
+classifier=nltk.NaiveBayesClassifier.train(training_set)
+# The following command prints the 20 words that best discriminate between
+# positive and negative reviews.
+classifier.show_most_informative_features(20)
+
+# Let's classify the first element of feature_set
+# The input for the classification need to be the list of words with True or False
+print(classifier.classify(feature_set[0][0]))
+print("The review is actually: "+str(feature_set[0][1]))
+
+# classify the 100 reports from the testing set
+# they have the position 1900 to 2000 in the feature set.
+i=1900
+classified_set=[]
+while i<2000:
+    classified_set.append(classifier.classify(feature_set[i][0]))
+    i=i+1
+
+# Compare classification result with actual category
+i=0
+# In this list we save tuples of [predicted category, actual category]
+comparison=[]
+# In this list we simply save "accurate" and "inaccurate"
+comparison_2=[]
+while i<100:
+    comparison.append([classified_set[i],feature_set[i+1900][1]])
+    # If the predicted and acutal classification match -> accurate
+    if comparison[i][0]==comparison[i][1]:
+        comparison_2.append("accurate")
+    else:
+        comparison_2.append("inaccurate")
+    i=i+1
+
+print(comparison)
+# We need the number of accurate and inaccurate classifications
+comparison_counter=collections.Counter(comparison_2)
+print(comparison_counter)
+
+# NLT can compute the accuracy directly
+# What is the accuracy for the testing set?
+print("Naive Bayes accuracy (in percent):", (nltk.classify.accuracy(classifier, testing_set))*100)
+# Same value as from our own calculations -> perfect!
+
+# What is the accuracy for the training set?
+print("Naive Bayes accuracy in training data (in percent):", (nltk.classify.accuracy(classifier, training_set))*100)
+# Higher than in the testing dataset -> expected.
+
+print("completed!")