Solve Part 2: Problem 2
This commit is contained in:
parent
d0edac1a1b
commit
e20469be2d
188 changed files with 63431 additions and 0 deletions
213
exam/part2_problems2n3/problem_2_code.py
Normal file
213
exam/part2_problems2n3/problem_2_code.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Jul 29 10:42:03 2022
|
||||
|
||||
@author: Alexander Hillert, Goethe University
|
||||
"""
|
||||
|
||||
# import packages
|
||||
import re
|
||||
|
||||
# define working directory
|
||||
# adjust it to your computer
|
||||
directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problems2n3/"
|
||||
|
||||
|
||||
# Open the dictionary
|
||||
# It is the 2018 version of the LM (2011) dictionary.
|
||||
file_word_list = open(
|
||||
directory + "LMD_pos_master_dictionary_2018.txt", "r", encoding="utf-8"
|
||||
)
|
||||
word_list = file_word_list.read()
|
||||
# use a consistent case format
|
||||
word_list = word_list.lower()
|
||||
# create the list of positive words
|
||||
positive_words = word_list.split()
|
||||
|
||||
|
||||
# Create output file according to the exam instructions
|
||||
output_csv_file = open(
|
||||
directory + "Problem_2a_Percentage_Positive_Words.csv", "w", encoding="utf-8"
|
||||
)
|
||||
# Write variable names to the first line of the output file
|
||||
# 1) Call-ID
|
||||
# 2) Answer-ID
|
||||
# 3) Total number of words in the answer
|
||||
# 4) The number of positive words in the answer
|
||||
# 5) The percentage of positive words in the answer
|
||||
# 6) the text of the answer
|
||||
output_csv_file.write(
|
||||
"call_id;answer_id;n_total_words;n_positive_words;f_positive_words;answer_text"
|
||||
)
|
||||
|
||||
|
||||
# Iterate over the 60 answer files
|
||||
for i in range(1, 61):
|
||||
# If you want you can print the progress of your script
|
||||
print(str(i))
|
||||
|
||||
# Open the ith answer file
|
||||
input_file_answer = open(
|
||||
directory + "/Problem_2_3_Sample_QandA/" + str(i) + "_answers.txt",
|
||||
"r",
|
||||
encoding="utf-8",
|
||||
errors="ignore",
|
||||
)
|
||||
|
||||
# read the text from the answer file
|
||||
input_text_answer = input_file_answer.read()
|
||||
|
||||
# use a consistent case format
|
||||
input_text_answer = input_text_answer.lower()
|
||||
|
||||
# Split the text into individual answers
|
||||
answer_list = re.split("answer_[0-9]+:", input_text_answer)
|
||||
|
||||
# Check whether there are empty elements in the answer list
|
||||
# If so, remove them
|
||||
while answer_list.count("") > 0:
|
||||
answer_list.remove("")
|
||||
|
||||
# clean whitespace a bit
|
||||
answer_list = [x.strip() for x in answer_list]
|
||||
|
||||
# iterate all answers of the ith call
|
||||
for j, answer in enumerate(answer_list, start=1):
|
||||
|
||||
# Preprocessing steps according to the exam instructions and hints
|
||||
# re.sub() commands are useful here.
|
||||
answer = re.sub(";", " SEMICOLON", answer)
|
||||
|
||||
# take care of contractions
|
||||
answer = re.sub("ain't", "am not", answer)
|
||||
answer = re.sub("aren't", "are not", answer)
|
||||
answer = re.sub("couldn't", "could not", answer)
|
||||
answer = re.sub("didn't", "did not", answer)
|
||||
answer = re.sub("doesn't", "does not", answer)
|
||||
answer = re.sub("don't", "do not", answer)
|
||||
answer = re.sub("dunno", "do not know", answer)
|
||||
answer = re.sub("can't", "can not", answer)
|
||||
answer = re.sub("cannot", "can not", answer)
|
||||
answer = re.sub("had't", "had not", answer)
|
||||
answer = re.sub("hasn't", "has not", answer)
|
||||
answer = re.sub("haven't", "have not", answer)
|
||||
answer = re.sub("isn't", "is not", answer)
|
||||
answer = re.sub("mustn't", "must not", answer)
|
||||
answer = re.sub("needn't", "need not", answer)
|
||||
answer = re.sub("shouldn't", "should not", answer)
|
||||
answer = re.sub("wasn't", "was not", answer)
|
||||
answer = re.sub("weren't", "were not", answer)
|
||||
answer = re.sub("won't", "will not", answer)
|
||||
answer = re.sub("wouldn't", "would not", answer)
|
||||
|
||||
######### Begin of the placeholder #########
|
||||
# Here is the placeholder for the further editing steps that you
|
||||
# should identify by looking at the file from Part b) of this problem.
|
||||
# Having created a first file in Part b), you will see that the measurement
|
||||
# of positive tone can be improved.
|
||||
# Please add these commands here and then return to part 2b)
|
||||
# See also the exam instructions.
|
||||
|
||||
# Take care of the 2b) promblem
|
||||
answer = re.sub("good morning", " ", answer)
|
||||
answer = re.sub("good afternoon", " ", answer)
|
||||
answer = re.sub("good evening", " ", answer)
|
||||
answer = re.sub("great thanks", " ", answer)
|
||||
|
||||
# Another approach could be to remove answers that are too short further below
|
||||
|
||||
######### End of the placeholder ########
|
||||
|
||||
answer = re.sub("\s+", " ", answer)
|
||||
|
||||
# Split the text in words
|
||||
list_of_words = re.split("\W{1,}", answer)
|
||||
# Check for empty elemments
|
||||
while list_of_words.count("") > 0:
|
||||
list_of_words.remove("")
|
||||
|
||||
# Determine total number of words
|
||||
word_count = len(list_of_words)
|
||||
|
||||
# Reset the number of positive words to zero
|
||||
positive_count = 0
|
||||
|
||||
# For each positive word, count the number of occurrences
|
||||
for pos_word in positive_words:
|
||||
# Check whether the positive word of interest shows up
|
||||
positive_words_found = list_of_words.count(pos_word)
|
||||
|
||||
# Loughran and McDonald (2011, JF, p.44): "We account for simple negation
|
||||
# only for Fin-Pos words. Simple negation is taken to be observations
|
||||
# of one of six words (no, not, none, neither, never, nobody) occurring
|
||||
# within three words preceding a positive word.
|
||||
|
||||
# While the positive word is found, implement the LM (2011) negation check.
|
||||
while positive_words_found > 0:
|
||||
# identify the position of the matched positive word in the list of all words
|
||||
position_of_word = list_of_words.index(pos_word)
|
||||
# identify the three words before the positive word and add them to a list
|
||||
list_negation = [
|
||||
list_of_words[max(0, position_of_word - 3)],
|
||||
list_of_words[max(0, position_of_word - 2)],
|
||||
list_of_words[max(0, position_of_word - 1)],
|
||||
]
|
||||
# check whether one of the three words in list_negation is a negation
|
||||
negation_found = (
|
||||
list_negation.count("no")
|
||||
+ list_negation.count("not")
|
||||
+ list_negation.count("none")
|
||||
+ list_negation.count("neither")
|
||||
+ list_negation.count("never")
|
||||
+ list_negation.count("nobody")
|
||||
)
|
||||
|
||||
if negation_found == 0:
|
||||
# no negation
|
||||
# positive_count_adj=positive_count_adj+1
|
||||
positive_count = positive_count + 1
|
||||
|
||||
# delete the matched positive word in the original document
|
||||
list_of_words[position_of_word] = ""
|
||||
# check whether there are further matches of the positive word of interest
|
||||
positive_words_found = list_of_words.count(pos_word)
|
||||
|
||||
# compute the percentage of positive words adjusted for negations
|
||||
# it could be that the total number of words of an answer is zero.
|
||||
if word_count > 0:
|
||||
percentage_positive = positive_count / word_count
|
||||
else:
|
||||
percentage_positive = "NA"
|
||||
|
||||
# ALREADY DONE ABOVE
|
||||
# Remove line breaks of the text that you write to the csv.
|
||||
# Line breaks would mess up your output file.
|
||||
# In addition to line breaks, you may also want to remove extra
|
||||
# whitespaces and tabs at the beginning and end.
|
||||
answer_text_print = re.sub("\s+", " ", answer)
|
||||
# replace the symbol that you use as delimiter, e.g., semicolon
|
||||
answer_text_print = re.sub(";", "SEMICOLON", answer)
|
||||
|
||||
# Write the call-ID, answer-ID, total number of words, number of positive words
|
||||
# adjusted for negations, percentage of positive words adjusted for negations,
|
||||
# and the edited answer text to the output file
|
||||
output_csv_file.write(
|
||||
str(i)
|
||||
+ ";"
|
||||
+ str(j)
|
||||
+ ";"
|
||||
+ str(word_count)
|
||||
+ ";"
|
||||
+ str(positive_count)
|
||||
+ ";"
|
||||
+ str(percentage_positive)
|
||||
+ ";"
|
||||
+ answer_text_print
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
# Close files
|
||||
input_file_answer.close()
|
||||
|
||||
print("Finished")
|
||||
output_csv_file.close()
|
||||
Loading…
Add table
Add a link
Reference in a new issue