# -*- coding: utf-8 -*- """ Created on Fri Jul 29 10:42:03 2022 @author: Alexander Hillert, Goethe University """ # import packages import re # define working directory # adjust it to your computer directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problems2n3/" # Open the dictionary # It is the 2018 version of the LM (2011) dictionary. file_word_list = open( directory + "LMD_pos_master_dictionary_2018.txt", "r", encoding="utf-8" ) word_list = file_word_list.read() # use a consistent case format word_list = word_list.lower() # create the list of positive words positive_words = word_list.split() # Create output file according to the exam instructions output_csv_file = open( directory + "Problem_2a_Percentage_Positive_Words.csv", "w", encoding="utf-8" ) # Write variable names to the first line of the output file # 1) Call-ID # 2) Answer-ID # 3) Total number of words in the answer # 4) The number of positive words in the answer # 5) The percentage of positive words in the answer # 6) the text of the answer output_csv_file.write( "call_id;answer_id;n_total_words;n_positive_words;f_positive_words;answer_text" ) # Iterate over the 60 answer files for i in range(1, 61): # If you want you can print the progress of your script print(str(i)) # Open the ith answer file input_file_answer = open( directory + "/Problem_2_3_Sample_QandA/" + str(i) + "_answers.txt", "r", encoding="utf-8", errors="ignore", ) # read the text from the answer file input_text_answer = input_file_answer.read() # use a consistent case format input_text_answer = input_text_answer.lower() # Split the text into individual answers answer_list = re.split("answer_[0-9]+:", input_text_answer) # Check whether there are empty elements in the answer list # If so, remove them while answer_list.count("") > 0: answer_list.remove("") # clean whitespace a bit answer_list = [x.strip() for x in answer_list] # iterate all answers of the ith call for j, answer in enumerate(answer_list, start=1): # Preprocessing steps according to the exam instructions and hints # re.sub() commands are useful here. answer = re.sub(";", " SEMICOLON", answer) # take care of contractions answer = re.sub("ain't", "am not", answer) answer = re.sub("aren't", "are not", answer) answer = re.sub("couldn't", "could not", answer) answer = re.sub("didn't", "did not", answer) answer = re.sub("doesn't", "does not", answer) answer = re.sub("don't", "do not", answer) answer = re.sub("dunno", "do not know", answer) answer = re.sub("can't", "can not", answer) answer = re.sub("cannot", "can not", answer) answer = re.sub("had't", "had not", answer) answer = re.sub("hasn't", "has not", answer) answer = re.sub("haven't", "have not", answer) answer = re.sub("isn't", "is not", answer) answer = re.sub("mustn't", "must not", answer) answer = re.sub("needn't", "need not", answer) answer = re.sub("shouldn't", "should not", answer) answer = re.sub("wasn't", "was not", answer) answer = re.sub("weren't", "were not", answer) answer = re.sub("won't", "will not", answer) answer = re.sub("wouldn't", "would not", answer) ######### Begin of the placeholder ######### # Here is the placeholder for the further editing steps that you # should identify by looking at the file from Part b) of this problem. # Having created a first file in Part b), you will see that the measurement # of positive tone can be improved. # Please add these commands here and then return to part 2b) # See also the exam instructions. # Take care of the 2b) promblem answer = re.sub("good morning", " ", answer) answer = re.sub("good afternoon", " ", answer) answer = re.sub("good evening", " ", answer) answer = re.sub("great thanks", " ", answer) # Another approach could be to remove answers that are too short further below ######### End of the placeholder ######## answer = re.sub("\s+", " ", answer) # Split the text in words list_of_words = re.split("\W{1,}", answer) # Check for empty elemments while list_of_words.count("") > 0: list_of_words.remove("") # Determine total number of words word_count = len(list_of_words) # Reset the number of positive words to zero positive_count = 0 # For each positive word, count the number of occurrences for pos_word in positive_words: # Check whether the positive word of interest shows up positive_words_found = list_of_words.count(pos_word) # Loughran and McDonald (2011, JF, p.44): "We account for simple negation # only for Fin-Pos words. Simple negation is taken to be observations # of one of six words (no, not, none, neither, never, nobody) occurring # within three words preceding a positive word. # While the positive word is found, implement the LM (2011) negation check. while positive_words_found > 0: # identify the position of the matched positive word in the list of all words position_of_word = list_of_words.index(pos_word) # identify the three words before the positive word and add them to a list list_negation = [ list_of_words[max(0, position_of_word - 3)], list_of_words[max(0, position_of_word - 2)], list_of_words[max(0, position_of_word - 1)], ] # check whether one of the three words in list_negation is a negation negation_found = ( list_negation.count("no") + list_negation.count("not") + list_negation.count("none") + list_negation.count("neither") + list_negation.count("never") + list_negation.count("nobody") ) if negation_found == 0: # no negation # positive_count_adj=positive_count_adj+1 positive_count = positive_count + 1 # delete the matched positive word in the original document list_of_words[position_of_word] = "" # check whether there are further matches of the positive word of interest positive_words_found = list_of_words.count(pos_word) # compute the percentage of positive words adjusted for negations # it could be that the total number of words of an answer is zero. if word_count > 0: percentage_positive = positive_count / word_count else: percentage_positive = "NA" # ALREADY DONE ABOVE # Remove line breaks of the text that you write to the csv. # Line breaks would mess up your output file. # In addition to line breaks, you may also want to remove extra # whitespaces and tabs at the beginning and end. answer_text_print = re.sub("\s+", " ", answer) # replace the symbol that you use as delimiter, e.g., semicolon answer_text_print = re.sub(";", "SEMICOLON", answer) # Write the call-ID, answer-ID, total number of words, number of positive words # adjusted for negations, percentage of positive words adjusted for negations, # and the edited answer text to the output file output_csv_file.write( str(i) + ";" + str(j) + ";" + str(word_count) + ";" + str(positive_count) + ";" + str(percentage_positive) + ";" + answer_text_print + "\n" ) # Close files input_file_answer.close() print("Finished") output_csv_file.close()