From 6e12086e0bda7bf972c319af2f51a402e7c07c80 Mon Sep 17 00:00:00 2001
From: Alexander Hess <alexander@webartifex.biz>
Date: Fri, 5 Aug 2022 09:36:26 +0200
Subject: [PATCH] Solve Part 2: Problem 3

---
 .../Problem_3a_Market-related_Questions.csv   |  61 +++++
 .../Problem_3b_Most_Frequent_Trigrams.csv     |  31 +++
 exam/part2_problems2n3/problem_3_code.py      | 254 ++++++++++++++++++
 3 files changed, 346 insertions(+)
 create mode 100644 exam/part2_problems2n3/Problem_3a_Market-related_Questions.csv
 create mode 100644 exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv
 create mode 100644 exam/part2_problems2n3/problem_3_code.py

diff --git a/exam/part2_problems2n3/Problem_3a_Market-related_Questions.csv b/exam/part2_problems2n3/Problem_3a_Market-related_Questions.csv
new file mode 100644
index 0000000..18a4ab0
--- /dev/null
+++ b/exam/part2_problems2n3/Problem_3a_Market-related_Questions.csv
@@ -0,0 +1,61 @@
+Call_ID;Number_Questions;Number_Market_Questions;Percetage_Market_Questions
+1;56;2;0.03571428571428571
+2;60;3;0.05
+3;88;5;0.056818181818181816
+4;60;3;0.05
+5;74;4;0.05405405405405406
+6;60;10;0.16666666666666666
+7;59;5;0.0847457627118644
+8;48;5;0.10416666666666667
+9;47;5;0.10638297872340426
+10;28;0;0.0
+11;39;8;0.20512820512820512
+12;31;4;0.12903225806451613
+13;38;7;0.18421052631578946
+14;37;7;0.1891891891891892
+15;39;4;0.10256410256410256
+16;23;4;0.17391304347826086
+17;43;4;0.09302325581395349
+18;30;5;0.16666666666666666
+19;24;1;0.041666666666666664
+20;34;4;0.11764705882352941
+21;16;0;0.0
+22;13;2;0.15384615384615385
+23;13;3;0.23076923076923078
+24;21;5;0.23809523809523808
+25;9;3;0.3333333333333333
+26;16;3;0.1875
+27;16;5;0.3125
+28;21;2;0.09523809523809523
+29;16;3;0.1875
+30;23;2;0.08695652173913043
+31;15;1;0.06666666666666667
+32;17;4;0.23529411764705882
+33;20;5;0.25
+34;18;1;0.05555555555555555
+35;12;3;0.25
+36;16;6;0.375
+37;19;4;0.21052631578947367
+38;12;2;0.16666666666666666
+39;14;4;0.2857142857142857
+40;17;5;0.29411764705882354
+41;14;2;0.14285714285714285
+42;25;1;0.04
+43;15;0;0.0
+44;18;1;0.05555555555555555
+45;19;0;0.0
+46;12;1;0.08333333333333333
+47;13;2;0.15384615384615385
+48;16;0;0.0
+49;14;0;0.0
+50;23;0;0.0
+51;14;1;0.07142857142857142
+52;14;0;0.0
+53;11;1;0.09090909090909091
+54;20;0;0.0
+55;19;2;0.10526315789473684
+56;16;0;0.0
+57;15;1;0.06666666666666667
+58;13;2;0.15384615384615385
+59;16;0;0.0
+60;14;0;0.0
diff --git a/exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv b/exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv
new file mode 100644
index 0000000..cb92967
--- /dev/null
+++ b/exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv
@@ -0,0 +1,31 @@
+Rank;Trigram;Frequency
+1;long term growth;9
+2;Coke Zero Sugar;7
+3;We gained share;5
+4;commercial real estate;5
+5;back half year;5
+6;positive price mix;5
+7;course couple years;4
+8;Coca Cola European;4
+9;mid single digits;3
+10;high single digits;3
+11;fourth quarter year;3
+12;And expect continue;3
+13;brand Coca Cola;3
+14;juice juice drinks;3
+15;quarter full year;3
+16;In terms China;3
+17;volume versus price;3
+18;Cola European partners;3
+19;pack price architecture;3
+20;value beverage category;3
+21;Investor Day give;2
+22;risk weighted assets;2
+23;repo short term;2
+24;bunch different things;2
+25;long term prospects;2
+26;couple years ago;2
+27;respect market share;2
+28;risk adjusted return;2
+29;emerging developing markets;2
+30;repricing taking place;2
diff --git a/exam/part2_problems2n3/problem_3_code.py b/exam/part2_problems2n3/problem_3_code.py
new file mode 100644
index 0000000..02f6c37
--- /dev/null
+++ b/exam/part2_problems2n3/problem_3_code.py
@@ -0,0 +1,254 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jul 31 14:37:49 2022
+
+@author: Alexander Hillert, Goethe University
+"""
+
+# import packages
+import re
+import nltk
+import collections
+
+# define working directory
+# adjust it to your computer
+directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problems2n3/"
+
+# =============================================================================
+# Part A: Identifying the answers to market-related sentences
+# =============================================================================
+
+# Create output file
+output_csv_file_3a = open(
+    directory + "Problem_3a_Market-related_Questions.csv", "w", encoding="utf-8"
+)
+# Write variable names to the first line of the output file
+# 1) Call-ID
+# 2) Number of questions in the call
+# 3) The number of market-related questions
+# 4) The percentage of market-related questions
+output_csv_file_3a.write(
+    "Call_ID;Number_Questions;Number_Market_Questions;Percetage_Market_Questions\n"
+)
+
+# create a text variable to store managers answers to market-related questions
+answers_market_questions = ""
+
+# Iterate over the 60 questions and answer files respectively
+for i in range(1, 61):
+
+    # If the execution of your scripts takes some time, printing the iterator
+    # gives you an impression of the overall progress
+    print(str(i))
+
+    # reset variables
+    market_question_count = 0
+
+    # Open the ith question file
+    # IF YOU HAVE PROBLEMS OPENING THE FILES DOUBLE-CHECK THE DIRECTORY AND FOLDER NAME
+    input_file_question = open(
+        directory + "Problem_2_3_Sample_QandA/" + str(i) + "_questions.txt",
+        "r",
+        encoding="utf-8",
+        errors="ignore",
+    )
+    # read the text from the question file
+    input_text_question = input_file_question.read()
+
+    # To identify managements' answer to a market-related question, also open the
+    # answer files and create a list of the individual answers.
+    # the jth list element in the answer list will correspond to the jth list
+    # element in the question list.
+    # Open the ith answer file
+    input_file_answer = open(
+        directory + "Problem_2_3_Sample_QandA/" + str(i) + "_answers.txt",
+        "r",
+        encoding="utf-8",
+        errors="ignore",
+    )
+    input_text_answer = input_file_answer.read()
+
+    # Split the text into individual questions
+    question_list = re.split("Question_[0-9]+:", input_text_question)
+    question_list = [x.strip() for x in question_list]
+    # Check whether there are empty questions, if so remove them
+    while question_list.count("") > 0:
+        question_list.remove("")
+
+    # get the total number of questions
+    number_questions = len(question_list)
+
+    # Split the text into individual answers
+    answer_list = re.split("Answer_[0-9]+:", input_text_answer)
+    answer_list = [x.strip() for x in answer_list]
+    # Check whether there are empty questions, if so remove them
+    while answer_list.count("") > 0:
+        answer_list.remove("")
+
+    # search for the term market/markets in each analyst question
+    # iterate over the list of questions
+    for j in range(number_questions):
+        question_id = j + 1
+
+        # it might be helpful to get the text of a question to a new variable
+        # of course, you can also work with the jth element of the question list.
+        question_text = question_list[j]
+
+        # search for market/markets in the list of words
+
+        # remember that searching for a word in a text is NOT the same as searching
+        # for a word in a list. Make sure that you only count actual matches!!!
+        # ADD necessary commands here
+        question_list_of_words = re.split("\W{1,}", question_text)
+        # Are there upper case letters? Are there lower case letters?
+        # Remember to use a consistent format of the text and the search term.
+        # USE A SET FOR FASTER SEARCH
+        question_set_of_words = set(x.lower() for x in question_list_of_words)
+
+        if "market" in question_set_of_words or "markets" in question_set_of_words:
+            # it is a market-related question
+            market_question_count += 1
+
+            # For Part b) you need the text of the answers to market-related
+            # questions. So, we identify the corresponding answer.
+            # question j relates to answer j.
+            # --> pick the right element from the answer list
+            market_answer = answer_list[j]
+
+            # add the text of answer j to the total text of all answers
+            answers_market_questions = answers_market_questions + "\n" + market_answer
+
+    # compute the percentage of market-related questions
+    pct_mkt_questions = market_question_count / number_questions
+
+    # Write the call-ID, the total number of questions, the number of market questions,
+    # and the percentage of market questions to the output file
+    output_csv_file_3a.write(
+        str(i)
+        + ";"
+        + str(number_questions)
+        + ";"
+        + str(market_question_count)
+        + ";"
+        + str(pct_mkt_questions)
+        + "\n"
+    )
+
+# close files
+output_csv_file_3a.close()
+
+print("Part a) of Problem 3 completed.")
+
+# =============================================================================
+# Part B: Most frequent trigrams in the answers to market-related questions
+# =============================================================================
+
+# import english stopwords
+nltk.download("stopwords")
+from nltk.corpus import stopwords
+
+NLTK_stop_words = set(stopwords.words("english"))
+
+# import sentence tokenizer
+# even though we discussed the weaknesses of the tokenizer in class, for this
+# text corpus it is fine to use the tokenizer.
+nltk.download("punkt")
+from nltk.tokenize import sent_tokenize
+
+# list and counter for building trigrams
+trigram_list = []
+trigram_counter = collections.Counter()
+
+
+# Create output file
+output_csv_file_3b = open(
+    directory + "Problem_3b_Most_Frequent_Trigrams.csv", "w", encoding="utf-8"
+)
+# Write variable names to the first line of the output file
+# 1) rank of the trigram ranging from 1 to 30
+# 2) trigram
+# 3) frequency of the trigram
+output_csv_file_3b.write("Rank;Trigram;Frequency\n")
+
+# the managers' answers to market related sentences are stored in the text variable
+# "answers_market_questions"
+
+
+# split the entire answer text into single sentences
+list_sentences = sent_tokenize(answers_market_questions)
+
+# iterate all sentences
+for i in range(len(list_sentences)):
+    # transform the ith sentence to lower or to upper case.
+    # make sure that the upper/lower case spelling is consistent with the
+    # stop word list!
+    sentence = list_sentences[i]
+
+    # remove numbers (all kinds of forms)
+    sentence = re.sub("\$\d[\.,]\d", " ", sentence)
+    sentence = re.sub("\$\d", " ", sentence)
+    sentence = re.sub("\d[\.,]\d", " ", sentence)
+    sentence = re.sub("\d[$%]", " ", sentence)
+    sentence = re.sub("\d", " ", sentence)
+    # delete single letter words
+    sentence = re.sub(r"(?:^| )\w(?:$| )", " ", sentence).strip()
+
+    # remove subsequent whitespace
+    sentence = re.sub("\s{1,}", " ", sentence)
+
+    # split the sentence into words
+    list_of_words = re.split("\W{1,}", sentence)
+    # remove empty elements from the list of words
+    while list_of_words.count("") > 0:
+        list_of_words.remove("")
+
+    # remove stopwords
+    list_of_nonstop_words = []
+    for word in list_of_words:
+        if word not in stopwords.words():
+            list_of_nonstop_words.append(word)
+
+    # go over all potential three word combinations in the sentence.
+    # check whether you have at least three words remaining in the sentence.
+    if len(list_of_nonstop_words) >= 3:
+        # go over all words in the sentence.
+        # remember to pay attention to the upper bound. For example, if there
+        # are 5 words in a sentence, you can only form 3 trigrams
+        for n in range(len(list_of_nonstop_words) - 2):
+            # append the three words of the trigram to the list of trigrams
+            # put a single whitespace between the three single words.
+            trigram_list.append(
+                list_of_nonstop_words[n]
+                + " "
+                + list_of_nonstop_words[n + 1]
+                + " "
+                + list_of_nonstop_words[n + 2]
+            )
+
+
+# add the list of trigrams to the counter of trigrams
+trigram_counter = collections.Counter(trigram_list)
+
+# Get the 30 most frequent trigrams
+top_30_trigrams = trigram_counter.most_common(30)
+
+# Write the 30 most frequent trigrams to the csv file.
+# Remember Python starts counting at 0, while humans start at 1.
+# So, the most frequent word (rank 1 in human counting) is element 0 for Python.
+# Consequently, to get a consistent table, we must use the value i for the rank
+# but call the element i-1.
+for i in range(1, 31):
+    output_csv_file_3b.write(
+        str(i)
+        + ";"
+        + str(top_30_trigrams[i - 1][0])
+        + ";"
+        + str(top_30_trigrams[i - 1][1])
+        + "\n"
+    )
+
+
+# close files
+output_csv_file_3b.close()
+
+print("Part b) of the Problem has also been completed.")