Solve Part 2: Problem 3
This commit is contained in:
parent
e20469be2d
commit
6e12086e0b
3 changed files with 346 additions and 0 deletions
|
@ -0,0 +1,61 @@
|
||||||
|
Call_ID;Number_Questions;Number_Market_Questions;Percetage_Market_Questions
|
||||||
|
1;56;2;0.03571428571428571
|
||||||
|
2;60;3;0.05
|
||||||
|
3;88;5;0.056818181818181816
|
||||||
|
4;60;3;0.05
|
||||||
|
5;74;4;0.05405405405405406
|
||||||
|
6;60;10;0.16666666666666666
|
||||||
|
7;59;5;0.0847457627118644
|
||||||
|
8;48;5;0.10416666666666667
|
||||||
|
9;47;5;0.10638297872340426
|
||||||
|
10;28;0;0.0
|
||||||
|
11;39;8;0.20512820512820512
|
||||||
|
12;31;4;0.12903225806451613
|
||||||
|
13;38;7;0.18421052631578946
|
||||||
|
14;37;7;0.1891891891891892
|
||||||
|
15;39;4;0.10256410256410256
|
||||||
|
16;23;4;0.17391304347826086
|
||||||
|
17;43;4;0.09302325581395349
|
||||||
|
18;30;5;0.16666666666666666
|
||||||
|
19;24;1;0.041666666666666664
|
||||||
|
20;34;4;0.11764705882352941
|
||||||
|
21;16;0;0.0
|
||||||
|
22;13;2;0.15384615384615385
|
||||||
|
23;13;3;0.23076923076923078
|
||||||
|
24;21;5;0.23809523809523808
|
||||||
|
25;9;3;0.3333333333333333
|
||||||
|
26;16;3;0.1875
|
||||||
|
27;16;5;0.3125
|
||||||
|
28;21;2;0.09523809523809523
|
||||||
|
29;16;3;0.1875
|
||||||
|
30;23;2;0.08695652173913043
|
||||||
|
31;15;1;0.06666666666666667
|
||||||
|
32;17;4;0.23529411764705882
|
||||||
|
33;20;5;0.25
|
||||||
|
34;18;1;0.05555555555555555
|
||||||
|
35;12;3;0.25
|
||||||
|
36;16;6;0.375
|
||||||
|
37;19;4;0.21052631578947367
|
||||||
|
38;12;2;0.16666666666666666
|
||||||
|
39;14;4;0.2857142857142857
|
||||||
|
40;17;5;0.29411764705882354
|
||||||
|
41;14;2;0.14285714285714285
|
||||||
|
42;25;1;0.04
|
||||||
|
43;15;0;0.0
|
||||||
|
44;18;1;0.05555555555555555
|
||||||
|
45;19;0;0.0
|
||||||
|
46;12;1;0.08333333333333333
|
||||||
|
47;13;2;0.15384615384615385
|
||||||
|
48;16;0;0.0
|
||||||
|
49;14;0;0.0
|
||||||
|
50;23;0;0.0
|
||||||
|
51;14;1;0.07142857142857142
|
||||||
|
52;14;0;0.0
|
||||||
|
53;11;1;0.09090909090909091
|
||||||
|
54;20;0;0.0
|
||||||
|
55;19;2;0.10526315789473684
|
||||||
|
56;16;0;0.0
|
||||||
|
57;15;1;0.06666666666666667
|
||||||
|
58;13;2;0.15384615384615385
|
||||||
|
59;16;0;0.0
|
||||||
|
60;14;0;0.0
|
|
31
exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv
Normal file
31
exam/part2_problems2n3/Problem_3b_Most_Frequent_Trigrams.csv
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
Rank;Trigram;Frequency
|
||||||
|
1;long term growth;9
|
||||||
|
2;Coke Zero Sugar;7
|
||||||
|
3;We gained share;5
|
||||||
|
4;commercial real estate;5
|
||||||
|
5;back half year;5
|
||||||
|
6;positive price mix;5
|
||||||
|
7;course couple years;4
|
||||||
|
8;Coca Cola European;4
|
||||||
|
9;mid single digits;3
|
||||||
|
10;high single digits;3
|
||||||
|
11;fourth quarter year;3
|
||||||
|
12;And expect continue;3
|
||||||
|
13;brand Coca Cola;3
|
||||||
|
14;juice juice drinks;3
|
||||||
|
15;quarter full year;3
|
||||||
|
16;In terms China;3
|
||||||
|
17;volume versus price;3
|
||||||
|
18;Cola European partners;3
|
||||||
|
19;pack price architecture;3
|
||||||
|
20;value beverage category;3
|
||||||
|
21;Investor Day give;2
|
||||||
|
22;risk weighted assets;2
|
||||||
|
23;repo short term;2
|
||||||
|
24;bunch different things;2
|
||||||
|
25;long term prospects;2
|
||||||
|
26;couple years ago;2
|
||||||
|
27;respect market share;2
|
||||||
|
28;risk adjusted return;2
|
||||||
|
29;emerging developing markets;2
|
||||||
|
30;repricing taking place;2
|
|
254
exam/part2_problems2n3/problem_3_code.py
Normal file
254
exam/part2_problems2n3/problem_3_code.py
Normal file
|
@ -0,0 +1,254 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sun Jul 31 14:37:49 2022
|
||||||
|
|
||||||
|
@author: Alexander Hillert, Goethe University
|
||||||
|
"""
|
||||||
|
|
||||||
|
# import packages
|
||||||
|
import re
|
||||||
|
import nltk
|
||||||
|
import collections
|
||||||
|
|
||||||
|
# define working directory
|
||||||
|
# adjust it to your computer
|
||||||
|
directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problems2n3/"
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Part A: Identifying the answers to market-related sentences
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_csv_file_3a = open(
|
||||||
|
directory + "Problem_3a_Market-related_Questions.csv", "w", encoding="utf-8"
|
||||||
|
)
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
# 1) Call-ID
|
||||||
|
# 2) Number of questions in the call
|
||||||
|
# 3) The number of market-related questions
|
||||||
|
# 4) The percentage of market-related questions
|
||||||
|
output_csv_file_3a.write(
|
||||||
|
"Call_ID;Number_Questions;Number_Market_Questions;Percetage_Market_Questions\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# create a text variable to store managers answers to market-related questions
|
||||||
|
answers_market_questions = ""
|
||||||
|
|
||||||
|
# Iterate over the 60 questions and answer files respectively
|
||||||
|
for i in range(1, 61):
|
||||||
|
|
||||||
|
# If the execution of your scripts takes some time, printing the iterator
|
||||||
|
# gives you an impression of the overall progress
|
||||||
|
print(str(i))
|
||||||
|
|
||||||
|
# reset variables
|
||||||
|
market_question_count = 0
|
||||||
|
|
||||||
|
# Open the ith question file
|
||||||
|
# IF YOU HAVE PROBLEMS OPENING THE FILES DOUBLE-CHECK THE DIRECTORY AND FOLDER NAME
|
||||||
|
input_file_question = open(
|
||||||
|
directory + "Problem_2_3_Sample_QandA/" + str(i) + "_questions.txt",
|
||||||
|
"r",
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="ignore",
|
||||||
|
)
|
||||||
|
# read the text from the question file
|
||||||
|
input_text_question = input_file_question.read()
|
||||||
|
|
||||||
|
# To identify managements' answer to a market-related question, also open the
|
||||||
|
# answer files and create a list of the individual answers.
|
||||||
|
# the jth list element in the answer list will correspond to the jth list
|
||||||
|
# element in the question list.
|
||||||
|
# Open the ith answer file
|
||||||
|
input_file_answer = open(
|
||||||
|
directory + "Problem_2_3_Sample_QandA/" + str(i) + "_answers.txt",
|
||||||
|
"r",
|
||||||
|
encoding="utf-8",
|
||||||
|
errors="ignore",
|
||||||
|
)
|
||||||
|
input_text_answer = input_file_answer.read()
|
||||||
|
|
||||||
|
# Split the text into individual questions
|
||||||
|
question_list = re.split("Question_[0-9]+:", input_text_question)
|
||||||
|
question_list = [x.strip() for x in question_list]
|
||||||
|
# Check whether there are empty questions, if so remove them
|
||||||
|
while question_list.count("") > 0:
|
||||||
|
question_list.remove("")
|
||||||
|
|
||||||
|
# get the total number of questions
|
||||||
|
number_questions = len(question_list)
|
||||||
|
|
||||||
|
# Split the text into individual answers
|
||||||
|
answer_list = re.split("Answer_[0-9]+:", input_text_answer)
|
||||||
|
answer_list = [x.strip() for x in answer_list]
|
||||||
|
# Check whether there are empty questions, if so remove them
|
||||||
|
while answer_list.count("") > 0:
|
||||||
|
answer_list.remove("")
|
||||||
|
|
||||||
|
# search for the term market/markets in each analyst question
|
||||||
|
# iterate over the list of questions
|
||||||
|
for j in range(number_questions):
|
||||||
|
question_id = j + 1
|
||||||
|
|
||||||
|
# it might be helpful to get the text of a question to a new variable
|
||||||
|
# of course, you can also work with the jth element of the question list.
|
||||||
|
question_text = question_list[j]
|
||||||
|
|
||||||
|
# search for market/markets in the list of words
|
||||||
|
|
||||||
|
# remember that searching for a word in a text is NOT the same as searching
|
||||||
|
# for a word in a list. Make sure that you only count actual matches!!!
|
||||||
|
# ADD necessary commands here
|
||||||
|
question_list_of_words = re.split("\W{1,}", question_text)
|
||||||
|
# Are there upper case letters? Are there lower case letters?
|
||||||
|
# Remember to use a consistent format of the text and the search term.
|
||||||
|
# USE A SET FOR FASTER SEARCH
|
||||||
|
question_set_of_words = set(x.lower() for x in question_list_of_words)
|
||||||
|
|
||||||
|
if "market" in question_set_of_words or "markets" in question_set_of_words:
|
||||||
|
# it is a market-related question
|
||||||
|
market_question_count += 1
|
||||||
|
|
||||||
|
# For Part b) you need the text of the answers to market-related
|
||||||
|
# questions. So, we identify the corresponding answer.
|
||||||
|
# question j relates to answer j.
|
||||||
|
# --> pick the right element from the answer list
|
||||||
|
market_answer = answer_list[j]
|
||||||
|
|
||||||
|
# add the text of answer j to the total text of all answers
|
||||||
|
answers_market_questions = answers_market_questions + "\n" + market_answer
|
||||||
|
|
||||||
|
# compute the percentage of market-related questions
|
||||||
|
pct_mkt_questions = market_question_count / number_questions
|
||||||
|
|
||||||
|
# Write the call-ID, the total number of questions, the number of market questions,
|
||||||
|
# and the percentage of market questions to the output file
|
||||||
|
output_csv_file_3a.write(
|
||||||
|
str(i)
|
||||||
|
+ ";"
|
||||||
|
+ str(number_questions)
|
||||||
|
+ ";"
|
||||||
|
+ str(market_question_count)
|
||||||
|
+ ";"
|
||||||
|
+ str(pct_mkt_questions)
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# close files
|
||||||
|
output_csv_file_3a.close()
|
||||||
|
|
||||||
|
print("Part a) of Problem 3 completed.")
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Part B: Most frequent trigrams in the answers to market-related questions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
# import english stopwords
|
||||||
|
nltk.download("stopwords")
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
NLTK_stop_words = set(stopwords.words("english"))
|
||||||
|
|
||||||
|
# import sentence tokenizer
|
||||||
|
# even though we discussed the weaknesses of the tokenizer in class, for this
|
||||||
|
# text corpus it is fine to use the tokenizer.
|
||||||
|
nltk.download("punkt")
|
||||||
|
from nltk.tokenize import sent_tokenize
|
||||||
|
|
||||||
|
# list and counter for building trigrams
|
||||||
|
trigram_list = []
|
||||||
|
trigram_counter = collections.Counter()
|
||||||
|
|
||||||
|
|
||||||
|
# Create output file
|
||||||
|
output_csv_file_3b = open(
|
||||||
|
directory + "Problem_3b_Most_Frequent_Trigrams.csv", "w", encoding="utf-8"
|
||||||
|
)
|
||||||
|
# Write variable names to the first line of the output file
|
||||||
|
# 1) rank of the trigram ranging from 1 to 30
|
||||||
|
# 2) trigram
|
||||||
|
# 3) frequency of the trigram
|
||||||
|
output_csv_file_3b.write("Rank;Trigram;Frequency\n")
|
||||||
|
|
||||||
|
# the managers' answers to market related sentences are stored in the text variable
|
||||||
|
# "answers_market_questions"
|
||||||
|
|
||||||
|
|
||||||
|
# split the entire answer text into single sentences
|
||||||
|
list_sentences = sent_tokenize(answers_market_questions)
|
||||||
|
|
||||||
|
# iterate all sentences
|
||||||
|
for i in range(len(list_sentences)):
|
||||||
|
# transform the ith sentence to lower or to upper case.
|
||||||
|
# make sure that the upper/lower case spelling is consistent with the
|
||||||
|
# stop word list!
|
||||||
|
sentence = list_sentences[i]
|
||||||
|
|
||||||
|
# remove numbers (all kinds of forms)
|
||||||
|
sentence = re.sub("\$\d[\.,]\d", " ", sentence)
|
||||||
|
sentence = re.sub("\$\d", " ", sentence)
|
||||||
|
sentence = re.sub("\d[\.,]\d", " ", sentence)
|
||||||
|
sentence = re.sub("\d[$%]", " ", sentence)
|
||||||
|
sentence = re.sub("\d", " ", sentence)
|
||||||
|
# delete single letter words
|
||||||
|
sentence = re.sub(r"(?:^| )\w(?:$| )", " ", sentence).strip()
|
||||||
|
|
||||||
|
# remove subsequent whitespace
|
||||||
|
sentence = re.sub("\s{1,}", " ", sentence)
|
||||||
|
|
||||||
|
# split the sentence into words
|
||||||
|
list_of_words = re.split("\W{1,}", sentence)
|
||||||
|
# remove empty elements from the list of words
|
||||||
|
while list_of_words.count("") > 0:
|
||||||
|
list_of_words.remove("")
|
||||||
|
|
||||||
|
# remove stopwords
|
||||||
|
list_of_nonstop_words = []
|
||||||
|
for word in list_of_words:
|
||||||
|
if word not in stopwords.words():
|
||||||
|
list_of_nonstop_words.append(word)
|
||||||
|
|
||||||
|
# go over all potential three word combinations in the sentence.
|
||||||
|
# check whether you have at least three words remaining in the sentence.
|
||||||
|
if len(list_of_nonstop_words) >= 3:
|
||||||
|
# go over all words in the sentence.
|
||||||
|
# remember to pay attention to the upper bound. For example, if there
|
||||||
|
# are 5 words in a sentence, you can only form 3 trigrams
|
||||||
|
for n in range(len(list_of_nonstop_words) - 2):
|
||||||
|
# append the three words of the trigram to the list of trigrams
|
||||||
|
# put a single whitespace between the three single words.
|
||||||
|
trigram_list.append(
|
||||||
|
list_of_nonstop_words[n]
|
||||||
|
+ " "
|
||||||
|
+ list_of_nonstop_words[n + 1]
|
||||||
|
+ " "
|
||||||
|
+ list_of_nonstop_words[n + 2]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# add the list of trigrams to the counter of trigrams
|
||||||
|
trigram_counter = collections.Counter(trigram_list)
|
||||||
|
|
||||||
|
# Get the 30 most frequent trigrams
|
||||||
|
top_30_trigrams = trigram_counter.most_common(30)
|
||||||
|
|
||||||
|
# Write the 30 most frequent trigrams to the csv file.
|
||||||
|
# Remember Python starts counting at 0, while humans start at 1.
|
||||||
|
# So, the most frequent word (rank 1 in human counting) is element 0 for Python.
|
||||||
|
# Consequently, to get a consistent table, we must use the value i for the rank
|
||||||
|
# but call the element i-1.
|
||||||
|
for i in range(1, 31):
|
||||||
|
output_csv_file_3b.write(
|
||||||
|
str(i)
|
||||||
|
+ ";"
|
||||||
|
+ str(top_30_trigrams[i - 1][0])
|
||||||
|
+ ";"
|
||||||
|
+ str(top_30_trigrams[i - 1][1])
|
||||||
|
+ "\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# close files
|
||||||
|
output_csv_file_3b.close()
|
||||||
|
|
||||||
|
print("Part b) of the Problem has also been completed.")
|
Loading…
Reference in a new issue