Solve Part 2: Problem 1

2022-08-05 04:54:12 +02:00 · 2022-08-05 04:54:12 +02:00 · d0edac1a1b
commit d0edac1a1b
parent 0d654bda9d
244 changed files with 63294 additions and 0 deletions
--- a/exam/part2_problem1/problem1_code.py
+++ b/exam/part2_problem1/problem1_code.py
@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Jul 30 15:20:32 2022
+
+@author: Alexander Hillert, Goethe University
+"""
+
+# import packages
+import re
+import textwrap
+
+# define working directory
+# adjust it to your computer
+directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problem1/"
+
+
+# =============================================================================
+# Part A: Creating an Overview File on the Call Participants
+# =============================================================================
+
+# Create output file
+output_csv_file = open(
+    directory + "Problem_1_Overview_Calls.csv", "w", encoding="utf-8"
+)
+# Write variable names to the first line of the output file
+# 1) Call-ID
+# 2) Filename
+# 3) Fiscal Quarter
+# 4) Fiscal Year
+# 5) Date of the call in the format YYYYMMDD
+# 6) Time of the call, e.g., 05:00 PM GMT
+# 7) number of non-corporate call participants
+# 8) the names of all corporate participants and their positions -> each item
+#    should be written in a seperate column
+output_csv_file.write("ID;Filename;Fiscal_Quarter;Fiscal_Year;Date;Time;Analysts")
+# There can be up to 4 corporate particiapnts
+for i in range(1, 5):
+    output_csv_file.write(";Name_" + str(i) + ";Position_" + str(i))
+output_csv_file.write("\n")
+
+# Open the overfiew file "Overview_File_Problem_1.csv" to call the earnings calls
+overview_file = open(directory + "Overview_File_Problem_1.csv", "r", encoding="utf-8")
+overview_text = overview_file.read()
+list_earnings_calls = overview_text.split("\n")
+# The last line is empty -> drop it
+while list_earnings_calls.count("") > 0:
+    list_earnings_calls.remove("")
+
+
+# iterate all earnings conference calls
+for i in range(1, len(list_earnings_calls)):
+
+    # reset the variables
+    fiscal_quarter = ""
+    fiscal_year = ""
+    date = ""
+    time = ""
+
+    # we split the entire transcripts into three parts
+    # its header
+    header_text = ""
+    # the list of non-corporate participants
+    participants_text = ""
+    # the list of corporate participants
+    corporates_text = ""
+
+    # the number of analysts joining the call
+    number_analysts = 0
+
+    # variables for manager name and position
+    manager_name = ""
+    manager_position = ""
+    manager_position_edited = ""
+
+    # a list of manager names for part b)
+    manager_name_list = []
+
+    # get the filename of each earnings call
+    call_information = list_earnings_calls[i].split(";")
+    call_id = call_information[0]
+    filename = call_information[1]
+
+    # open the call transcript
+    call_file = open(directory + "Problem_1_Sample/" + filename, "r", encoding="utf-8")
+    call_text = call_file.read()
+
+    # Get information on the call
+    # FOr example:
+    # Q1 2013 Bank of America Corporation Earnings Conference Call
+    # 04/17/2013 08:30 AM GMT
+
+    # the header ends where the list of corporate particpants starts
+    splitter = textwrap.dedent(
+        """
+    ================================================================================
+    Corporate Participants
+    ================================================================================                        
+    """
+    ).strip()
+    match_corporates = re.search(splitter, call_text)
+    if match_corporates:
+        header_text = call_text[: match_corporates.start()].strip()
+    else:  # sanity check to verify no record ends up here
+        raise RuntimeError("could not split")
+
+    # get the fiscal quarter and year from the header text
+    match_fiscal_quarter = re.search("Q([1-4]{1}) ([0-9]{4})", header_text)
+    if match_fiscal_quarter:
+        fiscal_quarter = match_fiscal_quarter.group(1)
+        fiscal_year = match_fiscal_quarter.group(2)
+    else:  # sanity check to verify no record ends up here
+        raise RuntimeError("could not split")
+
+    # get date and time of the call
+    match_date = re.search(
+        "([0-9]{2}/[0-9]{2}/[0-9]{4}) ([0-9]{2}:[0-9]{2} [A-Z]{2}) [A-Z]", header_text
+    )
+    if match_date:
+        # date
+        date = match_date.group(1)
+        # the date in the output file should be formatted as YYYYMMDD
+        # so, you need to rearrange the date text
+        year = date[-4:]
+        month = date[3:-5]
+        day = date[:2]
+        date_formatted = year + month + day
+        # time
+        time = match_date.group(2)
+    else:  # sanity check to verify no record ends up here
+        raise RuntimeError("could not split")
+
+    # count the number of analysts
+    # the relevant text part starts with, for example,
+    # ================================================================================
+    # Conference Call Participiants
+    # ================================================================================
+    #
+    # * Chris Mutascio
+    #   Keefe, Bruyette & Woods - Analyst
+    # * Thomas Laturneau
+    #   FBR - Analyst
+
+    # and ends with the beginning of the presentation
+    # ================================================================================
+    # Presentation
+    # --------------------------------------------------------------------------------
+    splitter1 = textwrap.dedent(
+    """
+    ================================================================================
+    Conference Call Participiants
+    ================================================================================                        
+    """
+    ).strip()
+    splitter2 = textwrap.dedent(
+    """
+    ================================================================================
+    Presentation                      
+    """
+    ).strip()
+    match_participants = re.search(splitter1, call_text)
+    match_presentation = re.search(splitter2, call_text)
+    # if you find both boundaries
+    if match_participants and match_presentation:
+        # get the text in between
+        participants_text = call_text[
+            match_participants.end() : match_presentation.start()
+        ]
+    else:  # sanity check to verify no record ends up here
+        raise RuntimeError("could not split")
+
+    # split the text of the participants that you have just identified
+    # in a way that each element refers to one analyst.
+    analyst_list = [
+        x.replace("\n   ", ", ").strip() for x in participants_text.split(" * ")
+    ]
+    # depending on how you split, you might need re.split()
+
+    # check whether you get empty elements and/or elements that do not refer
+    # to analysts -> remove them
+    while analyst_list.count("") > 0:
+        analyst_list.remove("")
+
+    # after these steps and checks, the number of analysts is the length of your analyst list
+    number_analysts = len(analyst_list)
+
+    # get the names of the corporate participants and their position
+    # remember that you already have the beginning of corporate participants
+    # see above at around line 90
+    # the corporate participants come before the list of non-corporate participants
+    corporates_text = call_text[match_corporates.end() : match_participants.start()]
+    # like before, split this text such that one element refers to one corporate participant
+    corporates_list = [
+        x.replace("\n   ", ", ").strip() for x in corporates_text.split(" * ")
+    ]
+    # check whether you get empty elements and/or elements that do not refer
+    # to corporate participants -> remove them
+    while corporates_list.count("") > 0:
+        corporates_list.remove("")
+
+    # write the call information to the output file
+    output_csv_file.write(
+        str(call_id)
+        + ";"
+        + filename
+        + ";"
+        + fiscal_quarter
+        + ";"
+        + fiscal_year
+        + ";"
+        + date_formatted
+        + ";"
+        + time
+        + ";"
+        + str(number_analysts)
+    )
+
+    # now, we need to add the information on the corporate participants
+    # go over all corporate participants
+    for j in range(len(corporates_list)):
+        # depending on how you split the text of corporate participants,
+        # one element of your list could contain the name of the mangager in the first
+        # line and their position in the second line.
+        # ADJUST THE FOLLOWING COMMANDS IF YOU USED A DIFFERENT SPLIT.
+
+        # split each element of the list of corporate participants further
+        # into name and position
+        manager_entry = corporates_list[j]
+        manager_entry_parts = manager_entry.split(", ")
+        manager_name = manager_entry_parts[0]
+
+        # for part b) of the problem it is helpful to have a list of all
+        # manager names. With this list, we can identify whether a statement
+        # comes from a managers (-> answer) or from an analyst (-> question)
+        manager_name_list.append(manager_name)
+
+        manager_entry_parts = manager_entry.split(" - ")
+        manager_position = manager_entry_parts[-1]
+        output_csv_file.write(";" + manager_name + ";" + manager_position)
+
+    output_csv_file.write("\n")
+
+    print("For earnings call " + str(i) + " part a) has been completed.")
+
+    # =========================================================================
+    # Part B: Extracting the Call Segments
+    # =========================================================================
+
+    # set variables
+    presentation_text = ""
+    qanda_text = ""
+    qanda_list = []
+    question_text = ""
+    answer_text = ""
+
+    # identify the presentation
+    # the begin of the presentation has already been identified above
+    # see at around line 140
+    #
+    # the presentation ends where the Q and A part begins
+    # ================================================================================
+    # Questions and Answers
+    # --------------------------------------------------------------------------------
+    splitter = textwrap.dedent(
+    """
+    ================================================================================
+    Questions and Answers                      
+    """
+    ).strip()
+    match_qanda = re.search(splitter, call_text)
+    presentation_text = call_text[match_presentation.end() : match_qanda.start()]
+
+    # drop operator statements
+    # search for the beginning of an operator statement
+    splitter1 = textwrap.dedent(
+    """
+    --------------------------------------------------------------------------------
+    Operator    \[.*\]
+    --------------------------------------------------------------------------------                      
+    """
+    ).strip()
+    splitter2 = textwrap.dedent(
+    """
+    --------------------------------------------------------------------------------                     
+    """
+    ).strip()
+    match_operator = re.search(splitter1, presentation_text)
+    while match_operator:
+        match_operator_start = match_operator.start()
+        # search for the end of the operator statement
+        # Hint: search only after the beginning of the operator statement
+        # Hint 2: remember to keep track of your coordinates (.start() and .end())
+        match_operator_end = re.search(
+            splitter2, presentation_text[match_operator.end() :]
+        )
+        match_operator_end = match_operator_end.start() + match_operator.end()
+
+        # keep the text before the operator statement and the text after
+        # the approach is similar to removing tables (see Problem 4 and 5 from class)
+        presentation_text = (
+            presentation_text[:match_operator_start]
+            + presentation_text[match_operator_end:]
+        )
+
+        # check whether there is another match
+        match_operator = re.search(splitter1, presentation_text)
+
+    # sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
+    # or "(technical difficulty)" -> drop those
+    # there are several ways to approach this editing step (e.g., re.sub())
+    presentation_text = re.sub("\(.*\)", "", presentation_text)
+
+    # drop information on the speakers, e.g.,
+    # -------------------------------------------------------------------------
+    # Deborah Crawford,  Facebook, Inc. - Director of IR    [2]
+    # -------------------------------------------------------------------------
+    splitter = textwrap.dedent(
+    """
+    --------------------------------------------------------------------------------
+    .*, .* - .*   \[.*\]
+    --------------------------------------------------------------------------------                      
+    """
+    ).strip()
+    match_speaker = re.search(splitter, presentation_text)
+    while match_speaker:
+        # the task is similar to the Operator statement but be careful
+        # to only remove the speaker name but NOT the text of the speaker.
+        presentation_text = (
+            presentation_text[: match_speaker.start()]
+            + presentation_text[match_speaker.end() :].strip()
+        )
+        # check whether there is another speaker name
+        match_speaker = re.search(splitter, presentation_text)
+
+    # write the text of the presentation to an output file
+    # make sure that the folder "Problem_1_Conference_Call_Segments" exists.
+    output_file_presentation = open(
+        directory
+        + "Problem_1_Conference_Call_Segments/call_"
+        + str(call_id)
+        + "_presentation.txt",
+        "w",
+        encoding="utf-8",
+    )
+    output_file_presentation.write(presentation_text)
+
+    # Close file
+    output_file_presentation.close()
+
+    # -------------------------------------------------------------------------
+    # identify questions and answers
+    # -------------------------------------------------------------------------
+    # you already have the start of the Q&A section (see at around lines 235)
+    qanda_text = call_text[match_qanda.end() :]
+
+    # the earnings call transcript ends with definitions
+    # remove these/keep the text before the definitions
+    splitter = textwrap.dedent(
+    """
+    --------------------------------------------------------------------------------
+    Definitions
+    --------------------------------------------------------------------------------                      
+    """
+    ).strip()
+    match_definitions = re.search(splitter, qanda_text)
+    if match_definitions:
+        # keep the text before
+        qanda_text = qanda_text[: match_definitions.start()]
+    else:  # sanity check to verify no record ends up here
+        raise RuntimeError("could not split")
+
+    # split the Q and A part by speaker
+    # -10 trick
+    qanda_list = re.split("\n\n----------", qanda_text)
+
+    # variables to count the number of answers
+    answer_counter = 1
+    # and questions
+    question_counter = 1
+
+    # go over all speakers/statements that you obtained from the previous split
+    # you now have to decide whether the speaker is an analyst (-> question)
+    # or a corporate participant (-> answer)
+    for k in range(len(qanda_list)):
+
+        # identify the speaker name to check whether it is a corporate participant.
+        # For example
+        # --------------------------------------------------------------------------------
+        # Bruce Thompson,  Bank of America Corporation - CFO    [3]
+        # --------------------------------------------------------------------------------
+        #
+        speaker_text_part = qanda_list[k]
+        # split the text part of the kth speaker
+        # into his*her name and the rest
+        # NOTE: re.search() and re.sub() are also nice ways to accomplish the goal
+        splitter = textwrap.dedent(
+        """
+        ----------------------------------------------------------------------
+        (.*),? .*  \[.*\]
+        --------------------------------------------------------------------------------
+        (.*)
+        """
+        ).strip()
+        match_speaker = re.search(splitter, speaker_text_part)
+        if match_speaker:
+
+            # get the name of the speaker from the previous split
+            # in the example above, we need to get "Bruce Thompson"
+            speaker_name = match_speaker.group(1).strip()
+            # edge case
+            if "," in speaker_name:
+                speaker_name = speaker_name.split(",")[0]
+            # depending on your split, you might need some further editing to
+            # get onyl the name ("Bruce Thompson") without any additional information.
+
+            # Drop Operator Statements
+            if "Operator" in speaker_name:
+                continue
+
+        else:  # sanity check to verify no record ends up here
+            raise RuntimeError("could not split")
+
+        # the second part of speaker_text_sub_parts is (probably) the statement
+        # of the speaker (again, it depends on your split)
+        text = speaker_text_part.split(
+            "--------------------------------------------------------------------------------"
+        )[-1].strip()
+
+        # sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
+        # or "(technical difficulty)" -> drop those
+        text = re.sub("\(.*\)", "", text)
+        # there are several ways to approach this editing step (e.g., re.sub())
+        # check whether the speaker name is in the manager list from part a) (see at around line 195)
+        if speaker_name in manager_name_list:
+            # the name of the speaker is in the list of corporate participants
+            # -> it is a management answer
+
+            answer_text = (
+                answer_text
+                + "Answer_"
+                + str(answer_counter)
+                + ":\n\n    "
+                + text
+                + "\n\n"
+            )  # adjusted to make it look like problem 2's structure
+            answer_counter = answer_counter + 1
+
+        else:
+
+            # it is either an analyst question or an operator statement
+            # be careful to check the condition below. depending on how your
+            # speaker names look like, you may need .count() and/or re.search() instead of .startswith()
+            if speaker_name.startswith(
+                "Operator"
+            ):  # or TO BE COMPLETED:  ALREADY HANDLED ABOVE
+                pass
+
+            else:
+                # it is an analyst question
+                question_text = (
+                    question_text
+                    + "Question_"
+                    + str(question_counter)
+                    + ":\n\n    "
+                    + text
+                    + "\n\n"
+                )  # adjusted to make it look like problem 2's structure
+                question_counter = question_counter + 1
+
+    # write the texts to output files
+    # make sure that the subfolder exists.
+    output_file_answers = open(
+        directory
+        + "Problem_1_Conference_Call_Segments/call_"
+        + str(call_id)
+        + "_answers.txt",
+        "w",
+        encoding="utf-8",
+    )
+    output_file_questions = open(
+        directory
+        + "Problem_1_Conference_Call_Segments/call_"
+        + str(call_id)
+        + "_questions.txt",
+        "w",
+        encoding="utf-8",
+    )
+    output_file_answers.write(answer_text)
+    output_file_questions.write(question_text)
+
+    # Close files
+    output_file_answers.close()
+    output_file_questions.close()
+    call_file.close()
+
+
+# Close files
+overview_file.close()
+output_csv_file.close()
+
+print("Problem 1 completed.")