501 lines
19 KiB
Python
501 lines
19 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
Created on Sat Jul 30 15:20:32 2022
|
||
|
|
||
|
@author: Alexander Hillert, Goethe University
|
||
|
"""
|
||
|
|
||
|
# import packages
|
||
|
import re
|
||
|
import textwrap
|
||
|
|
||
|
# define working directory
|
||
|
# adjust it to your computer
|
||
|
directory = "/home/alexander/repos/whu-textual-analysis/exam/part2_problem1/"
|
||
|
|
||
|
|
||
|
# =============================================================================
|
||
|
# Part A: Creating an Overview File on the Call Participants
|
||
|
# =============================================================================
|
||
|
|
||
|
# Create output file
|
||
|
output_csv_file = open(
|
||
|
directory + "Problem_1_Overview_Calls.csv", "w", encoding="utf-8"
|
||
|
)
|
||
|
# Write variable names to the first line of the output file
|
||
|
# 1) Call-ID
|
||
|
# 2) Filename
|
||
|
# 3) Fiscal Quarter
|
||
|
# 4) Fiscal Year
|
||
|
# 5) Date of the call in the format YYYYMMDD
|
||
|
# 6) Time of the call, e.g., 05:00 PM GMT
|
||
|
# 7) number of non-corporate call participants
|
||
|
# 8) the names of all corporate participants and their positions -> each item
|
||
|
# should be written in a seperate column
|
||
|
output_csv_file.write("ID;Filename;Fiscal_Quarter;Fiscal_Year;Date;Time;Analysts")
|
||
|
# There can be up to 4 corporate particiapnts
|
||
|
for i in range(1, 5):
|
||
|
output_csv_file.write(";Name_" + str(i) + ";Position_" + str(i))
|
||
|
output_csv_file.write("\n")
|
||
|
|
||
|
# Open the overfiew file "Overview_File_Problem_1.csv" to call the earnings calls
|
||
|
overview_file = open(directory + "Overview_File_Problem_1.csv", "r", encoding="utf-8")
|
||
|
overview_text = overview_file.read()
|
||
|
list_earnings_calls = overview_text.split("\n")
|
||
|
# The last line is empty -> drop it
|
||
|
while list_earnings_calls.count("") > 0:
|
||
|
list_earnings_calls.remove("")
|
||
|
|
||
|
|
||
|
# iterate all earnings conference calls
|
||
|
for i in range(1, len(list_earnings_calls)):
|
||
|
|
||
|
# reset the variables
|
||
|
fiscal_quarter = ""
|
||
|
fiscal_year = ""
|
||
|
date = ""
|
||
|
time = ""
|
||
|
|
||
|
# we split the entire transcripts into three parts
|
||
|
# its header
|
||
|
header_text = ""
|
||
|
# the list of non-corporate participants
|
||
|
participants_text = ""
|
||
|
# the list of corporate participants
|
||
|
corporates_text = ""
|
||
|
|
||
|
# the number of analysts joining the call
|
||
|
number_analysts = 0
|
||
|
|
||
|
# variables for manager name and position
|
||
|
manager_name = ""
|
||
|
manager_position = ""
|
||
|
manager_position_edited = ""
|
||
|
|
||
|
# a list of manager names for part b)
|
||
|
manager_name_list = []
|
||
|
|
||
|
# get the filename of each earnings call
|
||
|
call_information = list_earnings_calls[i].split(";")
|
||
|
call_id = call_information[0]
|
||
|
filename = call_information[1]
|
||
|
|
||
|
# open the call transcript
|
||
|
call_file = open(directory + "Problem_1_Sample/" + filename, "r", encoding="utf-8")
|
||
|
call_text = call_file.read()
|
||
|
|
||
|
# Get information on the call
|
||
|
# FOr example:
|
||
|
# Q1 2013 Bank of America Corporation Earnings Conference Call
|
||
|
# 04/17/2013 08:30 AM GMT
|
||
|
|
||
|
# the header ends where the list of corporate particpants starts
|
||
|
splitter = textwrap.dedent(
|
||
|
"""
|
||
|
================================================================================
|
||
|
Corporate Participants
|
||
|
================================================================================
|
||
|
"""
|
||
|
).strip()
|
||
|
match_corporates = re.search(splitter, call_text)
|
||
|
if match_corporates:
|
||
|
header_text = call_text[: match_corporates.start()].strip()
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# get the fiscal quarter and year from the header text
|
||
|
match_fiscal_quarter = re.search("Q([1-4]{1}) ([0-9]{4})", header_text)
|
||
|
if match_fiscal_quarter:
|
||
|
fiscal_quarter = match_fiscal_quarter.group(1)
|
||
|
fiscal_year = match_fiscal_quarter.group(2)
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# get date and time of the call
|
||
|
match_date = re.search(
|
||
|
"([0-9]{2}/[0-9]{2}/[0-9]{4}) ([0-9]{2}:[0-9]{2} [A-Z]{2}) [A-Z]", header_text
|
||
|
)
|
||
|
if match_date:
|
||
|
# date
|
||
|
date = match_date.group(1)
|
||
|
# the date in the output file should be formatted as YYYYMMDD
|
||
|
# so, you need to rearrange the date text
|
||
|
year = date[-4:]
|
||
|
month = date[3:-5]
|
||
|
day = date[:2]
|
||
|
date_formatted = year + month + day
|
||
|
# time
|
||
|
time = match_date.group(2)
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# count the number of analysts
|
||
|
# the relevant text part starts with, for example,
|
||
|
# ================================================================================
|
||
|
# Conference Call Participiants
|
||
|
# ================================================================================
|
||
|
#
|
||
|
# * Chris Mutascio
|
||
|
# Keefe, Bruyette & Woods - Analyst
|
||
|
# * Thomas Laturneau
|
||
|
# FBR - Analyst
|
||
|
|
||
|
# and ends with the beginning of the presentation
|
||
|
# ================================================================================
|
||
|
# Presentation
|
||
|
# --------------------------------------------------------------------------------
|
||
|
splitter1 = textwrap.dedent(
|
||
|
"""
|
||
|
================================================================================
|
||
|
Conference Call Participiants
|
||
|
================================================================================
|
||
|
"""
|
||
|
).strip()
|
||
|
splitter2 = textwrap.dedent(
|
||
|
"""
|
||
|
================================================================================
|
||
|
Presentation
|
||
|
"""
|
||
|
).strip()
|
||
|
match_participants = re.search(splitter1, call_text)
|
||
|
match_presentation = re.search(splitter2, call_text)
|
||
|
# if you find both boundaries
|
||
|
if match_participants and match_presentation:
|
||
|
# get the text in between
|
||
|
participants_text = call_text[
|
||
|
match_participants.end() : match_presentation.start()
|
||
|
]
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# split the text of the participants that you have just identified
|
||
|
# in a way that each element refers to one analyst.
|
||
|
analyst_list = [
|
||
|
x.replace("\n ", ", ").strip() for x in participants_text.split(" * ")
|
||
|
]
|
||
|
# depending on how you split, you might need re.split()
|
||
|
|
||
|
# check whether you get empty elements and/or elements that do not refer
|
||
|
# to analysts -> remove them
|
||
|
while analyst_list.count("") > 0:
|
||
|
analyst_list.remove("")
|
||
|
|
||
|
# after these steps and checks, the number of analysts is the length of your analyst list
|
||
|
number_analysts = len(analyst_list)
|
||
|
|
||
|
# get the names of the corporate participants and their position
|
||
|
# remember that you already have the beginning of corporate participants
|
||
|
# see above at around line 90
|
||
|
# the corporate participants come before the list of non-corporate participants
|
||
|
corporates_text = call_text[match_corporates.end() : match_participants.start()]
|
||
|
# like before, split this text such that one element refers to one corporate participant
|
||
|
corporates_list = [
|
||
|
x.replace("\n ", ", ").strip() for x in corporates_text.split(" * ")
|
||
|
]
|
||
|
# check whether you get empty elements and/or elements that do not refer
|
||
|
# to corporate participants -> remove them
|
||
|
while corporates_list.count("") > 0:
|
||
|
corporates_list.remove("")
|
||
|
|
||
|
# write the call information to the output file
|
||
|
output_csv_file.write(
|
||
|
str(call_id)
|
||
|
+ ";"
|
||
|
+ filename
|
||
|
+ ";"
|
||
|
+ fiscal_quarter
|
||
|
+ ";"
|
||
|
+ fiscal_year
|
||
|
+ ";"
|
||
|
+ date_formatted
|
||
|
+ ";"
|
||
|
+ time
|
||
|
+ ";"
|
||
|
+ str(number_analysts)
|
||
|
)
|
||
|
|
||
|
# now, we need to add the information on the corporate participants
|
||
|
# go over all corporate participants
|
||
|
for j in range(len(corporates_list)):
|
||
|
# depending on how you split the text of corporate participants,
|
||
|
# one element of your list could contain the name of the mangager in the first
|
||
|
# line and their position in the second line.
|
||
|
# ADJUST THE FOLLOWING COMMANDS IF YOU USED A DIFFERENT SPLIT.
|
||
|
|
||
|
# split each element of the list of corporate participants further
|
||
|
# into name and position
|
||
|
manager_entry = corporates_list[j]
|
||
|
manager_entry_parts = manager_entry.split(", ")
|
||
|
manager_name = manager_entry_parts[0]
|
||
|
|
||
|
# for part b) of the problem it is helpful to have a list of all
|
||
|
# manager names. With this list, we can identify whether a statement
|
||
|
# comes from a managers (-> answer) or from an analyst (-> question)
|
||
|
manager_name_list.append(manager_name)
|
||
|
|
||
|
manager_entry_parts = manager_entry.split(" - ")
|
||
|
manager_position = manager_entry_parts[-1]
|
||
|
output_csv_file.write(";" + manager_name + ";" + manager_position)
|
||
|
|
||
|
output_csv_file.write("\n")
|
||
|
|
||
|
print("For earnings call " + str(i) + " part a) has been completed.")
|
||
|
|
||
|
# =========================================================================
|
||
|
# Part B: Extracting the Call Segments
|
||
|
# =========================================================================
|
||
|
|
||
|
# set variables
|
||
|
presentation_text = ""
|
||
|
qanda_text = ""
|
||
|
qanda_list = []
|
||
|
question_text = ""
|
||
|
answer_text = ""
|
||
|
|
||
|
# identify the presentation
|
||
|
# the begin of the presentation has already been identified above
|
||
|
# see at around line 140
|
||
|
#
|
||
|
# the presentation ends where the Q and A part begins
|
||
|
# ================================================================================
|
||
|
# Questions and Answers
|
||
|
# --------------------------------------------------------------------------------
|
||
|
splitter = textwrap.dedent(
|
||
|
"""
|
||
|
================================================================================
|
||
|
Questions and Answers
|
||
|
"""
|
||
|
).strip()
|
||
|
match_qanda = re.search(splitter, call_text)
|
||
|
presentation_text = call_text[match_presentation.end() : match_qanda.start()]
|
||
|
|
||
|
# drop operator statements
|
||
|
# search for the beginning of an operator statement
|
||
|
splitter1 = textwrap.dedent(
|
||
|
"""
|
||
|
--------------------------------------------------------------------------------
|
||
|
Operator \[.*\]
|
||
|
--------------------------------------------------------------------------------
|
||
|
"""
|
||
|
).strip()
|
||
|
splitter2 = textwrap.dedent(
|
||
|
"""
|
||
|
--------------------------------------------------------------------------------
|
||
|
"""
|
||
|
).strip()
|
||
|
match_operator = re.search(splitter1, presentation_text)
|
||
|
while match_operator:
|
||
|
match_operator_start = match_operator.start()
|
||
|
# search for the end of the operator statement
|
||
|
# Hint: search only after the beginning of the operator statement
|
||
|
# Hint 2: remember to keep track of your coordinates (.start() and .end())
|
||
|
match_operator_end = re.search(
|
||
|
splitter2, presentation_text[match_operator.end() :]
|
||
|
)
|
||
|
match_operator_end = match_operator_end.start() + match_operator.end()
|
||
|
|
||
|
# keep the text before the operator statement and the text after
|
||
|
# the approach is similar to removing tables (see Problem 4 and 5 from class)
|
||
|
presentation_text = (
|
||
|
presentation_text[:match_operator_start]
|
||
|
+ presentation_text[match_operator_end:]
|
||
|
)
|
||
|
|
||
|
# check whether there is another match
|
||
|
match_operator = re.search(splitter1, presentation_text)
|
||
|
|
||
|
# sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
|
||
|
# or "(technical difficulty)" -> drop those
|
||
|
# there are several ways to approach this editing step (e.g., re.sub())
|
||
|
presentation_text = re.sub("\(.*\)", "", presentation_text)
|
||
|
|
||
|
# drop information on the speakers, e.g.,
|
||
|
# -------------------------------------------------------------------------
|
||
|
# Deborah Crawford, Facebook, Inc. - Director of IR [2]
|
||
|
# -------------------------------------------------------------------------
|
||
|
splitter = textwrap.dedent(
|
||
|
"""
|
||
|
--------------------------------------------------------------------------------
|
||
|
.*, .* - .* \[.*\]
|
||
|
--------------------------------------------------------------------------------
|
||
|
"""
|
||
|
).strip()
|
||
|
match_speaker = re.search(splitter, presentation_text)
|
||
|
while match_speaker:
|
||
|
# the task is similar to the Operator statement but be careful
|
||
|
# to only remove the speaker name but NOT the text of the speaker.
|
||
|
presentation_text = (
|
||
|
presentation_text[: match_speaker.start()]
|
||
|
+ presentation_text[match_speaker.end() :].strip()
|
||
|
)
|
||
|
# check whether there is another speaker name
|
||
|
match_speaker = re.search(splitter, presentation_text)
|
||
|
|
||
|
# write the text of the presentation to an output file
|
||
|
# make sure that the folder "Problem_1_Conference_Call_Segments" exists.
|
||
|
output_file_presentation = open(
|
||
|
directory
|
||
|
+ "Problem_1_Conference_Call_Segments/call_"
|
||
|
+ str(call_id)
|
||
|
+ "_presentation.txt",
|
||
|
"w",
|
||
|
encoding="utf-8",
|
||
|
)
|
||
|
output_file_presentation.write(presentation_text)
|
||
|
|
||
|
# Close file
|
||
|
output_file_presentation.close()
|
||
|
|
||
|
# -------------------------------------------------------------------------
|
||
|
# identify questions and answers
|
||
|
# -------------------------------------------------------------------------
|
||
|
# you already have the start of the Q&A section (see at around lines 235)
|
||
|
qanda_text = call_text[match_qanda.end() :]
|
||
|
|
||
|
# the earnings call transcript ends with definitions
|
||
|
# remove these/keep the text before the definitions
|
||
|
splitter = textwrap.dedent(
|
||
|
"""
|
||
|
--------------------------------------------------------------------------------
|
||
|
Definitions
|
||
|
--------------------------------------------------------------------------------
|
||
|
"""
|
||
|
).strip()
|
||
|
match_definitions = re.search(splitter, qanda_text)
|
||
|
if match_definitions:
|
||
|
# keep the text before
|
||
|
qanda_text = qanda_text[: match_definitions.start()]
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# split the Q and A part by speaker
|
||
|
# -10 trick
|
||
|
qanda_list = re.split("\n\n----------", qanda_text)
|
||
|
|
||
|
# variables to count the number of answers
|
||
|
answer_counter = 1
|
||
|
# and questions
|
||
|
question_counter = 1
|
||
|
|
||
|
# go over all speakers/statements that you obtained from the previous split
|
||
|
# you now have to decide whether the speaker is an analyst (-> question)
|
||
|
# or a corporate participant (-> answer)
|
||
|
for k in range(len(qanda_list)):
|
||
|
|
||
|
# identify the speaker name to check whether it is a corporate participant.
|
||
|
# For example
|
||
|
# --------------------------------------------------------------------------------
|
||
|
# Bruce Thompson, Bank of America Corporation - CFO [3]
|
||
|
# --------------------------------------------------------------------------------
|
||
|
#
|
||
|
speaker_text_part = qanda_list[k]
|
||
|
# split the text part of the kth speaker
|
||
|
# into his*her name and the rest
|
||
|
# NOTE: re.search() and re.sub() are also nice ways to accomplish the goal
|
||
|
splitter = textwrap.dedent(
|
||
|
"""
|
||
|
----------------------------------------------------------------------
|
||
|
(.*),? .* \[.*\]
|
||
|
--------------------------------------------------------------------------------
|
||
|
(.*)
|
||
|
"""
|
||
|
).strip()
|
||
|
match_speaker = re.search(splitter, speaker_text_part)
|
||
|
if match_speaker:
|
||
|
|
||
|
# get the name of the speaker from the previous split
|
||
|
# in the example above, we need to get "Bruce Thompson"
|
||
|
speaker_name = match_speaker.group(1).strip()
|
||
|
# edge case
|
||
|
if "," in speaker_name:
|
||
|
speaker_name = speaker_name.split(",")[0]
|
||
|
# depending on your split, you might need some further editing to
|
||
|
# get onyl the name ("Bruce Thompson") without any additional information.
|
||
|
|
||
|
# Drop Operator Statements
|
||
|
if "Operator" in speaker_name:
|
||
|
continue
|
||
|
|
||
|
else: # sanity check to verify no record ends up here
|
||
|
raise RuntimeError("could not split")
|
||
|
|
||
|
# the second part of speaker_text_sub_parts is (probably) the statement
|
||
|
# of the speaker (again, it depends on your split)
|
||
|
text = speaker_text_part.split(
|
||
|
"--------------------------------------------------------------------------------"
|
||
|
)[-1].strip()
|
||
|
|
||
|
# sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
|
||
|
# or "(technical difficulty)" -> drop those
|
||
|
text = re.sub("\(.*\)", "", text)
|
||
|
# there are several ways to approach this editing step (e.g., re.sub())
|
||
|
# check whether the speaker name is in the manager list from part a) (see at around line 195)
|
||
|
if speaker_name in manager_name_list:
|
||
|
# the name of the speaker is in the list of corporate participants
|
||
|
# -> it is a management answer
|
||
|
|
||
|
answer_text = (
|
||
|
answer_text
|
||
|
+ "Answer_"
|
||
|
+ str(answer_counter)
|
||
|
+ ":\n\n "
|
||
|
+ text
|
||
|
+ "\n\n"
|
||
|
) # adjusted to make it look like problem 2's structure
|
||
|
answer_counter = answer_counter + 1
|
||
|
|
||
|
else:
|
||
|
|
||
|
# it is either an analyst question or an operator statement
|
||
|
# be careful to check the condition below. depending on how your
|
||
|
# speaker names look like, you may need .count() and/or re.search() instead of .startswith()
|
||
|
if speaker_name.startswith(
|
||
|
"Operator"
|
||
|
): # or TO BE COMPLETED: ALREADY HANDLED ABOVE
|
||
|
pass
|
||
|
|
||
|
else:
|
||
|
# it is an analyst question
|
||
|
question_text = (
|
||
|
question_text
|
||
|
+ "Question_"
|
||
|
+ str(question_counter)
|
||
|
+ ":\n\n "
|
||
|
+ text
|
||
|
+ "\n\n"
|
||
|
) # adjusted to make it look like problem 2's structure
|
||
|
question_counter = question_counter + 1
|
||
|
|
||
|
# write the texts to output files
|
||
|
# make sure that the subfolder exists.
|
||
|
output_file_answers = open(
|
||
|
directory
|
||
|
+ "Problem_1_Conference_Call_Segments/call_"
|
||
|
+ str(call_id)
|
||
|
+ "_answers.txt",
|
||
|
"w",
|
||
|
encoding="utf-8",
|
||
|
)
|
||
|
output_file_questions = open(
|
||
|
directory
|
||
|
+ "Problem_1_Conference_Call_Segments/call_"
|
||
|
+ str(call_id)
|
||
|
+ "_questions.txt",
|
||
|
"w",
|
||
|
encoding="utf-8",
|
||
|
)
|
||
|
output_file_answers.write(answer_text)
|
||
|
output_file_questions.write(question_text)
|
||
|
|
||
|
# Close files
|
||
|
output_file_answers.close()
|
||
|
output_file_questions.close()
|
||
|
call_file.close()
|
||
|
|
||
|
|
||
|
# Close files
|
||
|
overview_file.close()
|
||
|
output_csv_file.close()
|
||
|
|
||
|
print("Problem 1 completed.")
|