Add the files for the take-home exam
This commit is contained in:
parent
a37c87d9c8
commit
0d654bda9d
248 changed files with 102406 additions and 0 deletions
371
exam/original-files/Problem_1_template.py
Normal file
371
exam/original-files/Problem_1_template.py
Normal file
|
|
@ -0,0 +1,371 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sat Jul 30 15:20:32 2022
|
||||
|
||||
@author: Alexander Hillert, Goethe University
|
||||
"""
|
||||
|
||||
# import packages
|
||||
import re
|
||||
|
||||
# define working directory
|
||||
# adjust it to your computer
|
||||
directory = "YOUR DIRECTORY"
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Part A: Creating an Overview File on the Call Participants
|
||||
# =============================================================================
|
||||
|
||||
# Create output file
|
||||
output_csv_file=open(directory+'Problem_1_Overview_Calls.csv','w',encoding="utf-8")
|
||||
# Write variable names to the first line of the output file
|
||||
# 1) Call-ID
|
||||
# 2) Filename
|
||||
# 3) Fiscal Quarter
|
||||
# 4) Fiscal Year
|
||||
# 5) Date of the call in the format YYYYMMDD
|
||||
# 6) Time of the call, e.g., 05:00 PM GMT
|
||||
# 7) number of non-corporate call participants
|
||||
# 8) the names of all corporate participants and their positions -> each item
|
||||
# should be written in a seperate column
|
||||
output_csv_file.write('ID;Filename;Fiscal_Quarter;Fiscal_Year;Date;Time;\
|
||||
#Analysts')
|
||||
# There can be up to 4 corporate particiapnts
|
||||
for i in range(1,5):
|
||||
output_csv_file.write(';Name_'+str(i)+';Position_'+str(i))
|
||||
output_csv_file.write('\n')
|
||||
|
||||
# Open the overfiew file "Overview_File_Problem_1.csv" to call the earnings calls
|
||||
overview_file=open(directory+'Overview_File_Problem_1.csv','r',encoding="utf-8")
|
||||
overview_text=overview_file.read()
|
||||
list_earnings_calls=overview_text.split("\n")
|
||||
# The last line is empty -> drop it
|
||||
while list_earnings_calls.count("")>0:
|
||||
list_earnings_calls.remove("")
|
||||
|
||||
|
||||
|
||||
# iterate all earnings conference calls
|
||||
for i in range(1, len(list_earnings_calls)):
|
||||
|
||||
# reset the variables
|
||||
fiscal_quarter=""
|
||||
fiscal_year=""
|
||||
date=""
|
||||
time=""
|
||||
|
||||
# we split the entire transcripts into three parts
|
||||
# its header
|
||||
header_text=""
|
||||
# the list of non-corporate participants
|
||||
participants_text=""
|
||||
# the list of corporate participants
|
||||
corporates_text=""
|
||||
|
||||
# the number of analysts joining the call
|
||||
number_analysts=0
|
||||
|
||||
# variables for manager name and position
|
||||
manager_name=""
|
||||
manager_position=""
|
||||
manager_position_edited=""
|
||||
|
||||
# a list of manager names for part b)
|
||||
manager_name_list=[]
|
||||
|
||||
# get the filename of each earnings call
|
||||
call_information=list_earnings_calls[i].split(";")
|
||||
call_id=call_information[0]
|
||||
filename=call_information[1]
|
||||
|
||||
# open the call transcript
|
||||
call_file=open(directory+'Problem_1_Sample/'+filename,'r',encoding="utf-8")
|
||||
call_text=call_file.read()
|
||||
|
||||
# Get information on the call
|
||||
# FOr example:
|
||||
# Q1 2013 Bank of America Corporation Earnings Conference Call
|
||||
# 04/17/2013 08:30 AM GMT
|
||||
|
||||
# the header ends where the list of corporate particpants starts
|
||||
match_corporates=re.search(TO BE COMPLETED,call_text)
|
||||
if match_corporates:
|
||||
header_text=call_text[TO BE COMPLETED]
|
||||
|
||||
|
||||
# get the fiscal quarter and year from the header text
|
||||
match_fiscal_quarter=re.search(TO BE COMPLETED,header_text)
|
||||
if match_fiscal_quarter:
|
||||
fiscal_quarter=match_fiscal_quarter.group(0)
|
||||
match_fiscal_year=re.search(TO BE COMPLETED,header_text)
|
||||
if match_fiscal_year:
|
||||
fiscal_year=match_fiscal_year.group(0)
|
||||
|
||||
# get date and time of the call
|
||||
# date
|
||||
match_date=re.search(TO BE COMPLETED,header_text)
|
||||
if match_date:
|
||||
date=match_date.group(0)
|
||||
# the date in the output file should be formatted as YYYYMMDD
|
||||
# so, you need to rearrange the date text
|
||||
year=date[TO BE COMPLETED]
|
||||
month=date[TO BE COMPLETED]
|
||||
day=date[TO BE COMPLETED]
|
||||
date_formatted=year+month+day
|
||||
# time
|
||||
match_time=re.search(TO BE COMPLETED,header_text)
|
||||
if match_time:
|
||||
time=match_time.group(0)
|
||||
|
||||
|
||||
# count the number of analysts
|
||||
# the relevant text part starts with, for example,
|
||||
# ================================================================================
|
||||
# Conference Call Participiants
|
||||
# ================================================================================
|
||||
#
|
||||
# * Chris Mutascio
|
||||
# Keefe, Bruyette & Woods - Analyst
|
||||
# * Thomas Laturneau
|
||||
# FBR - Analyst
|
||||
|
||||
# and ends with the beginning of the presentation
|
||||
# ================================================================================
|
||||
# Presentation
|
||||
# --------------------------------------------------------------------------------
|
||||
|
||||
match_participants=re.search(TO BE COMPLETED,call_text)
|
||||
match_presentation=re.search(TO BE COMPLETED,call_text)
|
||||
# if you find both boundaries
|
||||
if match_participants and match_presentation:
|
||||
# get the text in between
|
||||
participants_text=call_text[TO BE COMPLETED]
|
||||
|
||||
# split the text of the participants that you have just identified
|
||||
# in a way that each element refers to one analyst.
|
||||
analyst_list=participants_text.split(TO BE COMPLETED)
|
||||
# depending on how you split, you might need re.split()
|
||||
|
||||
# check whether you get empty elements and/or elements that do not refer
|
||||
# to analysts -> remove them
|
||||
while TO BE COMPLETED>0:
|
||||
TO BE COMPLETED
|
||||
|
||||
# after these steps and checks, the number of analysts is the length of your analyst list
|
||||
number_analysts=TO BE COMPLETED
|
||||
|
||||
|
||||
# get the names of the corporate participants and their position
|
||||
# remember that you already have the beginning of corporate participants
|
||||
# see above at around line 90
|
||||
# the corporate participants come before the list of non-corporate participants
|
||||
corporates_text=call_text[TO BE COMPLETED]
|
||||
# like before, split this text such that one element refers to one corporate participant
|
||||
corporates_list=corporates_text.split(TO BE COMPLETED)
|
||||
# check whether you get empty elements and/or elements that do not refer
|
||||
# to corporate participants -> remove them
|
||||
while TO BE COMPLETED>0:
|
||||
TO BE COMPLETED
|
||||
|
||||
|
||||
# write the call information to the output file
|
||||
output_csv_file.write(str(call_id)+";"+filename+";"+fiscal_quarter+";"+fiscal_year+";"\
|
||||
+date_formatted+";"+time+";"+str(number_analysts))
|
||||
|
||||
# now, we need to add the information on the corporate participants
|
||||
# go over all corporate participants
|
||||
for j in range(len(corporates_list)):
|
||||
# depending on how you split the text of corporate participants,
|
||||
# one element of your list could contain the name of the mangager in the first
|
||||
# line and their position in the second line.
|
||||
# ADJUST THE FOLLOWING COMMANDS IF YOU USED A DIFFERENT SPLIT.
|
||||
|
||||
# split each element of the list of corporate participants further
|
||||
# into name and position
|
||||
manager_entry=corporates_list[j]
|
||||
manager_entry_parts=manager_entry.split(TO BE COMPLETED)
|
||||
manager_name=manager_entry_parts[TO BE COMPLETED]
|
||||
|
||||
# for part b) of the problem it is helpful to have a list of all
|
||||
# manager names. With this list, we can identify whether a statement
|
||||
# comes from a managers (-> answer) or from an analyst (-> question)
|
||||
manager_name_list.append(manager_name)
|
||||
|
||||
|
||||
manager_position=manager_entry_parts[TO BE COMPLETED]
|
||||
# Like before, the template assumes a very specific type of split here
|
||||
# So depending on your approach, you might need to change the commands below.
|
||||
# the position is just the text part after " - "
|
||||
# For example
|
||||
# Bank of America Corporation - CEO
|
||||
# the position is "CEO"
|
||||
manager_position_edited=re.TO BE COMPLETED
|
||||
|
||||
# write the manager names and positions to the output file
|
||||
output_csv_file.write(";"+manager_name+";"+manager_position_edited)
|
||||
|
||||
output_csv_file.write("\n")
|
||||
|
||||
|
||||
print("For earnings call "+str(i)+" part a) has been completed.")
|
||||
|
||||
# =========================================================================
|
||||
# Part B: Extracting the Call Segments
|
||||
# =========================================================================
|
||||
|
||||
# set variables
|
||||
presentation_text=""
|
||||
qanda_text=""
|
||||
qanda_list=[]
|
||||
question_text=""
|
||||
answer_text=""
|
||||
|
||||
|
||||
# identify the presentation
|
||||
# the begin of the presentation has already been identified above
|
||||
# see at around line 140
|
||||
#
|
||||
# the presentation ends where the Q and A part begins
|
||||
# ================================================================================
|
||||
# Questions and Answers
|
||||
# --------------------------------------------------------------------------------
|
||||
match_qanda=re.search(TO BE COMPLETED,call_text)
|
||||
presentation_text=call_text[TO BE COMPLETED]
|
||||
|
||||
# drop operator statements
|
||||
# search for the beginning of an operator statement
|
||||
match_operator=re.search(TO BE COMPLETED,presentation_text)
|
||||
while match_operator:
|
||||
match_operator_start=match_operator.start()
|
||||
# search for the end of the operator statement
|
||||
# Hint: search only after the beginning of the operator statement
|
||||
# Hint 2: remember to keep track of your coordinates (.start() and .end())
|
||||
match_operator_end=re.search(TO BE COMPLETED,TO BE COMPLETED)
|
||||
|
||||
# keep the text before the operator statement and the text after
|
||||
# the approach is similar to removing tables (see Problem 4 and 5 from class)
|
||||
presentation_text=presentation_text[TO BE COMPLETED]
|
||||
|
||||
# check whether there is another match
|
||||
match_operator=re.search(TO BE COMPLETED,presentation_text)
|
||||
|
||||
# sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
|
||||
# or "(technical difficulty)" -> drop those
|
||||
TO BE COMPLETED
|
||||
# there are several ways to approach this editing step (e.g., re.sub())
|
||||
|
||||
|
||||
# drop information on the speakers, e.g.,
|
||||
# -------------------------------------------------------------------------
|
||||
# Deborah Crawford, Facebook, Inc. - Director of IR [2]
|
||||
# -------------------------------------------------------------------------
|
||||
match_speaker=re.search(TO BE COMPLETED,presentation_text)
|
||||
while match_speaker:
|
||||
# the task is similar to the Operator statement but be careful
|
||||
# to only remove the speaker name but NOT the text of the speaker.
|
||||
presentation_text=presentation_text TO BE COMPLETED
|
||||
# check whether there is another speaker name
|
||||
match_speaker=re.search(TO BE COMPLETED,presentation_text)
|
||||
|
||||
|
||||
# write the text of the presentation to an output file
|
||||
# make sure that the folder "Problem_1_Conference_Call_Segments" exists.
|
||||
output_file_presentation=open(directory+'Problem_1_Conference_Call_Segments/call_'+str(call_id)+'_presentation.txt',"w",encoding='utf-8')
|
||||
output_file_presentation.write(presentation_text)
|
||||
|
||||
# Close file
|
||||
output_file_presentation.close()
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# identify questions and answers
|
||||
# -------------------------------------------------------------------------
|
||||
# you already have the start of the Q&A section (see at around lines 235)
|
||||
qanda_text=call_text[match_qanda.end():]
|
||||
|
||||
# the earnings call transcript ends with definitions
|
||||
# remove these/keep the text before the definitions
|
||||
match_definitions=re.search("\n-{1,}\nDefinitions\n-{1,}\n",qanda_text)
|
||||
if match_definitions:
|
||||
# keep the text before
|
||||
qanda_text=qanda_text[TO BE COMPLETED]
|
||||
|
||||
# split the Q and A part by speaker
|
||||
qanda_list=re.split(TO BE COMPLETED,qanda_text)
|
||||
|
||||
# variables to count the number of answers
|
||||
answer_counter=1
|
||||
# and questions
|
||||
question_counter=1
|
||||
|
||||
# go over all speakers/statements that you obtained from the previous split
|
||||
# you now have to decide whether the speaker is an analyst (-> question)
|
||||
# or a corporate participant (-> answer)
|
||||
for k in range(TO BE COMPLETED):
|
||||
|
||||
# identify the speaker name to check whether it is a corporate participant.
|
||||
# For example
|
||||
# --------------------------------------------------------------------------------
|
||||
# Bruce Thompson, Bank of America Corporation - CFO [3]
|
||||
# --------------------------------------------------------------------------------
|
||||
#
|
||||
speaker_text_part=qanda_list[k]
|
||||
# split the text part of the kth speaker
|
||||
# into his*her name and the rest
|
||||
# NOTE: re.search() and re.sub() are also nice ways to accomplish the goal
|
||||
speaker_text_sub_parts=re.split(TO BE COMPLETED,qanda_list[k])
|
||||
# get the name of the speaker from the previous split
|
||||
# in the example above, we need to get "Bruce Thompson"
|
||||
speaker_name=speaker_text_sub_parts[TO BE COMPLETED]
|
||||
# depending on your split, you might need some further editing to
|
||||
# get onyl the name ("Bruce Thompson") without any additional information.
|
||||
|
||||
|
||||
# the second part of speaker_text_sub_parts is (probably) the statement
|
||||
# of the speaker (again, it depends on your split)
|
||||
text=speaker_text_sub_parts[TO BE COMPLETED]
|
||||
|
||||
# sometimes there are technical remarks like "(inaudible)", "(corrected by company after the call)",
|
||||
# or "(technical difficulty)" -> drop those
|
||||
text=TO BE COMPLETED
|
||||
# there are several ways to approach this editing step (e.g., re.sub())
|
||||
|
||||
# check whether the speaker name is in the manager list from part a) (see at around line 195)
|
||||
if speaker_name in manager_name_list:
|
||||
# the name of the speaker is in the list of corporate participants
|
||||
# -> it is a management answer
|
||||
|
||||
answer_text=answer_text+"Answer_"+str(answer_counter)+":\n"+text+"\n"
|
||||
answer_counter=answer_counter+1
|
||||
|
||||
else:
|
||||
# it is either an analyst question or an operator statement
|
||||
# be careful to check the condition below. depending on how your
|
||||
# speaker names look like, you may need .count() and/or re.search() instead of .startswith()
|
||||
if speaker_name.startswith("Operator") or TO BE COMPLETED:
|
||||
pass
|
||||
|
||||
else:
|
||||
# it is an analyst question
|
||||
question_text=question_text+"Question_"+str(question_counter)+":\n"+text+"\n"
|
||||
question_counter=question_counter+1
|
||||
|
||||
# write the texts to output files
|
||||
# make sure that the subfolder exists.
|
||||
output_file_answers=open(directory+'Problem_1_Conference_Call_Segments/call_'+str(call_id)+'_answers.txt',"w",encoding='utf-8')
|
||||
output_file_questions=open(directory+'Problem_1_Conference_Call_Segments/call_'+str(call_id)+'_questions.txt',"w",encoding='utf-8')
|
||||
output_file_answers.write(answer_text)
|
||||
output_file_questions.write(question_text)
|
||||
|
||||
# Close files
|
||||
output_file_answers.close()
|
||||
output_file_questions.close()
|
||||
call_file.close()
|
||||
|
||||
|
||||
# Close files
|
||||
overview_file.close()
|
||||
output_csv_file.close()
|
||||
|
||||
print("Problem 1 completed.")
|
||||
Loading…
Add table
Add a link
Reference in a new issue