162 lines
7.5 KiB
Python
162 lines
7.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
Created on Mon Mar 21 09:38:32 2022
|
||
|
|
||
|
@author: Alexander Hillert, Goethe University Frankfurt
|
||
|
"""
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from sklearn.metrics import mean_squared_error
|
||
|
from sklearn.linear_model import RidgeCV
|
||
|
from sklearn.linear_model import LassoCV
|
||
|
|
||
|
|
||
|
# adjust the directory to your folder
|
||
|
directory="C:/Lehre/Machine Learning/Data/"
|
||
|
|
||
|
|
||
|
# import the data for this problem
|
||
|
# NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA
|
||
|
data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";")
|
||
|
# The rows of the data are the Form 10-K filings. Each line is one filing.
|
||
|
# The columns are the variables. After some identifying information,
|
||
|
# you find the word frequencies, i.e., how often a word (e.g., "the") shows up
|
||
|
# in a 10-K (e.g., 100 times)
|
||
|
|
||
|
|
||
|
# WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer
|
||
|
# and Console will crash.
|
||
|
# However, you can pick a small subset of the data and look at it.
|
||
|
# It list all columns=variables and the first three observations.
|
||
|
data_frame_example=data_frame.head(3)
|
||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||
|
|
||
|
# To see the variables included in the data use the following command
|
||
|
data_frame_column_names=data_frame.columns
|
||
|
# you can click on this variable in the variable explorer without Spyder crashing.
|
||
|
# This variables shows all column/variable names in a vector.
|
||
|
|
||
|
# split the data set into the training and testing data
|
||
|
# we use the filings from year 2007 as training data
|
||
|
data_frame_train=data_frame[data_frame.year==2007]
|
||
|
# and the filing from year 2008 as testing data
|
||
|
data_frame_test=data_frame[data_frame.year==2008]
|
||
|
|
||
|
# put the cumulative abnormal return around the filing date into a new variable.
|
||
|
# we follow Loughran and McDonald (2011) and use the CAR from t to t+4.
|
||
|
# training data
|
||
|
filing_car_train=data_frame_train["excess_ret_t0_t4"]
|
||
|
# testing data
|
||
|
filing_car_test=data_frame_test["excess_ret_t0_t4"]
|
||
|
|
||
|
# so far, you have absolute word counts. For example, "loss" is found 5 times.
|
||
|
# As the length of the 10-Ks can be different, we scale by the number of words
|
||
|
# in the 10-K.
|
||
|
document_length_train=data_frame_train["number_of_words"]
|
||
|
document_length_test=data_frame_test["number_of_words"]
|
||
|
|
||
|
|
||
|
# the word frequencies are our independent variables -> restrict the data frame
|
||
|
# to those variables and drop all variables that are not needed
|
||
|
data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||
|
data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"])
|
||
|
|
||
|
# compute relative frequencies, i.e., divide the absolute word count by document length
|
||
|
data_frame_train=data_frame_train.div(document_length_train, axis=0)
|
||
|
data_frame_test=data_frame_test.div(document_length_test, axis=0)
|
||
|
|
||
|
# standardize the data frames
|
||
|
# training data
|
||
|
data_frame_train_mean=np.mean(data_frame_train,axis=0)
|
||
|
data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1)
|
||
|
data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd
|
||
|
# testing data
|
||
|
data_frame_test_mean=np.mean(data_frame_test,axis=0)
|
||
|
data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1)
|
||
|
data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd
|
||
|
|
||
|
|
||
|
# There can be missing values in the standardized variables.
|
||
|
# They arise if the word count for a specific word is always zero in the training
|
||
|
# or in the testing data. In this case, the standard deviation is zero ->
|
||
|
# division by zero -> NaN.
|
||
|
# We replace these missing values by zero.
|
||
|
# training data
|
||
|
data_frame_train_standardized=data_frame_train_standardized.fillna(0)
|
||
|
# testing data
|
||
|
data_frame_test_standardized=data_frame_test_standardized.fillna(0)
|
||
|
|
||
|
##########################
|
||
|
# Ridge regression
|
||
|
##########################
|
||
|
print("\nRidge regression - Using cross-validation\n")
|
||
|
# Regress the CARs on the word frequencies using Ridge regressions with cross-validation.
|
||
|
# In this regression, we use the training data.
|
||
|
# We use five-fold cross-validation.
|
||
|
# Recommendation for initial alphas/lambdas: 100000, 150000, and 200000
|
||
|
# The optimal alpha is at around 140000.
|
||
|
regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||
|
|
||
|
# get the optimal lambda
|
||
|
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||
|
|
||
|
# what is the R2 in the training and testing data?
|
||
|
print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train)))
|
||
|
print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test)))
|
||
|
|
||
|
# Mean squared error using the cross-validated model
|
||
|
# predict y in the full training sample
|
||
|
filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized)
|
||
|
# predict y in the testing sample
|
||
|
filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized)
|
||
|
# Determine the MSE
|
||
|
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge)))
|
||
|
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge)))
|
||
|
|
||
|
|
||
|
######################
|
||
|
# LASSO regression
|
||
|
######################
|
||
|
print("\nLASSO regression - Using cross-validation\n")
|
||
|
# Regress the CARs on the word frequencies using LASSO regressions with cross-validation.
|
||
|
# In this regression, we use the training data.
|
||
|
# We use five-fold cross-validation.
|
||
|
# Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5
|
||
|
# The optimal alpha is at around 0.86.
|
||
|
regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train)
|
||
|
|
||
|
# get the optimal lambda
|
||
|
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
||
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
||
|
|
||
|
# get the R2 in the training data
|
||
|
print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train)))
|
||
|
# ... and testing data
|
||
|
print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test)))
|
||
|
|
||
|
# Mean squared error using the cross-validated model
|
||
|
# predict y in the full training sample
|
||
|
filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized)
|
||
|
# predict y in the testing sample
|
||
|
filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized)
|
||
|
# Determine the MSE
|
||
|
print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso)))
|
||
|
print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso)))
|
||
|
|
||
|
|
||
|
############################################################
|
||
|
# Compare the betas from the Ridge and the LASSO regressions
|
||
|
############################################################
|
||
|
output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8")
|
||
|
output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n")
|
||
|
|
||
|
# get the list of coefficients
|
||
|
for i in range (0,len(data_frame_train.columns)):
|
||
|
output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
||
|
|
||
|
output_file.close()
|
||
|
|
||
|
print("Completed!")
|