# -*- coding: utf-8 -*- """ Created on Mon Mar 21 09:38:32 2022 @author: Alexander Hillert, Goethe University Frankfurt """ import pandas as pd import numpy as np from sklearn.metrics import mean_squared_error from sklearn.linear_model import RidgeCV from sklearn.linear_model import LassoCV # adjust the directory to your folder directory="C:/Lehre/Machine Learning/Data/" # import the data for this problem # NOTE: IT MIGHT TAKE 3 TO 5 MINUTES TO OPEN THE DATA data_frame=pd.read_csv(directory+"form_10-Ks_machine_learning_2007_2008_all_variables_v1.csv",sep=";") # The rows of the data are the Form 10-K filings. Each line is one filing. # The columns are the variables. After some identifying information, # you find the word frequencies, i.e., how often a word (e.g., "the") shows up # in a 10-K (e.g., 100 times) # WARNING: THE DATA SET IS TOO LARGE TO BE DISPLAYED -> Variable Explorer # and Console will crash. # However, you can pick a small subset of the data and look at it. # It list all columns=variables and the first three observations. data_frame_example=data_frame.head(3) # you can click on this variable in the variable explorer without Spyder crashing. # To see the variables included in the data use the following command data_frame_column_names=data_frame.columns # you can click on this variable in the variable explorer without Spyder crashing. # This variables shows all column/variable names in a vector. # split the data set into the training and testing data # we use the filings from year 2007 as training data data_frame_train=data_frame[data_frame.year==2007] # and the filing from year 2008 as testing data data_frame_test=data_frame[data_frame.year==2008] # put the cumulative abnormal return around the filing date into a new variable. # we follow Loughran and McDonald (2011) and use the CAR from t to t+4. # training data filing_car_train=data_frame_train["excess_ret_t0_t4"] # testing data filing_car_test=data_frame_test["excess_ret_t0_t4"] # so far, you have absolute word counts. For example, "loss" is found 5 times. # As the length of the 10-Ks can be different, we scale by the number of words # in the 10-K. document_length_train=data_frame_train["number_of_words"] document_length_test=data_frame_test["number_of_words"] # the word frequencies are our independent variables -> restrict the data frame # to those variables and drop all variables that are not needed data_frame_train=data_frame_train.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"]) data_frame_test=data_frame_test.drop(columns=["cik","year","month","link","filing_type","filing_date","excess_ret_t0_t4","number_of_words"]) # compute relative frequencies, i.e., divide the absolute word count by document length data_frame_train=data_frame_train.div(document_length_train, axis=0) data_frame_test=data_frame_test.div(document_length_test, axis=0) # standardize the data frames # training data data_frame_train_mean=np.mean(data_frame_train,axis=0) data_frame_train_sd=np.std(data_frame_train, axis=0, ddof=1) data_frame_train_standardized=(data_frame_train-data_frame_train_mean)/data_frame_train_sd # testing data data_frame_test_mean=np.mean(data_frame_test,axis=0) data_frame_test_sd=np.std(data_frame_test, axis=0, ddof=1) data_frame_test_standardized=(data_frame_test-data_frame_test_mean)/data_frame_test_sd # There can be missing values in the standardized variables. # They arise if the word count for a specific word is always zero in the training # or in the testing data. In this case, the standard deviation is zero -> # division by zero -> NaN. # We replace these missing values by zero. # training data data_frame_train_standardized=data_frame_train_standardized.fillna(0) # testing data data_frame_test_standardized=data_frame_test_standardized.fillna(0) ########################## # Ridge regression ########################## print("\nRidge regression - Using cross-validation\n") # Regress the CARs on the word frequencies using Ridge regressions with cross-validation. # In this regression, we use the training data. # We use five-fold cross-validation. # Recommendation for initial alphas/lambdas: 100000, 150000, and 200000 # The optimal alpha is at around 140000. regression_Ridge_cv=RidgeCV(alphas=[135000,137000,140000,143000,145000], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train) # get the optimal lambda alpha_optimal_cv=regression_Ridge_cv.alpha_ print("The optimal alpha is "+str(alpha_optimal_cv)) # what is the R2 in the training and testing data? print("The R2 in the training data is: "+str(regression_Ridge_cv.score(data_frame_train_standardized,filing_car_train))) print("The R2 in the testing data is: "+str(regression_Ridge_cv.score(data_frame_test_standardized,filing_car_test))) # Mean squared error using the cross-validated model # predict y in the full training sample filing_car_train_predicted_Ridge=regression_Ridge_cv.predict(data_frame_train_standardized) # predict y in the testing sample filing_car_test_predicted_Ridge=regression_Ridge_cv.predict(data_frame_test_standardized) # Determine the MSE print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Ridge))) print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Ridge))) ###################### # LASSO regression ###################### print("\nLASSO regression - Using cross-validation\n") # Regress the CARs on the word frequencies using LASSO regressions with cross-validation. # In this regression, we use the training data. # We use five-fold cross-validation. # Recommendation for initial alphas/lambdas: 0.5, 1, and 1.5 # The optimal alpha is at around 0.86. regression_Lasso_cv=LassoCV(alphas=[0.85,0.86,0.87,0.88,0.89], fit_intercept=True,cv=5).fit(data_frame_train_standardized,filing_car_train) # get the optimal lambda alpha_optimal_cv=regression_Lasso_cv.alpha_ print("The optimal alpha is "+str(alpha_optimal_cv)) # get the R2 in the training data print("The R2 in the training data is: "+str(regression_Lasso_cv.score(data_frame_train_standardized,filing_car_train))) # ... and testing data print("The R2 in the testing data is: "+str(regression_Lasso_cv.score(data_frame_test_standardized,filing_car_test))) # Mean squared error using the cross-validated model # predict y in the full training sample filing_car_train_predicted_Lasso=regression_Lasso_cv.predict(data_frame_train_standardized) # predict y in the testing sample filing_car_test_predicted_Lasso=regression_Lasso_cv.predict(data_frame_test_standardized) # Determine the MSE print("The MSE in the full training data is: "+str(mean_squared_error(filing_car_train, filing_car_train_predicted_Lasso))) print("The MSE in the testing data is: "+str(mean_squared_error(filing_car_test, filing_car_test_predicted_Lasso))) ############################################################ # Compare the betas from the Ridge and the LASSO regressions ############################################################ output_file=open(directory+"comparison_coefficients_Ridge_LASSO_10-Ks.csv","w",encoding="utf-8") output_file.write("index;word;coefficient_Ridge;coefficient_LASSO\n") # get the list of coefficients for i in range (0,len(data_frame_train.columns)): output_file.write(str(i)+';'+data_frame_train.columns[i]+';'+str(regression_Ridge_cv.coef_[i])+';'+str(regression_Lasso_cv.coef_[i])+'\n') output_file.close() print("Completed!")