Alexander Hess
a37c87d9c8
- add the code files provided by the instructor - the programming/files folder with the data files is NOT included here due to its size - add a .gitignore file to exclude the data files' folder
485 lines
22 KiB
Python
485 lines
22 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Mon Mar 21 09:38:32 2022
|
|
|
|
@author: Alexander Hillert, Goethe University Frankfurt
|
|
"""
|
|
|
|
'''
|
|
This script introduces you to linear models using the sklearn package.
|
|
Besides sklearn, we will use pandas to work with data sets as well as
|
|
numpy to perform computations.
|
|
|
|
The introduction consists of 10 parts:
|
|
1. linear regressions using a toy data set
|
|
2. linear regressions using a "real" data set
|
|
3. linear regressions using standardized variables
|
|
4. Ridge regression basics
|
|
5. Ridge regression with training, tuning, and testing sample
|
|
6. Ridge regression with cross-validation
|
|
7. LASSO regression basics
|
|
8. LASSO regression with training, tuning, and testing sample
|
|
9. LASSO regression with cross-validation
|
|
10. Compare the results from Ridge and LASSO
|
|
|
|
'''
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
# For OLS regressions
|
|
from sklearn.linear_model import LinearRegression
|
|
# for Ridge regressions
|
|
from sklearn.linear_model import Ridge
|
|
# for computing mean squared errors
|
|
from sklearn.metrics import mean_squared_error
|
|
# for plotting the MSEs for different levels of Lambda
|
|
import matplotlib.pyplot as plot
|
|
# for Ridge regressions with cross-validation
|
|
from sklearn.linear_model import RidgeCV
|
|
# for LASSO regressions
|
|
from sklearn.linear_model import Lasso
|
|
# for LASSO regressions with cross-validation
|
|
from sklearn.linear_model import LassoCV
|
|
|
|
# adjust the directory to your folder!!!
|
|
directory="C:/Lehre/Machine Learning/Data/"
|
|
|
|
############################################################
|
|
# Part 1. Basics: linear regressions in Python using sklearn
|
|
############################################################
|
|
print("\nPart 1: Run an OLS regression on a sandbox data set\n")
|
|
|
|
# create a random number from a normal distribution with mean 0 and standard deviation 1.
|
|
random_number=np.random.normal(0, 1)
|
|
print("A random number is: "+str(random_number))
|
|
|
|
# you can also create a vector or matrix of random variables
|
|
# the parameter size(# of rows, # of columns) specifies the number rows and columns
|
|
# For example, a (10,1) vector
|
|
random_number_vector=np.random.normal(0, 1, size=(10,1))
|
|
print("The vector of random numbers is:")
|
|
print(random_number_vector)
|
|
|
|
# create the independent variable x as a vector of random numbers
|
|
x_vector=np.random.normal(0, 1, size=(10,1))
|
|
print("The vector of the independent variable x is:")
|
|
print(x_vector)
|
|
|
|
# create the dependent variable y as
|
|
# y = 2x + epsilon, where epsilon is the random error term from above
|
|
y_vector=np.dot(x_vector,2) + random_number_vector
|
|
print("The vector of the dependent variable y is:")
|
|
print(y_vector)
|
|
|
|
# perform a standard OLS regression with intercept.
|
|
# The command takes x (independent variable(s)) first and then y (dependent variable)
|
|
# Note that the default is that the intercept is included. So, strictly speaking,
|
|
# the (fit_intercept=True) option is not needed.
|
|
regression_1=LinearRegression(fit_intercept=True).fit(x_vector, y_vector)
|
|
|
|
# display the intercept and the beta coefficient on x
|
|
print("The intercept is: "+str(regression_1.intercept_))
|
|
# to get it as a scalar/number not an array, use
|
|
regression_1.intercept_[0]
|
|
|
|
print("The coefficient on x is: "+str(regression_1.coef_))
|
|
# to get it as a scalar/number not an array, use
|
|
regression_1.coef_[0][0]
|
|
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_1.score(x_vector, y_vector)))
|
|
|
|
|
|
###############################################################
|
|
# Part 2: linear regression using a "real" data set
|
|
###############################################################
|
|
print("\nPart 2: Run an OLS regression with a real data set\n")
|
|
|
|
# import the data for this problem
|
|
# The data set consists of 200 independent variables (x1 to x200) and
|
|
# a dependent variable (y).
|
|
# There are 1,200 observations in total. In the later parts, we will
|
|
# use the first 1,000 observations for training and the last 200 for testing.
|
|
# The data are simulated using the following process:
|
|
# y = 0.5*x1 + 0.5*x2 + ... + 0.5*x100 + random error (mean 0, std. dev. 4)
|
|
# The x101 to x200 are not directly related to y but are correlated with
|
|
# the x1 to x100. More specifically,
|
|
# x101 = 0.7*x1 + random error (mean 0, std. dev. 1)
|
|
# x102 = 0.7*x2 + random error (mean 0, std. dev. 1)
|
|
# x200 = 0.7*x100 + random error (mean 0, std. dev. 1)
|
|
data_frame=pd.read_csv(directory+"regression_data_scikit.csv",sep=";")
|
|
|
|
# to get any idea about the data, display the first five data points
|
|
data_frame.head(5)
|
|
|
|
# split the data frame into the independent and dependent variables
|
|
# the independent variables(x1 to x200) are columns 1 to 200
|
|
x_variables=data_frame.values[:,:-1]
|
|
# the dependent variable (y) is column 201
|
|
y_variable=data_frame.values[:,-1:]
|
|
|
|
# run a standard OLS regression
|
|
regression_OLS=LinearRegression(fit_intercept=True).fit(x_variables, y_variable)
|
|
# You can double check the results by reruning the regression in Stata or R.
|
|
|
|
# display the intercept and the beta coefficients on x1 and x51
|
|
print("The intercept is: "+str(regression_OLS.intercept_[0]))
|
|
print("The coefficient on x_1 is: "+str(regression_OLS.coef_[0][0]))
|
|
print("The coefficient on x_51 is: "+str(regression_OLS.coef_[0][50]))
|
|
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_OLS.score(x_variables, y_variable)))
|
|
|
|
|
|
##################################################################
|
|
# Part 3: standardize the data to have mean zero and unit variance
|
|
# and rerun the regression
|
|
##################################################################
|
|
print("\nPart 3a.: Standardize variables\n")
|
|
|
|
# standardize x and y to have mean zero and unit variance
|
|
# axis=0 (axis=1) means that the computation is executed column (row) wise
|
|
x_variables_mean=np.mean(x_variables,axis=0)
|
|
# ddof=1 means that we use n-1 to compute the standard deviation
|
|
x_variables_standard_deviation=np.std(x_variables, axis=0, ddof=1)
|
|
x_variables_standardized=(x_variables-x_variables_mean)/x_variables_standard_deviation
|
|
|
|
# do the same exercise for y
|
|
y_variable_mean=np.mean(y_variable,axis=0)
|
|
y_variable_standard_deviation=np.std(y_variable, axis=0, ddof=1)
|
|
y_variable_standardized=(y_variable-y_variable_mean)/y_variable_standard_deviation
|
|
|
|
# rerun the regression using standardized data
|
|
regression_OLS_standardized=LinearRegression(fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
|
# results are identical to a regression in Stata with beta coefficients.
|
|
|
|
# display the intercept and the beta coefficients on x_1 and x_51
|
|
print("The intercept is: "+str(regression_OLS_standardized.intercept_[0]))
|
|
print("The coefficient on x_1 is: "+str(regression_OLS_standardized.coef_[0][0]))
|
|
print("The coefficient on x_51 is: "+str(regression_OLS_standardized.coef_[0][50]))
|
|
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_OLS_standardized.score(x_variables_standardized, y_variable_standardized)))
|
|
# The R2 is identical to the one from Part 2 -> good!
|
|
|
|
#######################################################################################
|
|
# CAUTION: be careful using the "normalize=True" option in the LinearRegression module!
|
|
#######################################################################################
|
|
print("\nPart 3b.: Regression with 'normalization'\n")
|
|
# Normalizer works on the rows, not the columns!
|
|
# By default, L2 normalization is applied to each observation so that the
|
|
# values in a row (!) have a unit norm. Unit norm with L2 means that if each
|
|
# element were squared and summed, the total would equal 1.
|
|
regression_OLS_normalized=LinearRegression(fit_intercept=True,normalize=True).fit(x_variables, y_variable)
|
|
|
|
# display the intercept and the beta coefficient on x_1 and x_51
|
|
print("The intercept is: "+str(regression_OLS_normalized.intercept_[0]))
|
|
print("The coefficient on x_1 is: "+str(regression_OLS_normalized.coef_[0][0]))
|
|
print("The coefficient on x_51 is: "+str(regression_OLS_normalized.coef_[0][50]))
|
|
# The coefficients are different from the ones above highlighting that the
|
|
# "normalize=True" option does not do the same as "normal" standardizing
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_OLS_normalized.score(x_variables, y_variable)))
|
|
|
|
|
|
#######################################################################
|
|
# Part 4: Ridge regression on the full sample (no training and testing)
|
|
# This part is to learn the syntax.
|
|
# We are using the standardized variables to have the same penalty
|
|
# for a given effect of x on y.
|
|
# Remember: if the independent variables are measured on very different
|
|
# scales, the beta coefficients have different sizes (e.g., market cap in
|
|
# thousand USD vs. past stock returns as a decimal number) and, thus,
|
|
# the panelty would be applied inconsistently.
|
|
#######################################################################
|
|
print("\nPart 4: Ridge regression - learning the syntax\n")
|
|
|
|
# the parameter alpha corresponds to the penalty parameter Lambda from
|
|
# the notation that is typically used.
|
|
# the default is that the intercept is included, so you do not need the
|
|
# "intercept=True" parameter. But it is good to keep in mind what
|
|
# specification you are using.
|
|
regression_Ridge=Ridge(alpha=10,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
|
|
|
# display the intercept and the beta coefficient on x1 and x51
|
|
print("The intercept is: "+str(regression_Ridge.intercept_[0]))
|
|
print("The coefficient on x_1 is: "+str(regression_Ridge.coef_[0][0]))
|
|
print("The coefficient on x_51 is: "+str(regression_Ridge.coef_[0][50]))
|
|
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_Ridge.score(x_variables_standardized, y_variable_standardized)))
|
|
|
|
# How to compute the mean squared error (MSE)?
|
|
# 1. get the predicted values
|
|
y_variable_standardized_predicted=regression_Ridge.predict(x_variables_standardized)
|
|
# 2. determine the MSE
|
|
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
|
|
|
|
|
#######################################################################
|
|
# Part 5: Ridge regression using a training, tuning, and testing sample
|
|
#######################################################################
|
|
print("\nPart 5: Ridge regression - Application with training, tuning, and testing data\n")
|
|
|
|
# Create a training, tuning, and testing sample
|
|
# we split the data into a training, a tuning, and a testing set
|
|
# training data are the frist 800 rows
|
|
# In the brackets, the first range (before the comma) indicates the rows, the second the columns.
|
|
x_variables_std_train=x_variables_standardized[:800,:]
|
|
y_variable_std_train=y_variable_standardized[:800,:]
|
|
# the tuning data are row 801 to 1000 -> 200 observations
|
|
x_variables_std_tune=x_variables_standardized[800:1000,:]
|
|
y_variable_std_tune=y_variable_standardized[800:1000,:]
|
|
# testing data are the last 200 rows
|
|
x_variables_std_test=x_variables_standardized[1000:,:]
|
|
y_variable_std_test=y_variable_standardized[1000:,:]
|
|
|
|
|
|
##########################
|
|
# find the optimal Lambda
|
|
##########################
|
|
# we store the MSE of the training/tuning data for each Lambda
|
|
mse_train_list=[]
|
|
mse_tune_list=[]
|
|
# Again, Lambda and Alpha refer to the same thing.
|
|
alpha_list=[]
|
|
|
|
# we iterate from 0.1 to 100 increasing Lambda=Alpha by 0.1 in each step.
|
|
alpha=0.1
|
|
while alpha<100:
|
|
# train the model
|
|
regression_Ridge_train=Ridge(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
|
# add the alpha to the list of alphas
|
|
alpha_list.append(alpha)
|
|
# predict y in the training sample
|
|
y_variable_std_train_predicted=regression_Ridge_train.predict(x_variables_std_train)
|
|
# predict y in the tuning sample
|
|
y_variable_std_tune_predicted=regression_Ridge_train.predict(x_variables_std_tune)
|
|
# compute the MSE in both samples
|
|
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
|
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
|
# append the MSEs to the two lists
|
|
mse_train_list.append(mse_train)
|
|
mse_tune_list.append(mse_tune)
|
|
# continue with the next alpha
|
|
alpha=alpha+0.1
|
|
|
|
########################################
|
|
# plot the MSEs for the different alphas
|
|
########################################
|
|
# MSE in the training sample
|
|
plot.scatter(alpha_list, mse_train_list)
|
|
plot.show()
|
|
# higher Lambda associated with higher MSE
|
|
|
|
# MSE in the tuning sample
|
|
plot.scatter(alpha_list, mse_tune_list)
|
|
plot.show()
|
|
# there is an optimal alpha with the lowest MSE
|
|
|
|
######################################
|
|
# determine the optimal Lambda
|
|
######################################
|
|
# what is the smallest MSE?
|
|
minimum=min(mse_tune_list)
|
|
print("The smallest MSE is "+ str(minimum))
|
|
# get the position of the minimum MSE in our list
|
|
index_min_MSE=mse_tune_list.index(minimum)
|
|
# choose the corresponding alpha
|
|
alpha_optimal=alpha_list[index_min_MSE]
|
|
print("The optimal alpha is "+str(alpha_optimal))
|
|
|
|
#############################################################
|
|
# What is the out-of-sample performance of the optimal model?
|
|
#############################################################
|
|
# take the full training data set (1000 observations, i.e., training + tuning set)
|
|
x_variables_std_train_total=np.concatenate((x_variables_std_train, x_variables_std_tune), axis=0)
|
|
y_variable_std_train_total=np.concatenate((y_variable_std_train, y_variable_std_tune), axis=0)
|
|
# train the model with the optimal Lambda on the training and tuning data
|
|
regression_Ridge_optimal=Ridge(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
|
|
|
# Mean squared error
|
|
# predict y in the full training sample
|
|
y_variable_std_train_total_predicted=regression_Ridge_optimal.predict(x_variables_std_train_total)
|
|
# predict y in the testing sample
|
|
# Remeber: we have not used the testing data yet. Firewall principle!!!
|
|
y_variable_std_test_predicted=regression_Ridge_optimal.predict(x_variables_std_test)
|
|
|
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
|
|
|
|
|
#############################################################
|
|
# Part 6: Ridge regression with k-fold cross-validation
|
|
# Implement the cross validation using a package
|
|
#############################################################
|
|
print("\nPart 6. Ridge regression - Using cross-validation\n")
|
|
|
|
# the default for cv is the leave-one-out cross-validation
|
|
# here we apply five-fold cross-validation
|
|
regression_Ridge_cv=RidgeCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
|
|
|
# get the optimal lambda
|
|
alpha_optimal_cv=regression_Ridge_cv.alpha_
|
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
|
|
|
# Mean squared error using the cross-validated model
|
|
# predict y in the full training sample
|
|
y_variable_std_train_total_predicted_cv=regression_Ridge_cv.predict(x_variables_std_train_total)
|
|
# predict y in the testing sample
|
|
y_variable_std_test_predicted_cv=regression_Ridge_cv.predict(x_variables_std_test)
|
|
|
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
|
|
|
|
|
###########################################
|
|
# Part 7: LASSO regression
|
|
# on the full sample -> to learn the syntax
|
|
###########################################
|
|
|
|
print("\nPart 7: LASSO regression - learning the syntax\n")
|
|
# the parameter alpha corresponds to the penalty parameter Lambda from
|
|
# the notation that is typically used.
|
|
# the default is that the intercept is included, so you do not need the
|
|
# "intercept=True" parameter. But it is good to keep in mind what
|
|
# specification you are using.
|
|
regression_Lasso=Lasso(alpha=0.1,fit_intercept=True).fit(x_variables_standardized, y_variable_standardized)
|
|
|
|
# display the intercept and the beta coefficient on x1 and x51
|
|
print("The intercept is: "+str(regression_Lasso.intercept_[0]))
|
|
print("The coefficient on x_1 is: "+str(regression_Lasso.coef_[0]))
|
|
print("The coefficient on x_51 is: "+str(regression_Lasso.coef_[50]))
|
|
|
|
# R2 of the regression
|
|
print("The R2 is: "+str(regression_Lasso.score(x_variables_standardized, y_variable_standardized)))
|
|
|
|
# How to compute the mean squared error (MSE)?
|
|
# 1. get the predicted values
|
|
y_variable_standardized_predicted=regression_Lasso.predict(x_variables_standardized)
|
|
# 2. determine the MSE
|
|
print("The MSE of the prediction is: "+str(mean_squared_error(y_variable_standardized, y_variable_standardized_predicted)))
|
|
|
|
|
|
####################################################
|
|
# Part 8: Create a training, tune and testing sample
|
|
####################################################
|
|
print("\nPart 8: LASSO regression - Application with training, tuning, and testing data\n")
|
|
# we use the same training, tuning, and testing data as in part 5.
|
|
# -> no need to redefine the data sets.
|
|
|
|
#################################
|
|
# find the optimal Lambda
|
|
#################################
|
|
# we store the MSE of the training/tuning data for each Lambda
|
|
mse_train_list=[]
|
|
mse_tune_list=[]
|
|
# Again, Lambda and Alpha refer to the same thing.
|
|
alpha_list=[]
|
|
|
|
# we iterate from 0.0001 to 0.25 increasing alpha by 0.0001 in each step.
|
|
alpha=0.0001
|
|
while alpha<0.25:
|
|
# train the model
|
|
regression_Lasso_train=Lasso(alpha=alpha,fit_intercept=True).fit(x_variables_std_train, y_variable_std_train)
|
|
# add the alpha to the list of alphas
|
|
alpha_list.append(alpha)
|
|
# predict y in the training sample
|
|
y_variable_std_train_predicted=regression_Lasso_train.predict(x_variables_std_train)
|
|
# predict y in the tuning sample
|
|
y_variable_std_tune_predicted=regression_Lasso_train.predict(x_variables_std_tune)
|
|
# compute the MSE in both samples
|
|
mse_train=mean_squared_error(y_variable_std_train, y_variable_std_train_predicted)
|
|
mse_tune=mean_squared_error(y_variable_std_tune, y_variable_std_tune_predicted)
|
|
# append the MSEs to the two lists
|
|
mse_train_list.append(mse_train)
|
|
mse_tune_list.append(mse_tune)
|
|
# continue with the next alpha
|
|
alpha=alpha+0.0001
|
|
|
|
########################################
|
|
# plot the MSEs for the different alphas
|
|
########################################
|
|
|
|
# MSE in the training sample
|
|
plot.scatter(alpha_list, mse_train_list)
|
|
plot.show()
|
|
# higher Lambda associated with higher MSE
|
|
|
|
# MSE in the tuning sample
|
|
plot.scatter(alpha_list, mse_tune_list)
|
|
plot.show()
|
|
# there is an optimal alpha with the lowest MSE
|
|
|
|
|
|
######################################
|
|
# determine the optimal Lambda
|
|
######################################
|
|
# what is the smallest MSE?
|
|
minimum=min(mse_tune_list)
|
|
print("The smallest MSE is "+ str(minimum))
|
|
# get the position of the minimum MSE
|
|
index_min_MSE=mse_tune_list.index(minimum)
|
|
alpha_optimal=alpha_list[index_min_MSE]
|
|
|
|
print("The optimal alpha is "+str(alpha_optimal))
|
|
|
|
#############################################################
|
|
# What is the out-of-sample performance of the optimal model?
|
|
#############################################################
|
|
# take the full training data set (1000 observations; training + tuning)
|
|
# use the same variables as in Part 5.
|
|
|
|
# train the model with the optimal Lambda on the training and tuning data
|
|
regression_Lasso_optimal=Lasso(alpha=alpha_optimal,fit_intercept=True).fit(x_variables_std_train_total, y_variable_std_train_total)
|
|
|
|
# Mean squared error
|
|
# predict y in the full training sample
|
|
y_variable_std_train_total_predicted=regression_Lasso_optimal.predict(x_variables_std_train_total)
|
|
# predict y in the testing sample
|
|
y_variable_std_test_predicted=regression_Lasso_optimal.predict(x_variables_std_test)
|
|
|
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted)))
|
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted)))
|
|
|
|
|
|
#############################################################
|
|
# Part 9: Implement the cross validation using a package
|
|
#############################################################
|
|
print("\nPart 9: LASSO regression - Using cross-validation\n")
|
|
|
|
# the default for cv in LassoCV is the 5-fold cross-validation
|
|
regression_Lasso_cv=LassoCV(alphas=alpha_list, fit_intercept=True,cv=5).fit(x_variables_std_train_total,y_variable_std_train_total)
|
|
|
|
# get the optimal lambda
|
|
alpha_optimal_cv=regression_Lasso_cv.alpha_
|
|
print("The optimal alpha is "+str(alpha_optimal_cv))
|
|
|
|
# Mean squared error using the cross-validated model
|
|
# predict y in the full training sample
|
|
y_variable_std_train_total_predicted_cv=regression_Lasso_cv.predict(x_variables_std_train_total)
|
|
# predict y in the testing sample
|
|
y_variable_std_test_predicted_cv=regression_Lasso_cv.predict(x_variables_std_test)
|
|
|
|
print("The MSE in the full training data is: "+str(mean_squared_error(y_variable_std_train_total, y_variable_std_train_total_predicted_cv)))
|
|
print("The MSE in the testing data is: "+str(mean_squared_error(y_variable_std_test, y_variable_std_test_predicted_cv)))
|
|
|
|
|
|
#####################################################################
|
|
# Part 10: Compare the betas from the Ridge and the LASSO regressions
|
|
#####################################################################
|
|
print("\nPart 10: Comparison of Ridge and LASSO coefficients\n")
|
|
# To set to what extend the results of Ridge and LASSO are similar, we
|
|
# write the coefficients from the cross-validation tasks (Parts 6 and 9)
|
|
# to a csv files.
|
|
|
|
output_file=open(directory+"comparison_coefficients_Ridge_LASSO.csv","w",encoding="utf-8")
|
|
output_file.write("index;coefficient_Ridge;coefficient_LASSO\n")
|
|
|
|
# get the list of coefficients
|
|
for i in range (0,200):
|
|
output_file.write(str(i)+';'+str(regression_Ridge_cv.coef_[0][i])+';'+str(regression_Lasso_cv.coef_[i])+'\n')
|
|
|
|
output_file.close()
|
|
|
|
print("Completed!")
|