Add feature transformations (e.g. Box-Cox)

This commit is contained in:
Alexander Hess 2018-09-03 15:57:24 +02:00
commit 069691cca1
7 changed files with 5025 additions and 89 deletions

View file

@ -30,7 +30,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2018-09-02 18:50:50 CEST\n",
"2018-09-03 15:32:42 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
@ -93,7 +93,7 @@
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
" ORDINAL_COLUMNS,\n",
" ORDINAL_VARIABLES,\n",
" TARGET_VARIABLE, # = Sale Price\n",
" TARGET_VARIABLES, # = Sale Price\n",
" correct_column_names,\n",
" print_column_list,\n",
" update_column_descriptions,\n",
@ -200,7 +200,7 @@
"# order as in the encoded description file.\n",
"# Note that the target variable \"SalePrice\"\n",
"# is not in the description file.\n",
"df = df[ALL_VARIABLES + TARGET_VARIABLE]"
"df = df[ALL_VARIABLES + TARGET_VARIABLES]"
]
},
{
@ -267,7 +267,7 @@
"outputs": [],
"source": [
"# Show that all \"continuous\" variables come as integers.\n",
"for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n",
"for column in NUMERIC_VARIABLES + TARGET_VARIABLES:\n",
" not_null = df[column].notnull()\n",
" mask = (\n",
" df.loc[not_null, column].astype(np.int64)\n",
@ -2238,7 +2238,7 @@
"metadata": {},
"outputs": [],
"source": [
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n",
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLES\n",
"mask = df[remaining_columns].isnull().any(axis=1)\n",
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
"df = df[~mask]"
@ -2288,7 +2288,7 @@
"update_column_descriptions(df.columns)\n",
"# Without any more missing data, cast all numeric\n",
"# columns as floats or integers respectively.\n",
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n",
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:\n",
" df[column] = df[column].astype(np.float64)\n",
"for column in DISCRETE_VARIABLES:\n",
" df[column] = df[column].astype(np.int64)"