Add a TARGET_VARIABLE constant to refer to Sales Price in a slightly cleaner way
This commit is contained in:
parent
488fb69da9
commit
1ef28ab3a1
3 changed files with 2908 additions and 2905 deletions
|
@ -30,7 +30,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"2018-09-01 16:51:42 CEST\n",
|
"2018-09-02 18:50:50 CEST\n",
|
||||||
"\n",
|
"\n",
|
||||||
"CPython 3.6.5\n",
|
"CPython 3.6.5\n",
|
||||||
"IPython 6.5.0\n",
|
"IPython 6.5.0\n",
|
||||||
|
@ -67,7 +67,7 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"The *utils.py* module defines helper dictionaries and lists that help with parsing the data types correctly, look up column descriptions, and refer to groups of data columns.\n",
|
"The *utils.py* module defines helper dictionaries, lists, and functions that help with parsing the data types correctly, look up column descriptions, and refer to groups of data columns.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Note:** the suffix \\_*COLUMNS* indicates a dictionary with all meta information on the provided data file and \\_*VARIABLES* a list with only the column names (i.e., the keys of the respective \\_*COLUMNS* dictionary)."
|
"**Note:** the suffix \\_*COLUMNS* indicates a dictionary with all meta information on the provided data file and \\_*VARIABLES* a list with only the column names (i.e., the keys of the respective \\_*COLUMNS* dictionary)."
|
||||||
]
|
]
|
||||||
|
@ -93,6 +93,7 @@
|
||||||
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
||||||
" ORDINAL_COLUMNS,\n",
|
" ORDINAL_COLUMNS,\n",
|
||||||
" ORDINAL_VARIABLES,\n",
|
" ORDINAL_VARIABLES,\n",
|
||||||
|
" TARGET_VARIABLE, # = Sale Price\n",
|
||||||
" correct_column_names,\n",
|
" correct_column_names,\n",
|
||||||
" print_column_list,\n",
|
" print_column_list,\n",
|
||||||
" update_column_descriptions,\n",
|
" update_column_descriptions,\n",
|
||||||
|
@ -199,7 +200,7 @@
|
||||||
"# order as in the encoded description file.\n",
|
"# order as in the encoded description file.\n",
|
||||||
"# Note that the target variable \"SalePrice\"\n",
|
"# Note that the target variable \"SalePrice\"\n",
|
||||||
"# is not in the description file.\n",
|
"# is not in the description file.\n",
|
||||||
"df = df[ALL_VARIABLES + [\"SalePrice\"]]"
|
"df = df[ALL_VARIABLES + TARGET_VARIABLE]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -266,7 +267,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Show that all \"continuous\" variables come as integers.\n",
|
"# Show that all \"continuous\" variables come as integers.\n",
|
||||||
"for column in NUMERIC_VARIABLES + [\"SalePrice\"]:\n",
|
"for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n",
|
||||||
" not_null = df[column].notnull()\n",
|
" not_null = df[column].notnull()\n",
|
||||||
" mask = (\n",
|
" mask = (\n",
|
||||||
" df.loc[not_null, column].astype(np.int64)\n",
|
" df.loc[not_null, column].astype(np.int64)\n",
|
||||||
|
@ -2237,7 +2238,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + [\"SalePrice\"]\n",
|
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n",
|
||||||
"mask = df[remaining_columns].isnull().any(axis=1)\n",
|
"mask = df[remaining_columns].isnull().any(axis=1)\n",
|
||||||
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
|
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
|
||||||
"df = df[~mask]"
|
"df = df[~mask]"
|
||||||
|
@ -2287,7 +2288,7 @@
|
||||||
"update_column_descriptions(df.columns)\n",
|
"update_column_descriptions(df.columns)\n",
|
||||||
"# Without any more missing data, cast all numeric\n",
|
"# Without any more missing data, cast all numeric\n",
|
||||||
"# columns as floats or integers respectively.\n",
|
"# columns as floats or integers respectively.\n",
|
||||||
"for column in CONTINUOUS_VARIABLES + [\"SalePrice\"]:\n",
|
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n",
|
||||||
" df[column] = df[column].astype(np.float64)\n",
|
" df[column] = df[column].astype(np.float64)\n",
|
||||||
"for column in DISCRETE_VARIABLES:\n",
|
"for column in DISCRETE_VARIABLES:\n",
|
||||||
" df[column] = df[column].astype(np.int64)"
|
" df[column] = df[column].astype(np.int64)"
|
||||||
|
|
5796
data_clean.csv
5796
data_clean.csv
File diff suppressed because it is too large
Load diff
4
utils.py
4
utils.py
|
@ -28,12 +28,14 @@ Implementation Note:
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import tabulate
|
import tabulate
|
||||||
|
|
||||||
|
|
||||||
INDEX_COLUMNS = ["Order", "PID"]
|
INDEX_COLUMNS = ["Order", "PID"]
|
||||||
LABEL_TYPES = ["nominal", "ordinal"]
|
LABEL_TYPES = ["nominal", "ordinal"]
|
||||||
|
TARGET_VARIABLE = ["SalePrice"]
|
||||||
# Note that these dictionaries and lists are not actually constants but
|
# Note that these dictionaries and lists are not actually constants but
|
||||||
# filled in during import time which makes them "near"-constant.
|
# filled in during import time which makes them "near"-constant.
|
||||||
ALL_COLUMNS = {}
|
ALL_COLUMNS = {}
|
||||||
|
@ -100,7 +102,7 @@ def _extract_meta_data(lines):
|
||||||
# The two ID columns and the target variable "SalePrice"
|
# The two ID columns and the target variable "SalePrice"
|
||||||
# are not put into the helper dicts / lists as they are
|
# are not put into the helper dicts / lists as they are
|
||||||
# treated seperately in the modelling anyways.
|
# treated seperately in the modelling anyways.
|
||||||
non_feature_columns = INDEX_COLUMNS + ["SalePrice"]
|
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLE
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
# Process the next variable in the list.
|
# Process the next variable in the list.
|
||||||
|
|
Loading…
Reference in a new issue