Add a TARGET_VARIABLE constant to refer to Sales Price in a slightly cleaner way

2018-09-02 18:54:33 +02:00 · 2018-09-02 18:54:33 +02:00 · 1ef28ab3a1
commit 1ef28ab3a1
parent 488fb69da9
3 changed files with 2908 additions and 2905 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-09-01 16:51:42 CEST\n",
+      "2018-09-02 18:50:50 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -67,7 +67,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "The *utils.py* module defines helper dictionaries and lists that help with parsing the data types correctly, look up column descriptions, and refer to groups of data columns.\n",
+    "The *utils.py* module defines helper dictionaries, lists, and functions that help with parsing the data types correctly, look up column descriptions, and refer to groups of data columns.\n",
    "\n",
    "**Note:** the suffix \\_*COLUMNS* indicates a dictionary with all meta information on the provided data file and \\_*VARIABLES* a list with only the column names (i.e., the keys of the respective \\_*COLUMNS* dictionary)."
   ]
@ -93,6 +93,7 @@
    "    NUMERIC_VARIABLES,  # groups continuous and discrete\n",
    "    ORDINAL_COLUMNS,\n",
    "    ORDINAL_VARIABLES,\n",
+    "    TARGET_VARIABLE,  # = Sale Price\n",
    "    correct_column_names,\n",
    "    print_column_list,\n",
    "    update_column_descriptions,\n",
@ -199,7 +200,7 @@
    "# order as in the encoded description file.\n",
    "# Note that the target variable \"SalePrice\"\n",
    "# is not in the description file.\n",
-    "df = df[ALL_VARIABLES + [\"SalePrice\"]]"
+    "df = df[ALL_VARIABLES + TARGET_VARIABLE]"
   ]
  },
  {
@ -266,7 +267,7 @@
   "outputs": [],
   "source": [
    "# Show that all \"continuous\" variables come as integers.\n",
-    "for column in NUMERIC_VARIABLES + [\"SalePrice\"]:\n",
+    "for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n",
    "    not_null = df[column].notnull()\n",
    "    mask = (\n",
    "        df.loc[not_null, column].astype(np.int64)\n",
@ -2237,7 +2238,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + [\"SalePrice\"]\n",
+    "remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n",
    "mask = df[remaining_columns].isnull().any(axis=1)\n",
    "assert (100 * mask.sum() / df.shape[0]) < 1.1  # percent\n",
    "df = df[~mask]"
@ -2287,7 +2288,7 @@
    "update_column_descriptions(df.columns)\n",
    "# Without any more missing data, cast all numeric\n",
    "# columns as floats or integers respectively.\n",
-    "for column in CONTINUOUS_VARIABLES + [\"SalePrice\"]:\n",
+    "for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n",
    "    df[column] = df[column].astype(np.float64)\n",
    "for column in DISCRETE_VARIABLES:\n",
    "    df[column] = df[column].astype(np.int64)"
--- a/data_clean.csv
+++ b/data_clean.csv
--- a/utils.py
+++ b/utils.py
@ -28,12 +28,14 @@ Implementation Note:

 import re

+import pandas as pd
 import requests
 import tabulate


 INDEX_COLUMNS = ["Order", "PID"]
 LABEL_TYPES = ["nominal", "ordinal"]
+TARGET_VARIABLE = ["SalePrice"]
 # Note that these dictionaries and lists are not actually constants but
 # filled in during import time which makes them "near"-constant.
 ALL_COLUMNS = {}
@ -100,7 +102,7 @@ def _extract_meta_data(lines):
    # The two ID columns and the target variable "SalePrice"
    # are not put into the helper dicts / lists as they are
    # treated seperately in the modelling anyways.
-    non_feature_columns = INDEX_COLUMNS + ["SalePrice"]
+    non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLE

    for line in lines:
        # Process the next variable in the list.