Add INDEX_COLUMNS constant for more readablility

2018-08-29 13:59:07 +02:00 · 2018-08-29 13:59:07 +02:00 · d5012946c2
commit d5012946c2
parent 38c4dd5aef
2 changed files with 11 additions and 9 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-08-29 13:46:44 CEST\n",
+      "2018-08-29 14:00:36 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -82,15 +82,16 @@
    "from utils import (\n",
    "    ALL_COLUMNS,\n",
    "    ALL_VARIABLES,\n",
-    "    LABEL_COLUMNS,  # groups nominal and ordinal\n",
-    "    NUMERIC_VARIABLES,  # groups continuous and discrete\n",
    "    CONTINUOUS_COLUMNS,\n",
    "    CONTINUOUS_VARIABLES,\n",
    "    DISCRETE_COLUMNS,\n",
    "    DISCRETE_VARIABLES,\n",
+    "    INDEX_COLUMNS,\n",
+    "    LABEL_COLUMNS,  # groups nominal and ordinal\n",
    "    LABEL_TYPES,\n",
    "    NOMINAL_COLUMNS,\n",
    "    NOMINAL_VARIABLES,\n",
+    "    NUMERIC_VARIABLES,  # groups continuous and discrete\n",
    "    ORDINAL_COLUMNS,\n",
    "    ORDINAL_VARIABLES,\n",
    "    correct_column_names,\n",
@ -184,7 +185,7 @@
    "correct_column_names(df.columns)\n",
    "# Use a compound index and keep both\n",
    "# identifying columns in the DataFrame.\n",
-    "df = df.set_index([\"Order\", \"PID\"])\n",
+    "df = df.set_index(INDEX_COLUMNS)\n",
    "# Put the provided columns into the same\n",
    "# order as in the encoded description file.\n",
    "# Note that the target variable \"SalePrice\"\n",
--- a/utils.py
+++ b/utils.py
@ -2,8 +2,8 @@

 This module uses the information available on the publication homepage and
 defines a nested dictionary `ALL_COLUMNS` that can be used to decode the data
-in the accompanying AmesHousing.xls file in the data folder. For convenience,
-`ALL_VARIABLES` provides a list of only the column names.
+in the accompanying Excel file. For convenience, `ALL_VARIABLES` provides a
+list of only the column names.

 Furthermore, six helper dictionaries `CONTINUOUS_COLUMNS`, `DISCRETE_COLUMNS`,
 `NUMERIC_COLUMNS`, `NOMINAL_COLUMNS`, `ORDINAL_COLUMNS`, and `LABEL_COLUMNS`
@ -14,8 +14,8 @@ ordinal columns. For each of the six dictionaries, a list of the actual column
 names is created with the same name and the suffix "_VARIABLES" instead of
 "_COLUMNS", e.g., "CONTINUOUS_VARIABLES" instead of "CONTINUOUS_COLUMNS".

-Lastly, the LABEL_TYPES list can be used to quickly check types in a readable
-way.
+Lastly, the INDEX_COLUMNS and LABEL_TYPES list can be used to refer to the
+actual values in a more readable way.

 Source:
    https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
@ -31,6 +31,7 @@ import re
 import requests


+INDEX_COLUMNS = ["Order", "PID"]
 LABEL_TYPES = ["nominal", "ordinal"]
 # Note that these dictionaries and lists are not actually constants but
 # filled in during import time which makes them "near"-constant.
@ -98,7 +99,7 @@ def _extract_meta_data(lines):
    # The two ID columns and the target variable "SalePrice"
    # are not put into the helper dicts / lists as they are
    # treated seperately in the modelling anyways.
-    non_feature_columns = ["Order", "PID", "SalePrice"]
+    non_feature_columns = INDEX_COLUMNS + ["SalePrice"]

    for line in lines:
        # Process the next variable in the list.