Add INDEX_COLUMNS constant for more readablility
This commit is contained in:
parent
38c4dd5aef
commit
d5012946c2
2 changed files with 11 additions and 9 deletions
|
@ -30,7 +30,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-08-29 13:46:44 CEST\n",
|
||||
"2018-08-29 14:00:36 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
|
@ -82,15 +82,16 @@
|
|||
"from utils import (\n",
|
||||
" ALL_COLUMNS,\n",
|
||||
" ALL_VARIABLES,\n",
|
||||
" LABEL_COLUMNS, # groups nominal and ordinal\n",
|
||||
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
||||
" CONTINUOUS_COLUMNS,\n",
|
||||
" CONTINUOUS_VARIABLES,\n",
|
||||
" DISCRETE_COLUMNS,\n",
|
||||
" DISCRETE_VARIABLES,\n",
|
||||
" INDEX_COLUMNS,\n",
|
||||
" LABEL_COLUMNS, # groups nominal and ordinal\n",
|
||||
" LABEL_TYPES,\n",
|
||||
" NOMINAL_COLUMNS,\n",
|
||||
" NOMINAL_VARIABLES,\n",
|
||||
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
||||
" ORDINAL_COLUMNS,\n",
|
||||
" ORDINAL_VARIABLES,\n",
|
||||
" correct_column_names,\n",
|
||||
|
@ -184,7 +185,7 @@
|
|||
"correct_column_names(df.columns)\n",
|
||||
"# Use a compound index and keep both\n",
|
||||
"# identifying columns in the DataFrame.\n",
|
||||
"df = df.set_index([\"Order\", \"PID\"])\n",
|
||||
"df = df.set_index(INDEX_COLUMNS)\n",
|
||||
"# Put the provided columns into the same\n",
|
||||
"# order as in the encoded description file.\n",
|
||||
"# Note that the target variable \"SalePrice\"\n",
|
||||
|
|
11
utils.py
11
utils.py
|
@ -2,8 +2,8 @@
|
|||
|
||||
This module uses the information available on the publication homepage and
|
||||
defines a nested dictionary `ALL_COLUMNS` that can be used to decode the data
|
||||
in the accompanying AmesHousing.xls file in the data folder. For convenience,
|
||||
`ALL_VARIABLES` provides a list of only the column names.
|
||||
in the accompanying Excel file. For convenience, `ALL_VARIABLES` provides a
|
||||
list of only the column names.
|
||||
|
||||
Furthermore, six helper dictionaries `CONTINUOUS_COLUMNS`, `DISCRETE_COLUMNS`,
|
||||
`NUMERIC_COLUMNS`, `NOMINAL_COLUMNS`, `ORDINAL_COLUMNS`, and `LABEL_COLUMNS`
|
||||
|
@ -14,8 +14,8 @@ ordinal columns. For each of the six dictionaries, a list of the actual column
|
|||
names is created with the same name and the suffix "_VARIABLES" instead of
|
||||
"_COLUMNS", e.g., "CONTINUOUS_VARIABLES" instead of "CONTINUOUS_COLUMNS".
|
||||
|
||||
Lastly, the LABEL_TYPES list can be used to quickly check types in a readable
|
||||
way.
|
||||
Lastly, the INDEX_COLUMNS and LABEL_TYPES list can be used to refer to the
|
||||
actual values in a more readable way.
|
||||
|
||||
Source:
|
||||
https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
|
||||
|
@ -31,6 +31,7 @@ import re
|
|||
import requests
|
||||
|
||||
|
||||
INDEX_COLUMNS = ["Order", "PID"]
|
||||
LABEL_TYPES = ["nominal", "ordinal"]
|
||||
# Note that these dictionaries and lists are not actually constants but
|
||||
# filled in during import time which makes them "near"-constant.
|
||||
|
@ -98,7 +99,7 @@ def _extract_meta_data(lines):
|
|||
# The two ID columns and the target variable "SalePrice"
|
||||
# are not put into the helper dicts / lists as they are
|
||||
# treated seperately in the modelling anyways.
|
||||
non_feature_columns = ["Order", "PID", "SalePrice"]
|
||||
non_feature_columns = INDEX_COLUMNS + ["SalePrice"]
|
||||
|
||||
for line in lines:
|
||||
# Process the next variable in the list.
|
||||
|
|
Loading…
Reference in a new issue