Add INDEX_COLUMNS constant for more readablility
This commit is contained in:
parent
38c4dd5aef
commit
d5012946c2
2 changed files with 11 additions and 9 deletions
|
@ -30,7 +30,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"2018-08-29 13:46:44 CEST\n",
|
"2018-08-29 14:00:36 CEST\n",
|
||||||
"\n",
|
"\n",
|
||||||
"CPython 3.6.5\n",
|
"CPython 3.6.5\n",
|
||||||
"IPython 6.5.0\n",
|
"IPython 6.5.0\n",
|
||||||
|
@ -82,15 +82,16 @@
|
||||||
"from utils import (\n",
|
"from utils import (\n",
|
||||||
" ALL_COLUMNS,\n",
|
" ALL_COLUMNS,\n",
|
||||||
" ALL_VARIABLES,\n",
|
" ALL_VARIABLES,\n",
|
||||||
" LABEL_COLUMNS, # groups nominal and ordinal\n",
|
|
||||||
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
|
||||||
" CONTINUOUS_COLUMNS,\n",
|
" CONTINUOUS_COLUMNS,\n",
|
||||||
" CONTINUOUS_VARIABLES,\n",
|
" CONTINUOUS_VARIABLES,\n",
|
||||||
" DISCRETE_COLUMNS,\n",
|
" DISCRETE_COLUMNS,\n",
|
||||||
" DISCRETE_VARIABLES,\n",
|
" DISCRETE_VARIABLES,\n",
|
||||||
|
" INDEX_COLUMNS,\n",
|
||||||
|
" LABEL_COLUMNS, # groups nominal and ordinal\n",
|
||||||
" LABEL_TYPES,\n",
|
" LABEL_TYPES,\n",
|
||||||
" NOMINAL_COLUMNS,\n",
|
" NOMINAL_COLUMNS,\n",
|
||||||
" NOMINAL_VARIABLES,\n",
|
" NOMINAL_VARIABLES,\n",
|
||||||
|
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
||||||
" ORDINAL_COLUMNS,\n",
|
" ORDINAL_COLUMNS,\n",
|
||||||
" ORDINAL_VARIABLES,\n",
|
" ORDINAL_VARIABLES,\n",
|
||||||
" correct_column_names,\n",
|
" correct_column_names,\n",
|
||||||
|
@ -184,7 +185,7 @@
|
||||||
"correct_column_names(df.columns)\n",
|
"correct_column_names(df.columns)\n",
|
||||||
"# Use a compound index and keep both\n",
|
"# Use a compound index and keep both\n",
|
||||||
"# identifying columns in the DataFrame.\n",
|
"# identifying columns in the DataFrame.\n",
|
||||||
"df = df.set_index([\"Order\", \"PID\"])\n",
|
"df = df.set_index(INDEX_COLUMNS)\n",
|
||||||
"# Put the provided columns into the same\n",
|
"# Put the provided columns into the same\n",
|
||||||
"# order as in the encoded description file.\n",
|
"# order as in the encoded description file.\n",
|
||||||
"# Note that the target variable \"SalePrice\"\n",
|
"# Note that the target variable \"SalePrice\"\n",
|
||||||
|
|
11
utils.py
11
utils.py
|
@ -2,8 +2,8 @@
|
||||||
|
|
||||||
This module uses the information available on the publication homepage and
|
This module uses the information available on the publication homepage and
|
||||||
defines a nested dictionary `ALL_COLUMNS` that can be used to decode the data
|
defines a nested dictionary `ALL_COLUMNS` that can be used to decode the data
|
||||||
in the accompanying AmesHousing.xls file in the data folder. For convenience,
|
in the accompanying Excel file. For convenience, `ALL_VARIABLES` provides a
|
||||||
`ALL_VARIABLES` provides a list of only the column names.
|
list of only the column names.
|
||||||
|
|
||||||
Furthermore, six helper dictionaries `CONTINUOUS_COLUMNS`, `DISCRETE_COLUMNS`,
|
Furthermore, six helper dictionaries `CONTINUOUS_COLUMNS`, `DISCRETE_COLUMNS`,
|
||||||
`NUMERIC_COLUMNS`, `NOMINAL_COLUMNS`, `ORDINAL_COLUMNS`, and `LABEL_COLUMNS`
|
`NUMERIC_COLUMNS`, `NOMINAL_COLUMNS`, `ORDINAL_COLUMNS`, and `LABEL_COLUMNS`
|
||||||
|
@ -14,8 +14,8 @@ ordinal columns. For each of the six dictionaries, a list of the actual column
|
||||||
names is created with the same name and the suffix "_VARIABLES" instead of
|
names is created with the same name and the suffix "_VARIABLES" instead of
|
||||||
"_COLUMNS", e.g., "CONTINUOUS_VARIABLES" instead of "CONTINUOUS_COLUMNS".
|
"_COLUMNS", e.g., "CONTINUOUS_VARIABLES" instead of "CONTINUOUS_COLUMNS".
|
||||||
|
|
||||||
Lastly, the LABEL_TYPES list can be used to quickly check types in a readable
|
Lastly, the INDEX_COLUMNS and LABEL_TYPES list can be used to refer to the
|
||||||
way.
|
actual values in a more readable way.
|
||||||
|
|
||||||
Source:
|
Source:
|
||||||
https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
|
https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
|
||||||
|
@ -31,6 +31,7 @@ import re
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
INDEX_COLUMNS = ["Order", "PID"]
|
||||||
LABEL_TYPES = ["nominal", "ordinal"]
|
LABEL_TYPES = ["nominal", "ordinal"]
|
||||||
# Note that these dictionaries and lists are not actually constants but
|
# Note that these dictionaries and lists are not actually constants but
|
||||||
# filled in during import time which makes them "near"-constant.
|
# filled in during import time which makes them "near"-constant.
|
||||||
|
@ -98,7 +99,7 @@ def _extract_meta_data(lines):
|
||||||
# The two ID columns and the target variable "SalePrice"
|
# The two ID columns and the target variable "SalePrice"
|
||||||
# are not put into the helper dicts / lists as they are
|
# are not put into the helper dicts / lists as they are
|
||||||
# treated seperately in the modelling anyways.
|
# treated seperately in the modelling anyways.
|
||||||
non_feature_columns = ["Order", "PID", "SalePrice"]
|
non_feature_columns = INDEX_COLUMNS + ["SalePrice"]
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
# Process the next variable in the list.
|
# Process the next variable in the list.
|
||||||
|
|
Loading…
Reference in a new issue