2018-08-29 02:54:44 +02:00
|
|
|
"""Description of the Ames Housing dataset.
|
|
|
|
|
|
|
|
This module uses the information available on the publication homepage and
|
|
|
|
defines a nested dictionary `ALL_COLUMNS` that can be used to decode the data
|
2018-08-29 13:59:07 +02:00
|
|
|
in the accompanying Excel file. For convenience, `ALL_VARIABLES` provides a
|
|
|
|
list of only the column names.
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
Furthermore, six helper dictionaries `CONTINUOUS_COLUMNS`, `DISCRETE_COLUMNS`,
|
|
|
|
`NUMERIC_COLUMNS`, `NOMINAL_COLUMNS`, `ORDINAL_COLUMNS`, and `LABEL_COLUMNS`
|
|
|
|
are defined that provide just the subset of the columns with the corresponding
|
|
|
|
data types. Note that the numeric dictionary unifies the continuous and
|
|
|
|
discrete data columns while the label dictionary unifies the nominal and
|
|
|
|
ordinal columns. For each of the six dictionaries, a list of the actual column
|
|
|
|
names is created with the same name and the suffix "_VARIABLES" instead of
|
|
|
|
"_COLUMNS", e.g., "CONTINUOUS_VARIABLES" instead of "CONTINUOUS_COLUMNS".
|
|
|
|
|
2018-08-29 13:59:07 +02:00
|
|
|
Lastly, the INDEX_COLUMNS and LABEL_TYPES list can be used to refer to the
|
|
|
|
actual values in a more readable way.
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
Source:
|
|
|
|
https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt
|
|
|
|
|
|
|
|
Implementation Note:
|
|
|
|
This file defines the "constants" it exports dynamically. This is a bit
|
|
|
|
advanced but intentional!
|
|
|
|
"""
|
2018-09-01 16:52:46 +02:00
|
|
|
# pragma pylint:disable=global-statement
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
import re
|
|
|
|
|
2018-09-02 18:54:33 +02:00
|
|
|
import pandas as pd
|
2018-08-29 02:54:44 +02:00
|
|
|
import requests
|
2018-09-01 16:52:46 +02:00
|
|
|
import tabulate
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
|
2018-08-29 13:59:07 +02:00
|
|
|
INDEX_COLUMNS = ["Order", "PID"]
|
2018-08-29 02:54:44 +02:00
|
|
|
LABEL_TYPES = ["nominal", "ordinal"]
|
2018-09-03 15:57:24 +02:00
|
|
|
TARGET_VARIABLES = ["SalePrice"]
|
2018-08-29 02:54:44 +02:00
|
|
|
# Note that these dictionaries and lists are not actually constants but
|
|
|
|
# filled in during import time which makes them "near"-constant.
|
|
|
|
ALL_COLUMNS = {}
|
|
|
|
ALL_VARIABLES = []
|
|
|
|
CONTINUOUS_COLUMNS = {}
|
|
|
|
CONTINUOUS_VARIABLES = []
|
|
|
|
DISCRETE_COLUMNS = {}
|
|
|
|
DISCRETE_VARIABLES = []
|
|
|
|
NUMERIC_COLUMNS = {}
|
|
|
|
NUMERIC_VARIABLES = []
|
|
|
|
NOMINAL_COLUMNS = {}
|
|
|
|
NOMINAL_VARIABLES = []
|
|
|
|
ORDINAL_COLUMNS = {}
|
|
|
|
ORDINAL_VARIABLES = []
|
|
|
|
LABEL_COLUMNS = {}
|
|
|
|
LABEL_VARIABLES = []
|
|
|
|
|
|
|
|
|
|
|
|
def _get_lines():
|
|
|
|
"""Obtain the non-empty lines of the data description file."""
|
|
|
|
# Read cached data file.
|
|
|
|
try:
|
|
|
|
with open("data_documentation.txt", "r") as file:
|
|
|
|
lines = file.readlines()
|
|
|
|
# If there is no cached file, obtain in from the original source.
|
|
|
|
except FileNotFoundError:
|
|
|
|
response = requests.get(
|
|
|
|
"https://www.amstat.org/publications"
|
|
|
|
"/jse/v19n3/decock/DataDocumentation.txt"
|
|
|
|
)
|
|
|
|
# Cache the retrieved file.
|
|
|
|
with open("data_documentation.txt", "w") as file:
|
|
|
|
file.write(response.text)
|
|
|
|
lines = response.text.split("\r\n")
|
|
|
|
# Remove header, footer, and empty lines.
|
|
|
|
lines = [x.replace(" ", " ").strip() for x in lines[13:545]]
|
|
|
|
lines = [x for x in lines if x != ""]
|
|
|
|
|
|
|
|
return lines
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_meta_data(lines):
|
|
|
|
"""Extract variables and realizations for a line.
|
|
|
|
|
|
|
|
This function parses the lines from the data documentation file and
|
|
|
|
writes the results into the global dictionary ALL_COLUMNS that is exported
|
|
|
|
by this module.
|
|
|
|
|
|
|
|
A line can be a variable consisting of:
|
|
|
|
- the name of the variable / column,
|
|
|
|
- the variable's type (continuous, discrete, nominal, or ordinal), and
|
|
|
|
- a text description of the variable.
|
|
|
|
|
|
|
|
A line can also be a realization of a label column consisting of:
|
|
|
|
- the encoding,
|
|
|
|
- and the description.
|
|
|
|
|
|
|
|
Implementation note:
|
|
|
|
As the lines come in order, the "elif" condition below correctly refers
|
|
|
|
to the last line representing a variable.
|
|
|
|
"""
|
|
|
|
variable = re.compile(r"^(.*)(?:[\s]+)\(([\w]*)\)(?:\t)?: (.*)$")
|
|
|
|
realization = re.compile(r"^(.*)\t(.*)$")
|
|
|
|
# The two ID columns and the target variable "SalePrice"
|
|
|
|
# are not put into the helper dicts / lists as they are
|
|
|
|
# treated seperately in the modelling anyways.
|
2018-09-03 15:57:24 +02:00
|
|
|
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLES
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
for line in lines:
|
|
|
|
# Process the next variable in the list.
|
|
|
|
match = variable.match(line)
|
|
|
|
if match:
|
|
|
|
name, type_, description = match.groups()
|
|
|
|
# Skip the non-feature columns (that are always non-label columns).
|
|
|
|
if name in non_feature_columns:
|
|
|
|
continue
|
|
|
|
type_ = type_.lower()
|
|
|
|
# Create an entry for the next variable in the list.
|
|
|
|
ALL_COLUMNS[name] = {"type": type_, "description": description}
|
|
|
|
# Only if the variable is a label type, a lookup table is needed.
|
|
|
|
if type_ in LABEL_TYPES:
|
|
|
|
ALL_COLUMNS[name].update({"lookups": {}})
|
|
|
|
# Ordinal variables also store the order of their realizations
|
|
|
|
# exactly as defined in the data description file.
|
2018-08-29 11:20:47 +02:00
|
|
|
if type_ == "ordinal":
|
2018-08-29 02:54:44 +02:00
|
|
|
ALL_COLUMNS[name].update({"order": []})
|
|
|
|
# Add label realizations to a previously found label variable.
|
|
|
|
elif type_ in LABEL_TYPES:
|
|
|
|
match = realization.match(line)
|
|
|
|
code, description = match.groups()
|
|
|
|
code = code.strip()
|
|
|
|
ALL_COLUMNS[name]["lookups"][code] = description
|
2018-08-29 11:20:47 +02:00
|
|
|
if type_ == "ordinal":
|
2018-08-29 02:54:44 +02:00
|
|
|
ALL_COLUMNS[name]["order"].append(code)
|
|
|
|
|
|
|
|
|
|
|
|
def _populate_dicts_and_lists():
|
|
|
|
"""Populate all "secondary" dictionaries and lists.
|
|
|
|
|
|
|
|
The ALL_COLUMNS dictionary is the "main" dictionary and all other global
|
|
|
|
dictionaries and lists are considered derived from it and thus considered
|
|
|
|
"secondary".
|
|
|
|
"""
|
|
|
|
global ALL_VARIABLES
|
|
|
|
global CONTINUOUS_COLUMNS
|
|
|
|
global CONTINUOUS_VARIABLES
|
|
|
|
global DISCRETE_COLUMNS
|
|
|
|
global DISCRETE_VARIABLES
|
|
|
|
global NUMERIC_COLUMNS
|
|
|
|
global NUMERIC_VARIABLES
|
|
|
|
global NOMINAL_COLUMNS
|
|
|
|
global NOMINAL_VARIABLES
|
|
|
|
global ORDINAL_COLUMNS
|
|
|
|
global ORDINAL_VARIABLES
|
|
|
|
global LABEL_COLUMNS
|
|
|
|
global LABEL_VARIABLES
|
|
|
|
# The global data structures are not re-assigned to so as to keep all
|
|
|
|
# references in the Jupyter notebooks alive. Instead, they are emptied
|
|
|
|
# and re-filled.
|
|
|
|
ALL_VARIABLES[:] = sorted(ALL_COLUMNS)
|
|
|
|
CONTINUOUS_COLUMNS.clear()
|
|
|
|
CONTINUOUS_COLUMNS.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for (key, value) in ALL_COLUMNS.items()
|
|
|
|
if value["type"] == "continuous"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
CONTINUOUS_VARIABLES[:] = sorted(CONTINUOUS_COLUMNS)
|
|
|
|
DISCRETE_COLUMNS.clear()
|
|
|
|
DISCRETE_COLUMNS.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for (key, value) in ALL_COLUMNS.items()
|
|
|
|
if value["type"] == "discrete"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
|
|
|
|
NUMERIC_COLUMNS.clear()
|
|
|
|
NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
|
|
|
|
NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
|
|
|
|
NOMINAL_COLUMNS.clear()
|
|
|
|
NOMINAL_COLUMNS.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for (key, value) in ALL_COLUMNS.items()
|
|
|
|
if value["type"] == "nominal"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
NOMINAL_VARIABLES[:] = sorted(NOMINAL_COLUMNS)
|
|
|
|
ORDINAL_COLUMNS.clear()
|
|
|
|
ORDINAL_COLUMNS.update(
|
|
|
|
{
|
|
|
|
key: value
|
|
|
|
for (key, value) in ALL_COLUMNS.items()
|
|
|
|
if value["type"] == "ordinal"
|
|
|
|
}
|
|
|
|
)
|
|
|
|
ORDINAL_VARIABLES[:] = sorted(ORDINAL_COLUMNS)
|
|
|
|
LABEL_COLUMNS.clear()
|
|
|
|
LABEL_COLUMNS.update({**NOMINAL_COLUMNS, **ORDINAL_COLUMNS})
|
|
|
|
LABEL_VARIABLES[:] = sorted(LABEL_COLUMNS)
|
|
|
|
|
|
|
|
|
|
|
|
def _rename_column(old_name, new_name):
|
|
|
|
"""Change the name of a column."""
|
|
|
|
global ALL_COLUMNS
|
|
|
|
ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
|
|
|
|
del ALL_COLUMNS[old_name]
|
|
|
|
|
|
|
|
|
2018-09-01 16:52:46 +02:00
|
|
|
def correct_column_names(data_columns, *, repopulate=True):
|
2018-08-29 02:54:44 +02:00
|
|
|
"""Cross-check the column names between data and description file.
|
|
|
|
|
|
|
|
In rare cases, the variable name in the data description file was slightly
|
|
|
|
changed, i.e., a dash or a space needs to be removed.
|
|
|
|
|
2018-09-02 23:25:07 +02:00
|
|
|
This function adjusts the keys in all the dictionaries and lists and
|
|
|
|
returns a dictionary summarizing the name changes.
|
2018-08-29 02:54:44 +02:00
|
|
|
"""
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed = {}
|
2018-08-29 02:54:44 +02:00
|
|
|
for desc_column in ALL_VARIABLES:
|
|
|
|
if desc_column not in data_columns:
|
|
|
|
for data_column in data_columns:
|
|
|
|
# Column name was truncated in description file.
|
|
|
|
if data_column.startswith(desc_column):
|
|
|
|
_rename_column(desc_column, data_column)
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed[desc_column] = data_column
|
2018-08-29 02:54:44 +02:00
|
|
|
break
|
|
|
|
# Spaces between words in Excel were removed.
|
|
|
|
adj_data_column = data_column.replace(" ", "")
|
|
|
|
if adj_data_column == desc_column:
|
|
|
|
_rename_column(desc_column, data_column)
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed[desc_column] = data_column
|
2018-08-29 02:54:44 +02:00
|
|
|
break
|
|
|
|
# Spaces between words in description file were removed.
|
|
|
|
adj_desc_column = desc_column.replace(" ", "")
|
|
|
|
if adj_data_column == adj_desc_column:
|
|
|
|
_rename_column(desc_column, data_column)
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed[desc_column] = data_column
|
2018-08-29 02:54:44 +02:00
|
|
|
break
|
|
|
|
# Dashes in description file were removed.
|
|
|
|
adj_desc_column = desc_column.replace("-", "")
|
|
|
|
if data_column == adj_desc_column:
|
|
|
|
_rename_column(desc_column, data_column)
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed[desc_column] = data_column
|
2018-08-29 02:54:44 +02:00
|
|
|
break
|
|
|
|
# Propagate the change to all "secondary" dictionaries and lists.
|
2018-09-01 16:52:46 +02:00
|
|
|
if repopulate:
|
|
|
|
_populate_dicts_and_lists()
|
2018-09-02 23:25:07 +02:00
|
|
|
return renamed
|
2018-08-29 02:54:44 +02:00
|
|
|
|
|
|
|
|
2018-09-01 16:52:46 +02:00
|
|
|
def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
2018-08-29 11:20:47 +02:00
|
|
|
"""Remove discarded columns for all the module's exported data structures.
|
|
|
|
|
|
|
|
After dropping some columns from the DataFrame, these removals must be
|
|
|
|
propagated to the helper data structures defined in this module.
|
2018-09-02 23:25:07 +02:00
|
|
|
|
|
|
|
Returns a dictionary of all the columns with changed names.
|
2018-08-29 11:20:47 +02:00
|
|
|
"""
|
|
|
|
global ALL_COLUMNS
|
2018-09-01 16:52:46 +02:00
|
|
|
if correct_columns:
|
2018-09-02 23:25:07 +02:00
|
|
|
renamed = correct_column_names(columns_to_be_kept, repopulate=False)
|
|
|
|
else:
|
|
|
|
renamed = {}
|
2018-08-29 11:20:47 +02:00
|
|
|
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
|
|
|
for column in columns_to_be_removed:
|
|
|
|
del ALL_COLUMNS[column]
|
|
|
|
# Propagate the change to all "secondary" dictionaries and lists.
|
|
|
|
_populate_dicts_and_lists()
|
2018-09-02 23:25:07 +02:00
|
|
|
return renamed
|
2018-08-29 11:20:47 +02:00
|
|
|
|
|
|
|
|
2018-09-01 16:52:46 +02:00
|
|
|
def print_column_list(subset=None):
|
|
|
|
"""Print (a subset of) the data's column headers.
|
|
|
|
|
|
|
|
Note that this function is built to handle both *_COLUMNS dicts and
|
|
|
|
*_VARIABLES lists.
|
|
|
|
"""
|
|
|
|
if subset is None:
|
|
|
|
subset = ALL_VARIABLES
|
|
|
|
else:
|
2018-09-03 15:57:24 +02:00
|
|
|
subset = set(subset)
|
|
|
|
# Handle variables withoutdescription seperately.
|
|
|
|
without_desc = subset - set(ALL_VARIABLES)
|
|
|
|
subset -= without_desc
|
|
|
|
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
|
|
|
|
if without_desc:
|
|
|
|
for c in sorted(without_desc):
|
|
|
|
columns.append((c, ''))
|
|
|
|
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
2018-09-01 16:52:46 +02:00
|
|
|
|
|
|
|
|
2018-09-02 23:25:07 +02:00
|
|
|
def load_clean_data(subset=None, ordinal_encoded=False):
|
|
|
|
"""Return the clean project data as a pandas DataFrame.
|
|
|
|
|
|
|
|
This utility function ensures that each column is cast to its correct type.
|
|
|
|
|
|
|
|
It takes as an optional 'subset' argument a list of columns and
|
|
|
|
'ordinal_encoded' can be set to True to obtain the ordinal columns already
|
|
|
|
encoded into ordered integers.
|
|
|
|
|
|
|
|
The target variable "SalePrice" is always included as the last column.
|
|
|
|
|
|
|
|
Implementation Notes:
|
|
|
|
|
|
|
|
One caveat is that all columns need to be casted as generic object type
|
|
|
|
first, then the column names in the global dicts and lists are updated to
|
|
|
|
reflect the slightly different column names (between data and description
|
|
|
|
files), after which only the numeric columns can be casted correctly.
|
|
|
|
|
|
|
|
Another difficulty is that some values, e.g., "NA" strings are cast as
|
|
|
|
np.NaN / None by pandas although they represent actual label values.
|
|
|
|
|
|
|
|
As column names come in slightly different form compared to the data
|
|
|
|
description file, the subsetting can only be done after loading the CSV
|
|
|
|
and some work needs to be put in to figure out if a column mentioned in the
|
|
|
|
subset was renamed.
|
|
|
|
"""
|
|
|
|
# pragma pylint:disable=invalid-name
|
|
|
|
df = pd.read_csv(
|
|
|
|
"data_clean.csv",
|
|
|
|
index_col=INDEX_COLUMNS,
|
|
|
|
dtype=object,
|
|
|
|
na_values="", # There are no missing values in the clean data file.
|
|
|
|
keep_default_na=False, # "NA" strings are casted as actual values.
|
|
|
|
)
|
|
|
|
# Remove columns that are in the description but not in the data file.
|
|
|
|
renamed = update_column_descriptions(df.columns, correct_columns=True)
|
|
|
|
# Cast the numeric types correctly.
|
2018-09-03 15:57:24 +02:00
|
|
|
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
|
2018-09-02 23:25:07 +02:00
|
|
|
df[column] = df[column].astype(float)
|
|
|
|
for column in DISCRETE_VARIABLES:
|
|
|
|
df[column] = df[column].astype(int)
|
|
|
|
# Cast the label types as Categoricals.
|
|
|
|
for column, mapping in NOMINAL_COLUMNS.items():
|
|
|
|
labels = pd.api.types.CategoricalDtype(
|
|
|
|
mapping["lookups"].keys(), ordered=False
|
|
|
|
)
|
|
|
|
df[column] = df[column].astype(labels)
|
|
|
|
for column, mapping in ORDINAL_COLUMNS.items():
|
|
|
|
labels = pd.api.types.CategoricalDtype(
|
|
|
|
reversed(mapping["order"]), ordered=True
|
|
|
|
)
|
|
|
|
df[column] = df[column].astype(labels)
|
|
|
|
# Mirror the renaming and dropping of columns
|
|
|
|
# for the provided list of columns.
|
|
|
|
# Note that the target variable goes last.
|
|
|
|
if subset is not None:
|
|
|
|
subset = set(subset)
|
|
|
|
subset.discard("SalePrice")
|
|
|
|
for old_name, new_name in renamed.items():
|
|
|
|
if old_name in subset:
|
|
|
|
subset.remove(old_name)
|
|
|
|
subset.add(new_name)
|
|
|
|
subset = sorted(set(df.columns) & subset)
|
2018-09-03 15:57:24 +02:00
|
|
|
df = df[subset + TARGET_VARIABLES]
|
2018-09-02 23:25:07 +02:00
|
|
|
# Use integer encoding for ordinal variables.
|
|
|
|
if ordinal_encoded:
|
|
|
|
df = encode_ordinals(df)
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
def encode_ordinals(df):
|
|
|
|
"""Replace ordinal columns' labels with integer codes."""
|
|
|
|
# pragma pylint:disable=invalid-name
|
|
|
|
df = df.copy()
|
|
|
|
for column in df.columns:
|
|
|
|
if column in ORDINAL_VARIABLES:
|
|
|
|
df[column] = df[column].cat.codes
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
2018-08-29 02:54:44 +02:00
|
|
|
# This code is executed once during import time and
|
|
|
|
# populates all the "constants" directly or indirectly.
|
|
|
|
_extract_meta_data(_get_lines())
|
|
|
|
_populate_dicts_and_lists()
|