Add a simple version of predictive models

This commit is contained in:
Alexander Hess 2018-09-05 19:44:00 +02:00
parent 848456f6c6
commit 04403b7603
4 changed files with 6416 additions and 30 deletions

6367
4_predictive_models.ipynb Normal file

File diff suppressed because it is too large Load diff

View file

@ -17,6 +17,7 @@ seaborn = "*"
missingno = "*" missingno = "*"
sklearn = "*" sklearn = "*"
statsmodels = "*" statsmodels = "*"
tqdm = "*"
[dev-packages] [dev-packages]
black = "*" black = "*"

10
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87" "sha256": "cd46097e2ebd23accf453936aef1b898dfbf46ff9f2642f494da0e041b7a5992"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -624,6 +624,14 @@
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'", "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
"version": "==5.1" "version": "==5.1"
}, },
"tqdm": {
"hashes": [
"sha256:5ef526702c0d265d5a960a3b27f3971fac13c26cf0fb819294bfa71fc6026c88",
"sha256:a3364bd83ce4777320b862e3c8a93d7da91e20a95f06ef79bed7dd71c654cafa"
],
"index": "pypi",
"version": "==4.25.0"
},
"traitlets": { "traitlets": {
"hashes": [ "hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",

View file

@ -24,10 +24,10 @@ Implementation Note:
This file defines the "constants" it exports dynamically. This is a bit This file defines the "constants" it exports dynamically. This is a bit
advanced but intentional! advanced but intentional!
""" """
# pragma pylint:disable=global-statement
import re import re
import numpy as np
import pandas as pd import pandas as pd
import requests import requests
import tabulate import tabulate
@ -140,19 +140,6 @@ def _populate_dicts_and_lists():
dictionaries and lists are considered derived from it and thus considered dictionaries and lists are considered derived from it and thus considered
"secondary". "secondary".
""" """
global ALL_VARIABLES
global CONTINUOUS_COLUMNS
global CONTINUOUS_VARIABLES
global DISCRETE_COLUMNS
global DISCRETE_VARIABLES
global NUMERIC_COLUMNS
global NUMERIC_VARIABLES
global NOMINAL_COLUMNS
global NOMINAL_VARIABLES
global ORDINAL_COLUMNS
global ORDINAL_VARIABLES
global LABEL_COLUMNS
global LABEL_VARIABLES
# The global data structures are not re-assigned to so as to keep all # The global data structures are not re-assigned to so as to keep all
# references in the Jupyter notebooks alive. Instead, they are emptied # references in the Jupyter notebooks alive. Instead, they are emptied
# and re-filled. # and re-filled.
@ -175,6 +162,11 @@ def _populate_dicts_and_lists():
} }
) )
DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS) DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
FACTOR_VARIABLES[:] = [
key
for (key, value) in ALL_COLUMNS.items()
if value["type"] == "factor"
]
NUMERIC_COLUMNS.clear() NUMERIC_COLUMNS.clear()
NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS}) NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS) NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
@ -203,7 +195,6 @@ def _populate_dicts_and_lists():
def _rename_column(old_name, new_name): def _rename_column(old_name, new_name):
"""Change the name of a column.""" """Change the name of a column."""
global ALL_COLUMNS
ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name] ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
del ALL_COLUMNS[old_name] del ALL_COLUMNS[old_name]
@ -249,7 +240,6 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
After dropping some columns from the DataFrame, these removals must be After dropping some columns from the DataFrame, these removals must be
propagated to the helper data structures defined in this module. propagated to the helper data structures defined in this module.
""" """
global ALL_COLUMNS
if correct_columns: if correct_columns:
correct_column_names(columns_to_be_kept, repopulate=False) correct_column_names(columns_to_be_kept, repopulate=False)
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept)) columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
@ -279,17 +269,15 @@ def print_column_list(subset=None):
print(tabulate.tabulate(sorted(columns), tablefmt="plain")) print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
def load_clean_data(*, path=None, ordinal_encoded=False): def load_clean_data(path=None):
"""Return the clean project data as a pandas DataFrame. """Return the clean project data as a pandas DataFrame.
This utility function ensures that each column is cast to its correct type. This utility function ensures that each column is cast to its correct type.
It takes the following optional keyword-only arguments: It takes an optional path argument to a clean CSV file (defaults to
- 'path': path to the clean CSV file (defaults to "data/data_clean.csv") "data/data_clean.csv").
- 'ordinal_encoded': can be set to True to obtain the ordinal columns that
are already encoded into ordered integers
The target variable "SalePrice" is always included as the last column. The target variables are always included as the last columns.
Implementation Notes: Implementation Notes:
@ -331,22 +319,30 @@ def load_clean_data(*, path=None, ordinal_encoded=False):
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES) derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
if derived_columns: if derived_columns:
for column in derived_columns: for column in derived_columns:
# All derived variables are numeric (including factors).
df[column] = df[column].astype(float)
# Check if the derived variable is a target variable. # Check if the derived variable is a target variable.
for target in TARGET_VARIABLES[:]: for target in TARGET_VARIABLES[:]:
if column.startswith(target): if column.startswith(target):
df[column] = df[column].astype(float)
TARGET_VARIABLES.append(column) TARGET_VARIABLES.append(column)
break break
else: else:
df[column] = df[column].astype(float)
is_int = (df[column] == df[column].astype(int)).all()
n_unique = len(df[column].unique())
if is_int & (n_unique == 2):
df[column] = df[column].astype(int)
type_ = "factor"
elif is_int & (n_unique < 150):
df[column] = df[column].astype(int)
type_ = "discrete"
else:
df[column] = df[column].astype(float)
type_ = "continuous"
ALL_COLUMNS[column] = { ALL_COLUMNS[column] = {
"type": "continuous", "type": type_,
"description": "derived variable", "description": "derived variable",
} }
_populate_dicts_and_lists() _populate_dicts_and_lists()
# Use integer encoding for ordinal variables.
if ordinal_encoded:
df = encode_ordinals(df)
return df return df
@ -356,10 +352,24 @@ def encode_ordinals(df):
df = df.copy() df = df.copy()
for column in df.columns: for column in df.columns:
if column in ORDINAL_VARIABLES: if column in ORDINAL_VARIABLES:
df[column] = df[column].cat.codes df[column] = df[column].cat.codes.astype(int)
return df return df
def bias_score(y_true, y_pred):
"""Determine the bias of a prediction."""
assert y_true.shape == y_pred.shape
assert y_true.ndim == 1
return np.mean(y_pred - y_true)
def max_deviation(y_true, y_pred):
"""Determine the maximum deviation of a prediction."""
assert y_true.shape == y_pred.shape
assert y_true.ndim == 1
return np.max(np.abs(y_pred - y_true))
# This code is executed once during import time and # This code is executed once during import time and
# populates all the "constants" directly or indirectly. # populates all the "constants" directly or indirectly.
_extract_meta_data(_get_lines()) _extract_meta_data(_get_lines())