Add a simple version of predictive models

This commit is contained in:
Alexander Hess 2018-09-05 19:44:00 +02:00
parent 848456f6c6
commit 04403b7603
4 changed files with 6416 additions and 30 deletions

6367
4_predictive_models.ipynb Normal file

File diff suppressed because it is too large Load diff

View file

@ -17,6 +17,7 @@ seaborn = "*"
missingno = "*"
sklearn = "*"
statsmodels = "*"
tqdm = "*"
[dev-packages]
black = "*"

10
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
"sha256": "cd46097e2ebd23accf453936aef1b898dfbf46ff9f2642f494da0e041b7a5992"
},
"pipfile-spec": 6,
"requires": {
@ -624,6 +624,14 @@
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
"version": "==5.1"
},
"tqdm": {
"hashes": [
"sha256:5ef526702c0d265d5a960a3b27f3971fac13c26cf0fb819294bfa71fc6026c88",
"sha256:a3364bd83ce4777320b862e3c8a93d7da91e20a95f06ef79bed7dd71c654cafa"
],
"index": "pypi",
"version": "==4.25.0"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",

View file

@ -24,10 +24,10 @@ Implementation Note:
This file defines the "constants" it exports dynamically. This is a bit
advanced but intentional!
"""
# pragma pylint:disable=global-statement
import re
import numpy as np
import pandas as pd
import requests
import tabulate
@ -140,19 +140,6 @@ def _populate_dicts_and_lists():
dictionaries and lists are considered derived from it and thus considered
"secondary".
"""
global ALL_VARIABLES
global CONTINUOUS_COLUMNS
global CONTINUOUS_VARIABLES
global DISCRETE_COLUMNS
global DISCRETE_VARIABLES
global NUMERIC_COLUMNS
global NUMERIC_VARIABLES
global NOMINAL_COLUMNS
global NOMINAL_VARIABLES
global ORDINAL_COLUMNS
global ORDINAL_VARIABLES
global LABEL_COLUMNS
global LABEL_VARIABLES
# The global data structures are not re-assigned to so as to keep all
# references in the Jupyter notebooks alive. Instead, they are emptied
# and re-filled.
@ -175,6 +162,11 @@ def _populate_dicts_and_lists():
}
)
DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
FACTOR_VARIABLES[:] = [
key
for (key, value) in ALL_COLUMNS.items()
if value["type"] == "factor"
]
NUMERIC_COLUMNS.clear()
NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
@ -203,7 +195,6 @@ def _populate_dicts_and_lists():
def _rename_column(old_name, new_name):
"""Change the name of a column."""
global ALL_COLUMNS
ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
del ALL_COLUMNS[old_name]
@ -249,7 +240,6 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
After dropping some columns from the DataFrame, these removals must be
propagated to the helper data structures defined in this module.
"""
global ALL_COLUMNS
if correct_columns:
correct_column_names(columns_to_be_kept, repopulate=False)
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
@ -279,17 +269,15 @@ def print_column_list(subset=None):
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
def load_clean_data(*, path=None, ordinal_encoded=False):
def load_clean_data(path=None):
"""Return the clean project data as a pandas DataFrame.
This utility function ensures that each column is cast to its correct type.
It takes the following optional keyword-only arguments:
- 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
- 'ordinal_encoded': can be set to True to obtain the ordinal columns that
are already encoded into ordered integers
It takes an optional path argument to a clean CSV file (defaults to
"data/data_clean.csv").
The target variable "SalePrice" is always included as the last column.
The target variables are always included as the last columns.
Implementation Notes:
@ -331,22 +319,30 @@ def load_clean_data(*, path=None, ordinal_encoded=False):
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
if derived_columns:
for column in derived_columns:
# All derived variables are numeric (including factors).
df[column] = df[column].astype(float)
# Check if the derived variable is a target variable.
for target in TARGET_VARIABLES[:]:
if column.startswith(target):
df[column] = df[column].astype(float)
TARGET_VARIABLES.append(column)
break
else:
df[column] = df[column].astype(float)
is_int = (df[column] == df[column].astype(int)).all()
n_unique = len(df[column].unique())
if is_int & (n_unique == 2):
df[column] = df[column].astype(int)
type_ = "factor"
elif is_int & (n_unique < 150):
df[column] = df[column].astype(int)
type_ = "discrete"
else:
df[column] = df[column].astype(float)
type_ = "continuous"
ALL_COLUMNS[column] = {
"type": "continuous",
"type": type_,
"description": "derived variable",
}
_populate_dicts_and_lists()
# Use integer encoding for ordinal variables.
if ordinal_encoded:
df = encode_ordinals(df)
return df
@ -356,10 +352,24 @@ def encode_ordinals(df):
df = df.copy()
for column in df.columns:
if column in ORDINAL_VARIABLES:
df[column] = df[column].cat.codes
df[column] = df[column].cat.codes.astype(int)
return df
def bias_score(y_true, y_pred):
"""Determine the bias of a prediction."""
assert y_true.shape == y_pred.shape
assert y_true.ndim == 1
return np.mean(y_pred - y_true)
def max_deviation(y_true, y_pred):
"""Determine the maximum deviation of a prediction."""
assert y_true.shape == y_pred.shape
assert y_true.ndim == 1
return np.max(np.abs(y_pred - y_true))
# This code is executed once during import time and
# populates all the "constants" directly or indirectly.
_extract_meta_data(_get_lines())