Add a simple version of predictive models

2018-09-05 19:44:00 +02:00 · 2018-09-05 19:44:00 +02:00 · 04403b7603
commit 04403b7603
parent 848456f6c6
4 changed files with 6416 additions and 30 deletions
--- a/4_predictive_models.ipynb
+++ b/4_predictive_models.ipynb
--- a/1
+++ b/1
@ -17,6 +17,7 @@ seaborn = "*"
 missingno = "*"
 sklearn = "*"
 statsmodels = "*"
 tqdm = "*"
 [dev-packages]
 black = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
+            "sha256": "cd46097e2ebd23accf453936aef1b898dfbf46ff9f2642f494da0e041b7a5992"
        },
        "pipfile-spec": 6,
        "requires": {
@ -624,6 +624,14 @@
            "markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
            "version": "==5.1"
        },
        "tqdm": {
            "hashes": [
                "sha256:5ef526702c0d265d5a960a3b27f3971fac13c26cf0fb819294bfa71fc6026c88",
                "sha256:a3364bd83ce4777320b862e3c8a93d7da91e20a95f06ef79bed7dd71c654cafa"
            ],
            "index": "pypi",
            "version": "==4.25.0"
        },
        "traitlets": {
            "hashes": [
                "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
--- a/utils.py
+++ b/utils.py
@ -24,10 +24,10 @@ Implementation Note:
    This file defines the "constants" it exports dynamically. This is a bit
    advanced but intentional!
 """
 # pragma pylint:disable=global-statement
 import re
 import numpy as np
 import pandas as pd
 import requests
 import tabulate
@ -140,19 +140,6 @@ def _populate_dicts_and_lists():
    dictionaries and lists are considered derived from it and thus considered
    "secondary".
    """
    global ALL_VARIABLES
    global CONTINUOUS_COLUMNS
    global CONTINUOUS_VARIABLES
    global DISCRETE_COLUMNS
    global DISCRETE_VARIABLES
    global NUMERIC_COLUMNS
    global NUMERIC_VARIABLES
    global NOMINAL_COLUMNS
    global NOMINAL_VARIABLES
    global ORDINAL_COLUMNS
    global ORDINAL_VARIABLES
    global LABEL_COLUMNS
    global LABEL_VARIABLES
    # The global data structures are not re-assigned to so as to keep all
    # references in the Jupyter notebooks alive. Instead, they are emptied
    # and re-filled.
@ -175,6 +162,11 @@ def _populate_dicts_and_lists():
        }
    )
    DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
    FACTOR_VARIABLES[:] = [
        key
        for (key, value) in ALL_COLUMNS.items()
        if value["type"] == "factor"
    ]
    NUMERIC_COLUMNS.clear()
    NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
    NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
@ -203,7 +195,6 @@ def _populate_dicts_and_lists():
 def _rename_column(old_name, new_name):
    """Change the name of a column."""
    global ALL_COLUMNS
    ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
    del ALL_COLUMNS[old_name]
@ -249,7 +240,6 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
    After dropping some columns from the DataFrame, these removals must be
    propagated to the helper data structures defined in this module.
    """
    global ALL_COLUMNS
    if correct_columns:
        correct_column_names(columns_to_be_kept, repopulate=False)
    columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
@ -279,17 +269,15 @@ def print_column_list(subset=None):
    print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
-def load_clean_data(*, path=None, ordinal_encoded=False):
+def load_clean_data(path=None):
    """Return the clean project data as a pandas DataFrame.
    This utility function ensures that each column is cast to its correct type.
-    It takes the following optional keyword-only arguments:
+    It takes an optional path argument to a clean CSV file (defaults to
-    - 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
+    "data/data_clean.csv").
    - 'ordinal_encoded': can be set to True to obtain the ordinal columns that
        are already encoded into ordered integers
-    The target variable "SalePrice" is always included as the last column.
+    The target variables are always included as the last columns.
    Implementation Notes:
@ -331,22 +319,30 @@ def load_clean_data(*, path=None, ordinal_encoded=False):
    derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
    if derived_columns:
        for column in derived_columns:
            # All derived variables are numeric (including factors).
            df[column] = df[column].astype(float)
            # Check if the derived variable is a target variable.
            for target in TARGET_VARIABLES[:]:
                if column.startswith(target):
                    df[column] = df[column].astype(float)
                    TARGET_VARIABLES.append(column)
                    break
            else:
                df[column] = df[column].astype(float)
                is_int = (df[column] == df[column].astype(int)).all()
                n_unique = len(df[column].unique())
                if is_int & (n_unique == 2):
                    df[column] = df[column].astype(int)
                    type_ = "factor"
                elif is_int & (n_unique < 150):
                    df[column] = df[column].astype(int)
                    type_ = "discrete"
                else:
                    df[column] = df[column].astype(float)
                    type_ = "continuous"
                ALL_COLUMNS[column] = {
-                    "type": "continuous",
+                    "type": type_,
                    "description": "derived variable",
                }
        _populate_dicts_and_lists()
    # Use integer encoding for ordinal variables.
    if ordinal_encoded:
        df = encode_ordinals(df)
    return df
@ -356,10 +352,24 @@ def encode_ordinals(df):
    df = df.copy()
    for column in df.columns:
        if column in ORDINAL_VARIABLES:
-            df[column] = df[column].cat.codes
+            df[column] = df[column].cat.codes.astype(int)
    return df
 def bias_score(y_true, y_pred):
    """Determine the bias of a prediction."""
    assert y_true.shape == y_pred.shape
    assert y_true.ndim == 1
    return np.mean(y_pred - y_true)
 def max_deviation(y_true, y_pred):
    """Determine the maximum deviation of a prediction."""
    assert y_true.shape == y_pred.shape
    assert y_true.ndim == 1
    return np.max(np.abs(y_pred - y_true))
 # This code is executed once during import time and
 # populates all the "constants" directly or indirectly.
 _extract_meta_data(_get_lines())