Add a simple version of predictive models
This commit is contained in:
parent
848456f6c6
commit
04403b7603
4 changed files with 6416 additions and 30 deletions
6367
4_predictive_models.ipynb
Normal file
6367
4_predictive_models.ipynb
Normal file
File diff suppressed because it is too large
Load diff
1
Pipfile
1
Pipfile
|
@ -17,6 +17,7 @@ seaborn = "*"
|
||||||
missingno = "*"
|
missingno = "*"
|
||||||
sklearn = "*"
|
sklearn = "*"
|
||||||
statsmodels = "*"
|
statsmodels = "*"
|
||||||
|
tqdm = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
black = "*"
|
black = "*"
|
||||||
|
|
10
Pipfile.lock
generated
10
Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
|
"sha256": "cd46097e2ebd23accf453936aef1b898dfbf46ff9f2642f494da0e041b7a5992"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
@ -624,6 +624,14 @@
|
||||||
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
|
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
|
||||||
"version": "==5.1"
|
"version": "==5.1"
|
||||||
},
|
},
|
||||||
|
"tqdm": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:5ef526702c0d265d5a960a3b27f3971fac13c26cf0fb819294bfa71fc6026c88",
|
||||||
|
"sha256:a3364bd83ce4777320b862e3c8a93d7da91e20a95f06ef79bed7dd71c654cafa"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==4.25.0"
|
||||||
|
},
|
||||||
"traitlets": {
|
"traitlets": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
||||||
|
|
68
utils.py
68
utils.py
|
@ -24,10 +24,10 @@ Implementation Note:
|
||||||
This file defines the "constants" it exports dynamically. This is a bit
|
This file defines the "constants" it exports dynamically. This is a bit
|
||||||
advanced but intentional!
|
advanced but intentional!
|
||||||
"""
|
"""
|
||||||
# pragma pylint:disable=global-statement
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import tabulate
|
import tabulate
|
||||||
|
@ -140,19 +140,6 @@ def _populate_dicts_and_lists():
|
||||||
dictionaries and lists are considered derived from it and thus considered
|
dictionaries and lists are considered derived from it and thus considered
|
||||||
"secondary".
|
"secondary".
|
||||||
"""
|
"""
|
||||||
global ALL_VARIABLES
|
|
||||||
global CONTINUOUS_COLUMNS
|
|
||||||
global CONTINUOUS_VARIABLES
|
|
||||||
global DISCRETE_COLUMNS
|
|
||||||
global DISCRETE_VARIABLES
|
|
||||||
global NUMERIC_COLUMNS
|
|
||||||
global NUMERIC_VARIABLES
|
|
||||||
global NOMINAL_COLUMNS
|
|
||||||
global NOMINAL_VARIABLES
|
|
||||||
global ORDINAL_COLUMNS
|
|
||||||
global ORDINAL_VARIABLES
|
|
||||||
global LABEL_COLUMNS
|
|
||||||
global LABEL_VARIABLES
|
|
||||||
# The global data structures are not re-assigned to so as to keep all
|
# The global data structures are not re-assigned to so as to keep all
|
||||||
# references in the Jupyter notebooks alive. Instead, they are emptied
|
# references in the Jupyter notebooks alive. Instead, they are emptied
|
||||||
# and re-filled.
|
# and re-filled.
|
||||||
|
@ -175,6 +162,11 @@ def _populate_dicts_and_lists():
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
|
DISCRETE_VARIABLES[:] = sorted(DISCRETE_COLUMNS)
|
||||||
|
FACTOR_VARIABLES[:] = [
|
||||||
|
key
|
||||||
|
for (key, value) in ALL_COLUMNS.items()
|
||||||
|
if value["type"] == "factor"
|
||||||
|
]
|
||||||
NUMERIC_COLUMNS.clear()
|
NUMERIC_COLUMNS.clear()
|
||||||
NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
|
NUMERIC_COLUMNS.update({**CONTINUOUS_COLUMNS, **DISCRETE_COLUMNS})
|
||||||
NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
|
NUMERIC_VARIABLES[:] = sorted(NUMERIC_COLUMNS)
|
||||||
|
@ -203,7 +195,6 @@ def _populate_dicts_and_lists():
|
||||||
|
|
||||||
def _rename_column(old_name, new_name):
|
def _rename_column(old_name, new_name):
|
||||||
"""Change the name of a column."""
|
"""Change the name of a column."""
|
||||||
global ALL_COLUMNS
|
|
||||||
ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
|
ALL_COLUMNS[new_name] = ALL_COLUMNS[old_name]
|
||||||
del ALL_COLUMNS[old_name]
|
del ALL_COLUMNS[old_name]
|
||||||
|
|
||||||
|
@ -249,7 +240,6 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
||||||
After dropping some columns from the DataFrame, these removals must be
|
After dropping some columns from the DataFrame, these removals must be
|
||||||
propagated to the helper data structures defined in this module.
|
propagated to the helper data structures defined in this module.
|
||||||
"""
|
"""
|
||||||
global ALL_COLUMNS
|
|
||||||
if correct_columns:
|
if correct_columns:
|
||||||
correct_column_names(columns_to_be_kept, repopulate=False)
|
correct_column_names(columns_to_be_kept, repopulate=False)
|
||||||
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
||||||
|
@ -279,17 +269,15 @@ def print_column_list(subset=None):
|
||||||
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
||||||
|
|
||||||
|
|
||||||
def load_clean_data(*, path=None, ordinal_encoded=False):
|
def load_clean_data(path=None):
|
||||||
"""Return the clean project data as a pandas DataFrame.
|
"""Return the clean project data as a pandas DataFrame.
|
||||||
|
|
||||||
This utility function ensures that each column is cast to its correct type.
|
This utility function ensures that each column is cast to its correct type.
|
||||||
|
|
||||||
It takes the following optional keyword-only arguments:
|
It takes an optional path argument to a clean CSV file (defaults to
|
||||||
- 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
|
"data/data_clean.csv").
|
||||||
- 'ordinal_encoded': can be set to True to obtain the ordinal columns that
|
|
||||||
are already encoded into ordered integers
|
|
||||||
|
|
||||||
The target variable "SalePrice" is always included as the last column.
|
The target variables are always included as the last columns.
|
||||||
|
|
||||||
Implementation Notes:
|
Implementation Notes:
|
||||||
|
|
||||||
|
@ -331,22 +319,30 @@ def load_clean_data(*, path=None, ordinal_encoded=False):
|
||||||
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
|
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
|
||||||
if derived_columns:
|
if derived_columns:
|
||||||
for column in derived_columns:
|
for column in derived_columns:
|
||||||
# All derived variables are numeric (including factors).
|
|
||||||
df[column] = df[column].astype(float)
|
|
||||||
# Check if the derived variable is a target variable.
|
# Check if the derived variable is a target variable.
|
||||||
for target in TARGET_VARIABLES[:]:
|
for target in TARGET_VARIABLES[:]:
|
||||||
if column.startswith(target):
|
if column.startswith(target):
|
||||||
|
df[column] = df[column].astype(float)
|
||||||
TARGET_VARIABLES.append(column)
|
TARGET_VARIABLES.append(column)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
df[column] = df[column].astype(float)
|
||||||
|
is_int = (df[column] == df[column].astype(int)).all()
|
||||||
|
n_unique = len(df[column].unique())
|
||||||
|
if is_int & (n_unique == 2):
|
||||||
|
df[column] = df[column].astype(int)
|
||||||
|
type_ = "factor"
|
||||||
|
elif is_int & (n_unique < 150):
|
||||||
|
df[column] = df[column].astype(int)
|
||||||
|
type_ = "discrete"
|
||||||
|
else:
|
||||||
|
df[column] = df[column].astype(float)
|
||||||
|
type_ = "continuous"
|
||||||
ALL_COLUMNS[column] = {
|
ALL_COLUMNS[column] = {
|
||||||
"type": "continuous",
|
"type": type_,
|
||||||
"description": "derived variable",
|
"description": "derived variable",
|
||||||
}
|
}
|
||||||
_populate_dicts_and_lists()
|
_populate_dicts_and_lists()
|
||||||
# Use integer encoding for ordinal variables.
|
|
||||||
if ordinal_encoded:
|
|
||||||
df = encode_ordinals(df)
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
@ -356,10 +352,24 @@ def encode_ordinals(df):
|
||||||
df = df.copy()
|
df = df.copy()
|
||||||
for column in df.columns:
|
for column in df.columns:
|
||||||
if column in ORDINAL_VARIABLES:
|
if column in ORDINAL_VARIABLES:
|
||||||
df[column] = df[column].cat.codes
|
df[column] = df[column].cat.codes.astype(int)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def bias_score(y_true, y_pred):
|
||||||
|
"""Determine the bias of a prediction."""
|
||||||
|
assert y_true.shape == y_pred.shape
|
||||||
|
assert y_true.ndim == 1
|
||||||
|
return np.mean(y_pred - y_true)
|
||||||
|
|
||||||
|
|
||||||
|
def max_deviation(y_true, y_pred):
|
||||||
|
"""Determine the maximum deviation of a prediction."""
|
||||||
|
assert y_true.shape == y_pred.shape
|
||||||
|
assert y_true.ndim == 1
|
||||||
|
return np.max(np.abs(y_pred - y_true))
|
||||||
|
|
||||||
|
|
||||||
# This code is executed once during import time and
|
# This code is executed once during import time and
|
||||||
# populates all the "constants" directly or indirectly.
|
# populates all the "constants" directly or indirectly.
|
||||||
_extract_meta_data(_get_lines())
|
_extract_meta_data(_get_lines())
|
||||||
|
|
Loading…
Reference in a new issue