Analyse the nominal predictors and add factor variables
This commit is contained in:
parent
387a495a80
commit
9c684bc866
6 changed files with 7946 additions and 40 deletions
4995
3_descriptive_visualizations.ipynb
Normal file
4995
3_descriptive_visualizations.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
Pipfile
1
Pipfile
|
@ -16,6 +16,7 @@ matplotlib = "*"
|
||||||
seaborn = "*"
|
seaborn = "*"
|
||||||
missingno = "*"
|
missingno = "*"
|
||||||
sklearn = "*"
|
sklearn = "*"
|
||||||
|
statsmodels = "*"
|
||||||
|
|
||||||
[dev-packages]
|
[dev-packages]
|
||||||
black = "*"
|
black = "*"
|
||||||
|
|
37
Pipfile.lock
generated
37
Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
|
"sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
@ -335,6 +335,13 @@
|
||||||
],
|
],
|
||||||
"version": "==0.3.1"
|
"version": "==0.3.1"
|
||||||
},
|
},
|
||||||
|
"patsy": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:14269536ecedaae0a5a2f300faac7d0afa1cc47aa98c561f54ba7300d0ec4011",
|
||||||
|
"sha256:e05f38d5c38c8d216f0cc2b765b1069b433c92d628b954fb2fee68d13e42883b"
|
||||||
|
],
|
||||||
|
"version": "==0.5.0"
|
||||||
|
},
|
||||||
"pexpect": {
|
"pexpect": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
|
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
|
||||||
|
@ -555,6 +562,34 @@
|
||||||
"index": "pypi",
|
"index": "pypi",
|
||||||
"version": "==0.0"
|
"version": "==0.0"
|
||||||
},
|
},
|
||||||
|
"statsmodels": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece",
|
||||||
|
"sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4",
|
||||||
|
"sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938",
|
||||||
|
"sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e",
|
||||||
|
"sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9",
|
||||||
|
"sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f",
|
||||||
|
"sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943",
|
||||||
|
"sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b",
|
||||||
|
"sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659",
|
||||||
|
"sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada",
|
||||||
|
"sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16",
|
||||||
|
"sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a",
|
||||||
|
"sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c",
|
||||||
|
"sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c",
|
||||||
|
"sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2",
|
||||||
|
"sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878",
|
||||||
|
"sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9",
|
||||||
|
"sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449",
|
||||||
|
"sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4",
|
||||||
|
"sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36",
|
||||||
|
"sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3",
|
||||||
|
"sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==0.9.0"
|
||||||
|
},
|
||||||
"tabulate": {
|
"tabulate": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
|
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
|
||||||
|
|
2884
data/data_clean_with_transformations_and_factors.csv
Normal file
2884
data/data_clean_with_transformations_and_factors.csv
Normal file
File diff suppressed because it is too large
Load diff
1
data/interesting_variables.json
Normal file
1
data/interesting_variables.json
Normal file
|
@ -0,0 +1 @@
|
||||||
|
["has 2nd Flr", "has Fireplace", "has Garage", "nhood_Blmngtn", "nhood_Blueste", "nhood_BrDale", "nhood_BrkSide", "nhood_ClearCr", "nhood_CollgCr", "nhood_Crawfor", "nhood_Edwards", "nhood_Gilbert", "nhood_Greens", "nhood_GrnHill", "nhood_IDOTRR", "nhood_Landmrk", "nhood_MeadowV", "nhood_Mitchel", "nhood_Names", "nhood_NoRidge", "nhood_NPkVill", "nhood_NridgHt", "nhood_NWAmes", "nhood_OldTown", "nhood_SWISU", "nhood_Sawyer", "nhood_SawyerW", "nhood_Somerst", "nhood_StoneBr", "nhood_Timber", "nhood_Veenker", "build_type_1Fam", "build_type_2Fam", "build_type_Twnhs", "air_cond", "major_street", "new_home", "remodeled", "years_since_built", "years_since_remodeled", "recently_built", "recently_remodeled"]
|
68
utils.py
68
utils.py
|
@ -33,6 +33,7 @@ import requests
|
||||||
import tabulate
|
import tabulate
|
||||||
|
|
||||||
|
|
||||||
|
FACTOR_VARIABLES = []
|
||||||
INDEX_COLUMNS = ["Order", "PID"]
|
INDEX_COLUMNS = ["Order", "PID"]
|
||||||
LABEL_TYPES = ["nominal", "ordinal"]
|
LABEL_TYPES = ["nominal", "ordinal"]
|
||||||
TARGET_VARIABLES = ["SalePrice"]
|
TARGET_VARIABLES = ["SalePrice"]
|
||||||
|
@ -213,40 +214,33 @@ def correct_column_names(data_columns, *, repopulate=True):
|
||||||
In rare cases, the variable name in the data description file was slightly
|
In rare cases, the variable name in the data description file was slightly
|
||||||
changed, i.e., a dash or a space needs to be removed.
|
changed, i.e., a dash or a space needs to be removed.
|
||||||
|
|
||||||
This function adjusts the keys in all the dictionaries and lists and
|
This function adjusts the keys in all the dictionaries and lists.
|
||||||
returns a dictionary summarizing the name changes.
|
|
||||||
"""
|
"""
|
||||||
renamed = {}
|
|
||||||
for desc_column in ALL_VARIABLES:
|
for desc_column in ALL_VARIABLES:
|
||||||
if desc_column not in data_columns:
|
if desc_column not in data_columns:
|
||||||
for data_column in data_columns:
|
for data_column in data_columns:
|
||||||
# Column name was truncated in description file.
|
# Column name was truncated in description file.
|
||||||
if data_column.startswith(desc_column):
|
if data_column.startswith(desc_column):
|
||||||
_rename_column(desc_column, data_column)
|
_rename_column(desc_column, data_column)
|
||||||
renamed[desc_column] = data_column
|
|
||||||
break
|
break
|
||||||
# Spaces between words in Excel were removed.
|
# Spaces between words in Excel were removed.
|
||||||
adj_data_column = data_column.replace(" ", "")
|
adj_data_column = data_column.replace(" ", "")
|
||||||
if adj_data_column == desc_column:
|
if adj_data_column == desc_column:
|
||||||
_rename_column(desc_column, data_column)
|
_rename_column(desc_column, data_column)
|
||||||
renamed[desc_column] = data_column
|
|
||||||
break
|
break
|
||||||
# Spaces between words in description file were removed.
|
# Spaces between words in description file were removed.
|
||||||
adj_desc_column = desc_column.replace(" ", "")
|
adj_desc_column = desc_column.replace(" ", "")
|
||||||
if adj_data_column == adj_desc_column:
|
if adj_data_column == adj_desc_column:
|
||||||
_rename_column(desc_column, data_column)
|
_rename_column(desc_column, data_column)
|
||||||
renamed[desc_column] = data_column
|
|
||||||
break
|
break
|
||||||
# Dashes in description file were removed.
|
# Dashes in description file were removed.
|
||||||
adj_desc_column = desc_column.replace("-", "")
|
adj_desc_column = desc_column.replace("-", "")
|
||||||
if data_column == adj_desc_column:
|
if data_column == adj_desc_column:
|
||||||
_rename_column(desc_column, data_column)
|
_rename_column(desc_column, data_column)
|
||||||
renamed[desc_column] = data_column
|
|
||||||
break
|
break
|
||||||
# Propagate the change to all "secondary" dictionaries and lists.
|
# Propagate the change to all "secondary" dictionaries and lists.
|
||||||
if repopulate:
|
if repopulate:
|
||||||
_populate_dicts_and_lists()
|
_populate_dicts_and_lists()
|
||||||
return renamed
|
|
||||||
|
|
||||||
|
|
||||||
def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
||||||
|
@ -254,20 +248,15 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
||||||
|
|
||||||
After dropping some columns from the DataFrame, these removals must be
|
After dropping some columns from the DataFrame, these removals must be
|
||||||
propagated to the helper data structures defined in this module.
|
propagated to the helper data structures defined in this module.
|
||||||
|
|
||||||
Returns a dictionary of all the columns with changed names.
|
|
||||||
"""
|
"""
|
||||||
global ALL_COLUMNS
|
global ALL_COLUMNS
|
||||||
if correct_columns:
|
if correct_columns:
|
||||||
renamed = correct_column_names(columns_to_be_kept, repopulate=False)
|
correct_column_names(columns_to_be_kept, repopulate=False)
|
||||||
else:
|
|
||||||
renamed = {}
|
|
||||||
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
||||||
for column in columns_to_be_removed:
|
for column in columns_to_be_removed:
|
||||||
del ALL_COLUMNS[column]
|
del ALL_COLUMNS[column]
|
||||||
# Propagate the change to all "secondary" dictionaries and lists.
|
# Propagate the change to all "secondary" dictionaries and lists.
|
||||||
_populate_dicts_and_lists()
|
_populate_dicts_and_lists()
|
||||||
return renamed
|
|
||||||
|
|
||||||
|
|
||||||
def print_column_list(subset=None):
|
def print_column_list(subset=None):
|
||||||
|
@ -285,19 +274,20 @@ def print_column_list(subset=None):
|
||||||
subset -= without_desc
|
subset -= without_desc
|
||||||
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
|
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
|
||||||
if without_desc:
|
if without_desc:
|
||||||
for c in sorted(without_desc):
|
for column in sorted(without_desc):
|
||||||
columns.append((c, ''))
|
columns.append((column, ""))
|
||||||
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
||||||
|
|
||||||
|
|
||||||
def load_clean_data(subset=None, ordinal_encoded=False):
|
def load_clean_data(*, path=None, ordinal_encoded=False):
|
||||||
"""Return the clean project data as a pandas DataFrame.
|
"""Return the clean project data as a pandas DataFrame.
|
||||||
|
|
||||||
This utility function ensures that each column is cast to its correct type.
|
This utility function ensures that each column is cast to its correct type.
|
||||||
|
|
||||||
It takes as an optional 'subset' argument a list of columns and
|
It takes the following optional keyword-only arguments:
|
||||||
'ordinal_encoded' can be set to True to obtain the ordinal columns already
|
- 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
|
||||||
encoded into ordered integers.
|
- 'ordinal_encoded': can be set to True to obtain the ordinal columns that
|
||||||
|
are already encoded into ordered integers
|
||||||
|
|
||||||
The target variable "SalePrice" is always included as the last column.
|
The target variable "SalePrice" is always included as the last column.
|
||||||
|
|
||||||
|
@ -310,22 +300,17 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
||||||
|
|
||||||
Another difficulty is that some values, e.g., "NA" strings are cast as
|
Another difficulty is that some values, e.g., "NA" strings are cast as
|
||||||
np.NaN / None by pandas although they represent actual label values.
|
np.NaN / None by pandas although they represent actual label values.
|
||||||
|
|
||||||
As column names come in slightly different form compared to the data
|
|
||||||
description file, the subsetting can only be done after loading the CSV
|
|
||||||
and some work needs to be put in to figure out if a column mentioned in the
|
|
||||||
subset was renamed.
|
|
||||||
"""
|
"""
|
||||||
# pragma pylint:disable=invalid-name
|
# pragma pylint:disable=invalid-name
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
"data/data_clean.csv",
|
"data/data_clean.csv" if path is None else path,
|
||||||
index_col=INDEX_COLUMNS,
|
index_col=INDEX_COLUMNS,
|
||||||
dtype=object,
|
dtype=object,
|
||||||
na_values="", # There are no missing values in the clean data file.
|
na_values="", # There are no missing values in the clean data file.
|
||||||
keep_default_na=False, # "NA" strings are casted as actual values.
|
keep_default_na=False, # "NA" strings are casted as actual values.
|
||||||
)
|
)
|
||||||
# Remove columns that are in the description but not in the data file.
|
# Remove columns that are in the description but not in the data file.
|
||||||
renamed = update_column_descriptions(df.columns, correct_columns=True)
|
update_column_descriptions(df.columns, correct_columns=True)
|
||||||
# Cast the numeric types correctly.
|
# Cast the numeric types correctly.
|
||||||
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
|
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
|
||||||
df[column] = df[column].astype(float)
|
df[column] = df[column].astype(float)
|
||||||
|
@ -342,18 +327,23 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
||||||
reversed(mapping["order"]), ordered=True
|
reversed(mapping["order"]), ordered=True
|
||||||
)
|
)
|
||||||
df[column] = df[column].astype(labels)
|
df[column] = df[column].astype(labels)
|
||||||
# Mirror the renaming and dropping of columns
|
# After the raw data cleaning, several derived variables were created.
|
||||||
# for the provided list of columns.
|
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
|
||||||
# Note that the target variable goes last.
|
if derived_columns:
|
||||||
if subset is not None:
|
for column in derived_columns:
|
||||||
subset = set(subset)
|
# All derived variables are numeric (including factors).
|
||||||
subset.discard("SalePrice")
|
df[column] = df[column].astype(float)
|
||||||
for old_name, new_name in renamed.items():
|
# Check if the derived variable is a target variable.
|
||||||
if old_name in subset:
|
for target in TARGET_VARIABLES[:]:
|
||||||
subset.remove(old_name)
|
if column.startswith(target):
|
||||||
subset.add(new_name)
|
TARGET_VARIABLES.append(column)
|
||||||
subset = sorted(set(df.columns) & subset)
|
break
|
||||||
df = df[subset + TARGET_VARIABLES]
|
else:
|
||||||
|
ALL_COLUMNS[column] = {
|
||||||
|
"type": "continuous",
|
||||||
|
"description": "derived variable",
|
||||||
|
}
|
||||||
|
_populate_dicts_and_lists()
|
||||||
# Use integer encoding for ordinal variables.
|
# Use integer encoding for ordinal variables.
|
||||||
if ordinal_encoded:
|
if ordinal_encoded:
|
||||||
df = encode_ordinals(df)
|
df = encode_ordinals(df)
|
||||||
|
|
Loading…
Reference in a new issue