Analyse the nominal predictors and add factor variables

This commit is contained in:
Alexander Hess 2018-09-05 00:48:12 +02:00
parent 387a495a80
commit 9c684bc866
6 changed files with 7946 additions and 40 deletions

File diff suppressed because one or more lines are too long

View file

@ -16,6 +16,7 @@ matplotlib = "*"
seaborn = "*" seaborn = "*"
missingno = "*" missingno = "*"
sklearn = "*" sklearn = "*"
statsmodels = "*"
[dev-packages] [dev-packages]
black = "*" black = "*"

37
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498" "sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -335,6 +335,13 @@
], ],
"version": "==0.3.1" "version": "==0.3.1"
}, },
"patsy": {
"hashes": [
"sha256:14269536ecedaae0a5a2f300faac7d0afa1cc47aa98c561f54ba7300d0ec4011",
"sha256:e05f38d5c38c8d216f0cc2b765b1069b433c92d628b954fb2fee68d13e42883b"
],
"version": "==0.5.0"
},
"pexpect": { "pexpect": {
"hashes": [ "hashes": [
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba", "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
@ -555,6 +562,34 @@
"index": "pypi", "index": "pypi",
"version": "==0.0" "version": "==0.0"
}, },
"statsmodels": {
"hashes": [
"sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece",
"sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4",
"sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938",
"sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e",
"sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9",
"sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f",
"sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943",
"sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b",
"sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659",
"sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada",
"sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16",
"sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a",
"sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c",
"sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c",
"sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2",
"sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878",
"sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9",
"sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449",
"sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4",
"sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36",
"sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3",
"sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23"
],
"index": "pypi",
"version": "==0.9.0"
},
"tabulate": { "tabulate": {
"hashes": [ "hashes": [
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1 @@
["has 2nd Flr", "has Fireplace", "has Garage", "nhood_Blmngtn", "nhood_Blueste", "nhood_BrDale", "nhood_BrkSide", "nhood_ClearCr", "nhood_CollgCr", "nhood_Crawfor", "nhood_Edwards", "nhood_Gilbert", "nhood_Greens", "nhood_GrnHill", "nhood_IDOTRR", "nhood_Landmrk", "nhood_MeadowV", "nhood_Mitchel", "nhood_Names", "nhood_NoRidge", "nhood_NPkVill", "nhood_NridgHt", "nhood_NWAmes", "nhood_OldTown", "nhood_SWISU", "nhood_Sawyer", "nhood_SawyerW", "nhood_Somerst", "nhood_StoneBr", "nhood_Timber", "nhood_Veenker", "build_type_1Fam", "build_type_2Fam", "build_type_Twnhs", "air_cond", "major_street", "new_home", "remodeled", "years_since_built", "years_since_remodeled", "recently_built", "recently_remodeled"]

View file

@ -33,6 +33,7 @@ import requests
import tabulate import tabulate
FACTOR_VARIABLES = []
INDEX_COLUMNS = ["Order", "PID"] INDEX_COLUMNS = ["Order", "PID"]
LABEL_TYPES = ["nominal", "ordinal"] LABEL_TYPES = ["nominal", "ordinal"]
TARGET_VARIABLES = ["SalePrice"] TARGET_VARIABLES = ["SalePrice"]
@ -213,40 +214,33 @@ def correct_column_names(data_columns, *, repopulate=True):
In rare cases, the variable name in the data description file was slightly In rare cases, the variable name in the data description file was slightly
changed, i.e., a dash or a space needs to be removed. changed, i.e., a dash or a space needs to be removed.
This function adjusts the keys in all the dictionaries and lists and This function adjusts the keys in all the dictionaries and lists.
returns a dictionary summarizing the name changes.
""" """
renamed = {}
for desc_column in ALL_VARIABLES: for desc_column in ALL_VARIABLES:
if desc_column not in data_columns: if desc_column not in data_columns:
for data_column in data_columns: for data_column in data_columns:
# Column name was truncated in description file. # Column name was truncated in description file.
if data_column.startswith(desc_column): if data_column.startswith(desc_column):
_rename_column(desc_column, data_column) _rename_column(desc_column, data_column)
renamed[desc_column] = data_column
break break
# Spaces between words in Excel were removed. # Spaces between words in Excel were removed.
adj_data_column = data_column.replace(" ", "") adj_data_column = data_column.replace(" ", "")
if adj_data_column == desc_column: if adj_data_column == desc_column:
_rename_column(desc_column, data_column) _rename_column(desc_column, data_column)
renamed[desc_column] = data_column
break break
# Spaces between words in description file were removed. # Spaces between words in description file were removed.
adj_desc_column = desc_column.replace(" ", "") adj_desc_column = desc_column.replace(" ", "")
if adj_data_column == adj_desc_column: if adj_data_column == adj_desc_column:
_rename_column(desc_column, data_column) _rename_column(desc_column, data_column)
renamed[desc_column] = data_column
break break
# Dashes in description file were removed. # Dashes in description file were removed.
adj_desc_column = desc_column.replace("-", "") adj_desc_column = desc_column.replace("-", "")
if data_column == adj_desc_column: if data_column == adj_desc_column:
_rename_column(desc_column, data_column) _rename_column(desc_column, data_column)
renamed[desc_column] = data_column
break break
# Propagate the change to all "secondary" dictionaries and lists. # Propagate the change to all "secondary" dictionaries and lists.
if repopulate: if repopulate:
_populate_dicts_and_lists() _populate_dicts_and_lists()
return renamed
def update_column_descriptions(columns_to_be_kept, *, correct_columns=False): def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
@ -254,20 +248,15 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
After dropping some columns from the DataFrame, these removals must be After dropping some columns from the DataFrame, these removals must be
propagated to the helper data structures defined in this module. propagated to the helper data structures defined in this module.
Returns a dictionary of all the columns with changed names.
""" """
global ALL_COLUMNS global ALL_COLUMNS
if correct_columns: if correct_columns:
renamed = correct_column_names(columns_to_be_kept, repopulate=False) correct_column_names(columns_to_be_kept, repopulate=False)
else:
renamed = {}
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept)) columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
for column in columns_to_be_removed: for column in columns_to_be_removed:
del ALL_COLUMNS[column] del ALL_COLUMNS[column]
# Propagate the change to all "secondary" dictionaries and lists. # Propagate the change to all "secondary" dictionaries and lists.
_populate_dicts_and_lists() _populate_dicts_and_lists()
return renamed
def print_column_list(subset=None): def print_column_list(subset=None):
@ -285,19 +274,20 @@ def print_column_list(subset=None):
subset -= without_desc subset -= without_desc
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset] columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
if without_desc: if without_desc:
for c in sorted(without_desc): for column in sorted(without_desc):
columns.append((c, '')) columns.append((column, ""))
print(tabulate.tabulate(sorted(columns), tablefmt="plain")) print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
def load_clean_data(subset=None, ordinal_encoded=False): def load_clean_data(*, path=None, ordinal_encoded=False):
"""Return the clean project data as a pandas DataFrame. """Return the clean project data as a pandas DataFrame.
This utility function ensures that each column is cast to its correct type. This utility function ensures that each column is cast to its correct type.
It takes as an optional 'subset' argument a list of columns and It takes the following optional keyword-only arguments:
'ordinal_encoded' can be set to True to obtain the ordinal columns already - 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
encoded into ordered integers. - 'ordinal_encoded': can be set to True to obtain the ordinal columns that
are already encoded into ordered integers
The target variable "SalePrice" is always included as the last column. The target variable "SalePrice" is always included as the last column.
@ -310,22 +300,17 @@ def load_clean_data(subset=None, ordinal_encoded=False):
Another difficulty is that some values, e.g., "NA" strings are cast as Another difficulty is that some values, e.g., "NA" strings are cast as
np.NaN / None by pandas although they represent actual label values. np.NaN / None by pandas although they represent actual label values.
As column names come in slightly different form compared to the data
description file, the subsetting can only be done after loading the CSV
and some work needs to be put in to figure out if a column mentioned in the
subset was renamed.
""" """
# pragma pylint:disable=invalid-name # pragma pylint:disable=invalid-name
df = pd.read_csv( df = pd.read_csv(
"data/data_clean.csv", "data/data_clean.csv" if path is None else path,
index_col=INDEX_COLUMNS, index_col=INDEX_COLUMNS,
dtype=object, dtype=object,
na_values="", # There are no missing values in the clean data file. na_values="", # There are no missing values in the clean data file.
keep_default_na=False, # "NA" strings are casted as actual values. keep_default_na=False, # "NA" strings are casted as actual values.
) )
# Remove columns that are in the description but not in the data file. # Remove columns that are in the description but not in the data file.
renamed = update_column_descriptions(df.columns, correct_columns=True) update_column_descriptions(df.columns, correct_columns=True)
# Cast the numeric types correctly. # Cast the numeric types correctly.
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES: for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
df[column] = df[column].astype(float) df[column] = df[column].astype(float)
@ -342,18 +327,23 @@ def load_clean_data(subset=None, ordinal_encoded=False):
reversed(mapping["order"]), ordered=True reversed(mapping["order"]), ordered=True
) )
df[column] = df[column].astype(labels) df[column] = df[column].astype(labels)
# Mirror the renaming and dropping of columns # After the raw data cleaning, several derived variables were created.
# for the provided list of columns. derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
# Note that the target variable goes last. if derived_columns:
if subset is not None: for column in derived_columns:
subset = set(subset) # All derived variables are numeric (including factors).
subset.discard("SalePrice") df[column] = df[column].astype(float)
for old_name, new_name in renamed.items(): # Check if the derived variable is a target variable.
if old_name in subset: for target in TARGET_VARIABLES[:]:
subset.remove(old_name) if column.startswith(target):
subset.add(new_name) TARGET_VARIABLES.append(column)
subset = sorted(set(df.columns) & subset) break
df = df[subset + TARGET_VARIABLES] else:
ALL_COLUMNS[column] = {
"type": "continuous",
"description": "derived variable",
}
_populate_dicts_and_lists()
# Use integer encoding for ordinal variables. # Use integer encoding for ordinal variables.
if ordinal_encoded: if ordinal_encoded:
df = encode_ordinals(df) df = encode_ordinals(df)