Analyse the nominal predictors and add factor variables
This commit is contained in:
parent
387a495a80
commit
9c684bc866
6 changed files with 7946 additions and 40 deletions
4995
3_descriptive_visualizations.ipynb
Normal file
4995
3_descriptive_visualizations.ipynb
Normal file
File diff suppressed because one or more lines are too long
1
Pipfile
1
Pipfile
|
@ -16,6 +16,7 @@ matplotlib = "*"
|
|||
seaborn = "*"
|
||||
missingno = "*"
|
||||
sklearn = "*"
|
||||
statsmodels = "*"
|
||||
|
||||
[dev-packages]
|
||||
black = "*"
|
||||
|
|
37
Pipfile.lock
generated
37
Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
|
||||
"sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -335,6 +335,13 @@
|
|||
],
|
||||
"version": "==0.3.1"
|
||||
},
|
||||
"patsy": {
|
||||
"hashes": [
|
||||
"sha256:14269536ecedaae0a5a2f300faac7d0afa1cc47aa98c561f54ba7300d0ec4011",
|
||||
"sha256:e05f38d5c38c8d216f0cc2b765b1069b433c92d628b954fb2fee68d13e42883b"
|
||||
],
|
||||
"version": "==0.5.0"
|
||||
},
|
||||
"pexpect": {
|
||||
"hashes": [
|
||||
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
|
||||
|
@ -555,6 +562,34 @@
|
|||
"index": "pypi",
|
||||
"version": "==0.0"
|
||||
},
|
||||
"statsmodels": {
|
||||
"hashes": [
|
||||
"sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece",
|
||||
"sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4",
|
||||
"sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938",
|
||||
"sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e",
|
||||
"sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9",
|
||||
"sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f",
|
||||
"sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943",
|
||||
"sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b",
|
||||
"sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659",
|
||||
"sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada",
|
||||
"sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16",
|
||||
"sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a",
|
||||
"sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c",
|
||||
"sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c",
|
||||
"sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2",
|
||||
"sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878",
|
||||
"sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9",
|
||||
"sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449",
|
||||
"sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4",
|
||||
"sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36",
|
||||
"sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3",
|
||||
"sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.9.0"
|
||||
},
|
||||
"tabulate": {
|
||||
"hashes": [
|
||||
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
|
||||
|
|
2884
data/data_clean_with_transformations_and_factors.csv
Normal file
2884
data/data_clean_with_transformations_and_factors.csv
Normal file
File diff suppressed because it is too large
Load diff
1
data/interesting_variables.json
Normal file
1
data/interesting_variables.json
Normal file
|
@ -0,0 +1 @@
|
|||
["has 2nd Flr", "has Fireplace", "has Garage", "nhood_Blmngtn", "nhood_Blueste", "nhood_BrDale", "nhood_BrkSide", "nhood_ClearCr", "nhood_CollgCr", "nhood_Crawfor", "nhood_Edwards", "nhood_Gilbert", "nhood_Greens", "nhood_GrnHill", "nhood_IDOTRR", "nhood_Landmrk", "nhood_MeadowV", "nhood_Mitchel", "nhood_Names", "nhood_NoRidge", "nhood_NPkVill", "nhood_NridgHt", "nhood_NWAmes", "nhood_OldTown", "nhood_SWISU", "nhood_Sawyer", "nhood_SawyerW", "nhood_Somerst", "nhood_StoneBr", "nhood_Timber", "nhood_Veenker", "build_type_1Fam", "build_type_2Fam", "build_type_Twnhs", "air_cond", "major_street", "new_home", "remodeled", "years_since_built", "years_since_remodeled", "recently_built", "recently_remodeled"]
|
68
utils.py
68
utils.py
|
@ -33,6 +33,7 @@ import requests
|
|||
import tabulate
|
||||
|
||||
|
||||
FACTOR_VARIABLES = []
|
||||
INDEX_COLUMNS = ["Order", "PID"]
|
||||
LABEL_TYPES = ["nominal", "ordinal"]
|
||||
TARGET_VARIABLES = ["SalePrice"]
|
||||
|
@ -213,40 +214,33 @@ def correct_column_names(data_columns, *, repopulate=True):
|
|||
In rare cases, the variable name in the data description file was slightly
|
||||
changed, i.e., a dash or a space needs to be removed.
|
||||
|
||||
This function adjusts the keys in all the dictionaries and lists and
|
||||
returns a dictionary summarizing the name changes.
|
||||
This function adjusts the keys in all the dictionaries and lists.
|
||||
"""
|
||||
renamed = {}
|
||||
for desc_column in ALL_VARIABLES:
|
||||
if desc_column not in data_columns:
|
||||
for data_column in data_columns:
|
||||
# Column name was truncated in description file.
|
||||
if data_column.startswith(desc_column):
|
||||
_rename_column(desc_column, data_column)
|
||||
renamed[desc_column] = data_column
|
||||
break
|
||||
# Spaces between words in Excel were removed.
|
||||
adj_data_column = data_column.replace(" ", "")
|
||||
if adj_data_column == desc_column:
|
||||
_rename_column(desc_column, data_column)
|
||||
renamed[desc_column] = data_column
|
||||
break
|
||||
# Spaces between words in description file were removed.
|
||||
adj_desc_column = desc_column.replace(" ", "")
|
||||
if adj_data_column == adj_desc_column:
|
||||
_rename_column(desc_column, data_column)
|
||||
renamed[desc_column] = data_column
|
||||
break
|
||||
# Dashes in description file were removed.
|
||||
adj_desc_column = desc_column.replace("-", "")
|
||||
if data_column == adj_desc_column:
|
||||
_rename_column(desc_column, data_column)
|
||||
renamed[desc_column] = data_column
|
||||
break
|
||||
# Propagate the change to all "secondary" dictionaries and lists.
|
||||
if repopulate:
|
||||
_populate_dicts_and_lists()
|
||||
return renamed
|
||||
|
||||
|
||||
def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
||||
|
@ -254,20 +248,15 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
|
|||
|
||||
After dropping some columns from the DataFrame, these removals must be
|
||||
propagated to the helper data structures defined in this module.
|
||||
|
||||
Returns a dictionary of all the columns with changed names.
|
||||
"""
|
||||
global ALL_COLUMNS
|
||||
if correct_columns:
|
||||
renamed = correct_column_names(columns_to_be_kept, repopulate=False)
|
||||
else:
|
||||
renamed = {}
|
||||
correct_column_names(columns_to_be_kept, repopulate=False)
|
||||
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
||||
for column in columns_to_be_removed:
|
||||
del ALL_COLUMNS[column]
|
||||
# Propagate the change to all "secondary" dictionaries and lists.
|
||||
_populate_dicts_and_lists()
|
||||
return renamed
|
||||
|
||||
|
||||
def print_column_list(subset=None):
|
||||
|
@ -285,19 +274,20 @@ def print_column_list(subset=None):
|
|||
subset -= without_desc
|
||||
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
|
||||
if without_desc:
|
||||
for c in sorted(without_desc):
|
||||
columns.append((c, ''))
|
||||
for column in sorted(without_desc):
|
||||
columns.append((column, ""))
|
||||
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
||||
|
||||
|
||||
def load_clean_data(subset=None, ordinal_encoded=False):
|
||||
def load_clean_data(*, path=None, ordinal_encoded=False):
|
||||
"""Return the clean project data as a pandas DataFrame.
|
||||
|
||||
This utility function ensures that each column is cast to its correct type.
|
||||
|
||||
It takes as an optional 'subset' argument a list of columns and
|
||||
'ordinal_encoded' can be set to True to obtain the ordinal columns already
|
||||
encoded into ordered integers.
|
||||
It takes the following optional keyword-only arguments:
|
||||
- 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
|
||||
- 'ordinal_encoded': can be set to True to obtain the ordinal columns that
|
||||
are already encoded into ordered integers
|
||||
|
||||
The target variable "SalePrice" is always included as the last column.
|
||||
|
||||
|
@ -310,22 +300,17 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
|||
|
||||
Another difficulty is that some values, e.g., "NA" strings are cast as
|
||||
np.NaN / None by pandas although they represent actual label values.
|
||||
|
||||
As column names come in slightly different form compared to the data
|
||||
description file, the subsetting can only be done after loading the CSV
|
||||
and some work needs to be put in to figure out if a column mentioned in the
|
||||
subset was renamed.
|
||||
"""
|
||||
# pragma pylint:disable=invalid-name
|
||||
df = pd.read_csv(
|
||||
"data/data_clean.csv",
|
||||
"data/data_clean.csv" if path is None else path,
|
||||
index_col=INDEX_COLUMNS,
|
||||
dtype=object,
|
||||
na_values="", # There are no missing values in the clean data file.
|
||||
keep_default_na=False, # "NA" strings are casted as actual values.
|
||||
)
|
||||
# Remove columns that are in the description but not in the data file.
|
||||
renamed = update_column_descriptions(df.columns, correct_columns=True)
|
||||
update_column_descriptions(df.columns, correct_columns=True)
|
||||
# Cast the numeric types correctly.
|
||||
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
|
||||
df[column] = df[column].astype(float)
|
||||
|
@ -342,18 +327,23 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
|||
reversed(mapping["order"]), ordered=True
|
||||
)
|
||||
df[column] = df[column].astype(labels)
|
||||
# Mirror the renaming and dropping of columns
|
||||
# for the provided list of columns.
|
||||
# Note that the target variable goes last.
|
||||
if subset is not None:
|
||||
subset = set(subset)
|
||||
subset.discard("SalePrice")
|
||||
for old_name, new_name in renamed.items():
|
||||
if old_name in subset:
|
||||
subset.remove(old_name)
|
||||
subset.add(new_name)
|
||||
subset = sorted(set(df.columns) & subset)
|
||||
df = df[subset + TARGET_VARIABLES]
|
||||
# After the raw data cleaning, several derived variables were created.
|
||||
derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
|
||||
if derived_columns:
|
||||
for column in derived_columns:
|
||||
# All derived variables are numeric (including factors).
|
||||
df[column] = df[column].astype(float)
|
||||
# Check if the derived variable is a target variable.
|
||||
for target in TARGET_VARIABLES[:]:
|
||||
if column.startswith(target):
|
||||
TARGET_VARIABLES.append(column)
|
||||
break
|
||||
else:
|
||||
ALL_COLUMNS[column] = {
|
||||
"type": "continuous",
|
||||
"description": "derived variable",
|
||||
}
|
||||
_populate_dicts_and_lists()
|
||||
# Use integer encoding for ordinal variables.
|
||||
if ordinal_encoded:
|
||||
df = encode_ordinals(df)
|
||||
|
|
Loading…
Reference in a new issue