Analyse the nominal predictors and add factor variables

2018-09-05 00:48:12 +02:00 · 2018-09-05 00:48:12 +02:00 · 9c684bc866
commit 9c684bc866
parent 387a495a80
6 changed files with 7946 additions and 40 deletions
--- a/3_descriptive_visualizations.ipynb
+++ b/3_descriptive_visualizations.ipynb
--- a/1
+++ b/1
@ -16,6 +16,7 @@ matplotlib = "*"
 seaborn = "*"
 missingno = "*"
 sklearn = "*"
+statsmodels = "*"

 [dev-packages]
 black = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -1,7 +1,7 @@
 {
    "_meta": {
        "hash": {
-            "sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
+            "sha256": "9179f1a05ed984d241ed8fcfa9c66c135100dbc2137f22627ac15977ced9ce87"
        },
        "pipfile-spec": 6,
        "requires": {
@ -335,6 +335,13 @@
            ],
            "version": "==0.3.1"
        },
+        "patsy": {
+            "hashes": [
+                "sha256:14269536ecedaae0a5a2f300faac7d0afa1cc47aa98c561f54ba7300d0ec4011",
+                "sha256:e05f38d5c38c8d216f0cc2b765b1069b433c92d628b954fb2fee68d13e42883b"
+            ],
+            "version": "==0.5.0"
+        },
        "pexpect": {
            "hashes": [
                "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
@ -555,6 +562,34 @@
            "index": "pypi",
            "version": "==0.0"
        },
+        "statsmodels": {
+            "hashes": [
+                "sha256:0fd6af8db18b776c81c8fba54de20e9ec2f11b9310871b6b666d8805e3cf5ece",
+                "sha256:18844bbd95fcf62885d195571334762533ae16de182e1032ccc1595a98ffffb4",
+                "sha256:27e87cc6cd390fce8f44df225dadf589e1df6272f36b267ccdece2a9c4f52938",
+                "sha256:2902f5eef49fc38c112ffd8168dd76f7ae27f6cb5aa735cf55bc887b49aaec6e",
+                "sha256:31c2e26436a992e66355c0b3ef4b7c9714a0aa8375952d24f0593ac7c417b1e9",
+                "sha256:5d91ad30b8e20a45f583077ffeb4352be01955033f3dcd09bc06c30be1d29e8f",
+                "sha256:5de3d525b9a8679cd6c0f7f7c8cb8508275ab86cc3c1a140b2dc6b6390adb943",
+                "sha256:6461f93a842c649922c2c9a9bc9d9c4834110b89de8c4af196a791ab8f42ba3b",
+                "sha256:78d1b40c18d41f6c683c1c184be146264a782d409a89d8ed6c78acd1e1c11659",
+                "sha256:7c1a7cf557139f4bcbf97172268a8001156e42a7eeccca04d15c0cb7c3491ada",
+                "sha256:8532885c5778f94dae7ad83c4ac3f6916d4c8eb294f47ecefe2f0d3b967e6a16",
+                "sha256:95d35b33a301ded560662c733780ce58b37e218d122bb1b9c14e216aa9d42a2a",
+                "sha256:b48e283ba171698dca3989c0c03e6f25d3f431640383d926235d26ce48f3891c",
+                "sha256:b4b4b25c0e4228b1d33098894c3b29f4546e45afb29b333582cbaa5e16f38f3c",
+                "sha256:c06fd4af98f4c7ab61c9a79fd051ad4d7247991a691c3b4883c611029bac30a2",
+                "sha256:d2003c70c854f35a6446a465c61c994486039feb2fd47345a1e9984e95d55878",
+                "sha256:d7182803cdb09f1f17a335c0eae71d84905da9b0bc35c3d2c2379745f33096d9",
+                "sha256:d9b85bd98e90a02f2192084a85c857465e40e508629ac922242dba70731d0449",
+                "sha256:e2d9fd696e2d1523386d0f64f115352acbfaf59d5ca4c681c23ea064393a2ac4",
+                "sha256:ede078fdc9af857ed454d1e9e51831b2d577255c794d4044ecc332d40f3e3b36",
+                "sha256:f512afa7bc10b848aaacab5dfff6f61255142dd3a5581f82980c12745b0b6cd3",
+                "sha256:fbf789cc6d3fadca4350fa87e5f710ad2628e1fdff71bf8f853ecd49599ebe23"
+            ],
+            "index": "pypi",
+            "version": "==0.9.0"
+        },
        "tabulate": {
            "hashes": [
                "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
--- a/data/data_clean_with_transformations_and_factors.csv
+++ b/data/data_clean_with_transformations_and_factors.csv
--- a/data/interesting_variables.json
+++ b/data/interesting_variables.json
@ -0,0 +1 @@
+["has 2nd Flr", "has Fireplace", "has Garage", "nhood_Blmngtn", "nhood_Blueste", "nhood_BrDale", "nhood_BrkSide", "nhood_ClearCr", "nhood_CollgCr", "nhood_Crawfor", "nhood_Edwards", "nhood_Gilbert", "nhood_Greens", "nhood_GrnHill", "nhood_IDOTRR", "nhood_Landmrk", "nhood_MeadowV", "nhood_Mitchel", "nhood_Names", "nhood_NoRidge", "nhood_NPkVill", "nhood_NridgHt", "nhood_NWAmes", "nhood_OldTown", "nhood_SWISU", "nhood_Sawyer", "nhood_SawyerW", "nhood_Somerst", "nhood_StoneBr", "nhood_Timber", "nhood_Veenker", "build_type_1Fam", "build_type_2Fam", "build_type_Twnhs", "air_cond", "major_street", "new_home", "remodeled", "years_since_built", "years_since_remodeled", "recently_built", "recently_remodeled"]
--- a/utils.py
+++ b/utils.py
@ -33,6 +33,7 @@ import requests
 import tabulate


+FACTOR_VARIABLES = []
 INDEX_COLUMNS = ["Order", "PID"]
 LABEL_TYPES = ["nominal", "ordinal"]
 TARGET_VARIABLES = ["SalePrice"]
@ -213,40 +214,33 @@ def correct_column_names(data_columns, *, repopulate=True):
    In rare cases, the variable name in the data description file was slightly
    changed, i.e., a dash or a space needs to be removed.

-    This function adjusts the keys in all the dictionaries and lists and
-    returns a dictionary summarizing the name changes.
+    This function adjusts the keys in all the dictionaries and lists.
    """
-    renamed = {}
    for desc_column in ALL_VARIABLES:
        if desc_column not in data_columns:
            for data_column in data_columns:
                # Column name was truncated in description file.
                if data_column.startswith(desc_column):
                    _rename_column(desc_column, data_column)
-                    renamed[desc_column] = data_column
                    break
                # Spaces between words in Excel were removed.
                adj_data_column = data_column.replace(" ", "")
                if adj_data_column == desc_column:
                    _rename_column(desc_column, data_column)
-                    renamed[desc_column] = data_column
                    break
                # Spaces between words in description file were removed.
                adj_desc_column = desc_column.replace(" ", "")
                if adj_data_column == adj_desc_column:
                    _rename_column(desc_column, data_column)
-                    renamed[desc_column] = data_column
                    break
                # Dashes in description file were removed.
                adj_desc_column = desc_column.replace("-", "")
                if data_column == adj_desc_column:
                    _rename_column(desc_column, data_column)
-                    renamed[desc_column] = data_column
                    break
    # Propagate the change to all "secondary" dictionaries and lists.
    if repopulate:
        _populate_dicts_and_lists()
-    return renamed


 def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
@ -254,20 +248,15 @@ def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):

    After dropping some columns from the DataFrame, these removals must be
    propagated to the helper data structures defined in this module.
-
-    Returns a dictionary of all the columns with changed names.
    """
    global ALL_COLUMNS
    if correct_columns:
-        renamed = correct_column_names(columns_to_be_kept, repopulate=False)
-    else:
-        renamed = {}
+        correct_column_names(columns_to_be_kept, repopulate=False)
    columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
    for column in columns_to_be_removed:
        del ALL_COLUMNS[column]
    # Propagate the change to all "secondary" dictionaries and lists.
    _populate_dicts_and_lists()
-    return renamed


 def print_column_list(subset=None):
@ -285,19 +274,20 @@ def print_column_list(subset=None):
        subset -= without_desc
    columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
    if without_desc:
-        for c in sorted(without_desc):
-            columns.append((c, ''))
+        for column in sorted(without_desc):
+            columns.append((column, ""))
    print(tabulate.tabulate(sorted(columns), tablefmt="plain"))


-def load_clean_data(subset=None, ordinal_encoded=False):
+def load_clean_data(*, path=None, ordinal_encoded=False):
    """Return the clean project data as a pandas DataFrame.

    This utility function ensures that each column is cast to its correct type.

-    It takes as an optional 'subset' argument a list of columns and
-    'ordinal_encoded' can be set to True to obtain the ordinal columns already
-    encoded into ordered integers.
+    It takes the following optional keyword-only arguments:
+    - 'path': path to the clean CSV file (defaults to "data/data_clean.csv")
+    - 'ordinal_encoded': can be set to True to obtain the ordinal columns that
+        are already encoded into ordered integers

    The target variable "SalePrice" is always included as the last column.

@ -310,22 +300,17 @@ def load_clean_data(subset=None, ordinal_encoded=False):

    Another difficulty is that some values, e.g., "NA" strings are cast as
    np.NaN / None by pandas although they represent actual label values.
-
-    As column names come in slightly different form compared to the data
-    description file, the subsetting can only be done after loading the CSV
-    and some work needs to be put in to figure out if a column mentioned in the
-    subset was renamed.
    """
    # pragma pylint:disable=invalid-name
    df = pd.read_csv(
-        "data/data_clean.csv",
+        "data/data_clean.csv" if path is None else path,
        index_col=INDEX_COLUMNS,
        dtype=object,
        na_values="",  # There are no missing values in the clean data file.
        keep_default_na=False,  # "NA" strings are casted as actual values.
    )
    # Remove columns that are in the description but not in the data file.
-    renamed = update_column_descriptions(df.columns, correct_columns=True)
+    update_column_descriptions(df.columns, correct_columns=True)
    # Cast the numeric types correctly.
    for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
        df[column] = df[column].astype(float)
@ -342,18 +327,23 @@ def load_clean_data(subset=None, ordinal_encoded=False):
            reversed(mapping["order"]), ordered=True
        )
        df[column] = df[column].astype(labels)
-    # Mirror the renaming and dropping of columns
-    # for the provided list of columns.
-    # Note that the target variable goes last.
-    if subset is not None:
-        subset = set(subset)
-        subset.discard("SalePrice")
-        for old_name, new_name in renamed.items():
-            if old_name in subset:
-                subset.remove(old_name)
-                subset.add(new_name)
-        subset = sorted(set(df.columns) & subset)
-        df = df[subset + TARGET_VARIABLES]
+    # After the raw data cleaning, several derived variables were created.
+    derived_columns = set(df.columns) - set(ALL_VARIABLES + TARGET_VARIABLES)
+    if derived_columns:
+        for column in derived_columns:
+            # All derived variables are numeric (including factors).
+            df[column] = df[column].astype(float)
+            # Check if the derived variable is a target variable.
+            for target in TARGET_VARIABLES[:]:
+                if column.startswith(target):
+                    TARGET_VARIABLES.append(column)
+                    break
+            else:
+                ALL_COLUMNS[column] = {
+                    "type": "continuous",
+                    "description": "derived variable",
+                }
+        _populate_dicts_and_lists()
    # Use integer encoding for ordinal variables.
    if ordinal_encoded:
        df = encode_ordinals(df)
				`@ -0,0 +1 @@`
				["has 2nd Flr", "has Fireplace", "has Garage", "nhood_Blmngtn", "nhood_Blueste", "nhood_BrDale", "nhood_BrkSide", "nhood_ClearCr", "nhood_CollgCr", "nhood_Crawfor", "nhood_Edwards", "nhood_Gilbert", "nhood_Greens", "nhood_GrnHill", "nhood_IDOTRR", "nhood_Landmrk", "nhood_MeadowV", "nhood_Mitchel", "nhood_Names", "nhood_NoRidge", "nhood_NPkVill", "nhood_NridgHt", "nhood_NWAmes", "nhood_OldTown", "nhood_SWISU", "nhood_Sawyer", "nhood_SawyerW", "nhood_Somerst", "nhood_StoneBr", "nhood_Timber", "nhood_Veenker", "build_type_1Fam", "build_type_2Fam", "build_type_Twnhs", "air_cond", "major_street", "new_home", "remodeled", "years_since_built", "years_since_remodeled", "recently_built", "recently_remodeled"]