From 441f121350bc5a11d2af740ba55498cb9a479c87 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 29 Aug 2018 11:20:47 +0200 Subject: [PATCH] Add utility function to update helper dicts / lists --- 1_data_cleaning.ipynb | 28 +++++++++------------------- cleaning_utils.py | 20 ++++++++++++++++---- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/1_data_cleaning.ipynb b/1_data_cleaning.ipynb index 451a26c..b89c90e 100644 --- a/1_data_cleaning.ipynb +++ b/1_data_cleaning.ipynb @@ -30,7 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2018-08-29 10:31:14 CEST\n", + "2018-08-29 11:19:59 CEST\n", "\n", "CPython 3.6.5\n", "IPython 6.5.0\n", @@ -94,6 +94,7 @@ " ORDINAL_COLUMNS,\n", " ORDINAL_VARIABLES,\n", " correct_column_names,\n", + " update_column_descriptions,\n", ")" ] }, @@ -2175,19 +2176,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Also remove the discarded columns from the helper lists.\n", - "ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n", - "CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n", - "DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n", - "NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ + "# Remove the discarded columns from the helper dictionaries / lists.\n", + "update_column_descriptions(df.columns)\n", "# Without any more missing data, convert\n", "# the discrete columns to the correct data type.\n", "for column in DISCRETE_VARIABLES:\n", @@ -2210,7 +2200,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -2219,7 +2209,7 @@ "(2898, 78)" ] }, - "execution_count": 35, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2230,7 +2220,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2970,7 +2960,7 @@ "5 527105010 1997 1998 2010 189900 " ] }, - "execution_count": 36, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2981,7 +2971,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ diff --git a/cleaning_utils.py b/cleaning_utils.py index 57b9116..2927378 100644 --- a/cleaning_utils.py +++ b/cleaning_utils.py @@ -108,9 +108,7 @@ def _extract_meta_data(lines): # Skip the non-feature columns (that are always non-label columns). if name in non_feature_columns: continue - # name = name.strip() type_ = type_.lower() - # description = description.strip() # Create an entry for the next variable in the list. ALL_COLUMNS[name] = {"type": type_, "description": description} # Only if the variable is a label type, a lookup table is needed. @@ -118,7 +116,7 @@ def _extract_meta_data(lines): ALL_COLUMNS[name].update({"lookups": {}}) # Ordinal variables also store the order of their realizations # exactly as defined in the data description file. - if type_ == 'ordinal': + if type_ == "ordinal": ALL_COLUMNS[name].update({"order": []}) # Add label realizations to a previously found label variable. elif type_ in LABEL_TYPES: @@ -126,7 +124,7 @@ def _extract_meta_data(lines): code, description = match.groups() code = code.strip() ALL_COLUMNS[name]["lookups"][code] = description - if type_ == 'ordinal': + if type_ == "ordinal": ALL_COLUMNS[name]["order"].append(code) @@ -239,6 +237,20 @@ def correct_column_names(data_columns): _populate_dicts_and_lists() +def update_column_descriptions(columns_to_be_kept): + """Remove discarded columns for all the module's exported data structures. + + After dropping some columns from the DataFrame, these removals must be + propagated to the helper data structures defined in this module. + """ + global ALL_COLUMNS + columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept)) + for column in columns_to_be_removed: + del ALL_COLUMNS[column] + # Propagate the change to all "secondary" dictionaries and lists. + _populate_dicts_and_lists() + + # This code is executed once during import time and # populates all the "constants" directly or indirectly. _extract_meta_data(_get_lines())