Add utility function to update helper dicts / lists

2018-08-29 11:20:47 +02:00 · 2018-08-29 11:20:47 +02:00 · 441f121350
commit 441f121350
parent 85bb52e318
2 changed files with 25 additions and 23 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-08-29 10:31:14 CEST\n",
+      "2018-08-29 11:19:59 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -94,6 +94,7 @@
    "    ORDINAL_COLUMNS,\n",
    "    ORDINAL_VARIABLES,\n",
    "    correct_column_names,\n",
+    "    update_column_descriptions,\n",
    ")"
   ]
  },
@ -2175,19 +2176,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Also remove the discarded columns from the helper lists.\n",
-    "ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n",
-    "CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n",
-    "DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n",
-    "NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "# Remove the discarded columns from the helper dictionaries / lists.\n",
+    "update_column_descriptions(df.columns)\n",
    "# Without any more missing data, convert\n",
    "# the discrete columns to the correct data type.\n",
    "for column in DISCRETE_VARIABLES:\n",
@ -2210,7 +2200,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
@ -2219,7 +2209,7 @@
       "(2898, 78)"
      ]
     },
-     "execution_count": 35,
+     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2230,7 +2220,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
@ -2970,7 +2960,7 @@
       "5     527105010        1997            1998     2010     189900  "
      ]
     },
-     "execution_count": 36,
+     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2981,7 +2971,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/cleaning_utils.py
+++ b/cleaning_utils.py
@ -108,9 +108,7 @@ def _extract_meta_data(lines):
            # Skip the non-feature columns (that are always non-label columns).
            if name in non_feature_columns:
                continue
-            # name = name.strip()
            type_ = type_.lower()
-            # description = description.strip()
            # Create an entry for the next variable in the list.
            ALL_COLUMNS[name] = {"type": type_, "description": description}
            # Only if the variable is a label type, a lookup table is needed.
@ -118,7 +116,7 @@ def _extract_meta_data(lines):
                ALL_COLUMNS[name].update({"lookups": {}})
            # Ordinal variables also store the order of their realizations
            # exactly as defined in the data description file.
-            if type_ == 'ordinal':
+            if type_ == "ordinal":
                ALL_COLUMNS[name].update({"order": []})
        # Add label realizations to a previously found label variable.
        elif type_ in LABEL_TYPES:
@ -126,7 +124,7 @@ def _extract_meta_data(lines):
            code, description = match.groups()
            code = code.strip()
            ALL_COLUMNS[name]["lookups"][code] = description
-            if type_ == 'ordinal':
+            if type_ == "ordinal":
                ALL_COLUMNS[name]["order"].append(code)


@ -239,6 +237,20 @@ def correct_column_names(data_columns):
    _populate_dicts_and_lists()


+def update_column_descriptions(columns_to_be_kept):
+    """Remove discarded columns for all the module's exported data structures.
+
+    After dropping some columns from the DataFrame, these removals must be
+    propagated to the helper data structures defined in this module.
+    """
+    global ALL_COLUMNS
+    columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
+    for column in columns_to_be_removed:
+        del ALL_COLUMNS[column]
+    # Propagate the change to all "secondary" dictionaries and lists.
+    _populate_dicts_and_lists()
+
+
 # This code is executed once during import time and
 # populates all the "constants" directly or indirectly.
 _extract_meta_data(_get_lines())