Add utility function to update helper dicts / lists

2018-08-29 11:20:47 +02:00 · 2018-08-29 11:20:47 +02:00 · 441f121350
commit 441f121350
parent 85bb52e318
2 changed files with 25 additions and 23 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-08-29 10:31:14 CEST\n",
+      "2018-08-29 11:19:59 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -94,6 +94,7 @@
    "    ORDINAL_COLUMNS,\n",
    "    ORDINAL_VARIABLES,\n",
    "    correct_column_names,\n",
    "    update_column_descriptions,\n",
    ")"
   ]
  },
@ -2175,19 +2176,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Also remove the discarded columns from the helper lists.\n",
+    "# Remove the discarded columns from the helper dictionaries / lists.\n",
-    "ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n",
+    "update_column_descriptions(df.columns)\n",
    "CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n",
    "DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n",
    "NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Without any more missing data, convert\n",
    "# the discrete columns to the correct data type.\n",
    "for column in DISCRETE_VARIABLES:\n",
@ -2210,7 +2200,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
@ -2219,7 +2209,7 @@
       "(2898, 78)"
      ]
     },
-     "execution_count": 35,
+     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2230,7 +2220,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
@ -2970,7 +2960,7 @@
       "5     527105010        1997            1998     2010     189900  "
      ]
     },
-     "execution_count": 36,
+     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -2981,7 +2971,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/cleaning_utils.py
+++ b/cleaning_utils.py
@ -108,9 +108,7 @@ def _extract_meta_data(lines):
            # Skip the non-feature columns (that are always non-label columns).
            if name in non_feature_columns:
                continue
            # name = name.strip()
            type_ = type_.lower()
            # description = description.strip()
            # Create an entry for the next variable in the list.
            ALL_COLUMNS[name] = {"type": type_, "description": description}
            # Only if the variable is a label type, a lookup table is needed.
@ -118,7 +116,7 @@ def _extract_meta_data(lines):
                ALL_COLUMNS[name].update({"lookups": {}})
            # Ordinal variables also store the order of their realizations
            # exactly as defined in the data description file.
-            if type_ == 'ordinal':
+            if type_ == "ordinal":
                ALL_COLUMNS[name].update({"order": []})
        # Add label realizations to a previously found label variable.
        elif type_ in LABEL_TYPES:
@ -126,7 +124,7 @@ def _extract_meta_data(lines):
            code, description = match.groups()
            code = code.strip()
            ALL_COLUMNS[name]["lookups"][code] = description
-            if type_ == 'ordinal':
+            if type_ == "ordinal":
                ALL_COLUMNS[name]["order"].append(code)
@ -239,6 +237,20 @@ def correct_column_names(data_columns):
    _populate_dicts_and_lists()
 def update_column_descriptions(columns_to_be_kept):
    """Remove discarded columns for all the module's exported data structures.
    After dropping some columns from the DataFrame, these removals must be
    propagated to the helper data structures defined in this module.
    """
    global ALL_COLUMNS
    columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
    for column in columns_to_be_removed:
        del ALL_COLUMNS[column]
    # Propagate the change to all "secondary" dictionaries and lists.
    _populate_dicts_and_lists()
 # This code is executed once during import time and
 # populates all the "constants" directly or indirectly.
 _extract_meta_data(_get_lines())