Re-factor code into function to print a column list

2018-09-01 16:52:46 +02:00 · 2018-09-01 16:52:46 +02:00 · 488fb69da9
commit 488fb69da9
parent ffeed18376
2 changed files with 42 additions and 22 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-08-29 17:18:17 CEST\n",
+      "2018-09-01 16:51:42 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -60,9 +60,7 @@
   "source": [
    "import missingno as msno\n",
    "import numpy as np\n",
-    "import pandas as pd\n",
-    "\n",
-    "from tabulate import tabulate"
+    "import pandas as pd"
   ]
  },
  {
@ -96,6 +94,7 @@
    "    ORDINAL_COLUMNS,\n",
    "    ORDINAL_VARIABLES,\n",
    "    correct_column_names,\n",
+    "    print_column_list,\n",
    "    update_column_descriptions,\n",
    ")"
   ]
@ -383,8 +382,7 @@
    }
   ],
   "source": [
-    "table = ((key, value[\"description\"]) for (key, value) in CONTINUOUS_COLUMNS.items())\n",
-    "print(tabulate(sorted(table), tablefmt=\"plain\"))"
+    "print_column_list(CONTINUOUS_COLUMNS)"
   ]
  },
  {
@ -749,8 +747,7 @@
    }
   ],
   "source": [
-    "table = ((key, value[\"description\"]) for (key, value) in DISCRETE_COLUMNS.items())\n",
-    "print(tabulate(sorted(table), tablefmt=\"plain\"))"
+    "print_column_list(DISCRETE_COLUMNS)"
   ]
  },
  {
@ -1081,8 +1078,7 @@
    }
   ],
   "source": [
-    "table = ((key, value[\"description\"]) for (key, value) in NOMINAL_COLUMNS.items())\n",
-    "print(tabulate(sorted(table), tablefmt=\"plain\"))"
+    "print_column_list(NOMINAL_COLUMNS)"
   ]
  },
  {
@ -1635,8 +1631,7 @@
    }
   ],
   "source": [
-    "table = ((key, value[\"description\"]) for (key, value) in ORDINAL_COLUMNS.items())\n",
-    "print(tabulate(sorted(table), tablefmt=\"plain\"))"
+    "print_column_list(ORDINAL_COLUMNS)"
   ]
  },
  {
@ -2146,7 +2141,9 @@
       "<Figure size 1800x720 with 2 Axes>"
      ]
     },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
     "output_type": "display_data"
    }
   ],
@ -2166,7 +2163,9 @@
       "<Figure size 1800x720 with 2 Axes>"
      ]
     },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
     "output_type": "display_data"
    }
   ],
@ -2186,7 +2185,9 @@
       "<Figure size 1800x720 with 2 Axes>"
      ]
     },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
     "output_type": "display_data"
    }
   ],
@ -2206,7 +2207,9 @@
       "<Figure size 1800x720 with 2 Axes>"
      ]
     },
-     "metadata": {},
+     "metadata": {
+      "needs_background": "light"
+     },
     "output_type": "display_data"
    }
   ],
@ -2262,8 +2265,7 @@
    }
   ],
   "source": [
-    "table = ((col, ALL_COLUMNS[col][\"description\"]) for col in sorted(missing_a_lot))\n",
-    "print(tabulate(table, tablefmt=\"plain\"))"
+    "print_column_list(missing_a_lot)"
   ]
  },
  {
--- a/utils.py
+++ b/utils.py
@ -24,11 +24,12 @@ Implementation Note:
    This file defines the "constants" it exports dynamically. This is a bit
    advanced but intentional!
 """
-# pragma pylint:disable=W0603
+# pragma pylint:disable=global-statement

 import re

 import requests
+import tabulate


 INDEX_COLUMNS = ["Order", "PID"]
@ -204,7 +205,7 @@ def _rename_column(old_name, new_name):
    del ALL_COLUMNS[old_name]


-def correct_column_names(data_columns):
+def correct_column_names(data_columns, *, repopulate=True):
    """Cross-check the column names between data and description file.

    In rare cases, the variable name in the data description file was slightly
@ -235,16 +236,19 @@ def correct_column_names(data_columns):
                    _rename_column(desc_column, data_column)
                    break
    # Propagate the change to all "secondary" dictionaries and lists.
-    _populate_dicts_and_lists()
+    if repopulate:
+        _populate_dicts_and_lists()


-def update_column_descriptions(columns_to_be_kept):
+def update_column_descriptions(columns_to_be_kept, *, correct_columns=False):
    """Remove discarded columns for all the module's exported data structures.

    After dropping some columns from the DataFrame, these removals must be
    propagated to the helper data structures defined in this module.
    """
    global ALL_COLUMNS
+    if correct_columns:
+        correct_column_names(columns_to_be_kept, repopulate=False)
    columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
    for column in columns_to_be_removed:
        del ALL_COLUMNS[column]
@ -252,6 +256,20 @@ def update_column_descriptions(columns_to_be_kept):
    _populate_dicts_and_lists()


+def print_column_list(subset=None):
+    """Print (a subset of) the data's column headers.
+
+    Note that this function is built to handle both *_COLUMNS dicts and
+    *_VARIABLES lists.
+    """
+    if subset is None:
+        subset = ALL_VARIABLES
+    else:
+        assert set(list(subset)) <= set(list(ALL_VARIABLES))
+    columns = sorted((c, ALL_COLUMNS[c]["description"]) for c in subset)
+    print(tabulate.tabulate(columns, tablefmt="plain"))
+
+
 # This code is executed once during import time and
 # populates all the "constants" directly or indirectly.
 _extract_meta_data(_get_lines())