From 488fb69da9923a5ad002f42fe4fc0708fe9e997d Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Sat, 1 Sep 2018 16:52:46 +0200 Subject: [PATCH] Re-factor code into function to print a column list --- 1_data_cleaning.ipynb | 38 ++++++++++++++++++++------------------ utils.py | 26 ++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/1_data_cleaning.ipynb b/1_data_cleaning.ipynb index 18e187c..1091d57 100644 --- a/1_data_cleaning.ipynb +++ b/1_data_cleaning.ipynb @@ -30,7 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2018-08-29 17:18:17 CEST\n", + "2018-09-01 16:51:42 CEST\n", "\n", "CPython 3.6.5\n", "IPython 6.5.0\n", @@ -60,9 +60,7 @@ "source": [ "import missingno as msno\n", "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from tabulate import tabulate" + "import pandas as pd" ] }, { @@ -96,6 +94,7 @@ " ORDINAL_COLUMNS,\n", " ORDINAL_VARIABLES,\n", " correct_column_names,\n", + " print_column_list,\n", " update_column_descriptions,\n", ")" ] @@ -383,8 +382,7 @@ } ], "source": [ - "table = ((key, value[\"description\"]) for (key, value) in CONTINUOUS_COLUMNS.items())\n", - "print(tabulate(sorted(table), tablefmt=\"plain\"))" + "print_column_list(CONTINUOUS_COLUMNS)" ] }, { @@ -749,8 +747,7 @@ } ], "source": [ - "table = ((key, value[\"description\"]) for (key, value) in DISCRETE_COLUMNS.items())\n", - "print(tabulate(sorted(table), tablefmt=\"plain\"))" + "print_column_list(DISCRETE_COLUMNS)" ] }, { @@ -1081,8 +1078,7 @@ } ], "source": [ - "table = ((key, value[\"description\"]) for (key, value) in NOMINAL_COLUMNS.items())\n", - "print(tabulate(sorted(table), tablefmt=\"plain\"))" + "print_column_list(NOMINAL_COLUMNS)" ] }, { @@ -1635,8 +1631,7 @@ } ], "source": [ - "table = ((key, value[\"description\"]) for (key, value) in ORDINAL_COLUMNS.items())\n", - "print(tabulate(sorted(table), tablefmt=\"plain\"))" + "print_column_list(ORDINAL_COLUMNS)" ] }, { @@ -2146,7 +2141,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -2166,7 +2163,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -2186,7 +2185,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -2206,7 +2207,9 @@ "
" ] }, - "metadata": {}, + "metadata": { + "needs_background": "light" + }, "output_type": "display_data" } ], @@ -2262,8 +2265,7 @@ } ], "source": [ - "table = ((col, ALL_COLUMNS[col][\"description\"]) for col in sorted(missing_a_lot))\n", - "print(tabulate(table, tablefmt=\"plain\"))" + "print_column_list(missing_a_lot)" ] }, { diff --git a/utils.py b/utils.py index e408604..a96b153 100644 --- a/utils.py +++ b/utils.py @@ -24,11 +24,12 @@ Implementation Note: This file defines the "constants" it exports dynamically. This is a bit advanced but intentional! """ -# pragma pylint:disable=W0603 +# pragma pylint:disable=global-statement import re import requests +import tabulate INDEX_COLUMNS = ["Order", "PID"] @@ -204,7 +205,7 @@ def _rename_column(old_name, new_name): del ALL_COLUMNS[old_name] -def correct_column_names(data_columns): +def correct_column_names(data_columns, *, repopulate=True): """Cross-check the column names between data and description file. In rare cases, the variable name in the data description file was slightly @@ -235,16 +236,19 @@ def correct_column_names(data_columns): _rename_column(desc_column, data_column) break # Propagate the change to all "secondary" dictionaries and lists. - _populate_dicts_and_lists() + if repopulate: + _populate_dicts_and_lists() -def update_column_descriptions(columns_to_be_kept): +def update_column_descriptions(columns_to_be_kept, *, correct_columns=False): """Remove discarded columns for all the module's exported data structures. After dropping some columns from the DataFrame, these removals must be propagated to the helper data structures defined in this module. """ global ALL_COLUMNS + if correct_columns: + correct_column_names(columns_to_be_kept, repopulate=False) columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept)) for column in columns_to_be_removed: del ALL_COLUMNS[column] @@ -252,6 +256,20 @@ def update_column_descriptions(columns_to_be_kept): _populate_dicts_and_lists() +def print_column_list(subset=None): + """Print (a subset of) the data's column headers. + + Note that this function is built to handle both *_COLUMNS dicts and + *_VARIABLES lists. + """ + if subset is None: + subset = ALL_VARIABLES + else: + assert set(list(subset)) <= set(list(ALL_VARIABLES)) + columns = sorted((c, ALL_COLUMNS[c]["description"]) for c in subset) + print(tabulate.tabulate(columns, tablefmt="plain")) + + # This code is executed once during import time and # populates all the "constants" directly or indirectly. _extract_meta_data(_get_lines())