Add utility function to update helper dicts / lists

This commit is contained in:
Alexander Hess 2018-08-29 11:20:47 +02:00
parent 85bb52e318
commit 441f121350
2 changed files with 25 additions and 23 deletions

View file

@ -30,7 +30,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-29 10:31:14 CEST\n",
"2018-08-29 11:19:59 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
@ -94,6 +94,7 @@
" ORDINAL_COLUMNS,\n",
" ORDINAL_VARIABLES,\n",
" correct_column_names,\n",
" update_column_descriptions,\n",
")"
]
},
@ -2175,19 +2176,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Also remove the discarded columns from the helper lists.\n",
"ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n",
"CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n",
"DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n",
"NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Remove the discarded columns from the helper dictionaries / lists.\n",
"update_column_descriptions(df.columns)\n",
"# Without any more missing data, convert\n",
"# the discrete columns to the correct data type.\n",
"for column in DISCRETE_VARIABLES:\n",
@ -2210,7 +2200,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 34,
"metadata": {},
"outputs": [
{
@ -2219,7 +2209,7 @@
"(2898, 78)"
]
},
"execution_count": 35,
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@ -2230,7 +2220,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 35,
"metadata": {},
"outputs": [
{
@ -2970,7 +2960,7 @@
"5 527105010 1997 1998 2010 189900 "
]
},
"execution_count": 36,
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@ -2981,7 +2971,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [

View file

@ -108,9 +108,7 @@ def _extract_meta_data(lines):
# Skip the non-feature columns (that are always non-label columns).
if name in non_feature_columns:
continue
# name = name.strip()
type_ = type_.lower()
# description = description.strip()
# Create an entry for the next variable in the list.
ALL_COLUMNS[name] = {"type": type_, "description": description}
# Only if the variable is a label type, a lookup table is needed.
@ -118,7 +116,7 @@ def _extract_meta_data(lines):
ALL_COLUMNS[name].update({"lookups": {}})
# Ordinal variables also store the order of their realizations
# exactly as defined in the data description file.
if type_ == 'ordinal':
if type_ == "ordinal":
ALL_COLUMNS[name].update({"order": []})
# Add label realizations to a previously found label variable.
elif type_ in LABEL_TYPES:
@ -126,7 +124,7 @@ def _extract_meta_data(lines):
code, description = match.groups()
code = code.strip()
ALL_COLUMNS[name]["lookups"][code] = description
if type_ == 'ordinal':
if type_ == "ordinal":
ALL_COLUMNS[name]["order"].append(code)
@ -239,6 +237,20 @@ def correct_column_names(data_columns):
_populate_dicts_and_lists()
def update_column_descriptions(columns_to_be_kept):
"""Remove discarded columns for all the module's exported data structures.
After dropping some columns from the DataFrame, these removals must be
propagated to the helper data structures defined in this module.
"""
global ALL_COLUMNS
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
for column in columns_to_be_removed:
del ALL_COLUMNS[column]
# Propagate the change to all "secondary" dictionaries and lists.
_populate_dicts_and_lists()
# This code is executed once during import time and
# populates all the "constants" directly or indirectly.
_extract_meta_data(_get_lines())