Add utility function to update helper dicts / lists
This commit is contained in:
parent
85bb52e318
commit
441f121350
2 changed files with 25 additions and 23 deletions
|
@ -30,7 +30,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-08-29 10:31:14 CEST\n",
|
||||
"2018-08-29 11:19:59 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
|
@ -94,6 +94,7 @@
|
|||
" ORDINAL_COLUMNS,\n",
|
||||
" ORDINAL_VARIABLES,\n",
|
||||
" correct_column_names,\n",
|
||||
" update_column_descriptions,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
@ -2175,19 +2176,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Also remove the discarded columns from the helper lists.\n",
|
||||
"ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n",
|
||||
"CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n",
|
||||
"DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n",
|
||||
"NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Remove the discarded columns from the helper dictionaries / lists.\n",
|
||||
"update_column_descriptions(df.columns)\n",
|
||||
"# Without any more missing data, convert\n",
|
||||
"# the discrete columns to the correct data type.\n",
|
||||
"for column in DISCRETE_VARIABLES:\n",
|
||||
|
@ -2210,7 +2200,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -2219,7 +2209,7 @@
|
|||
"(2898, 78)"
|
||||
]
|
||||
},
|
||||
"execution_count": 35,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -2230,7 +2220,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -2970,7 +2960,7 @@
|
|||
"5 527105010 1997 1998 2010 189900 "
|
||||
]
|
||||
},
|
||||
"execution_count": 36,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -2981,7 +2971,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
|
@ -108,9 +108,7 @@ def _extract_meta_data(lines):
|
|||
# Skip the non-feature columns (that are always non-label columns).
|
||||
if name in non_feature_columns:
|
||||
continue
|
||||
# name = name.strip()
|
||||
type_ = type_.lower()
|
||||
# description = description.strip()
|
||||
# Create an entry for the next variable in the list.
|
||||
ALL_COLUMNS[name] = {"type": type_, "description": description}
|
||||
# Only if the variable is a label type, a lookup table is needed.
|
||||
|
@ -118,7 +116,7 @@ def _extract_meta_data(lines):
|
|||
ALL_COLUMNS[name].update({"lookups": {}})
|
||||
# Ordinal variables also store the order of their realizations
|
||||
# exactly as defined in the data description file.
|
||||
if type_ == 'ordinal':
|
||||
if type_ == "ordinal":
|
||||
ALL_COLUMNS[name].update({"order": []})
|
||||
# Add label realizations to a previously found label variable.
|
||||
elif type_ in LABEL_TYPES:
|
||||
|
@ -126,7 +124,7 @@ def _extract_meta_data(lines):
|
|||
code, description = match.groups()
|
||||
code = code.strip()
|
||||
ALL_COLUMNS[name]["lookups"][code] = description
|
||||
if type_ == 'ordinal':
|
||||
if type_ == "ordinal":
|
||||
ALL_COLUMNS[name]["order"].append(code)
|
||||
|
||||
|
||||
|
@ -239,6 +237,20 @@ def correct_column_names(data_columns):
|
|||
_populate_dicts_and_lists()
|
||||
|
||||
|
||||
def update_column_descriptions(columns_to_be_kept):
|
||||
"""Remove discarded columns for all the module's exported data structures.
|
||||
|
||||
After dropping some columns from the DataFrame, these removals must be
|
||||
propagated to the helper data structures defined in this module.
|
||||
"""
|
||||
global ALL_COLUMNS
|
||||
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
||||
for column in columns_to_be_removed:
|
||||
del ALL_COLUMNS[column]
|
||||
# Propagate the change to all "secondary" dictionaries and lists.
|
||||
_populate_dicts_and_lists()
|
||||
|
||||
|
||||
# This code is executed once during import time and
|
||||
# populates all the "constants" directly or indirectly.
|
||||
_extract_meta_data(_get_lines())
|
||||
|
|
Loading…
Reference in a new issue