Add utility function to update helper dicts / lists
This commit is contained in:
parent
85bb52e318
commit
441f121350
2 changed files with 25 additions and 23 deletions
|
@ -30,7 +30,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"2018-08-29 10:31:14 CEST\n",
|
"2018-08-29 11:19:59 CEST\n",
|
||||||
"\n",
|
"\n",
|
||||||
"CPython 3.6.5\n",
|
"CPython 3.6.5\n",
|
||||||
"IPython 6.5.0\n",
|
"IPython 6.5.0\n",
|
||||||
|
@ -94,6 +94,7 @@
|
||||||
" ORDINAL_COLUMNS,\n",
|
" ORDINAL_COLUMNS,\n",
|
||||||
" ORDINAL_VARIABLES,\n",
|
" ORDINAL_VARIABLES,\n",
|
||||||
" correct_column_names,\n",
|
" correct_column_names,\n",
|
||||||
|
" update_column_descriptions,\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -2175,19 +2176,8 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Also remove the discarded columns from the helper lists.\n",
|
"# Remove the discarded columns from the helper dictionaries / lists.\n",
|
||||||
"ALL_VARIABLES = sorted(set(ALL_VARIABLES) - set(missing_a_lot))\n",
|
"update_column_descriptions(df.columns)\n",
|
||||||
"CONTINUOUS_VARIABLES = sorted(set(CONTINUOUS_VARIABLES) - set(missing_a_lot))\n",
|
|
||||||
"DISCRETE_VARIABLES = sorted(set(DISCRETE_VARIABLES) - set(missing_a_lot))\n",
|
|
||||||
"NUMERIC_VARIABLES = sorted(set(NUMERIC_VARIABLES) - set(missing_a_lot))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 34,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Without any more missing data, convert\n",
|
"# Without any more missing data, convert\n",
|
||||||
"# the discrete columns to the correct data type.\n",
|
"# the discrete columns to the correct data type.\n",
|
||||||
"for column in DISCRETE_VARIABLES:\n",
|
"for column in DISCRETE_VARIABLES:\n",
|
||||||
|
@ -2210,7 +2200,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 35,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -2219,7 +2209,7 @@
|
||||||
"(2898, 78)"
|
"(2898, 78)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 35,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -2230,7 +2220,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 36,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -2970,7 +2960,7 @@
|
||||||
"5 527105010 1997 1998 2010 189900 "
|
"5 527105010 1997 1998 2010 189900 "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 36,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -2981,7 +2971,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 37,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
|
|
@ -108,9 +108,7 @@ def _extract_meta_data(lines):
|
||||||
# Skip the non-feature columns (that are always non-label columns).
|
# Skip the non-feature columns (that are always non-label columns).
|
||||||
if name in non_feature_columns:
|
if name in non_feature_columns:
|
||||||
continue
|
continue
|
||||||
# name = name.strip()
|
|
||||||
type_ = type_.lower()
|
type_ = type_.lower()
|
||||||
# description = description.strip()
|
|
||||||
# Create an entry for the next variable in the list.
|
# Create an entry for the next variable in the list.
|
||||||
ALL_COLUMNS[name] = {"type": type_, "description": description}
|
ALL_COLUMNS[name] = {"type": type_, "description": description}
|
||||||
# Only if the variable is a label type, a lookup table is needed.
|
# Only if the variable is a label type, a lookup table is needed.
|
||||||
|
@ -118,7 +116,7 @@ def _extract_meta_data(lines):
|
||||||
ALL_COLUMNS[name].update({"lookups": {}})
|
ALL_COLUMNS[name].update({"lookups": {}})
|
||||||
# Ordinal variables also store the order of their realizations
|
# Ordinal variables also store the order of their realizations
|
||||||
# exactly as defined in the data description file.
|
# exactly as defined in the data description file.
|
||||||
if type_ == 'ordinal':
|
if type_ == "ordinal":
|
||||||
ALL_COLUMNS[name].update({"order": []})
|
ALL_COLUMNS[name].update({"order": []})
|
||||||
# Add label realizations to a previously found label variable.
|
# Add label realizations to a previously found label variable.
|
||||||
elif type_ in LABEL_TYPES:
|
elif type_ in LABEL_TYPES:
|
||||||
|
@ -126,7 +124,7 @@ def _extract_meta_data(lines):
|
||||||
code, description = match.groups()
|
code, description = match.groups()
|
||||||
code = code.strip()
|
code = code.strip()
|
||||||
ALL_COLUMNS[name]["lookups"][code] = description
|
ALL_COLUMNS[name]["lookups"][code] = description
|
||||||
if type_ == 'ordinal':
|
if type_ == "ordinal":
|
||||||
ALL_COLUMNS[name]["order"].append(code)
|
ALL_COLUMNS[name]["order"].append(code)
|
||||||
|
|
||||||
|
|
||||||
|
@ -239,6 +237,20 @@ def correct_column_names(data_columns):
|
||||||
_populate_dicts_and_lists()
|
_populate_dicts_and_lists()
|
||||||
|
|
||||||
|
|
||||||
|
def update_column_descriptions(columns_to_be_kept):
|
||||||
|
"""Remove discarded columns for all the module's exported data structures.
|
||||||
|
|
||||||
|
After dropping some columns from the DataFrame, these removals must be
|
||||||
|
propagated to the helper data structures defined in this module.
|
||||||
|
"""
|
||||||
|
global ALL_COLUMNS
|
||||||
|
columns_to_be_removed = list(set(ALL_COLUMNS) - set(columns_to_be_kept))
|
||||||
|
for column in columns_to_be_removed:
|
||||||
|
del ALL_COLUMNS[column]
|
||||||
|
# Propagate the change to all "secondary" dictionaries and lists.
|
||||||
|
_populate_dicts_and_lists()
|
||||||
|
|
||||||
|
|
||||||
# This code is executed once during import time and
|
# This code is executed once during import time and
|
||||||
# populates all the "constants" directly or indirectly.
|
# populates all the "constants" directly or indirectly.
|
||||||
_extract_meta_data(_get_lines())
|
_extract_meta_data(_get_lines())
|
||||||
|
|
Loading…
Reference in a new issue