Add feature transformations (e.g. Box-Cox)

This commit is contained in:
Alexander Hess 2018-09-03 15:57:24 +02:00
parent e662960fde
commit 069691cca1
7 changed files with 5025 additions and 89 deletions

View file

@ -30,7 +30,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"2018-09-02 18:50:50 CEST\n", "2018-09-03 15:32:42 CEST\n",
"\n", "\n",
"CPython 3.6.5\n", "CPython 3.6.5\n",
"IPython 6.5.0\n", "IPython 6.5.0\n",
@ -93,7 +93,7 @@
" NUMERIC_VARIABLES, # groups continuous and discrete\n", " NUMERIC_VARIABLES, # groups continuous and discrete\n",
" ORDINAL_COLUMNS,\n", " ORDINAL_COLUMNS,\n",
" ORDINAL_VARIABLES,\n", " ORDINAL_VARIABLES,\n",
" TARGET_VARIABLE, # = Sale Price\n", " TARGET_VARIABLES, # = Sale Price\n",
" correct_column_names,\n", " correct_column_names,\n",
" print_column_list,\n", " print_column_list,\n",
" update_column_descriptions,\n", " update_column_descriptions,\n",
@ -200,7 +200,7 @@
"# order as in the encoded description file.\n", "# order as in the encoded description file.\n",
"# Note that the target variable \"SalePrice\"\n", "# Note that the target variable \"SalePrice\"\n",
"# is not in the description file.\n", "# is not in the description file.\n",
"df = df[ALL_VARIABLES + TARGET_VARIABLE]" "df = df[ALL_VARIABLES + TARGET_VARIABLES]"
] ]
}, },
{ {
@ -267,7 +267,7 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Show that all \"continuous\" variables come as integers.\n", "# Show that all \"continuous\" variables come as integers.\n",
"for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n", "for column in NUMERIC_VARIABLES + TARGET_VARIABLES:\n",
" not_null = df[column].notnull()\n", " not_null = df[column].notnull()\n",
" mask = (\n", " mask = (\n",
" df.loc[not_null, column].astype(np.int64)\n", " df.loc[not_null, column].astype(np.int64)\n",
@ -2238,7 +2238,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n", "remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLES\n",
"mask = df[remaining_columns].isnull().any(axis=1)\n", "mask = df[remaining_columns].isnull().any(axis=1)\n",
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n", "assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
"df = df[~mask]" "df = df[~mask]"
@ -2288,7 +2288,7 @@
"update_column_descriptions(df.columns)\n", "update_column_descriptions(df.columns)\n",
"# Without any more missing data, cast all numeric\n", "# Without any more missing data, cast all numeric\n",
"# columns as floats or integers respectively.\n", "# columns as floats or integers respectively.\n",
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n", "for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:\n",
" df[column] = df[column].astype(np.float64)\n", " df[column] = df[column].astype(np.float64)\n",
"for column in DISCRETE_VARIABLES:\n", "for column in DISCRETE_VARIABLES:\n",
" df[column] = df[column].astype(np.int64)" " df[column] = df[column].astype(np.int64)"

File diff suppressed because one or more lines are too long

View file

@ -12,8 +12,10 @@ xlrd = "*"
xlwt = "*" xlwt = "*"
tabulate = "*" tabulate = "*"
requests = "*" requests = "*"
matplotlib = "*"
seaborn = "*" seaborn = "*"
missingno = "*" missingno = "*"
sklearn = "*"
[dev-packages] [dev-packages]
black = "*" black = "*"

45
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "b9e10abf6d574d2d423a53aca049fcee7b8f1c730088a0079da637e77d22c2c0" "sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -224,7 +224,7 @@
"sha256:c6e83ca4e70dc5d3185ee15544a66fbf25aff6f242d6102c2358cee240963785", "sha256:c6e83ca4e70dc5d3185ee15544a66fbf25aff6f242d6102c2358cee240963785",
"sha256:e041d42e6cf4a363aa544e6affedc7ce21c71232147c9f206044e7dfb034bc94" "sha256:e041d42e6cf4a363aa544e6affedc7ce21c71232147c9f206044e7dfb034bc94"
], ],
"markers": "python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'", "index": "pypi",
"version": "==3.0.0rc2" "version": "==3.0.0rc2"
}, },
"missingno": { "missingno": {
@ -452,6 +452,40 @@
"index": "pypi", "index": "pypi",
"version": "==2.19.1" "version": "==2.19.1"
}, },
"scikit-learn": {
"hashes": [
"sha256:0aa0ac9a47459f5a5f078b345e56b15cf7a3126ecddbd5c22dfbe3f898884f90",
"sha256:10543e70ab8500deb2e21e8ffddba83680585dc8346b773acec8fbbcf4d7f878",
"sha256:131c09fe20e25db4262d54279513950c9dee1c7ad45564404a753e588014b970",
"sha256:1829ebc8d86285429e310db61c0d3d3983756f2d13373e6d446c115c926c937d",
"sha256:31f6fc497e01de726488828e312606ff87bf19cba54f51a731852f1ef4113242",
"sha256:3c94c99e9411c54850077e576a7fe9a4443b061a1a11b66a506be51ca9083b5b",
"sha256:605380170c080f620bcd34c6e8489bd721cfa8c478854b092c9505076d069d42",
"sha256:64301ce9014b8082c0eae4636edb49381e9d5357a3a39ed80151453ed0f8b519",
"sha256:68c2e5df3cc3f681b15271325016d3fc1ba23c65b1361da0d764606050002397",
"sha256:72ece165f2262a262e702e51a06b661fb86f3d30c20fa20f09f0f30097656244",
"sha256:760292b61931d36963df3c3657b0292da3f348974235e0af55cd09c5db0c1280",
"sha256:7a15dfaae603bc78632d0bb2809bb781b67461f689dc92da9bca44551dd011cf",
"sha256:7cb3c857ffcb349795947f2938969ced4271ac72bdbece55c00a0817392c2d15",
"sha256:83946e287749e9ddf853fc188d624e2c9280e88ce7cbb3b0d71980483e022fab",
"sha256:85a93c849344a1acf63a9ec3f97c4a4067497daf63ca985e7244d75a185c11a6",
"sha256:9f83e5353fbde280c0e8302f4b78fe3795f36be67ad02ff4ceb84d6e7d3001fa",
"sha256:a0c8c06582b01b7b8c568a426483e09e47ca91dce585e7d4b78c66bab5f19d4f",
"sha256:a9b2d9329a6802a38461b7ff81d9de9c218e18835ff7a196dcf93d85937f7d24",
"sha256:ac45ae4963e2a5119e2d88f1e8e5cbfac43851c6aa9f6477d5bd452e9dcd97f4",
"sha256:b5baee191ff940f4d536d370f1481a1819dbc5b9443e2e28593922979fac9aaf",
"sha256:b723f603169d86f4cac8ed47453ab5cb07db46ea31a103960f6bacc4c7605634",
"sha256:b9158ba32ba8d97f747ab40f29f6df7088b70d0aaea7763b4a88f9ef8ede198c",
"sha256:ce00d619768594b7d0013c3f5a8405236df8730474479324de0328e73c812536",
"sha256:db85a112e187409ee464675f6d3703f5a81b26dc30e155929a950f85c672335c",
"sha256:df0ea014e791dcec8b55def33189dc000a3d6b635a10739317fea058a427e634",
"sha256:f0454968f13611972feefc0d3f86ffc59cee98e3c2f545f2abae7f338a74f7ee",
"sha256:f4119d45b770703cd86f7f7b1fd5416415cd5ff089f3e88b417ab016749f5f3d",
"sha256:f575e6572db703a94418cd8e880f131a62b8767f298d531eaf38e98be8abfe9a"
],
"markers": "python_version != '3.2.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.0.*' and python_version >= '2.7'",
"version": "==0.20rc1"
},
"scipy": { "scipy": {
"hashes": [ "hashes": [
"sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7", "sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7",
@ -514,6 +548,13 @@
], ],
"version": "==1.11.0" "version": "==1.11.0"
}, },
"sklearn": {
"hashes": [
"sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
],
"index": "pypi",
"version": "==0.0"
},
"tabulate": { "tabulate": {
"hashes": [ "hashes": [
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2" "sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"

File diff suppressed because it is too large Load diff

View file

@ -35,7 +35,7 @@ import tabulate
INDEX_COLUMNS = ["Order", "PID"] INDEX_COLUMNS = ["Order", "PID"]
LABEL_TYPES = ["nominal", "ordinal"] LABEL_TYPES = ["nominal", "ordinal"]
TARGET_VARIABLE = ["SalePrice"] TARGET_VARIABLES = ["SalePrice"]
# Note that these dictionaries and lists are not actually constants but # Note that these dictionaries and lists are not actually constants but
# filled in during import time which makes them "near"-constant. # filled in during import time which makes them "near"-constant.
ALL_COLUMNS = {} ALL_COLUMNS = {}
@ -102,7 +102,7 @@ def _extract_meta_data(lines):
# The two ID columns and the target variable "SalePrice" # The two ID columns and the target variable "SalePrice"
# are not put into the helper dicts / lists as they are # are not put into the helper dicts / lists as they are
# treated seperately in the modelling anyways. # treated seperately in the modelling anyways.
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLE non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLES
for line in lines: for line in lines:
# Process the next variable in the list. # Process the next variable in the list.
@ -279,9 +279,15 @@ def print_column_list(subset=None):
if subset is None: if subset is None:
subset = ALL_VARIABLES subset = ALL_VARIABLES
else: else:
assert set(list(subset)) <= set(list(ALL_VARIABLES)) subset = set(subset)
columns = sorted((c, ALL_COLUMNS[c]["description"]) for c in subset) # Handle variables withoutdescription seperately.
print(tabulate.tabulate(columns, tablefmt="plain")) without_desc = subset - set(ALL_VARIABLES)
subset -= without_desc
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
if without_desc:
for c in sorted(without_desc):
columns.append((c, ''))
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
def load_clean_data(subset=None, ordinal_encoded=False): def load_clean_data(subset=None, ordinal_encoded=False):
@ -321,7 +327,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
# Remove columns that are in the description but not in the data file. # Remove columns that are in the description but not in the data file.
renamed = update_column_descriptions(df.columns, correct_columns=True) renamed = update_column_descriptions(df.columns, correct_columns=True)
# Cast the numeric types correctly. # Cast the numeric types correctly.
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE: for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
df[column] = df[column].astype(float) df[column] = df[column].astype(float)
for column in DISCRETE_VARIABLES: for column in DISCRETE_VARIABLES:
df[column] = df[column].astype(int) df[column] = df[column].astype(int)
@ -347,7 +353,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
subset.remove(old_name) subset.remove(old_name)
subset.add(new_name) subset.add(new_name)
subset = sorted(set(df.columns) & subset) subset = sorted(set(df.columns) & subset)
df = df[subset + TARGET_VARIABLE] df = df[subset + TARGET_VARIABLES]
# Use integer encoding for ordinal variables. # Use integer encoding for ordinal variables.
if ordinal_encoded: if ordinal_encoded:
df = encode_ordinals(df) df = encode_ordinals(df)

View file

@ -1 +1 @@
{"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "Total Bsmt SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Year Built"]} {"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "TotRms AbvGrd (box-cox-0.0)", "Total Bsmt SF", "Total Porch SF", "Wood Deck SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Total Bath", "Total Bath (box-cox-0.5)", "Year Built"]}