Add feature transformations (e.g. Box-Cox)

This commit is contained in:
Alexander Hess 2018-09-03 15:57:24 +02:00
parent e662960fde
commit 069691cca1
7 changed files with 5025 additions and 89 deletions

View file

@ -30,7 +30,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"2018-09-02 18:50:50 CEST\n",
"2018-09-03 15:32:42 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
@ -93,7 +93,7 @@
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
" ORDINAL_COLUMNS,\n",
" ORDINAL_VARIABLES,\n",
" TARGET_VARIABLE, # = Sale Price\n",
" TARGET_VARIABLES, # = Sale Price\n",
" correct_column_names,\n",
" print_column_list,\n",
" update_column_descriptions,\n",
@ -200,7 +200,7 @@
"# order as in the encoded description file.\n",
"# Note that the target variable \"SalePrice\"\n",
"# is not in the description file.\n",
"df = df[ALL_VARIABLES + TARGET_VARIABLE]"
"df = df[ALL_VARIABLES + TARGET_VARIABLES]"
]
},
{
@ -267,7 +267,7 @@
"outputs": [],
"source": [
"# Show that all \"continuous\" variables come as integers.\n",
"for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n",
"for column in NUMERIC_VARIABLES + TARGET_VARIABLES:\n",
" not_null = df[column].notnull()\n",
" mask = (\n",
" df.loc[not_null, column].astype(np.int64)\n",
@ -2238,7 +2238,7 @@
"metadata": {},
"outputs": [],
"source": [
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n",
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLES\n",
"mask = df[remaining_columns].isnull().any(axis=1)\n",
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
"df = df[~mask]"
@ -2288,7 +2288,7 @@
"update_column_descriptions(df.columns)\n",
"# Without any more missing data, cast all numeric\n",
"# columns as floats or integers respectively.\n",
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n",
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:\n",
" df[column] = df[column].astype(np.float64)\n",
"for column in DISCRETE_VARIABLES:\n",
" df[column] = df[column].astype(np.int64)"

File diff suppressed because one or more lines are too long

View file

@ -12,8 +12,10 @@ xlrd = "*"
xlwt = "*"
tabulate = "*"
requests = "*"
matplotlib = "*"
seaborn = "*"
missingno = "*"
sklearn = "*"
[dev-packages]
black = "*"

45
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "b9e10abf6d574d2d423a53aca049fcee7b8f1c730088a0079da637e77d22c2c0"
"sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
},
"pipfile-spec": 6,
"requires": {
@ -224,7 +224,7 @@
"sha256:c6e83ca4e70dc5d3185ee15544a66fbf25aff6f242d6102c2358cee240963785",
"sha256:e041d42e6cf4a363aa544e6affedc7ce21c71232147c9f206044e7dfb034bc94"
],
"markers": "python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
"index": "pypi",
"version": "==3.0.0rc2"
},
"missingno": {
@ -452,6 +452,40 @@
"index": "pypi",
"version": "==2.19.1"
},
"scikit-learn": {
"hashes": [
"sha256:0aa0ac9a47459f5a5f078b345e56b15cf7a3126ecddbd5c22dfbe3f898884f90",
"sha256:10543e70ab8500deb2e21e8ffddba83680585dc8346b773acec8fbbcf4d7f878",
"sha256:131c09fe20e25db4262d54279513950c9dee1c7ad45564404a753e588014b970",
"sha256:1829ebc8d86285429e310db61c0d3d3983756f2d13373e6d446c115c926c937d",
"sha256:31f6fc497e01de726488828e312606ff87bf19cba54f51a731852f1ef4113242",
"sha256:3c94c99e9411c54850077e576a7fe9a4443b061a1a11b66a506be51ca9083b5b",
"sha256:605380170c080f620bcd34c6e8489bd721cfa8c478854b092c9505076d069d42",
"sha256:64301ce9014b8082c0eae4636edb49381e9d5357a3a39ed80151453ed0f8b519",
"sha256:68c2e5df3cc3f681b15271325016d3fc1ba23c65b1361da0d764606050002397",
"sha256:72ece165f2262a262e702e51a06b661fb86f3d30c20fa20f09f0f30097656244",
"sha256:760292b61931d36963df3c3657b0292da3f348974235e0af55cd09c5db0c1280",
"sha256:7a15dfaae603bc78632d0bb2809bb781b67461f689dc92da9bca44551dd011cf",
"sha256:7cb3c857ffcb349795947f2938969ced4271ac72bdbece55c00a0817392c2d15",
"sha256:83946e287749e9ddf853fc188d624e2c9280e88ce7cbb3b0d71980483e022fab",
"sha256:85a93c849344a1acf63a9ec3f97c4a4067497daf63ca985e7244d75a185c11a6",
"sha256:9f83e5353fbde280c0e8302f4b78fe3795f36be67ad02ff4ceb84d6e7d3001fa",
"sha256:a0c8c06582b01b7b8c568a426483e09e47ca91dce585e7d4b78c66bab5f19d4f",
"sha256:a9b2d9329a6802a38461b7ff81d9de9c218e18835ff7a196dcf93d85937f7d24",
"sha256:ac45ae4963e2a5119e2d88f1e8e5cbfac43851c6aa9f6477d5bd452e9dcd97f4",
"sha256:b5baee191ff940f4d536d370f1481a1819dbc5b9443e2e28593922979fac9aaf",
"sha256:b723f603169d86f4cac8ed47453ab5cb07db46ea31a103960f6bacc4c7605634",
"sha256:b9158ba32ba8d97f747ab40f29f6df7088b70d0aaea7763b4a88f9ef8ede198c",
"sha256:ce00d619768594b7d0013c3f5a8405236df8730474479324de0328e73c812536",
"sha256:db85a112e187409ee464675f6d3703f5a81b26dc30e155929a950f85c672335c",
"sha256:df0ea014e791dcec8b55def33189dc000a3d6b635a10739317fea058a427e634",
"sha256:f0454968f13611972feefc0d3f86ffc59cee98e3c2f545f2abae7f338a74f7ee",
"sha256:f4119d45b770703cd86f7f7b1fd5416415cd5ff089f3e88b417ab016749f5f3d",
"sha256:f575e6572db703a94418cd8e880f131a62b8767f298d531eaf38e98be8abfe9a"
],
"markers": "python_version != '3.2.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.0.*' and python_version >= '2.7'",
"version": "==0.20rc1"
},
"scipy": {
"hashes": [
"sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7",
@ -514,6 +548,13 @@
],
"version": "==1.11.0"
},
"sklearn": {
"hashes": [
"sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
],
"index": "pypi",
"version": "==0.0"
},
"tabulate": {
"hashes": [
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"

File diff suppressed because it is too large Load diff

View file

@ -35,7 +35,7 @@ import tabulate
INDEX_COLUMNS = ["Order", "PID"]
LABEL_TYPES = ["nominal", "ordinal"]
TARGET_VARIABLE = ["SalePrice"]
TARGET_VARIABLES = ["SalePrice"]
# Note that these dictionaries and lists are not actually constants but
# filled in during import time which makes them "near"-constant.
ALL_COLUMNS = {}
@ -102,7 +102,7 @@ def _extract_meta_data(lines):
# The two ID columns and the target variable "SalePrice"
# are not put into the helper dicts / lists as they are
# treated seperately in the modelling anyways.
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLE
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLES
for line in lines:
# Process the next variable in the list.
@ -279,9 +279,15 @@ def print_column_list(subset=None):
if subset is None:
subset = ALL_VARIABLES
else:
assert set(list(subset)) <= set(list(ALL_VARIABLES))
columns = sorted((c, ALL_COLUMNS[c]["description"]) for c in subset)
print(tabulate.tabulate(columns, tablefmt="plain"))
subset = set(subset)
# Handle variables withoutdescription seperately.
without_desc = subset - set(ALL_VARIABLES)
subset -= without_desc
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
if without_desc:
for c in sorted(without_desc):
columns.append((c, ''))
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
def load_clean_data(subset=None, ordinal_encoded=False):
@ -321,7 +327,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
# Remove columns that are in the description but not in the data file.
renamed = update_column_descriptions(df.columns, correct_columns=True)
# Cast the numeric types correctly.
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
df[column] = df[column].astype(float)
for column in DISCRETE_VARIABLES:
df[column] = df[column].astype(int)
@ -347,7 +353,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
subset.remove(old_name)
subset.add(new_name)
subset = sorted(set(df.columns) & subset)
df = df[subset + TARGET_VARIABLE]
df = df[subset + TARGET_VARIABLES]
# Use integer encoding for ordinal variables.
if ordinal_encoded:
df = encode_ordinals(df)

View file

@ -1 +1 @@
{"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "Total Bsmt SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Year Built"]}
{"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "TotRms AbvGrd (box-cox-0.0)", "Total Bsmt SF", "Total Porch SF", "Wood Deck SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Total Bath", "Total Bath (box-cox-0.5)", "Year Built"]}