Add feature transformations (e.g. Box-Cox)
This commit is contained in:
parent
e662960fde
commit
069691cca1
7 changed files with 5025 additions and 89 deletions
|
@ -30,7 +30,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-09-02 18:50:50 CEST\n",
|
||||
"2018-09-03 15:32:42 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
|
@ -93,7 +93,7 @@
|
|||
" NUMERIC_VARIABLES, # groups continuous and discrete\n",
|
||||
" ORDINAL_COLUMNS,\n",
|
||||
" ORDINAL_VARIABLES,\n",
|
||||
" TARGET_VARIABLE, # = Sale Price\n",
|
||||
" TARGET_VARIABLES, # = Sale Price\n",
|
||||
" correct_column_names,\n",
|
||||
" print_column_list,\n",
|
||||
" update_column_descriptions,\n",
|
||||
|
@ -200,7 +200,7 @@
|
|||
"# order as in the encoded description file.\n",
|
||||
"# Note that the target variable \"SalePrice\"\n",
|
||||
"# is not in the description file.\n",
|
||||
"df = df[ALL_VARIABLES + TARGET_VARIABLE]"
|
||||
"df = df[ALL_VARIABLES + TARGET_VARIABLES]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -267,7 +267,7 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Show that all \"continuous\" variables come as integers.\n",
|
||||
"for column in NUMERIC_VARIABLES + TARGET_VARIABLE:\n",
|
||||
"for column in NUMERIC_VARIABLES + TARGET_VARIABLES:\n",
|
||||
" not_null = df[column].notnull()\n",
|
||||
" mask = (\n",
|
||||
" df.loc[not_null, column].astype(np.int64)\n",
|
||||
|
@ -2238,7 +2238,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLE\n",
|
||||
"remaining_columns = sorted(set(ALL_VARIABLES) - set(missing_a_lot)) + TARGET_VARIABLES\n",
|
||||
"mask = df[remaining_columns].isnull().any(axis=1)\n",
|
||||
"assert (100 * mask.sum() / df.shape[0]) < 1.1 # percent\n",
|
||||
"df = df[~mask]"
|
||||
|
@ -2288,7 +2288,7 @@
|
|||
"update_column_descriptions(df.columns)\n",
|
||||
"# Without any more missing data, cast all numeric\n",
|
||||
"# columns as floats or integers respectively.\n",
|
||||
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:\n",
|
||||
"for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:\n",
|
||||
" df[column] = df[column].astype(np.float64)\n",
|
||||
"for column in DISCRETE_VARIABLES:\n",
|
||||
" df[column] = df[column].astype(np.int64)"
|
||||
|
|
File diff suppressed because one or more lines are too long
2
Pipfile
2
Pipfile
|
@ -12,8 +12,10 @@ xlrd = "*"
|
|||
xlwt = "*"
|
||||
tabulate = "*"
|
||||
requests = "*"
|
||||
matplotlib = "*"
|
||||
seaborn = "*"
|
||||
missingno = "*"
|
||||
sklearn = "*"
|
||||
|
||||
[dev-packages]
|
||||
black = "*"
|
||||
|
|
45
Pipfile.lock
generated
45
Pipfile.lock
generated
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "b9e10abf6d574d2d423a53aca049fcee7b8f1c730088a0079da637e77d22c2c0"
|
||||
"sha256": "36a1b0a9371a1a4fdd5fc9035120f57e601b4d9f2fd4ddd279b14e95c2063498"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -224,7 +224,7 @@
|
|||
"sha256:c6e83ca4e70dc5d3185ee15544a66fbf25aff6f242d6102c2358cee240963785",
|
||||
"sha256:e041d42e6cf4a363aa544e6affedc7ce21c71232147c9f206044e7dfb034bc94"
|
||||
],
|
||||
"markers": "python_version != '3.0.*' and python_version != '3.2.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7'",
|
||||
"index": "pypi",
|
||||
"version": "==3.0.0rc2"
|
||||
},
|
||||
"missingno": {
|
||||
|
@ -452,6 +452,40 @@
|
|||
"index": "pypi",
|
||||
"version": "==2.19.1"
|
||||
},
|
||||
"scikit-learn": {
|
||||
"hashes": [
|
||||
"sha256:0aa0ac9a47459f5a5f078b345e56b15cf7a3126ecddbd5c22dfbe3f898884f90",
|
||||
"sha256:10543e70ab8500deb2e21e8ffddba83680585dc8346b773acec8fbbcf4d7f878",
|
||||
"sha256:131c09fe20e25db4262d54279513950c9dee1c7ad45564404a753e588014b970",
|
||||
"sha256:1829ebc8d86285429e310db61c0d3d3983756f2d13373e6d446c115c926c937d",
|
||||
"sha256:31f6fc497e01de726488828e312606ff87bf19cba54f51a731852f1ef4113242",
|
||||
"sha256:3c94c99e9411c54850077e576a7fe9a4443b061a1a11b66a506be51ca9083b5b",
|
||||
"sha256:605380170c080f620bcd34c6e8489bd721cfa8c478854b092c9505076d069d42",
|
||||
"sha256:64301ce9014b8082c0eae4636edb49381e9d5357a3a39ed80151453ed0f8b519",
|
||||
"sha256:68c2e5df3cc3f681b15271325016d3fc1ba23c65b1361da0d764606050002397",
|
||||
"sha256:72ece165f2262a262e702e51a06b661fb86f3d30c20fa20f09f0f30097656244",
|
||||
"sha256:760292b61931d36963df3c3657b0292da3f348974235e0af55cd09c5db0c1280",
|
||||
"sha256:7a15dfaae603bc78632d0bb2809bb781b67461f689dc92da9bca44551dd011cf",
|
||||
"sha256:7cb3c857ffcb349795947f2938969ced4271ac72bdbece55c00a0817392c2d15",
|
||||
"sha256:83946e287749e9ddf853fc188d624e2c9280e88ce7cbb3b0d71980483e022fab",
|
||||
"sha256:85a93c849344a1acf63a9ec3f97c4a4067497daf63ca985e7244d75a185c11a6",
|
||||
"sha256:9f83e5353fbde280c0e8302f4b78fe3795f36be67ad02ff4ceb84d6e7d3001fa",
|
||||
"sha256:a0c8c06582b01b7b8c568a426483e09e47ca91dce585e7d4b78c66bab5f19d4f",
|
||||
"sha256:a9b2d9329a6802a38461b7ff81d9de9c218e18835ff7a196dcf93d85937f7d24",
|
||||
"sha256:ac45ae4963e2a5119e2d88f1e8e5cbfac43851c6aa9f6477d5bd452e9dcd97f4",
|
||||
"sha256:b5baee191ff940f4d536d370f1481a1819dbc5b9443e2e28593922979fac9aaf",
|
||||
"sha256:b723f603169d86f4cac8ed47453ab5cb07db46ea31a103960f6bacc4c7605634",
|
||||
"sha256:b9158ba32ba8d97f747ab40f29f6df7088b70d0aaea7763b4a88f9ef8ede198c",
|
||||
"sha256:ce00d619768594b7d0013c3f5a8405236df8730474479324de0328e73c812536",
|
||||
"sha256:db85a112e187409ee464675f6d3703f5a81b26dc30e155929a950f85c672335c",
|
||||
"sha256:df0ea014e791dcec8b55def33189dc000a3d6b635a10739317fea058a427e634",
|
||||
"sha256:f0454968f13611972feefc0d3f86ffc59cee98e3c2f545f2abae7f338a74f7ee",
|
||||
"sha256:f4119d45b770703cd86f7f7b1fd5416415cd5ff089f3e88b417ab016749f5f3d",
|
||||
"sha256:f575e6572db703a94418cd8e880f131a62b8767f298d531eaf38e98be8abfe9a"
|
||||
],
|
||||
"markers": "python_version != '3.2.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.0.*' and python_version >= '2.7'",
|
||||
"version": "==0.20rc1"
|
||||
},
|
||||
"scipy": {
|
||||
"hashes": [
|
||||
"sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7",
|
||||
|
@ -514,6 +548,13 @@
|
|||
],
|
||||
"version": "==1.11.0"
|
||||
},
|
||||
"sklearn": {
|
||||
"hashes": [
|
||||
"sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.0"
|
||||
},
|
||||
"tabulate": {
|
||||
"hashes": [
|
||||
"sha256:e4ca13f26d0a6be2a2915428dc21e732f1e44dad7f76d7030b2ef1ec251cf7f2"
|
||||
|
|
2899
data_clean_with_transformations.csv
Normal file
2899
data_clean_with_transformations.csv
Normal file
File diff suppressed because it is too large
Load diff
20
utils.py
20
utils.py
|
@ -35,7 +35,7 @@ import tabulate
|
|||
|
||||
INDEX_COLUMNS = ["Order", "PID"]
|
||||
LABEL_TYPES = ["nominal", "ordinal"]
|
||||
TARGET_VARIABLE = ["SalePrice"]
|
||||
TARGET_VARIABLES = ["SalePrice"]
|
||||
# Note that these dictionaries and lists are not actually constants but
|
||||
# filled in during import time which makes them "near"-constant.
|
||||
ALL_COLUMNS = {}
|
||||
|
@ -102,7 +102,7 @@ def _extract_meta_data(lines):
|
|||
# The two ID columns and the target variable "SalePrice"
|
||||
# are not put into the helper dicts / lists as they are
|
||||
# treated seperately in the modelling anyways.
|
||||
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLE
|
||||
non_feature_columns = INDEX_COLUMNS + TARGET_VARIABLES
|
||||
|
||||
for line in lines:
|
||||
# Process the next variable in the list.
|
||||
|
@ -279,9 +279,15 @@ def print_column_list(subset=None):
|
|||
if subset is None:
|
||||
subset = ALL_VARIABLES
|
||||
else:
|
||||
assert set(list(subset)) <= set(list(ALL_VARIABLES))
|
||||
columns = sorted((c, ALL_COLUMNS[c]["description"]) for c in subset)
|
||||
print(tabulate.tabulate(columns, tablefmt="plain"))
|
||||
subset = set(subset)
|
||||
# Handle variables withoutdescription seperately.
|
||||
without_desc = subset - set(ALL_VARIABLES)
|
||||
subset -= without_desc
|
||||
columns = [(c, ALL_COLUMNS[c]["description"]) for c in subset]
|
||||
if without_desc:
|
||||
for c in sorted(without_desc):
|
||||
columns.append((c, ''))
|
||||
print(tabulate.tabulate(sorted(columns), tablefmt="plain"))
|
||||
|
||||
|
||||
def load_clean_data(subset=None, ordinal_encoded=False):
|
||||
|
@ -321,7 +327,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
|||
# Remove columns that are in the description but not in the data file.
|
||||
renamed = update_column_descriptions(df.columns, correct_columns=True)
|
||||
# Cast the numeric types correctly.
|
||||
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLE:
|
||||
for column in CONTINUOUS_VARIABLES + TARGET_VARIABLES:
|
||||
df[column] = df[column].astype(float)
|
||||
for column in DISCRETE_VARIABLES:
|
||||
df[column] = df[column].astype(int)
|
||||
|
@ -347,7 +353,7 @@ def load_clean_data(subset=None, ordinal_encoded=False):
|
|||
subset.remove(old_name)
|
||||
subset.add(new_name)
|
||||
subset = sorted(set(df.columns) & subset)
|
||||
df = df[subset + TARGET_VARIABLE]
|
||||
df = df[subset + TARGET_VARIABLES]
|
||||
# Use integer encoding for ordinal variables.
|
||||
if ordinal_encoded:
|
||||
df = encode_ordinals(df)
|
||||
|
|
|
@ -1 +1 @@
|
|||
{"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "Total Bsmt SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Year Built"]}
|
||||
{"weakly_correlated": ["1st Flr SF", "Bsmt Exposure", "BsmtFin SF 1", "BsmtFin Type 1", "Fireplace Qu", "Fireplaces", "Full Bath", "Garage Area", "Garage Cond", "Garage Finish", "Garage Qual", "Half Bath", "Heating QC", "Mas Vnr Area", "Paved Drive", "TotRms AbvGrd", "TotRms AbvGrd (box-cox-0.0)", "Total Bsmt SF", "Total Porch SF", "Wood Deck SF", "Year Remod/Add"], "strongly_correlated": ["Bsmt Qual", "Exter Qual", "Garage Cars", "Gr Liv Area", "Kitchen Qual", "Overall Qual", "Total Bath", "Total Bath (box-cox-0.5)", "Year Built"]}
|
Loading…
Reference in a new issue