Consider negatively and uncorrelated variables
This commit is contained in:
parent
a4b8100492
commit
387a495a80
3 changed files with 82 additions and 26 deletions
|
|
@ -27,7 +27,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-09-03 18:03:41 CEST\n",
|
||||
"2018-09-04 23:28:55 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
|
|
@ -1075,7 +1075,7 @@
|
|||
"- **continuous** variables are assumed to be linearly related with the target and each other or not: use **Pearson's correlation coefficient**\n",
|
||||
"- **discrete** (because of the low number of distinct realizations as seen in the data cleaning notebook) and **ordinal** (low number of distinct realizations as well) variables are assumed to be related in a monotonic way with the target and each other or not: use **Spearman's rank correlation coefficient**\n",
|
||||
"\n",
|
||||
"Furthermore, for a **naive feature selection** a \"rule of thumb\" classification in *weak* and *strong* correlation is applied to the predictor variables. The identified variables will be used in the prediction modelling part to speed up the feature selection. A correlation between 0.33 and 0.66 is considered *weak* while a correlation above 0.66 is considered *strong*. Correlations are calculated for **each** target variable (i.e., raw \"SalePrice\" and Box-Cox transformation thereof)."
|
||||
"Furthermore, for a **naive feature selection** a \"rule of thumb\" classification in *weak* and *strong* correlation is applied to the predictor variables. The identified variables will be used in the prediction modelling part to speed up the feature selection. A correlation between 0.33 and 0.66 is considered *weak* while a correlation above 0.66 is considered *strong* (these thresholds refer to the absolute value of the correlation). Correlations are calculated for **each** target variable (i.e., raw \"SalePrice\" and Box-Cox transformation thereof). Correlations below 0.1 are considered \"uncorrelated\"."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1085,7 +1085,8 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"strong = 0.66\n",
|
||||
"weak = 0.33"
|
||||
"weak = 0.33\n",
|
||||
"uncorrelated = 0.1"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1177,26 +1178,50 @@
|
|||
"source": [
|
||||
"pearson_weakly_correlated = set()\n",
|
||||
"pearson_strongly_correlated = set()\n",
|
||||
"pearson_uncorrelated = set()\n",
|
||||
"# Iterate over the raw and transformed target.\n",
|
||||
"for target in TARGET_VARIABLES:\n",
|
||||
" corrs = pearson.loc[target].drop(TARGET_VARIABLES)\n",
|
||||
" corrs = pearson.loc[target].drop(TARGET_VARIABLES).abs()\n",
|
||||
" pearson_weakly_correlated |= set(corrs[(weak < corrs) & (corrs <= strong)].index)\n",
|
||||
" pearson_strongly_correlated |= set(corrs[(strong < corrs)].index)\n",
|
||||
"# Show that no contradiction exists between weak and strong classification.\n",
|
||||
"assert pearson_weakly_correlated & pearson_strongly_correlated == set()"
|
||||
" pearson_uncorrelated |= set(corrs[(corrs < uncorrelated)].index)\n",
|
||||
"# Show that no contradiction exists between the classifications.\n",
|
||||
"assert pearson_weakly_correlated & pearson_strongly_correlated == set()\n",
|
||||
"assert pearson_weakly_correlated & pearson_uncorrelated == set()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Show the continuous variables that are weakly and strongly correlated with the sales price."
|
||||
"Show the continuous variables that are weakly and strongly correlated with the sales price or uncorrelated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3Ssn Porch Three season porch area in square feet\n",
|
||||
"BsmtFin SF 2 Type 2 finished square feet\n",
|
||||
"Low Qual Fin SF Low quality finished square feet (all floors)\n",
|
||||
"Misc Val $Value of miscellaneous feature\n",
|
||||
"Pool Area Pool area in square feet\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_column_list(pearson_uncorrelated)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
|
|
@ -1218,7 +1243,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -1244,7 +1269,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -1254,7 +1279,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -1281,31 +1306,58 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"spearman_weakly_correlated = set()\n",
|
||||
"spearman_strongly_correlated = set()\n",
|
||||
"spearman_uncorrelated = set()\n",
|
||||
"# Iterate over the raw and transformed target.\n",
|
||||
"for target in TARGET_VARIABLES:\n",
|
||||
" corrs = spearman.loc[target].drop(TARGET_VARIABLES)\n",
|
||||
" corrs = spearman.loc[target].drop(TARGET_VARIABLES).abs()\n",
|
||||
" spearman_weakly_correlated |= set(corrs[(weak < corrs) & (corrs <= strong)].index)\n",
|
||||
" spearman_strongly_correlated |= set(corrs[(strong < corrs)].index)\n",
|
||||
"# Show that no contradiction exists between weak and strong classification.\n",
|
||||
"assert spearman_weakly_correlated & spearman_strongly_correlated == set()"
|
||||
" spearman_uncorrelated |= set(corrs[(corrs < uncorrelated)].index)\n",
|
||||
"# Show that no contradiction exists between the classifications.\n",
|
||||
"assert spearman_weakly_correlated & spearman_strongly_correlated == set()\n",
|
||||
"assert spearman_weakly_correlated & spearman_uncorrelated == set()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Show the discrete and ordinal variables that are weakly and strongly correlated with the sales price."
|
||||
"Show the discrete and ordinal variables that are weakly and strongly correlated with the sales price or uncorrelated."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Bsmt Half Bath Basement half bathrooms\n",
|
||||
"BsmtFin Type 2 Rating of basement finished area (if multiple types)\n",
|
||||
"Exter Cond Evaluates the present condition of the material on the exterior\n",
|
||||
"Land Slope Slope of property\n",
|
||||
"Mo Sold Month Sold (MM)\n",
|
||||
"Pool QC Pool quality\n",
|
||||
"Utilities Type of utilities available\n",
|
||||
"Yr Sold Year Sold (YYYY)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print_column_list(spearman_uncorrelated)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -1322,6 +1374,7 @@
|
|||
"Garage Qual Garage quality\n",
|
||||
"Half Bath Half baths above grade\n",
|
||||
"Heating QC Heating quality and condition\n",
|
||||
"Lot Shape General shape of property\n",
|
||||
"Paved Drive Paved driveway\n",
|
||||
"TotRms AbvGrd Total rooms above grade (does not include bathrooms)\n",
|
||||
"Year Remod/Add Remodel date (same as construction date if no remodeling or additions)\n"
|
||||
|
|
@ -1334,7 +1387,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -1373,12 +1426,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(\"data/weakly_and_strongly_correlated_variables.json\", \"w\") as file:\n",
|
||||
"with open(\"data/correlated_variables.json\", \"w\") as file:\n",
|
||||
" file.write(json.dumps({\n",
|
||||
" \"uncorrelated\": sorted(\n",
|
||||
" list(pearson_uncorrelated) + list(spearman_uncorrelated)\n",
|
||||
" ),\n",
|
||||
" \"weakly_correlated\": sorted(\n",
|
||||
" list(pearson_weakly_correlated) + list(spearman_weakly_correlated)\n",
|
||||
" ),\n",
|
||||
|
|
@ -1399,7 +1455,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
@ -1419,7 +1475,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -1428,7 +1484,7 @@
|
|||
"(2898, 81)"
|
||||
]
|
||||
},
|
||||
"execution_count": 32,
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -1439,7 +1495,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
|
@ -2208,7 +2264,7 @@
|
|||
"5 527105010 189900.0 12.154253 "
|
||||
]
|
||||
},
|
||||
"execution_count": 33,
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
|
@ -2219,7 +2275,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue