Cast all numeric fields correctly as floats or integers
This commit is contained in:
parent
d5012946c2
commit
563a442ad9
2 changed files with 2914 additions and 2912 deletions
|
@ -30,7 +30,7 @@
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"2018-08-29 14:00:36 CEST\n",
|
"2018-08-29 14:32:09 CEST\n",
|
||||||
"\n",
|
"\n",
|
||||||
"CPython 3.6.5\n",
|
"CPython 3.6.5\n",
|
||||||
"IPython 6.5.0\n",
|
"IPython 6.5.0\n",
|
||||||
|
@ -133,7 +133,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"The original data are available for [download](https://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls) and a detailed description of the data types for each column can be found [here](https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt). These meta data go into the `dtype` argument of the `read_excel` function below to parse the data correctly. There are four different generic data types defined that are casted as follows:\n",
|
"The original data are available for [download](https://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls) and a detailed description of the data types for each column can be found [here](https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt). These meta data go into the `dtype` argument of the `read_excel` function below to parse the data correctly. There are four different generic data types defined that are casted as follows:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"- continous -> np.float64\n",
|
"- continuous -> np.float64\n",
|
||||||
"- discrete -> actually np.int64 but np.float64 because of missing values\n",
|
"- discrete -> actually np.int64 but np.float64 because of missing values\n",
|
||||||
"- nominal -> object (str)\n",
|
"- nominal -> object (str)\n",
|
||||||
"- ordinal -> object (str), the order can be looked up in the above mentioned *ALL_COLUMNS* dictionary\n",
|
"- ordinal -> object (str), the order can be looked up in the above mentioned *ALL_COLUMNS* dictionary\n",
|
||||||
|
@ -2179,8 +2179,10 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Remove the discarded columns from the helper dictionaries / lists.\n",
|
"# Remove the discarded columns from the helper dictionaries / lists.\n",
|
||||||
"update_column_descriptions(df.columns)\n",
|
"update_column_descriptions(df.columns)\n",
|
||||||
"# Without any more missing data, convert\n",
|
"# Without any more missing data, cast all numeric\n",
|
||||||
"# the discrete columns to the correct data type.\n",
|
"# columns as floats or integers respectively.\n",
|
||||||
|
"for column in CONTINUOUS_VARIABLES:\n",
|
||||||
|
" df[column] = df[column].astype(np.float64)\n",
|
||||||
"for column in DISCRETE_VARIABLES:\n",
|
"for column in DISCRETE_VARIABLES:\n",
|
||||||
" df[column] = df[column].astype(np.int64)"
|
" df[column] = df[column].astype(np.int64)"
|
||||||
]
|
]
|
||||||
|
@ -2414,7 +2416,7 @@
|
||||||
" <th>526301100</th>\n",
|
" <th>526301100</th>\n",
|
||||||
" <td>1656.0</td>\n",
|
" <td>1656.0</td>\n",
|
||||||
" <td>0.0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>NA</td>\n",
|
" <td>NA</td>\n",
|
||||||
" <td>3</td>\n",
|
" <td>3</td>\n",
|
||||||
" <td>1Fam</td>\n",
|
" <td>1Fam</td>\n",
|
||||||
|
@ -2496,7 +2498,7 @@
|
||||||
" <th>526350040</th>\n",
|
" <th>526350040</th>\n",
|
||||||
" <td>896.0</td>\n",
|
" <td>896.0</td>\n",
|
||||||
" <td>0.0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>NA</td>\n",
|
" <td>NA</td>\n",
|
||||||
" <td>2</td>\n",
|
" <td>2</td>\n",
|
||||||
" <td>1Fam</td>\n",
|
" <td>1Fam</td>\n",
|
||||||
|
@ -2578,7 +2580,7 @@
|
||||||
" <th>526351010</th>\n",
|
" <th>526351010</th>\n",
|
||||||
" <td>1329.0</td>\n",
|
" <td>1329.0</td>\n",
|
||||||
" <td>0.0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>NA</td>\n",
|
" <td>NA</td>\n",
|
||||||
" <td>3</td>\n",
|
" <td>3</td>\n",
|
||||||
" <td>1Fam</td>\n",
|
" <td>1Fam</td>\n",
|
||||||
|
@ -2660,7 +2662,7 @@
|
||||||
" <th>526353030</th>\n",
|
" <th>526353030</th>\n",
|
||||||
" <td>2110.0</td>\n",
|
" <td>2110.0</td>\n",
|
||||||
" <td>0.0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>NA</td>\n",
|
" <td>NA</td>\n",
|
||||||
" <td>3</td>\n",
|
" <td>3</td>\n",
|
||||||
" <td>1Fam</td>\n",
|
" <td>1Fam</td>\n",
|
||||||
|
@ -2742,7 +2744,7 @@
|
||||||
" <th>527105010</th>\n",
|
" <th>527105010</th>\n",
|
||||||
" <td>928.0</td>\n",
|
" <td>928.0</td>\n",
|
||||||
" <td>701.0</td>\n",
|
" <td>701.0</td>\n",
|
||||||
" <td>0</td>\n",
|
" <td>0.0</td>\n",
|
||||||
" <td>NA</td>\n",
|
" <td>NA</td>\n",
|
||||||
" <td>3</td>\n",
|
" <td>3</td>\n",
|
||||||
" <td>1Fam</td>\n",
|
" <td>1Fam</td>\n",
|
||||||
|
@ -2826,11 +2828,11 @@
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
" 1st Flr SF 2nd Flr SF 3Ssn Porch Alley Bedroom AbvGr \\\n",
|
" 1st Flr SF 2nd Flr SF 3Ssn Porch Alley Bedroom AbvGr \\\n",
|
||||||
"Order PID \n",
|
"Order PID \n",
|
||||||
"1 526301100 1656.0 0.0 0 NA 3 \n",
|
"1 526301100 1656.0 0.0 0.0 NA 3 \n",
|
||||||
"2 526350040 896.0 0.0 0 NA 2 \n",
|
"2 526350040 896.0 0.0 0.0 NA 2 \n",
|
||||||
"3 526351010 1329.0 0.0 0 NA 3 \n",
|
"3 526351010 1329.0 0.0 0.0 NA 3 \n",
|
||||||
"4 526353030 2110.0 0.0 0 NA 3 \n",
|
"4 526353030 2110.0 0.0 0.0 NA 3 \n",
|
||||||
"5 527105010 928.0 701.0 0 NA 3 \n",
|
"5 527105010 928.0 701.0 0.0 NA 3 \n",
|
||||||
"\n",
|
"\n",
|
||||||
" Bldg Type Bsmt Cond Bsmt Exposure Bsmt Full Bath \\\n",
|
" Bldg Type Bsmt Cond Bsmt Exposure Bsmt Full Bath \\\n",
|
||||||
"Order PID \n",
|
"Order PID \n",
|
||||||
|
|
5796
data_clean.csv
5796
data_clean.csv
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue