Cast all numeric fields correctly as floats or integers
This commit is contained in:
parent
d5012946c2
commit
563a442ad9
2 changed files with 2914 additions and 2912 deletions
|
@ -30,7 +30,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-08-29 14:00:36 CEST\n",
|
||||
"2018-08-29 14:32:09 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
|
@ -133,7 +133,7 @@
|
|||
"source": [
|
||||
"The original data are available for [download](https://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls) and a detailed description of the data types for each column can be found [here](https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt). These meta data go into the `dtype` argument of the `read_excel` function below to parse the data correctly. There are four different generic data types defined that are casted as follows:\n",
|
||||
"\n",
|
||||
"- continous -> np.float64\n",
|
||||
"- continuous -> np.float64\n",
|
||||
"- discrete -> actually np.int64 but np.float64 because of missing values\n",
|
||||
"- nominal -> object (str)\n",
|
||||
"- ordinal -> object (str), the order can be looked up in the above mentioned *ALL_COLUMNS* dictionary\n",
|
||||
|
@ -2179,8 +2179,10 @@
|
|||
"source": [
|
||||
"# Remove the discarded columns from the helper dictionaries / lists.\n",
|
||||
"update_column_descriptions(df.columns)\n",
|
||||
"# Without any more missing data, convert\n",
|
||||
"# the discrete columns to the correct data type.\n",
|
||||
"# Without any more missing data, cast all numeric\n",
|
||||
"# columns as floats or integers respectively.\n",
|
||||
"for column in CONTINUOUS_VARIABLES:\n",
|
||||
" df[column] = df[column].astype(np.float64)\n",
|
||||
"for column in DISCRETE_VARIABLES:\n",
|
||||
" df[column] = df[column].astype(np.int64)"
|
||||
]
|
||||
|
@ -2414,7 +2416,7 @@
|
|||
" <th>526301100</th>\n",
|
||||
" <td>1656.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>NA</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1Fam</td>\n",
|
||||
|
@ -2496,7 +2498,7 @@
|
|||
" <th>526350040</th>\n",
|
||||
" <td>896.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>NA</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1Fam</td>\n",
|
||||
|
@ -2578,7 +2580,7 @@
|
|||
" <th>526351010</th>\n",
|
||||
" <td>1329.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>NA</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1Fam</td>\n",
|
||||
|
@ -2660,7 +2662,7 @@
|
|||
" <th>526353030</th>\n",
|
||||
" <td>2110.0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>NA</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1Fam</td>\n",
|
||||
|
@ -2742,7 +2744,7 @@
|
|||
" <th>527105010</th>\n",
|
||||
" <td>928.0</td>\n",
|
||||
" <td>701.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td>NA</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1Fam</td>\n",
|
||||
|
@ -2826,11 +2828,11 @@
|
|||
"text/plain": [
|
||||
" 1st Flr SF 2nd Flr SF 3Ssn Porch Alley Bedroom AbvGr \\\n",
|
||||
"Order PID \n",
|
||||
"1 526301100 1656.0 0.0 0 NA 3 \n",
|
||||
"2 526350040 896.0 0.0 0 NA 2 \n",
|
||||
"3 526351010 1329.0 0.0 0 NA 3 \n",
|
||||
"4 526353030 2110.0 0.0 0 NA 3 \n",
|
||||
"5 527105010 928.0 701.0 0 NA 3 \n",
|
||||
"1 526301100 1656.0 0.0 0.0 NA 3 \n",
|
||||
"2 526350040 896.0 0.0 0.0 NA 2 \n",
|
||||
"3 526351010 1329.0 0.0 0.0 NA 3 \n",
|
||||
"4 526353030 2110.0 0.0 0.0 NA 3 \n",
|
||||
"5 527105010 928.0 701.0 0.0 NA 3 \n",
|
||||
"\n",
|
||||
" Bldg Type Bsmt Cond Bsmt Exposure Bsmt Full Bath \\\n",
|
||||
"Order PID \n",
|
||||
|
|
5796
data_clean.csv
5796
data_clean.csv
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue