Cast all numeric fields correctly as floats or integers

2018-08-29 14:09:10 +02:00 · 2018-08-29 14:09:10 +02:00 · 563a442ad9
commit 563a442ad9
parent d5012946c2
2 changed files with 2914 additions and 2912 deletions
--- a/1_data_cleaning.ipynb
+++ b/1_data_cleaning.ipynb
@ -30,7 +30,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "2018-08-29 14:00:36 CEST\n",
+      "2018-08-29 14:32:09 CEST\n",
      "\n",
      "CPython 3.6.5\n",
      "IPython 6.5.0\n",
@ -133,7 +133,7 @@
   "source": [
    "The original data are available for [download](https://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls) and a detailed description of the data types for each column can be found [here](https://www.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt). These meta data go into the `dtype` argument of the `read_excel` function below to parse the data correctly. There are four different generic data types defined that are casted as follows:\n",
    "\n",
-    "- continous -> np.float64\n",
+    "- continuous -> np.float64\n",
    "- discrete -> actually np.int64 but np.float64 because of missing values\n",
    "- nominal -> object (str)\n",
    "- ordinal -> object (str), the order can be looked up in the above mentioned *ALL_COLUMNS* dictionary\n",
@ -2179,8 +2179,10 @@
   "source": [
    "# Remove the discarded columns from the helper dictionaries / lists.\n",
    "update_column_descriptions(df.columns)\n",
-    "# Without any more missing data, convert\n",
+    "# Without any more missing data, cast all numeric\n",
-    "# the discrete columns to the correct data type.\n",
+    "# columns as floats or integers respectively.\n",
    "for column in CONTINUOUS_VARIABLES:\n",
    "    df[column] = df[column].astype(np.float64)\n",
    "for column in DISCRETE_VARIABLES:\n",
    "    df[column] = df[column].astype(np.int64)"
   ]
@ -2414,7 +2416,7 @@
       "      <th>526301100</th>\n",
       "      <td>1656.0</td>\n",
       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
       "      <td>NA</td>\n",
       "      <td>3</td>\n",
       "      <td>1Fam</td>\n",
@ -2496,7 +2498,7 @@
       "      <th>526350040</th>\n",
       "      <td>896.0</td>\n",
       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
       "      <td>NA</td>\n",
       "      <td>2</td>\n",
       "      <td>1Fam</td>\n",
@ -2578,7 +2580,7 @@
       "      <th>526351010</th>\n",
       "      <td>1329.0</td>\n",
       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
       "      <td>NA</td>\n",
       "      <td>3</td>\n",
       "      <td>1Fam</td>\n",
@ -2660,7 +2662,7 @@
       "      <th>526353030</th>\n",
       "      <td>2110.0</td>\n",
       "      <td>0.0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
       "      <td>NA</td>\n",
       "      <td>3</td>\n",
       "      <td>1Fam</td>\n",
@ -2742,7 +2744,7 @@
       "      <th>527105010</th>\n",
       "      <td>928.0</td>\n",
       "      <td>701.0</td>\n",
-       "      <td>0</td>\n",
+       "      <td>0.0</td>\n",
       "      <td>NA</td>\n",
       "      <td>3</td>\n",
       "      <td>1Fam</td>\n",
@ -2826,11 +2828,11 @@
      "text/plain": [
       "                 1st Flr SF  2nd Flr SF  3Ssn Porch Alley  Bedroom AbvGr  \\\n",
       "Order PID                                                                  \n",
-       "1     526301100      1656.0         0.0           0    NA              3   \n",
+       "1     526301100      1656.0         0.0         0.0    NA              3   \n",
-       "2     526350040       896.0         0.0           0    NA              2   \n",
+       "2     526350040       896.0         0.0         0.0    NA              2   \n",
-       "3     526351010      1329.0         0.0           0    NA              3   \n",
+       "3     526351010      1329.0         0.0         0.0    NA              3   \n",
-       "4     526353030      2110.0         0.0           0    NA              3   \n",
+       "4     526353030      2110.0         0.0         0.0    NA              3   \n",
-       "5     527105010       928.0       701.0           0    NA              3   \n",
+       "5     527105010       928.0       701.0         0.0    NA              3   \n",
       "\n",
       "                Bldg Type Bsmt Cond Bsmt Exposure  Bsmt Full Bath  \\\n",
       "Order PID                                                           \n",
--- a/data_clean.csv
+++ b/data_clean.csv