From 1a210cc79969d5a48b4de47f0be19d66b30b6d21 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Sun, 26 Aug 2018 00:58:33 +0200 Subject: [PATCH] Create notebook for the second application of tidying --- ...tiple_variables_stored_in_one_column.ipynb | 730 ++++++++++++++++++ 1 file changed, 730 insertions(+) create mode 100644 2_multiple_variables_stored_in_one_column.ipynb diff --git a/2_multiple_variables_stored_in_one_column.ipynb b/2_multiple_variables_stored_in_one_column.ipynb new file mode 100644 index 0000000..d4bd4a8 --- /dev/null +++ b/2_multiple_variables_stored_in_one_column.ipynb @@ -0,0 +1,730 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiple Variables stored in one Column\n", + "\n", + "This notebook shows how multiple variables stored in the same column can be isolated." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## \"Housekeeping\"" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2018-08-26 11:50:39 CEST\n", + "\n", + "CPython 3.6.5\n", + "IPython 6.5.0\n", + "\n", + "numpy 1.15.1\n", + "pandas 0.23.4\n" + ] + } + ], + "source": [ + "% load_ext watermark\n", + "% watermark -d -t -v -z -p numpy,pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: Tuberculosis" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load the Data\n", + "\n", + "Select the same columns as in the paper and name them accordingly." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "columns = ['iso2', 'year',\n", + " 'new_sp_m014', 'new_sp_m1524', 'new_sp_m2534', 'new_sp_m3544',\n", + " 'new_sp_m4554', 'new_sp_m5564', 'new_sp_m65', 'new_sp_mu',\n", + " 'new_sp_f014', 'new_sp_f1524', 'new_sp_f2534', 'new_sp_f3544',\n", + " 'new_sp_f4554', 'new_sp_f5564', 'new_sp_f65', 'new_sp_fu']\n", + "tb = pd.read_csv('data/tb.csv', usecols=columns)\n", + "\n", + "rename = {c: c[7:] for c in columns if c.startswith('new_sp_')}\n", + "rename = {'iso2': 'country', **rename}\n", + "tb = tb.rename(columns=rename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Messy Data\n", + "\n", + "The data are assumed to be provided as below. Except for the *country* and *year* columns, the remaining columns are actually joint realizations of two variables **sex** and **age**." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearm014m1524m2534m3544m4554m5564m65muf014f1524f2534f3544f4554f5564f65fu
10AD20000.00.01.00.00.00.00.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
36AE20002.04.04.06.05.012.010.0NaN3.016.01.03.00.00.04.0NaN
60AF200052.0228.0183.0149.0129.094.080.0NaN93.0414.0565.0339.0205.099.036.0NaN
87AG20000.00.00.00.00.00.01.0NaN1.01.01.00.00.00.00.0NaN
136AL20002.019.021.014.024.019.016.0NaN3.011.010.08.08.05.011.0NaN
165AM20002.0152.0130.0131.063.026.021.0NaN1.024.027.024.08.08.04.0NaN
178AN20000.00.01.02.00.00.00.0NaN0.00.01.00.00.01.00.0NaN
207AO2000186.0999.01003.0912.0482.0312.0194.0NaN247.01142.01091.0844.0417.0200.0120.0NaN
236AR200097.0278.0594.0402.0419.0368.0330.0NaN121.0544.0479.0262.0230.0179.0216.0NaN
265AS2000NaNNaNNaNNaN1.01.0NaNNaNNaNNaNNaNNaN1.0NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " country year m014 m1524 m2534 m3544 m4554 m5564 m65 mu \\\n", + "10 AD 2000 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN \n", + "36 AE 2000 2.0 4.0 4.0 6.0 5.0 12.0 10.0 NaN \n", + "60 AF 2000 52.0 228.0 183.0 149.0 129.0 94.0 80.0 NaN \n", + "87 AG 2000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN \n", + "136 AL 2000 2.0 19.0 21.0 14.0 24.0 19.0 16.0 NaN \n", + "165 AM 2000 2.0 152.0 130.0 131.0 63.0 26.0 21.0 NaN \n", + "178 AN 2000 0.0 0.0 1.0 2.0 0.0 0.0 0.0 NaN \n", + "207 AO 2000 186.0 999.0 1003.0 912.0 482.0 312.0 194.0 NaN \n", + "236 AR 2000 97.0 278.0 594.0 402.0 419.0 368.0 330.0 NaN \n", + "265 AS 2000 NaN NaN NaN NaN 1.0 1.0 NaN NaN \n", + "\n", + " f014 f1524 f2534 f3544 f4554 f5564 f65 fu \n", + "10 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "36 3.0 16.0 1.0 3.0 0.0 0.0 4.0 NaN \n", + "60 93.0 414.0 565.0 339.0 205.0 99.0 36.0 NaN \n", + "87 1.0 1.0 1.0 0.0 0.0 0.0 0.0 NaN \n", + "136 3.0 11.0 10.0 8.0 8.0 5.0 11.0 NaN \n", + "165 1.0 24.0 27.0 24.0 8.0 8.0 4.0 NaN \n", + "178 0.0 0.0 1.0 0.0 0.0 1.0 0.0 NaN \n", + "207 247.0 1142.0 1091.0 844.0 417.0 200.0 120.0 NaN \n", + "236 121.0 544.0 479.0 262.0 230.0 179.0 216.0 NaN \n", + "265 NaN NaN NaN NaN 1.0 NaN NaN NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tb[(tb['year'] == 2000)].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Molten Data\n", + "\n", + "As in the previous notebook the [*pd.melt*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are then discarded (then, without any more missing values, the column's data type is casted as integer). Furthermore, the resulting *molten* dataset is sorted as in the paper." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "molten_tb = pd.melt(tb, id_vars=['country', 'year'], var_name='column', value_name='cases')\n", + "molten_tb = molten_tb[molten_tb['cases'].notnull()]\n", + "molten_tb['cases'] = molten_tb['cases'].astype(int)\n", + "molten_tb = molten_tb.sort_values(['country', 'year', 'column']).reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearcolumncases
49AD2000m0140
50AD2000m15240
51AD2000m25341
52AD2000m35440
53AD2000m45540
54AD2000m55640
55AD2000m650
165AE2000f0143
166AE2000f152416
167AE2000f25341
\n", + "
" + ], + "text/plain": [ + " country year column cases\n", + "49 AD 2000 m014 0\n", + "50 AD 2000 m1524 0\n", + "51 AD 2000 m2534 1\n", + "52 AD 2000 m3544 0\n", + "53 AD 2000 m4554 0\n", + "54 AD 2000 m5564 0\n", + "55 AD 2000 m65 0\n", + "165 AE 2000 f014 3\n", + "166 AE 2000 f1524 16\n", + "167 AE 2000 f2534 1" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "molten_tb[(molten_tb['year'] == 2000)].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tidy Data\n", + "\n", + "Using the [*pd.Series.str.extract*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "tidy_tb = molten_tb[['country', 'year', 'cases']]\n", + "tidy_tb[['sex', 'age']] = molten_tb['column'].str.extract(r'(f|m)(.*)')\n", + "tidy_tb['age'] = tidy_tb['age'].map({\n", + " '014': '0-14', '1524': '15-24', '2534': '25-34',\n", + " '3544': '35-44', '4554': '45-54', '5564': '55-64',\n", + " '65': '65+', 'u': 'unknown'\n", + "})\n", + "tidy_tb = tidy_tb[['country', 'year', 'sex', 'age', 'cases']]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countryyearsexagecases
49AD2000m0-140
50AD2000m15-240
51AD2000m25-341
52AD2000m35-440
53AD2000m45-540
54AD2000m55-640
55AD2000m65+0
165AE2000f0-143
166AE2000f15-2416
167AE2000f25-341
\n", + "
" + ], + "text/plain": [ + " country year sex age cases\n", + "49 AD 2000 m 0-14 0\n", + "50 AD 2000 m 15-24 0\n", + "51 AD 2000 m 25-34 1\n", + "52 AD 2000 m 35-44 0\n", + "53 AD 2000 m 45-54 0\n", + "54 AD 2000 m 55-64 0\n", + "55 AD 2000 m 65+ 0\n", + "165 AE 2000 f 0-14 3\n", + "166 AE 2000 f 15-24 16\n", + "167 AE 2000 f 25-34 1" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tidy_tb[(tidy_tb['year'] == 2000)].head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}