diff --git a/2_multiple_variables_stored_in_one_column.ipynb b/2_multiple_variables_stored_in_one_column.ipynb
new file mode 100644
index 0000000..d4bd4a8
--- /dev/null
+++ b/2_multiple_variables_stored_in_one_column.ipynb
@@ -0,0 +1,730 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Multiple Variables stored in one Column\n",
+ "\n",
+ "This notebook shows how multiple variables stored in the same column can be isolated."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## \"Housekeeping\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2018-08-26 11:50:39 CEST\n",
+ "\n",
+ "CPython 3.6.5\n",
+ "IPython 6.5.0\n",
+ "\n",
+ "numpy 1.15.1\n",
+ "pandas 0.23.4\n"
+ ]
+ }
+ ],
+ "source": [
+ "% load_ext watermark\n",
+ "% watermark -d -t -v -z -p numpy,pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Example: Tuberculosis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Load the Data\n",
+ "\n",
+ "Select the same columns as in the paper and name them accordingly."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns = ['iso2', 'year',\n",
+ " 'new_sp_m014', 'new_sp_m1524', 'new_sp_m2534', 'new_sp_m3544',\n",
+ " 'new_sp_m4554', 'new_sp_m5564', 'new_sp_m65', 'new_sp_mu',\n",
+ " 'new_sp_f014', 'new_sp_f1524', 'new_sp_f2534', 'new_sp_f3544',\n",
+ " 'new_sp_f4554', 'new_sp_f5564', 'new_sp_f65', 'new_sp_fu']\n",
+ "tb = pd.read_csv('data/tb.csv', usecols=columns)\n",
+ "\n",
+ "rename = {c: c[7:] for c in columns if c.startswith('new_sp_')}\n",
+ "rename = {'iso2': 'country', **rename}\n",
+ "tb = tb.rename(columns=rename)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Messy Data\n",
+ "\n",
+ "The data are assumed to be provided as below. Except for the *country* and *year* columns, the remaining columns are actually joint realizations of two variables **sex** and **age**."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " year | \n",
+ " m014 | \n",
+ " m1524 | \n",
+ " m2534 | \n",
+ " m3544 | \n",
+ " m4554 | \n",
+ " m5564 | \n",
+ " m65 | \n",
+ " mu | \n",
+ " f014 | \n",
+ " f1524 | \n",
+ " f2534 | \n",
+ " f3544 | \n",
+ " f4554 | \n",
+ " f5564 | \n",
+ " f65 | \n",
+ " fu | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 10 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 6.0 | \n",
+ " 5.0 | \n",
+ " 12.0 | \n",
+ " 10.0 | \n",
+ " NaN | \n",
+ " 3.0 | \n",
+ " 16.0 | \n",
+ " 1.0 | \n",
+ " 3.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 60 | \n",
+ " AF | \n",
+ " 2000 | \n",
+ " 52.0 | \n",
+ " 228.0 | \n",
+ " 183.0 | \n",
+ " 149.0 | \n",
+ " 129.0 | \n",
+ " 94.0 | \n",
+ " 80.0 | \n",
+ " NaN | \n",
+ " 93.0 | \n",
+ " 414.0 | \n",
+ " 565.0 | \n",
+ " 339.0 | \n",
+ " 205.0 | \n",
+ " 99.0 | \n",
+ " 36.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 87 | \n",
+ " AG | \n",
+ " 2000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 136 | \n",
+ " AL | \n",
+ " 2000 | \n",
+ " 2.0 | \n",
+ " 19.0 | \n",
+ " 21.0 | \n",
+ " 14.0 | \n",
+ " 24.0 | \n",
+ " 19.0 | \n",
+ " 16.0 | \n",
+ " NaN | \n",
+ " 3.0 | \n",
+ " 11.0 | \n",
+ " 10.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 5.0 | \n",
+ " 11.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 165 | \n",
+ " AM | \n",
+ " 2000 | \n",
+ " 2.0 | \n",
+ " 152.0 | \n",
+ " 130.0 | \n",
+ " 131.0 | \n",
+ " 63.0 | \n",
+ " 26.0 | \n",
+ " 21.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 24.0 | \n",
+ " 27.0 | \n",
+ " 24.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 4.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 178 | \n",
+ " AN | \n",
+ " 2000 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 1.0 | \n",
+ " 0.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 207 | \n",
+ " AO | \n",
+ " 2000 | \n",
+ " 186.0 | \n",
+ " 999.0 | \n",
+ " 1003.0 | \n",
+ " 912.0 | \n",
+ " 482.0 | \n",
+ " 312.0 | \n",
+ " 194.0 | \n",
+ " NaN | \n",
+ " 247.0 | \n",
+ " 1142.0 | \n",
+ " 1091.0 | \n",
+ " 844.0 | \n",
+ " 417.0 | \n",
+ " 200.0 | \n",
+ " 120.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 236 | \n",
+ " AR | \n",
+ " 2000 | \n",
+ " 97.0 | \n",
+ " 278.0 | \n",
+ " 594.0 | \n",
+ " 402.0 | \n",
+ " 419.0 | \n",
+ " 368.0 | \n",
+ " 330.0 | \n",
+ " NaN | \n",
+ " 121.0 | \n",
+ " 544.0 | \n",
+ " 479.0 | \n",
+ " 262.0 | \n",
+ " 230.0 | \n",
+ " 179.0 | \n",
+ " 216.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 265 | \n",
+ " AS | \n",
+ " 2000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country year m014 m1524 m2534 m3544 m4554 m5564 m65 mu \\\n",
+ "10 AD 2000 0.0 0.0 1.0 0.0 0.0 0.0 0.0 NaN \n",
+ "36 AE 2000 2.0 4.0 4.0 6.0 5.0 12.0 10.0 NaN \n",
+ "60 AF 2000 52.0 228.0 183.0 149.0 129.0 94.0 80.0 NaN \n",
+ "87 AG 2000 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN \n",
+ "136 AL 2000 2.0 19.0 21.0 14.0 24.0 19.0 16.0 NaN \n",
+ "165 AM 2000 2.0 152.0 130.0 131.0 63.0 26.0 21.0 NaN \n",
+ "178 AN 2000 0.0 0.0 1.0 2.0 0.0 0.0 0.0 NaN \n",
+ "207 AO 2000 186.0 999.0 1003.0 912.0 482.0 312.0 194.0 NaN \n",
+ "236 AR 2000 97.0 278.0 594.0 402.0 419.0 368.0 330.0 NaN \n",
+ "265 AS 2000 NaN NaN NaN NaN 1.0 1.0 NaN NaN \n",
+ "\n",
+ " f014 f1524 f2534 f3544 f4554 f5564 f65 fu \n",
+ "10 NaN NaN NaN NaN NaN NaN NaN NaN \n",
+ "36 3.0 16.0 1.0 3.0 0.0 0.0 4.0 NaN \n",
+ "60 93.0 414.0 565.0 339.0 205.0 99.0 36.0 NaN \n",
+ "87 1.0 1.0 1.0 0.0 0.0 0.0 0.0 NaN \n",
+ "136 3.0 11.0 10.0 8.0 8.0 5.0 11.0 NaN \n",
+ "165 1.0 24.0 27.0 24.0 8.0 8.0 4.0 NaN \n",
+ "178 0.0 0.0 1.0 0.0 0.0 1.0 0.0 NaN \n",
+ "207 247.0 1142.0 1091.0 844.0 417.0 200.0 120.0 NaN \n",
+ "236 121.0 544.0 479.0 262.0 230.0 179.0 216.0 NaN \n",
+ "265 NaN NaN NaN NaN 1.0 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tb[(tb['year'] == 2000)].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Molten Data\n",
+ "\n",
+ "As in the previous notebook the [*pd.melt*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are then discarded (then, without any more missing values, the column's data type is casted as integer). Furthermore, the resulting *molten* dataset is sorted as in the paper."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "molten_tb = pd.melt(tb, id_vars=['country', 'year'], var_name='column', value_name='cases')\n",
+ "molten_tb = molten_tb[molten_tb['cases'].notnull()]\n",
+ "molten_tb['cases'] = molten_tb['cases'].astype(int)\n",
+ "molten_tb = molten_tb.sort_values(['country', 'year', 'column']).reset_index(drop=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " year | \n",
+ " column | \n",
+ " cases | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 49 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m014 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m1524 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m2534 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m3544 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m4554 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m5564 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m65 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 165 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f014 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 166 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f1524 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 167 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f2534 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country year column cases\n",
+ "49 AD 2000 m014 0\n",
+ "50 AD 2000 m1524 0\n",
+ "51 AD 2000 m2534 1\n",
+ "52 AD 2000 m3544 0\n",
+ "53 AD 2000 m4554 0\n",
+ "54 AD 2000 m5564 0\n",
+ "55 AD 2000 m65 0\n",
+ "165 AE 2000 f014 3\n",
+ "166 AE 2000 f1524 16\n",
+ "167 AE 2000 f2534 1"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "molten_tb[(molten_tb['year'] == 2000)].head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Tidy Data\n",
+ "\n",
+ "Using the [*pd.Series.str.extract*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tidy_tb = molten_tb[['country', 'year', 'cases']]\n",
+ "tidy_tb[['sex', 'age']] = molten_tb['column'].str.extract(r'(f|m)(.*)')\n",
+ "tidy_tb['age'] = tidy_tb['age'].map({\n",
+ " '014': '0-14', '1524': '15-24', '2534': '25-34',\n",
+ " '3544': '35-44', '4554': '45-54', '5564': '55-64',\n",
+ " '65': '65+', 'u': 'unknown'\n",
+ "})\n",
+ "tidy_tb = tidy_tb[['country', 'year', 'sex', 'age', 'cases']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " country | \n",
+ " year | \n",
+ " sex | \n",
+ " age | \n",
+ " cases | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 49 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 0-14 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 50 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 15-24 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 51 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 25-34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 52 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 35-44 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 53 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 45-54 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 54 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 55-64 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 55 | \n",
+ " AD | \n",
+ " 2000 | \n",
+ " m | \n",
+ " 65+ | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " | 165 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f | \n",
+ " 0-14 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 166 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f | \n",
+ " 15-24 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 167 | \n",
+ " AE | \n",
+ " 2000 | \n",
+ " f | \n",
+ " 25-34 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " country year sex age cases\n",
+ "49 AD 2000 m 0-14 0\n",
+ "50 AD 2000 m 15-24 0\n",
+ "51 AD 2000 m 25-34 1\n",
+ "52 AD 2000 m 35-44 0\n",
+ "53 AD 2000 m 45-54 0\n",
+ "54 AD 2000 m 55-64 0\n",
+ "55 AD 2000 m 65+ 0\n",
+ "165 AE 2000 f 0-14 3\n",
+ "166 AE 2000 f 15-24 16\n",
+ "167 AE 2000 f 25-34 1"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tidy_tb[(tidy_tb['year'] == 2000)].head(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}