1
0
Fork 0

Update the project for 2020

- replace pipenv with poetry
- update the README.md:
  * streamline the text
  * update links to notebooks with nbviewer
  * update installation notes with poetry info
- streamline the notebooks:
  * use backticks in MarkDown cells to make references to
    columns in DataFrames clearer
  * blacken all code cells
- add MIT license
- ignore .venv/ and .python-version
This commit is contained in:
Alexander Hess 2020-08-26 00:07:58 +02:00
commit a3a17236a2
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
13 changed files with 1975 additions and 1158 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
.ipynb_checkpoints/
.python-version
.venv/

View file

@ -6,7 +6,7 @@
"source": [
"# Column Headers are Values, not Variable Names\n",
"\n",
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two types of settings:\n",
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two kinds of settings:\n",
"\n",
"1. Presentations\n",
"2. Recordings of regularly spaced observations over time"
@ -23,24 +23,9 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-26 14:39:56 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
"\n",
"numpy 1.15.1\n",
"pandas 0.23.4\n"
]
}
],
"outputs": [],
"source": [
"% load_ext watermark\n",
"% watermark -d -t -v -z -p numpy,pandas"
"%load_ext lab_black"
]
},
{
@ -90,32 +75,35 @@
"metadata": {},
"outputs": [],
"source": [
"columns = ['q16', 'reltrad', 'income']\n",
"columns = [\"q16\", \"reltrad\", \"income\"]\n",
"encodings = {}\n",
"\n",
"# For sake of simplicity all data cleaning operations\n",
"# For the sake of simplicity, all data cleaning operations\n",
"# are done within the for-loop for all columns.\n",
"with spss.SavHeaderReader('data/pew.sav') as pew:\n",
" for c in columns:\n",
" encodings[c] = {\n",
" int(k): (\n",
" re.sub(r'\\(.*\\)', '', (\n",
" v.decode('iso-8859-1')\n",
" .replace('\\x92', \"'\")\n",
" .replace(' Churches', '')\n",
" .replace('Less than $10,000', '<$10k')\n",
" .replace('10 to under $20,000', '$10-20k')\n",
" .replace('20 to under $30,000', '$20-30k')\n",
" .replace('30 to under $40,000', '$30-40k')\n",
" .replace('40 to under $50,000', '$40-50k')\n",
" .replace('50 to under $75,000', '$50-75k')\n",
" .replace('75 to under $100,000', '$75-100k')\n",
" .replace('100 to under $150,000', '$100-150k')\n",
" .replace('$150,000 or more', '>150k')\n",
"with spss.SavHeaderReader(\"data/pew.sav\") as pew:\n",
" for column in columns:\n",
" encodings[column] = {\n",
" int(key): (\n",
" re.sub(\n",
" r\"\\(.*\\)\",\n",
" \"\",\n",
" (\n",
" value.decode(\"iso-8859-1\")\n",
" .replace(\"\\x92\", \"'\")\n",
" .replace(\" Churches\", \"\")\n",
" .replace(\"Less than $10,000\", \"<$10k\")\n",
" .replace(\"10 to under $20,000\", \"$10-20k\")\n",
" .replace(\"20 to under $30,000\", \"$20-30k\")\n",
" .replace(\"30 to under $40,000\", \"$30-40k\")\n",
" .replace(\"40 to under $50,000\", \"$40-50k\")\n",
" .replace(\"50 to under $75,000\", \"$50-75k\")\n",
" .replace(\"75 to under $100,000\", \"$75-100k\")\n",
" .replace(\"100 to under $150,000\", \"$100-150k\")\n",
" .replace(\"$150,000 or more\", \">150k\")\n",
" ),\n",
" ).strip()\n",
" )\n",
" for (k, v) in pew.all().valueLabels[c.encode()].items()\n",
" for (key, value) in pew.all().valueLabels[column.encode()].items()\n",
" }"
]
},
@ -132,25 +120,36 @@
"metadata": {},
"outputs": [],
"source": [
"with spss.SavReader('data/pew.sav', selectVars=[c.encode() for c in columns]) as pew:\n",
"with spss.SavReader(\n",
" \"data/pew.sav\", selectVars=[column.encode() for column in columns]\n",
") as pew:\n",
" pew = list(pew)\n",
"\n",
"# Use the above encodings to map the numeric data\n",
"# to the actual labels.\n",
"pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
"for c in columns:\n",
" pew[c] = pew[c].map(encodings[c])\n",
"for column in columns:\n",
" pew[column] = pew[column].map(encodings[column])\n",
"\n",
"for v in ('Atheist', 'Agnostic'):\n",
" pew.loc[(pew['q16'] == v), 'reltrad'] = v\n",
"for value in (\"Atheist\", \"Agnostic\"):\n",
" pew.loc[(pew[\"q16\"] == value), \"reltrad\"] = value\n",
"\n",
"income_columns = ['<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k',\n",
" '$75-100k', '$100-150k', '>150k', 'Don\\'t know/Refused']\n",
"\n",
"pew = pew.groupby(['reltrad', 'income']).size().unstack('income')\n",
"income_columns = [\n",
" \"<$10k\",\n",
" \"$10-20k\",\n",
" \"$20-30k\",\n",
" \"$30-40k\",\n",
" \"$40-50k\",\n",
" \"$50-75k\",\n",
" \"$75-100k\",\n",
" \"$100-150k\",\n",
" \">150k\",\n",
" \"Don't know/Refused\",\n",
"]\n",
"\n",
"pew = pew.groupby([\"reltrad\", \"income\"]).size().unstack(\"income\")\n",
"pew = pew[income_columns]\n",
"pew.index.name = 'religion'"
"pew.index.name = \"religion\""
]
},
{
@ -426,9 +425,9 @@
"\n",
"> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
"\n",
"pandas provides a [pd.melt](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function to un-pivot the dataset.\n",
"pandas provides a [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function to un-pivot the dataset.\n",
"\n",
"**Notes:** *reset_index()* transforms the religion index column into a data column (*pd.melt()* needs that). Further, the resulting table is sorted implicitly by the *religion* column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
"**Notes:** `.reset_index()` transforms the religion index column into a data column (`pd.melt()` needs that). Further, the resulting table is sorted implicitly by the `\"religion\"` column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
]
},
{
@ -437,7 +436,7 @@
"metadata": {},
"outputs": [],
"source": [
"molten_pew = pd.melt(pew.reset_index(), id_vars=['religion'], value_name='frequency')"
"molten_pew = pd.melt(pew.reset_index(), id_vars=[\"religion\"], value_name=\"frequency\")"
]
},
{
@ -448,8 +447,8 @@
"source": [
"# Create a ordered column for the income labels.\n",
"income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
"molten_pew['income'] = molten_pew['income'].astype(income_dtype)\n",
"molten_pew = molten_pew.sort_values(['religion', 'income']).reset_index(drop=True)"
"molten_pew[\"income\"] = molten_pew[\"income\"].astype(income_dtype)\n",
"molten_pew = molten_pew.sort_values([\"religion\", \"income\"]).reset_index(drop=True)"
]
},
{
@ -616,37 +615,40 @@
"outputs": [],
"source": [
"# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
"usecols = ['artist.inverted', 'track', 'time', 'date.entered'] + (\n",
" [f'x{i}st.week' for i in range(1, 76, 10) if i != 11]\n",
" + [f'x{i}nd.week' for i in range(2, 76, 10) if i != 12]\n",
" + [f'x{i}rd.week' for i in range(3, 76, 10) if i != 13]\n",
" + [f'x{i}th.week' for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
" + [f'x11th.week', f'x12th.week', f'x13th.week']\n",
"usecols = [\"artist.inverted\", \"track\", \"time\", \"date.entered\"] + (\n",
" [f\"x{i}st.week\" for i in range(1, 76, 10) if i != 11]\n",
" + [f\"x{i}nd.week\" for i in range(2, 76, 10) if i != 12]\n",
" + [f\"x{i}rd.week\" for i in range(3, 76, 10) if i != 13]\n",
" + [f\"x{i}th.week\" for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
" + [f\"x11th.week\", f\"x12th.week\", f\"x13th.week\"]\n",
")\n",
"billboard = pd.read_csv(\n",
" \"data/billboard.csv\",\n",
" encoding=\"iso-8859-1\",\n",
" parse_dates=[\"date.entered\"],\n",
" usecols=usecols,\n",
")\n",
"\n",
"billboard = pd.read_csv('data/billboard.csv', encoding='iso-8859-1',\n",
" parse_dates=['date.entered'], usecols=usecols)\n",
"\n",
"billboard = billboard.assign(year=lambda x: x['date.entered'].dt.year)\n",
"billboard = billboard.assign(year=lambda x: x[\"date.entered\"].dt.year)\n",
"\n",
"# Rename the week columns.\n",
"week_columns = {\n",
" c: ('wk' + re.sub(r'[^\\d]+', '', c))\n",
" for c in billboard.columns\n",
" if c.endswith('.week')\n",
" column: (\"wk\" + re.sub(r\"[^\\d]+\", \"\", column))\n",
" for column in billboard.columns\n",
" if column.endswith(\".week\")\n",
"}\n",
"billboard = billboard.rename(columns={'artist.inverted': 'artist', **week_columns})\n",
"billboard = billboard.rename(columns={\"artist.inverted\": \"artist\", **week_columns})\n",
"\n",
"# Ensure the columns' order is the same as in the paper.\n",
"columns = ['year', 'artist', 'track', 'time', 'date.entered'] + [\n",
" f'wk{i}' for i in range(1, 76)\n",
"columns = [\"year\", \"artist\", \"track\", \"time\", \"date.entered\"] + [\n",
" f\"wk{i}\" for i in range(1, 76)\n",
"]\n",
"billboard = billboard[columns]\n",
"\n",
"# Ensure the rows' order is similar as in the paper.\n",
"# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
"billboard = billboard[billboard['year'] == 2000]\n",
"billboard = billboard.sort_values(['artist', 'track'])"
"billboard = billboard[billboard[\"year\"] == 2000]\n",
"billboard = billboard.sort_values([\"artist\", \"track\"])"
]
},
{
@ -986,17 +988,17 @@
"14 2000 Aaliyah Try Again \n",
"200 2000 Adams, Yolanda Open My Heart \n",
"\n",
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
"\n",
" wk69 wk70 wk71 wk72 wk73 wk74 wk75 \n",
"246 NaN NaN NaN NaN NaN NaN NaN \n",
@ -1028,7 +1030,7 @@
"source": [
"### \"Tidy\" Data\n",
"\n",
"As before the *pd.melt* function is used to transform the data from \"wide\" to \"long\" form."
"As before the `pd.melt()` function is used to transform the data from \"wide\" to \"long\" form."
]
},
{
@ -1039,9 +1041,9 @@
"source": [
"molten_billboard = pd.melt(\n",
" billboard,\n",
" id_vars=['year', 'artist', 'track', 'time', 'date.entered'],\n",
" var_name='week',\n",
" value_name='rank',\n",
" id_vars=[\"year\", \"artist\", \"track\", \"time\", \"date.entered\"],\n",
" var_name=\"week\",\n",
" value_name=\"rank\",\n",
")"
]
},
@ -1049,7 +1051,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column *date* indicating when exactly a particular song was at a certain rank in the charts is added."
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column`\"date\"` indicating when exactly a particular song was at a certain rank in the charts is added."
]
},
{
@ -1059,24 +1061,23 @@
"outputs": [],
"source": [
"# pandas keeps \"wide\" variables that had missing values as rows.\n",
"molten_billboard = molten_billboard[molten_billboard['rank'].notnull()]\n",
"molten_billboard = molten_billboard[molten_billboard[\"rank\"].notnull()]\n",
"\n",
"# Cast as integer after missing values are removed.\n",
"molten_billboard['week'] = molten_billboard['week'].map(lambda x: int(x[2:]))\n",
"molten_billboard['rank'] = molten_billboard['rank'].map(int)\n",
"molten_billboard[\"week\"] = molten_billboard[\"week\"].map(lambda x: int(x[2:]))\n",
"molten_billboard[\"rank\"] = molten_billboard[\"rank\"].map(int)\n",
"\n",
"# Calculate the actual week from the date of first entering the list.\n",
"molten_billboard = molten_billboard.assign(\n",
" date=lambda x: x['date.entered'] + (x['week'] - 1) * datetime.timedelta(weeks=1)\n",
" date=lambda x: x[\"date.entered\"] + (x[\"week\"] - 1) * datetime.timedelta(weeks=1)\n",
")\n",
"\n",
"# Sort rows and columns as in the paper.\n",
"molten_billboard = molten_billboard[\n",
" ['year', 'artist', 'time', 'track', 'date', 'week', 'rank']\n",
" [\"year\", \"artist\", \"time\", \"track\", \"date\", \"week\", \"rank\"]\n",
"]\n",
"molten_billboard = (\n",
" molten_billboard.sort_values(['artist', 'track', 'week']).reset_index(drop=True)\n",
")"
"molten_billboard = molten_billboard.sort_values([\"artist\", \"track\", \"week\"])\n",
"molten_billboard = molten_billboard.reset_index(drop=True)"
]
},
{
@ -1336,7 +1337,7 @@
"metadata": {},
"outputs": [],
"source": [
"molten_billboard.to_csv('data/billboard_cleaned.csv', index=False)"
"molten_billboard.to_csv(\"data/billboard_cleaned.csv\", index=False)"
]
}
],
@ -1356,9 +1357,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View file

@ -20,24 +20,9 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-26 11:50:39 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
"\n",
"numpy 1.15.1\n",
"pandas 0.23.4\n"
]
}
],
"outputs": [],
"source": [
"% load_ext watermark\n",
"% watermark -d -t -v -z -p numpy,pandas"
"%load_ext lab_black"
]
},
{
@ -71,15 +56,30 @@
"metadata": {},
"outputs": [],
"source": [
"columns = ['iso2', 'year',\n",
" 'new_sp_m014', 'new_sp_m1524', 'new_sp_m2534', 'new_sp_m3544',\n",
" 'new_sp_m4554', 'new_sp_m5564', 'new_sp_m65', 'new_sp_mu',\n",
" 'new_sp_f014', 'new_sp_f1524', 'new_sp_f2534', 'new_sp_f3544',\n",
" 'new_sp_f4554', 'new_sp_f5564', 'new_sp_f65', 'new_sp_fu']\n",
"tb = pd.read_csv('data/tb.csv', usecols=columns)\n",
"columns = [\n",
" \"iso2\",\n",
" \"year\",\n",
" \"new_sp_m014\",\n",
" \"new_sp_m1524\",\n",
" \"new_sp_m2534\",\n",
" \"new_sp_m3544\",\n",
" \"new_sp_m4554\",\n",
" \"new_sp_m5564\",\n",
" \"new_sp_m65\",\n",
" \"new_sp_mu\",\n",
" \"new_sp_f014\",\n",
" \"new_sp_f1524\",\n",
" \"new_sp_f2534\",\n",
" \"new_sp_f3544\",\n",
" \"new_sp_f4554\",\n",
" \"new_sp_f5564\",\n",
" \"new_sp_f65\",\n",
" \"new_sp_fu\",\n",
"]\n",
"tb = pd.read_csv(\"data/tb.csv\", usecols=columns)\n",
"\n",
"rename = {c: c[7:] for c in columns if c.startswith('new_sp_')}\n",
"rename = {'iso2': 'country', **rename}\n",
"rename = {column: column[7:] for column in columns if column.startswith(\"new_sp_\")}\n",
"rename = {\"iso2\": \"country\", **rename}\n",
"tb = tb.rename(columns=rename)"
]
},
@ -89,7 +89,7 @@
"source": [
"### Messy Data\n",
"\n",
"The data are assumed to be provided as below. Except for the *country* and *year* columns, the remaining columns are actually joint realizations of two variables **sex** and **age**."
"The data are assumed to be provided as below. Except for the `\"country\"` and `\"year\"` columns, the remaining columns are actually joint realizations of two variables `\"sex\"` and `\"age\"`."
]
},
{
@ -385,7 +385,7 @@
}
],
"source": [
"tb[(tb['year'] == 2000)].head(10)"
"tb[(tb[\"year\"] == 2000)].head(10)"
]
},
{
@ -394,7 +394,7 @@
"source": [
"### Molten Data\n",
"\n",
"As in the previous notebook the [*pd.melt*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are then discarded (then, without any more missing values, the column's data type is casted as integer). Furthermore, the resulting *molten* dataset is sorted as in the paper."
"As in the previous notebook the [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are discarded. Then, without any more missing values, the column's data type is casted as `int`. Furthermore, the resulting *molten* dataset is sorted as in the paper."
]
},
{
@ -403,10 +403,12 @@
"metadata": {},
"outputs": [],
"source": [
"molten_tb = pd.melt(tb, id_vars=['country', 'year'], var_name='column', value_name='cases')\n",
"molten_tb = molten_tb[molten_tb['cases'].notnull()]\n",
"molten_tb['cases'] = molten_tb['cases'].astype(int)\n",
"molten_tb = molten_tb.sort_values(['country', 'year', 'column']).reset_index(drop=True)"
"molten_tb = pd.melt(\n",
" tb, id_vars=[\"country\", \"year\"], var_name=\"column\", value_name=\"cases\"\n",
")\n",
"molten_tb = molten_tb[molten_tb[\"cases\"].notnull()]\n",
"molten_tb[\"cases\"] = molten_tb[\"cases\"].astype(int)\n",
"molten_tb = molten_tb.sort_values([\"country\", \"year\", \"column\"]).reset_index(drop=True)"
]
},
{
@ -536,7 +538,7 @@
}
],
"source": [
"molten_tb[(molten_tb['year'] == 2000)].head(10)"
"molten_tb[(molten_tb[\"year\"] == 2000)].head(10)"
]
},
{
@ -545,7 +547,7 @@
"source": [
"### Tidy Data\n",
"\n",
"Using the [*pd.Series.str.extract*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper."
"Using the [pd.Series.str.extract()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper."
]
},
{
@ -554,14 +556,21 @@
"metadata": {},
"outputs": [],
"source": [
"tidy_tb = molten_tb[['country', 'year', 'cases']]\n",
"tidy_tb[['sex', 'age']] = molten_tb['column'].str.extract(r'(f|m)(.*)')\n",
"tidy_tb['age'] = tidy_tb['age'].map({\n",
" '014': '0-14', '1524': '15-24', '2534': '25-34',\n",
" '3544': '35-44', '4554': '45-54', '5564': '55-64',\n",
" '65': '65+', 'u': 'unknown'\n",
"})\n",
"tidy_tb = tidy_tb[['country', 'year', 'sex', 'age', 'cases']]"
"tidy_tb = molten_tb[[\"country\", \"year\", \"cases\"]]\n",
"tidy_tb[[\"sex\", \"age\"]] = molten_tb[\"column\"].str.extract(r\"(f|m)(.*)\")\n",
"tidy_tb[\"age\"] = tidy_tb[\"age\"].map(\n",
" {\n",
" \"014\": \"0-14\",\n",
" \"1524\": \"15-24\",\n",
" \"2534\": \"25-34\",\n",
" \"3544\": \"35-44\",\n",
" \"4554\": \"45-54\",\n",
" \"5564\": \"55-64\",\n",
" \"65\": \"65+\",\n",
" \"u\": \"unknown\",\n",
" }\n",
")\n",
"tidy_tb = tidy_tb[[\"country\", \"year\", \"sex\", \"age\", \"cases\"]]"
]
},
{
@ -702,7 +711,7 @@
}
],
"source": [
"tidy_tb[(tidy_tb['year'] == 2000)].head(10)"
"tidy_tb[(tidy_tb[\"year\"] == 2000)].head(10)"
]
}
],
@ -722,9 +731,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View file

@ -18,24 +18,9 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-26 12:56:31 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
"\n",
"numpy 1.15.1\n",
"pandas 0.23.4\n"
]
}
],
"outputs": [],
"source": [
"% load_ext watermark\n",
"% watermark -d -t -v -z -p numpy,pandas"
"%load_ext lab_black"
]
},
{
@ -54,7 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', 40)"
"pd.set_option(\"display.max_columns\", 40)"
]
},
{
@ -83,38 +68,46 @@
"source": [
"# Extract the data as one column and\n",
"# use string slicing to obtain groups of columns.\n",
"weather = pd.read_csv('data/weather.txt', header=None, sep='^')\n",
"weather = pd.read_csv(\"data/weather.txt\", header=None, sep=\"^\")\n",
"\n",
"# First, remove the weird character seperators,\n",
"# then split the columns by whitespace, and\n",
"# finally name them appropriately.\n",
"days = (\n",
" weather[0]\n",
" .map(lambda x: x[21:]).str.replace('OI', ' ')\n",
" .str.replace('OS', ' ').str.replace('SI', ' ').str.replace('I', ' ')\n",
" .str.replace('S', ' ').str.replace('B', ' ').str.replace('D', ' ')\n",
" .map(str.lstrip).str.split(r'\\s+', expand=True)\n",
")[list(range(31))].rename(columns={i: f'd{i+1}' for i in range(31)})\n",
" .map(lambda x: x[21:])\n",
" .str.replace(\"OI\", \" \")\n",
" .str.replace(\"OS\", \" \")\n",
" .str.replace(\"SI\", \" \")\n",
" .str.replace(\"I\", \" \")\n",
" .str.replace(\"S\", \" \")\n",
" .str.replace(\"B\", \" \")\n",
" .str.replace(\"D\", \" \")\n",
" .map(str.lstrip)\n",
" .str.split(r\"\\s+\", expand=True)\n",
")[list(range(31))].rename(columns={i: f\"d{i+1}\" for i in range(31)})\n",
"\n",
"# The non-temperature columns can be extracted as simple slices.\n",
"weather = pd.DataFrame(data={\n",
" 'id': weather[0].map(lambda x: x[:11]),\n",
" 'year': weather[0].map(lambda x: x[11:15]).astype(int),\n",
" 'month': weather[0].map(lambda x: x[15:17]).astype(int),\n",
" 'element': weather[0].map(lambda x: x[17:21]).str.lower(),\n",
"})\n",
"weather = pd.DataFrame(\n",
" data={\n",
" \"id\": weather[0].map(lambda x: x[:11]),\n",
" \"year\": weather[0].map(lambda x: x[11:15]).astype(int),\n",
" \"month\": weather[0].map(lambda x: x[15:17]).astype(int),\n",
" \"element\": weather[0].map(lambda x: x[17:21]).str.lower(),\n",
" }\n",
")\n",
"\n",
"# The temperatures were stored as whole integers\n",
"# with -9999 indicating missing values.\n",
"for i in range(1, 32):\n",
" weather[f'd{i}'] = days[f'd{i}'].astype(float) / 10\n",
" weather[f\"d{i}\"] = days[f\"d{i}\"].astype(float) / 10\n",
"weather = weather.replace(-999.9, np.NaN)\n",
"\n",
"# Discard the non-temperature observations and\n",
"# sort the dataset as in the paper.\n",
"weather = (\n",
" weather[weather['element'].isin(['tmax', 'tmin'])]\n",
" .sort_values(['id', 'year', 'month', 'element'])\n",
" weather[weather[\"element\"].isin([\"tmax\", \"tmin\"])]\n",
" .sort_values([\"id\", \"year\", \"month\", \"element\"])\n",
" .reset_index(drop=True)\n",
")"
]
@ -128,8 +121,7 @@
"Below is a dataset assumed to have been provided like this as \"raw\", i.e., the data analyst did not do the above parsing work but some third party instead.\n",
"\n",
"> The most complicated form of messy data occurs when variables are stored in both rows and columns. Table 11 shows daily weather data from the Global Historical Climatology Network for one weather station (MX17004) in Mexico for five months in 2010. It has variables in\n",
"individual columns (*id*, *year*, *month*), spread across columns (day, d1d31) and across rows (*tmin*, *tmax*) (minimum and maximum temperature). Months with less than 31 days have\n",
"structural missing values for the last day(s) of the month. The *element* column is not a variable; it stores the names of variables."
"individual columns (`\"id\"`, `\"year\"`, `\"month\"`), spread across columns (day, `\"d1\"``\"d31\"`) and across rows (`\"tmin\"` and `\"tmax\"` for the minimum and maximum temperatures). Months with less than 31 days have missing values for the last day(s) of the month. The `\"element\"` column is not a variable: it stores the *names* of variables."
]
},
{
@ -624,7 +616,7 @@
}
],
"source": [
"weather[(weather['year'] == 2010)].head(10)"
"weather[(weather[\"year\"] == 2010)].head(10)"
]
},
{
@ -638,7 +630,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"> To tidy this dataset we first melt it with colvars *id*, *year*, *month* and the column that contains variable names, *element* [...]. For presentation, we have dropped the missing values, making them implicit rather than explicit. This is permissible because we know how many days are in each month and can easily reconstruct the explicit missing values."
"> To tidy this dataset we first melt it with colvars `\"id\"`, `\"year\"`, `\"month\"`, and the column that contains the actual variable names, `\"element\"` [...]. For presentation, we have dropped the missing values, making them implicit rather than explicit. This is permissible because we know how many days are in each month and can easily reconstruct the explicit missing values."
]
},
{
@ -649,27 +641,25 @@
"source": [
"# Melt the dataset and extract a date column.\n",
"molten_weather = (\n",
" pd.melt(weather, id_vars=['id', 'year', 'month', 'element'], var_name='day')\n",
" .assign(day=lambda x: x['day'].str.extract('(\\d+)').astype(int))\n",
" .assign(date=lambda x: pd.to_datetime(x[['year', 'month', 'day']], errors='coerce'))\n",
")[['id', 'date', 'element', 'value']]\n",
" pd.melt(weather, id_vars=[\"id\", \"year\", \"month\", \"element\"], var_name=\"day\")\n",
" .assign(day=lambda x: x[\"day\"].str.extract(\"(\\d+)\").astype(int))\n",
" .assign(date=lambda x: pd.to_datetime(x[[\"year\", \"month\", \"day\"]], errors=\"coerce\"))\n",
")\n",
"molten_weather = molten_weather[[\"id\", \"date\", \"element\", \"value\"]]\n",
"\n",
"# Make the missing values implicit.\n",
"molten_weather = molten_weather[molten_weather['value'].notnull()]\n",
"molten_weather = molten_weather[molten_weather[\"value\"].notnull()]\n",
"\n",
"# Sort the data as in the paper.\n",
"molten_weather = (\n",
" molten_weather\n",
" .sort_values(['id', 'date', 'element'])\n",
" .reset_index(drop=True)\n",
")"
"molten_weather = molten_weather.sort_values([\"id\", \"date\", \"element\"])\n",
"molten_weather = molten_weather.reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> This dataset is mostly tidy, but we have two variables stored in rows: *tmin* and *tmax*, the type of observation."
"> This dataset is mostly tidy, but we have two variables stored in rows: `\"tmin\"` and `\"tmax\"`, the type of observation."
]
},
{
@ -799,7 +789,7 @@
}
],
"source": [
"molten_weather[(molten_weather['date'].dt.year == 2010)].head(10)"
"molten_weather[(molten_weather[\"date\"].dt.year == 2010)].head(10)"
]
},
{
@ -815,7 +805,7 @@
"source": [
"> Fixing this requires the cast, or unstack, operation. This performs the inverse of melting by rotating the element variable back out into the columns\n",
"\n",
"Note that [pd.DataFrame.unstack](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.unstack.html) method uses a DataFrame's index as columns to unstack over."
"Below, [pd.DataFrame.unstack()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html) uses a DataFrame's index as columns to unstack over."
]
},
{
@ -824,7 +814,7 @@
"metadata": {},
"outputs": [],
"source": [
"tidy_weather = molten_weather.set_index(['id', 'date', 'element']).unstack()\n",
"tidy_weather = molten_weather.set_index([\"id\", \"date\", \"element\"]).unstack()\n",
"\n",
"# Make the column headers look as in the paper.\n",
"tidy_weather.columns = tidy_weather.columns.droplevel(0)\n",
@ -966,7 +956,7 @@
}
],
"source": [
"tidy_weather[(tidy_weather['date'].dt.year == 2010)].head(10)"
"tidy_weather[(tidy_weather[\"date\"].dt.year == 2010)].head(10)"
]
}
],
@ -986,9 +976,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View file

@ -20,24 +20,9 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-26 15:32:47 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
"\n",
"numpy 1.15.1\n",
"pandas 0.23.4\n"
]
}
],
"outputs": [],
"source": [
"% load_ext watermark\n",
"% watermark -d -t -v -z -p numpy,pandas"
"%load_ext lab_black"
]
},
{
@ -71,7 +56,7 @@
"metadata": {},
"outputs": [],
"source": [
"billboard = pd.read_csv('data/billboard_cleaned.csv')"
"billboard = pd.read_csv(\"data/billboard_cleaned.csv\")"
]
},
{
@ -81,7 +66,7 @@
"### Messy Data\n",
"\n",
"> The Billboard dataset described in Table 8 actually contains observations on two types of\n",
"observational units: the *song* and its *rank* in each week. This manifests itself through the duplication of facts about the song: *artist* and *time* are repeated for every song in each *week*."
"observational units: the **song** and its **rank** in each week. This manifests itself through the duplication of facts about the song: `\"artist\"` and `\"time\"` are repeated for every song in each `\"week\"`."
]
},
{
@ -325,9 +310,9 @@
"source": [
"### Tidy Data\n",
"\n",
"> The billboard dataset needs to be broken down into two datasets: a **song** dataset which stores *artist*, *song name* and *time*, and a **ranking** dataset which gives the *rank* of the song in each *week*.\n",
"> The billboard dataset needs to be broken down into two datasets: a **song** dataset which stores `\"artist\"`, `\"song name\"` and `\"time\"`, and a **ranking** dataset which gives the `\"rank\"` of the song in each `\"week\".\n",
"\n",
"Transforming data columns into index columns is enough in pandas to obtain unique tuples from several columns. So no real \"function\" is needed to tidy up the dataset."
"Transforming data columns into index columns is enough in pandas to obtain unique `tuple`s from several columns. So, no real \"function\" is needed to tidy up the dataset."
]
},
{
@ -338,26 +323,25 @@
"source": [
"# Get the unique combinations for the song DataFrame and\n",
"# \"store\" them in the original dataset for reuse.\n",
"billboard = billboard.set_index(['artist', 'track', 'time'])\n",
"billboard = billboard.set_index([\"artist\", \"track\", \"time\"])\n",
"\n",
"# Create the song DataFrame.\n",
"songs = pd.DataFrame.from_records(\n",
" columns=['id', 'artist', 'track', 'time'],\n",
" columns=[\"id\", \"artist\", \"track\", \"time\"],\n",
" data=[ # Combine enumerate with tuple unpacking\n",
" (a + 1, b, c, d) # to create the ID column.\n",
" for (a, (b, c, d))\n",
" in enumerate(billboard.index.unique())\n",
" for (a, (b, c, d)) in enumerate(billboard.index.unique())\n",
" ],\n",
")\n",
"\n",
"# Take the date and rank columns from the original dataset\n",
"# and use the implicit index alignment to assign the songs' IDs.\n",
"ranking = billboard[['date', 'rank']].copy()\n",
"ranking['id'] = songs.set_index(['artist', 'track', 'time'])\n",
"ranking = billboard[[\"date\", \"rank\"]].copy()\n",
"ranking[\"id\"] = songs.set_index([\"artist\", \"track\", \"time\"])\n",
"\n",
"# Use the song ID as the index as in the paper.\n",
"ranking = ranking.reset_index(drop=True).set_index('id')\n",
"songs = songs.set_index('id')"
"ranking = ranking.reset_index(drop=True).set_index(\"id\")\n",
"songs = songs.set_index(\"id\")"
]
},
{
@ -700,9 +684,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

View file

@ -6,7 +6,7 @@
"source": [
"# One Type in multiple Tables\n",
"\n",
"The repository with the original R code does not provide code for this case but only refers to other projects that cannot be replicated any more (source website not available any more)."
"The repository with the original R code does not provide code for this case but only refers to other projects that cannot be replicated any more (because the source website is *not* available any more)."
]
},
{
@ -47,9 +47,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}

File diff suppressed because one or more lines are too long

19
LICENSE.txt Normal file
View file

@ -0,0 +1,19 @@
Copyright (c) 2018-2020 Alexander Hess [alexander@webartifex.biz]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

24
Pipfile
View file

@ -1,24 +0,0 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
pandas = "*"
jupyter = "*"
watermark = "*"
savreaderwriter = "*"
"rpy2" = "==2.8.*"
matplotlib = "*"
seaborn = "*"
sklearn = "*"
[dev-packages]
black = "*"
blackcellmagic = "*"
[requires]
python_version = "3.6"
[pipenv]
allow_prereleases = true

758
Pipfile.lock generated
View file

@ -1,758 +0,0 @@
{
"_meta": {
"hash": {
"sha256": "9fc4c60d75aac99be98f4bd18fa6b1bf507d093c96a4c639901a1d0746a83ace"
},
"pipfile-spec": 6,
"requires": {
"python_version": "3.6"
},
"sources": [
{
"name": "pypi",
"url": "https://pypi.org/simple",
"verify_ssl": true
}
]
},
"default": {
"attrs": {
"hashes": [
"sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265",
"sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b"
],
"version": "==18.1.0"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
},
"bleach": {
"hashes": [
"sha256:0ee95f6167129859c5dce9b1ca291ebdb5d8cd7e382ca0e237dfd0dad63f63d8",
"sha256:24754b9a7d530bf30ce7cbc805bc6cce785660b4a10ff3a43633728438c105ab"
],
"version": "==2.1.4"
},
"cycler": {
"hashes": [
"sha256:1d8a5ae1ff6c5cf9b93e8811e581232ad8920aeec647c37316ceac982b08cb2d",
"sha256:cd7b2d1018258d7247a71425e9f26463dfb444d411c39569972f4ce586b0c9d8"
],
"version": "==0.10.0"
},
"decorator": {
"hashes": [
"sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
"sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
],
"version": "==4.3.0"
},
"entrypoints": {
"hashes": [
"sha256:10ad569bb245e7e2ba425285b9fa3e8178a0dc92fc53b1e1c553805e15a8825b",
"sha256:d2d587dde06f99545fb13a383d2cd336a8ff1f359c5839ce3a64c917d10c029f"
],
"markers": "python_version >= '2.7'",
"version": "==0.2.3"
},
"html5lib": {
"hashes": [
"sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3",
"sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736"
],
"version": "==1.0.1"
},
"ipykernel": {
"hashes": [
"sha256:395f020610e33ffa0b0c9c0cd1a1d927d51ab9aa9f30a7ae36bb0c908a33e89c",
"sha256:935941dba29d856eee34b8b5261d971bd5012547239ed73ddfff099143748c37",
"sha256:c091449dd0fad7710ddd9c4a06e8b9e15277da306590bc07a3a1afa6b4453c8f"
],
"version": "==4.8.2"
},
"ipython": {
"hashes": [
"sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62",
"sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4"
],
"markers": "python_version >= '3.3'",
"version": "==6.5.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"ipywidgets": {
"hashes": [
"sha256:100f4ea495e1fa2c1dfeabb68641af2302e65e877003f910be4e29f3aa68a0b2",
"sha256:fd24a66d82f2ea49e281da7714a7c656340d3ec24dff376b17590fa59469b817"
],
"version": "==7.4.0"
},
"jedi": {
"hashes": [
"sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1",
"sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f"
],
"version": "==0.12.1"
},
"jinja2": {
"hashes": [
"sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd",
"sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"
],
"version": "==2.10"
},
"jsonschema": {
"hashes": [
"sha256:1ae9fe07aec50f07fc546a4432c93e2f23deeaa23bb67aef42d75d8aa496849b",
"sha256:aef58a18d83e4c5ea117d7ae1ba4238a6a84654fee6d0f32fd335ded63a1626e"
],
"version": "==3.0.0a2"
},
"jupyter": {
"hashes": [
"sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7",
"sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78",
"sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f"
],
"index": "pypi",
"version": "==1.0.0"
},
"jupyter-client": {
"hashes": [
"sha256:27befcf0446b01e29853014d6a902dd101ad7d7f94e2252b1adca17c3466b761",
"sha256:59e6d791e22a8002ad0e80b78c6fd6deecab4f9e1b1aa1a22f4213de271b29ea"
],
"version": "==5.2.3"
},
"jupyter-console": {
"hashes": [
"sha256:3f928b817fc82cda95e431eb4c2b5eb21be5c483c2b43f424761a966bb808094",
"sha256:545dedd3aaaa355148093c5609f0229aeb121b4852995c2accfa64fe3e0e55cd"
],
"version": "==5.2.0"
},
"jupyter-core": {
"hashes": [
"sha256:927d713ffa616ea11972534411544589976b2493fc7e09ad946e010aa7eb9970",
"sha256:ba70754aa680300306c699790128f6fbd8c306ee5927976cbe48adacf240c0b7"
],
"version": "==4.4.0"
},
"kiwisolver": {
"hashes": [
"sha256:0ee4ed8b3ae8f5f712b0aa9ebd2858b5b232f1b9a96b0943dceb34df2a223bc3",
"sha256:0f7f532f3c94e99545a29f4c3f05637f4d2713e7fd91b4dd8abfc18340b86cd5",
"sha256:1a078f5dd7e99317098f0e0d490257fd0349d79363e8c923d5bb76428f318421",
"sha256:1aa0b55a0eb1bd3fa82e704f44fb8f16e26702af1a073cc5030eea399e617b56",
"sha256:2874060b91e131ceeff00574b7c2140749c9355817a4ed498e82a4ffa308ecbc",
"sha256:379d97783ba8d2934d52221c833407f20ca287b36d949b4bba6c75274bcf6363",
"sha256:3b791ddf2aefc56382aadc26ea5b352e86a2921e4e85c31c1f770f527eb06ce4",
"sha256:4329008a167fac233e398e8a600d1b91539dc33c5a3eadee84c0d4b04d4494fa",
"sha256:45813e0873bbb679334a161b28cb9606d9665e70561fd6caa8863e279b5e464b",
"sha256:53a5b27e6b5717bdc0125338a822605084054c80f382051fb945d2c0e6899a20",
"sha256:574f24b9805cb1c72d02b9f7749aa0cc0b81aa82571be5201aa1453190390ae5",
"sha256:66f82819ff47fa67a11540da96966fb9245504b7f496034f534b81cacf333861",
"sha256:79e5fe3ccd5144ae80777e12973027bd2f4f5e3ae8eb286cabe787bed9780138",
"sha256:83410258eb886f3456714eea4d4304db3a1fc8624623fc3f38a487ab36c0f653",
"sha256:8b6a7b596ce1d2a6d93c3562f1178ebd3b7bb445b3b0dd33b09f9255e312a965",
"sha256:9576cb63897fbfa69df60f994082c3f4b8e6adb49cccb60efb2a80a208e6f996",
"sha256:95a25d9f3449046ecbe9065be8f8380c03c56081bc5d41fe0fb964aaa30b2195",
"sha256:a424f048bebc4476620e77f3e4d1f282920cef9bc376ba16d0b8fe97eec87cde",
"sha256:aaec1cfd94f4f3e9a25e144d5b0ed1eb8a9596ec36d7318a504d813412563a85",
"sha256:acb673eecbae089ea3be3dcf75bfe45fc8d4dcdc951e27d8691887963cf421c7",
"sha256:b15bc8d2c2848a4a7c04f76c9b3dc3561e95d4dabc6b4f24bfabe5fd81a0b14f",
"sha256:b1c240d565e977d80c0083404c01e4d59c5772c977fae2c483f100567f50847b",
"sha256:c595693de998461bcd49b8d20568c8870b3209b8ea323b2a7b0ea86d85864694",
"sha256:ce3be5d520b4d2c3e5eeb4cd2ef62b9b9ab8ac6b6fedbaa0e39cdb6f50644278",
"sha256:e0f910f84b35c36a3513b96d816e6442ae138862257ae18a0019d2fc67b041dc",
"sha256:ea36e19ac0a483eea239320aef0bd40702404ff8c7e42179a2d9d36c5afcb55c",
"sha256:efabbcd4f406b532206b8801058c8bab9e79645b9880329253ae3322b7b02cd5",
"sha256:f923406e6b32c86309261b8195e24e18b6a8801df0cfc7814ac44017bfcb3939"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*'",
"version": "==1.0.1"
},
"markupsafe": {
"hashes": [
"sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665"
],
"version": "==1.0"
},
"matplotlib": {
"hashes": [
"sha256:0ba8e3ec1b0feddc6b068fe70dc38dcf2917e301ad8d2b3f848c14ad463a4157",
"sha256:10a48e33e64dbd95f0776ba162f379c5cc55301c2d155506e79ce0c26b52f2ce",
"sha256:1376535fe731adbba55ab9e48896de226b7e89dbb55390c5fbd8f7161b7ae3be",
"sha256:16f0f8ba22df1e2c9f06c87088de45742322fde282a93b5c744c0f969cf7932e",
"sha256:1c6c999f2212858021329537f8e0f98f3f29086ec3683511dd1ecec84409f51d",
"sha256:2316dc177fc7b3d8848b49365498de0c385b4c9bba511edddd24c34fbe3d37a4",
"sha256:3398bfb533482bf21974cecf28224dd23784ad4e4848be582903f7a2436ec12e",
"sha256:3477cb1e1061b34210acc43d20050be8444478ff50b8adfac5fe2b45fc97df01",
"sha256:4259ea7cb2c238355ee13275eddd261d869cefbdeb18a65f35459589d6d17def",
"sha256:4addcf93234b6122f530f90f485fd3d00d158911fbc1ed24db3fa66cd49fe565",
"sha256:50c0e24bcbce9c54346f4a2f4e97b0ed111f0413ac3fe9954061ae1c8aa7021f",
"sha256:62ed7597d9e54db6e133420d779c642503c25eba390e1178d85dfb2ba0d05948",
"sha256:69f6d51e41a17f6a5f70c56bb10b8ded9f299609204495a7fa2782a3a755ffc5",
"sha256:6d232e49b74e3d2db22c63c25a9a0166d965e87e2b057f795487f1f244b61d9d",
"sha256:7355bf757ecacd5f0ac9dd9523c8e1a1103faadf8d33c22664178e17533f8ce5",
"sha256:886b1045c5105631f10c1cbc999f910e44d33af3e9c7efd68c2123efc06ab636",
"sha256:9e1f353edd7fc7e5e9101abd5bc0201946f77a1b59e0da49095086c03db856ed",
"sha256:b3a343dfcbe296dbe0f26c731beee72a792ff948407e6979524298ae7bc3234e",
"sha256:d93675af09ca497a25f4f8d62f3313cf0f21e45427a87487049fe84898b99909",
"sha256:e2409ef9d37804dfb566f39c962e6ed70f281ff516b8131b3e6b4e6442711ff1",
"sha256:f8b653b0f89938ba72e92ab080c2f3aa24c1b72e2f61add22880cd1b9a6e3cdd"
],
"index": "pypi",
"version": "==2.2.3"
},
"mistune": {
"hashes": [
"sha256:b4c512ce2fc99e5a62eb95a4aba4b73e5f90264115c40b70a21e1f7d4e0eac91",
"sha256:bc10c33bfdcaa4e749b779f62f60d6e12f8215c46a292d05e486b869ae306619"
],
"version": "==0.8.3"
},
"nbconvert": {
"hashes": [
"sha256:12b1a4671d4463ab73af6e4cbcc965b62254e05d182cd54995dda0d0ef9e2db9",
"sha256:260d390b989a647575b8ecae2cd06a9eaead10d396733d6e50185d5ebd08996e"
],
"version": "==5.3.1"
},
"nbformat": {
"hashes": [
"sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b",
"sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402"
],
"version": "==4.4.0"
},
"notebook": {
"hashes": [
"sha256:66dd59e76e755584ae9450eb015c39f55d4bb1d8ec68f2c694d2b3cba7bf5c7e",
"sha256:e2c8e931cc19db4f8c63e6a396efbc13a228b2cb5b2919df011b946f28239a08"
],
"version": "==5.6.0"
},
"numpy": {
"hashes": [
"sha256:1c362ad12dd09a43b348bb28dd2295dd9cdf77f41f0f45965e04ba97f525b864",
"sha256:2156a06bd407918df4ac0122df6497a9c137432118f585e5b17d543e593d1587",
"sha256:24e4149c38489b51fc774b1e1faa9103e82f73344d7a00ba66f6845ab4769f3f",
"sha256:340ec1697d9bb3a9c464028af7a54245298502e91178bddb4c37626d36e197b7",
"sha256:35db8d419345caa4eeaa65cd63f34a15208acd87530a30f0bc25fc84f55c8c80",
"sha256:361370e9b7f5e44c41eee29f2bb5cb3b755abb4b038bce6d6cbe08db7ff9cb74",
"sha256:36e8dcd1813ca92ce7e4299120cee6c03adad33d89b54862c1b1a100443ac399",
"sha256:378378973546ecc1dfaf9e24c160d683dd04df871ecd2dcc86ce658ca20f92c0",
"sha256:419e6faee16097124ee627ed31572c7e80a1070efa25260b78097cca240e219a",
"sha256:4287104c24e6a09b9b418761a1e7b1bbde65105f110690ca46a23600a3c606b8",
"sha256:549f3e9778b148a47f4fb4682955ed88057eb627c9fe5467f33507c536deda9d",
"sha256:5e359e9c531075220785603e5966eef20ccae9b3b6b8a06fdfb66c084361ce92",
"sha256:5ee7f3dbbdba0da75dec7e94bd7a2b10fe57a83e1b38e678200a6ad8e7b14fdc",
"sha256:62d55e96ec7b117d3d5e618c15efcf769e70a6effaee5842857b64fb4883887a",
"sha256:719b6789acb2bc86ea9b33a701d7c43dc2fc56d95107fd3c5b0a8230164d4dfb",
"sha256:7a70f2b60d48828cba94a54a8776b61a9c2657a803d47f5785f8062e3a9c7c55",
"sha256:7b9e37f194f8bcdca8e9e6af92e2cbad79e360542effc2dd6b98d63955d8d8a3",
"sha256:83b8fc18261b70f45bece2d392537c93dc81eb6c539a16c9ac994c47fc79f09a",
"sha256:9473ad28375710ab18378e72b59422399b27e957e9339c413bf00793b4b12df0",
"sha256:95b085b253080e5d09f7826f5e27dce067bae813a132023a77b739614a29de6e",
"sha256:98b86c62c08c2e5dc98a9c856d4a95329d11b1c6058cb9b5191d5ea6891acd09",
"sha256:a3bd01d6d3ed3d7c06d7f9979ba5d68281f15383fafd53b81aa44b9191047cf8",
"sha256:c81a6afc1d2531a9ada50b58f8c36197f8418ef3d0611d4c1d7af93fdcda764f",
"sha256:ce75ed495a746e3e78cfa22a77096b3bff2eda995616cb7a542047f233091268",
"sha256:dae8618c0bcbfcf6cf91350f8abcdd84158323711566a8c5892b5c7f832af76f",
"sha256:df0b02c6705c5d1c25cc35c7b5d6b6f9b3b30833f9d178843397ae55ecc2eebb",
"sha256:e3660744cda0d94b90141cdd0db9308b958a372cfeee8d7188fdf5ad9108ea82",
"sha256:f2362d0ca3e16c37782c1054d7972b8ad2729169567e3f0f4e5dd3cdf85f188e"
],
"markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.3.*'",
"version": "==1.15.1"
},
"pandas": {
"hashes": [
"sha256:11975fad9edbdb55f1a560d96f91830e83e29bed6ad5ebf506abda09818eaf60",
"sha256:12e13d127ca1b585dd6f6840d3fe3fa6e46c36a6afe2dbc5cb0b57032c902e31",
"sha256:1c87fcb201e1e06f66e23a61a5fea9eeebfe7204a66d99df24600e3f05168051",
"sha256:242e9900de758e137304ad4b5663c2eff0d798c2c3b891250bd0bd97144579da",
"sha256:26c903d0ae1542890cb9abadb4adcb18f356b14c2df46e4ff657ae640e3ac9e7",
"sha256:2e1e88f9d3e5f107b65b59cd29f141995597b035d17cc5537e58142038942e1a",
"sha256:31b7a48b344c14691a8e92765d4023f88902ba3e96e2e4d0364d3453cdfd50db",
"sha256:4fd07a932b4352f8a8973761ab4e84f965bf81cc750fb38e04f01088ab901cb8",
"sha256:5b24ca47acf69222e82530e89111dd9d14f9b970ab2cd3a1c2c78f0c4fbba4f4",
"sha256:647b3b916cc8f6aeba240c8171be3ab799c3c1b2ea179a3be0bd2712c4237553",
"sha256:66b060946046ca27c0e03e9bec9bba3e0b918bafff84c425ca2cc2e157ce121e",
"sha256:6efa9fa6e1434141df8872d0fa4226fc301b17aacf37429193f9d70b426ea28f",
"sha256:be4715c9d8367e51dbe6bc6d05e205b1ae234f0dc5465931014aa1c4af44c1ba",
"sha256:bea90da782d8e945fccfc958585210d23de374fa9294a9481ed2abcef637ebfc",
"sha256:d785fc08d6f4207437e900ffead930a61e634c5e4f980ba6d3dc03c9581748c7",
"sha256:de9559287c4fe8da56e8c3878d2374abc19d1ba2b807bfa7553e912a8e5ba87c",
"sha256:f4f98b190bb918ac0bc0e3dd2ab74ff3573da9f43106f6dba6385406912ec00f",
"sha256:f71f1a7e2d03758f6e957896ed696254e2bc83110ddbc6942018f1a232dd9dad",
"sha256:fb944c8f0b0ab5c1f7846c686bc4cdf8cde7224655c12edcd59d5212cd57bec0"
],
"index": "pypi",
"version": "==0.23.4"
},
"pandocfilters": {
"hashes": [
"sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9"
],
"version": "==1.4.2"
},
"parso": {
"hashes": [
"sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
"sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
],
"version": "==0.3.1"
},
"pexpect": {
"hashes": [
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
"sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
],
"markers": "sys_platform != 'win32'",
"version": "==4.6.0"
},
"pickleshare": {
"hashes": [
"sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
"sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
],
"version": "==0.7.4"
},
"prometheus-client": {
"hashes": [
"sha256:17bc24c09431644f7c65d7bce9f4237252308070b6395d6d8e87767afe867e24"
],
"version": "==0.3.1"
},
"prompt-toolkit": {
"hashes": [
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
"sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
"sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
],
"version": "==1.0.15"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"markers": "os_name != 'nt'",
"version": "==0.6.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
],
"version": "==2.2.0"
},
"pyparsing": {
"hashes": [
"sha256:0832bcf47acd283788593e7a0f542407bd9550a55a8a8435214a1960e04bcb04",
"sha256:fee43f17a9c4087e7ed1605bd6df994c6173c1e977d7ade7b651292fab2bd010"
],
"version": "==2.2.0"
},
"pyrsistent": {
"hashes": [
"sha256:4024f838472cba9ea1ccbc638e0bcafec2efda28594a9905177ec365f1a95fea"
],
"version": "==0.14.4"
},
"python-dateutil": {
"hashes": [
"sha256:1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0",
"sha256:e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8"
],
"version": "==2.7.3"
},
"pytz": {
"hashes": [
"sha256:a061aa0a9e06881eb8b3b2b43f05b9439d6583c206d0a6c340ff72a7b6669053",
"sha256:ffb9ef1de172603304d9d2819af6f5ece76f2e85ec10692a524dd876e72bf277"
],
"version": "==2018.5"
},
"pyzmq": {
"hashes": [
"sha256:25a0715c8f69cf72f67cfe5a68a3f3ed391c67c063d2257bec0fe7fc2c7f08f8",
"sha256:2bab63759632c6b9e0d5bf19cc63c3b01df267d660e0abcf230cf0afaa966349",
"sha256:30ab49d99b24bf0908ebe1cdfa421720bfab6f93174e4883075b7ff38cc555ba",
"sha256:32c7ca9fc547a91e3c26fc6080b6982e46e79819e706eb414dd78f635a65d946",
"sha256:41219ae72b3cc86d97557fe5b1ef5d1adc1057292ec597b50050874a970a39cf",
"sha256:4b8c48a9a13cea8f1f16622f9bd46127108af14cd26150461e3eab71e0de3e46",
"sha256:55724997b4a929c0d01b43c95051318e26ddbae23565018e138ae2dc60187e59",
"sha256:65f0a4afae59d4fc0aad54a917ab599162613a761b760ba167d66cc646ac3786",
"sha256:6f88591a8b246f5c285ee6ce5c1bf4f6bd8464b7f090b1333a446b6240a68d40",
"sha256:75022a4c60dcd8765bb9ca32f6de75a0ec83b0d96e0309dc479f4c7b21f26cb7",
"sha256:76ea493bfab18dcb090d825f3662b5612e2def73dffc196d51a5194b0294a81d",
"sha256:7b60c045b80709e4e3c085bab9b691e71761b44c2b42dbb047b8b498e7bc16b3",
"sha256:8e6af2f736734aef8ed6f278f9f552ec7f37b1a6b98e59b887484a840757f67d",
"sha256:9ac2298e486524331e26390eac14e4627effd3f8e001d4266ed9d8f1d2d31cce",
"sha256:9ba650f493a9bc1f24feca1d90fce0e5dd41088a252ac9840131dfbdbf3815ca",
"sha256:a02a4a385e394e46012dc83d2e8fd6523f039bb52997c1c34a2e0dd49ed839c1",
"sha256:a3ceee84114d9f5711fa0f4db9c652af0e4636c89eabc9b7f03a3882569dd1ed",
"sha256:a72b82ac1910f2cf61a49139f4974f994984475f771b0faa730839607eeedddf",
"sha256:ab136ac51027e7c484c53138a0fab4a8a51e80d05162eb7b1585583bcfdbad27",
"sha256:c095b224300bcac61e6c445e27f9046981b1ac20d891b2f1714da89d34c637c8",
"sha256:c5cc52d16c06dc2521340d69adda78a8e1031705924e103c0eb8fc8af861d810",
"sha256:d612e9833a89e8177f8c1dc68d7b4ff98d3186cd331acd616b01bbdab67d3a7b",
"sha256:e828376a23c66c6fe90dcea24b4b72cd774f555a6ee94081670872918df87a19",
"sha256:e9767c7ab2eb552796440168d5c6e23a99ecaade08dda16266d43ad461730192",
"sha256:ebf8b800d42d217e4710d1582b0c8bff20cdcb4faad7c7213e52644034300924"
],
"markers": "python_version != '3.2*' and python_version != '3.1*' and python_version != '3.0*' and python_version >= '2.7'",
"version": "==17.1.2"
},
"qtconsole": {
"hashes": [
"sha256:298431d376d71a02eb1a04fe6e72dd4beb82b83423d58b17d532e0af838e62fa",
"sha256:7870b19e6a6b0ab3acc09ee65463c0ca7568b3a01a6902d7c4e1ed2c4fc4e176"
],
"version": "==4.4.1"
},
"rpy2": {
"hashes": [
"sha256:004d13734a7b9a85cbc1e7a93ec87df741e28db1273ab5b0d9efaac04a9c5f98"
],
"index": "pypi",
"version": "==2.8.6"
},
"savreaderwriter": {
"hashes": [
"sha256:868fe96db95706eb17168f9ccb5d5827e3bf9e7f11bb6ab6b47970654d980e89"
],
"index": "pypi",
"version": "==3.4.2"
},
"scikit-learn": {
"hashes": [
"sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260",
"sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169",
"sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98",
"sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc",
"sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff",
"sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e",
"sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b",
"sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370",
"sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97",
"sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab",
"sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5",
"sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9",
"sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2",
"sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369",
"sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5",
"sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888",
"sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563",
"sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2",
"sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8",
"sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7",
"sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db",
"sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610",
"sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3",
"sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9",
"sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b",
"sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872",
"sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9",
"sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938"
],
"version": "==0.19.2"
},
"scipy": {
"hashes": [
"sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7",
"sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a",
"sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd",
"sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3",
"sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37",
"sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463",
"sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3",
"sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631",
"sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5",
"sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a",
"sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef",
"sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3",
"sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f",
"sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559",
"sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692",
"sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1",
"sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76",
"sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac",
"sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a",
"sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2",
"sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac",
"sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01",
"sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552",
"sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40",
"sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020",
"sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae",
"sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40",
"sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694"
],
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*'",
"version": "==1.1.0"
},
"seaborn": {
"hashes": [
"sha256:42e627b24e849c2d3bbfd059e00005f6afbc4a76e4895baf44ae23fe8a4b09a5",
"sha256:76c83f794ca320fb6b23a7c6192d5e185a5fcf4758966a0c0a54baee46d41e2f"
],
"index": "pypi",
"version": "==0.9.0"
},
"send2trash": {
"hashes": [
"sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2",
"sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"
],
"version": "==1.5.0"
},
"simplegeneric": {
"hashes": [
"sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
],
"version": "==0.8.1"
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
],
"version": "==1.11.0"
},
"sklearn": {
"hashes": [
"sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
],
"index": "pypi",
"version": "==0.0"
},
"terminado": {
"hashes": [
"sha256:55abf9ade563b8f9be1f34e4233c7b7bde726059947a593322e8a553cc4c067a",
"sha256:65011551baff97f5414c67018e908110693143cfbaeb16831b743fe7cad8b927"
],
"version": "==0.8.1"
},
"testpath": {
"hashes": [
"sha256:039fa6a6c9fd3488f8336d23aebbfead5fa602c4a47d49d83845f55a595ec1b4",
"sha256:0d5337839c788da5900df70f8e01015aec141aa3fe7936cb0d0a2953f7ac7609"
],
"version": "==0.3.1"
},
"tornado": {
"hashes": [
"sha256:1c0816fc32b7d31b98781bd8ebc7a9726d7dce67407dc353a2e66e697e138448",
"sha256:4f66a2172cb947387193ca4c2c3e19131f1c70fa8be470ddbbd9317fd0801582",
"sha256:5327ba1a6c694e0149e7d9126426b3704b1d9d520852a3e4aa9fc8fe989e4046",
"sha256:6a7e8657618268bb007646b9eae7661d0b57f13efc94faa33cd2588eae5912c9",
"sha256:a9b14804783a1d77c0bd6c66f7a9b1196cbddfbdf8bceb64683c5ae60bd1ec6f",
"sha256:c58757e37c4a3172949c99099d4d5106e4d7b63aa0617f9bb24bfbff712c7866",
"sha256:d8984742ce86c0855cccecd5c6f54a9f7532c983947cff06f3a0e2115b47f85c"
],
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
"version": "==5.1"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"watermark": {
"hashes": [
"sha256:1530bf12a729ab701458cb1d8365621688c2757b0b1ef1d426fe0f8bfec0b61e",
"sha256:7bdc31a0ab6e80968a3d79507ea993fbf8a422eb7a0f6277db9d1e54011e7342"
],
"index": "pypi",
"version": "==1.6.1"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
},
"webencodings": {
"hashes": [
"sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
"sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
],
"version": "==0.5.1"
},
"widgetsnbextension": {
"hashes": [
"sha256:7e8fc9688d4fb68c96537ce00604cf8d3bbf48bd348f2c4dfb91174c308b1e10",
"sha256:c9d6e426a1d79d132b57b93b368feba2c66eb7b0fd34bdb901716b4b88e94497"
],
"version": "==3.4.0"
}
},
"develop": {
"appdirs": {
"hashes": [
"sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
"sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
],
"version": "==1.4.3"
},
"attrs": {
"hashes": [
"sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265",
"sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b"
],
"version": "==18.1.0"
},
"backcall": {
"hashes": [
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
],
"version": "==0.1.0"
},
"black": {
"hashes": [
"sha256:22158b89c1a6b4eb333a1e65e791a3f8b998cf3b11ae094adb2570f31f769a44",
"sha256:4b475bbd528acce094c503a3d2dbc2d05a4075f6d0ef7d9e7514518e14cc5191"
],
"index": "pypi",
"version": "==18.6b4"
},
"blackcellmagic": {
"hashes": [
"sha256:6b3804c8851591804bcdc5635c8d55b6b2a50874df63ba40a20d258cf79049b0"
],
"index": "pypi",
"version": "==0.0.1"
},
"click": {
"hashes": [
"sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d",
"sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b"
],
"version": "==6.7"
},
"decorator": {
"hashes": [
"sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
"sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
],
"version": "==4.3.0"
},
"ipython": {
"hashes": [
"sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62",
"sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4"
],
"markers": "python_version >= '3.3'",
"version": "==6.5.0"
},
"ipython-genutils": {
"hashes": [
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
],
"version": "==0.2.0"
},
"jedi": {
"hashes": [
"sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1",
"sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f"
],
"version": "==0.12.1"
},
"parso": {
"hashes": [
"sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
"sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
],
"version": "==0.3.1"
},
"pexpect": {
"hashes": [
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
"sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
],
"markers": "sys_platform != 'win32'",
"version": "==4.6.0"
},
"pickleshare": {
"hashes": [
"sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
"sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
],
"version": "==0.7.4"
},
"prompt-toolkit": {
"hashes": [
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
"sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
"sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
],
"version": "==1.0.15"
},
"ptyprocess": {
"hashes": [
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
],
"markers": "os_name != 'nt'",
"version": "==0.6.0"
},
"pygments": {
"hashes": [
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
],
"version": "==2.2.0"
},
"simplegeneric": {
"hashes": [
"sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
],
"version": "==0.8.1"
},
"six": {
"hashes": [
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
],
"version": "==1.11.0"
},
"toml": {
"hashes": [
"sha256:8e86bd6ce8cc11b9620cb637466453d94f5d57ad86f17e98a98d1f73e3baab2d"
],
"version": "==0.9.4"
},
"traitlets": {
"hashes": [
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
],
"version": "==4.3.2"
},
"wcwidth": {
"hashes": [
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
],
"version": "==0.1.7"
}
}
}

View file

@ -1,21 +1,18 @@
# Tidy Data
The purpose of this repository is to re-do the work described in the paper
[Tidy Data](tidy-data.pdf) by Hadley Wickham (member of the RStudio team) in
Python.
The purpose of this repository is to illustrate how the data cleaning process described
in the paper "[Tidy Data](tidy-data.pdf)" by Hadley Wickham, a member of the
[RStudio](https://rstudio.com/) team, can be done in
[Python](https://www.python.org/).
The paper was published in 2014 in the Journal of
[Statistical Software](https://www.jstatsoft.org/article/view/v059i10). The
author offers it for free download
[here](http://vita.had.co.nz/papers/tidy-data.html). Furthermore, the original
R code is available in a Github
[repository](https://github.com/hadley/tidy-data)
The paper was published in 2014 in the [Journal of Statistical Software](https://www.jstatsoft.org/article/view/v059i10).
The author offers it for free [here](http://vita.had.co.nz/papers/tidy-data.html).
Furthermore, the original [R](https://www.r-project.org/) code is available [here](https://github.com/hadley/tidy-data).
After installing this project, it is recommended to first read the paper to get
the big picture and then work through the six Jupyter notebooks (listed further
below).
After installing the dependencies for this project (cf., the [installation notes](https://github.com/webartifex/tidy-data#installation)
below), it is recommended to first read the paper to get the big picture and
then work through the six Jupyter notebooks listed below.
See installation notes at the bottom.
## Summary
@ -23,50 +20,51 @@ See installation notes at the bottom.
### Definition
**Tidy** data is defined as data that comes in a table form adhering to the
following requirements:
following requirements:
1. each variable is a column,
2. each observation a row, and
3. each type of observational unit forms a table.
1. Each variable forms a column.
2. Each observation forms a row.
3. Each type of observational unit forms a table.
This is equivalent to Codd's 3rd normal form (in the context of relational
databases). A dataset that does not satisfy these properties is called
**messy**.
This is equivalent to [Codd's 3rd normal form](https://en.wikipedia.org/wiki/Third_normal_form),
a concept from the theory on relational databases.
A dataset that does *not* satisfy these properties is called **messy**.
### Tidying messy Data
### Tidying Data
The five most common problems with messy data are as follows:
The five most common problems with messy data are:
- Column headers are values, not variable names
[[notebook](1_column_headers_are_values.ipynb)]
- Multiple variables are stored in one column
[[notebook](2_multiple_variables_stored_in_one_column.ipynb)]
- Variables are stored in both rows and columns
[[notebook](3_variables_are_stored_in_both_rows_and_columns.ipynb)]
- Multiple types of observational units are stored in the same table
[[notebook](4_multiple_types_in_one_table.ipynb)]
- A single observational unit is stored in multiple tables
[[notebook](5_one_type_in_multiple_tables.ipynb)]
- column headers are values, not variable names
(cf., [notebook 1](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/1_column_headers_are_values.ipynb))
- multiple variables are stored in one column
(cf., [notebook 2](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/2_multiple_variables_stored_in_one_column.ipynb))
- variables are stored in both rows and columns
(cf., [notebook 3](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/3_variables_are_stored_in_both_rows_and_columns.ipynb))
- multiple types of observational units are stored in the same table
(cf., [notebook 4](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/4_multiple_types_in_one_table.ipynb))
- a single observational unit is stored in multiple tables
(cf., [notebook 5](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/5_one_type_in_multiple_tables.ipynb))
Further, a [case study](6_case_study.ipynb) shows the advantages of tidy data
(as standardized input/output to statistical functions).
## Download & Installation
### Case Study
Create a local copy of this repository with:
A case study (cf., [notebook 6](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/6_case_study.ipynb))
shows the advantages of tidy data as a standardized input to statistical functions.
## Installation
Get a local copy of this repository with [git](https://git-scm.com/).
`git clone https://github.com/webartifex/tidy-data.git`
This project uses [pipenv](https://docs.pipenv.org/) to manage its
dependencies.
If you are not familiar with [git](https://git-scm.com/), simply download the latest
version of the files in a zip archive [here](https://github.com/webartifex/tidy-data/archive/master.zip).
To install all third-party Python packages in the most recent version into a
project-local virtual environment, run:
This project uses [poetry](https://python-poetry.org/docs/) to manage its dependencies.
Install all third-party packages into a [virtual environment](https://docs.python.org/3/library/venv.html).
`pipenv install`
`poetry install`
To install all packages with the same version as of the time of creating this
project (for exact reproducability), run:
`pipenv install --ignore-pipfile`
Alternatively, use the [Anaconda Distribution](https://www.anaconda.com/products/individual)
that *should* also suffice to run the provided notebooks.

1558
poetry.lock generated Normal file

File diff suppressed because it is too large Load diff

37
pyproject.toml Normal file
View file

@ -0,0 +1,37 @@
[build-system]
build-backend = "poetry.masonry.api"
requires = ["poetry>=0.12"]
[tool.poetry]
name = "tidy-data"
version = "0.1.0"
authors = ["Alexander Hess <alexander@webartifex.biz>"]
description = "A Python implementation for Hadley Wickham's Tidy Data paper"
keywords = [
"data-cleaning",
"data-science",
"messy-data",
"python",
"tidy-data",
]
license = "MIT"
[tool.poetry.dependencies]
python = "^3.7"
# Data Science Tools
jupyterlab = "^2.2.6"
matplotlib = "^3.3.1"
numpy = "^1.19.1"
pandas = "^1.1.1"
seaborn = "^0.10.1"
sklearn = "^0.0"
# Interfaces to other tools
rpy2 = "==2.8.*" # R support
savreaderwriter = "^3.4.2" # IBM SPSS support
# Code Formatters
black = "^19.10b0"
nb_black = "^1.0.7"