Update the project for 2020
- replace pipenv with poetry
- update the README.md:
* streamline the text
* update links to notebooks with nbviewer
* update installation notes with poetry info
- streamline the notebooks:
* use backticks in MarkDown cells to make references to
columns in DataFrames clearer
* blacken all code cells
- add MIT license
- ignore .venv/ and .python-version
This commit is contained in:
parent
4cec950887
commit
a3a17236a2
13 changed files with 1975 additions and 1158 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,2 +1,3 @@
|
||||||
.ipynb_checkpoints/
|
.ipynb_checkpoints/
|
||||||
|
.python-version
|
||||||
|
.venv/
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Column Headers are Values, not Variable Names\n",
|
"# Column Headers are Values, not Variable Names\n",
|
||||||
"\n",
|
"\n",
|
||||||
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two types of settings:\n",
|
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two kinds of settings:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"1. Presentations\n",
|
"1. Presentations\n",
|
||||||
"2. Recordings of regularly spaced observations over time"
|
"2. Recordings of regularly spaced observations over time"
|
||||||
|
|
@ -23,24 +23,9 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2018-08-26 14:39:56 CEST\n",
|
|
||||||
"\n",
|
|
||||||
"CPython 3.6.5\n",
|
|
||||||
"IPython 6.5.0\n",
|
|
||||||
"\n",
|
|
||||||
"numpy 1.15.1\n",
|
|
||||||
"pandas 0.23.4\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"% load_ext watermark\n",
|
"%load_ext lab_black"
|
||||||
"% watermark -d -t -v -z -p numpy,pandas"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -90,32 +75,35 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"columns = ['q16', 'reltrad', 'income']\n",
|
"columns = [\"q16\", \"reltrad\", \"income\"]\n",
|
||||||
"encodings = {}\n",
|
"encodings = {}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# For sake of simplicity all data cleaning operations\n",
|
"# For the sake of simplicity, all data cleaning operations\n",
|
||||||
"# are done within the for-loop for all columns.\n",
|
"# are done within the for-loop for all columns.\n",
|
||||||
"with spss.SavHeaderReader('data/pew.sav') as pew:\n",
|
"with spss.SavHeaderReader(\"data/pew.sav\") as pew:\n",
|
||||||
" for c in columns:\n",
|
" for column in columns:\n",
|
||||||
" encodings[c] = {\n",
|
" encodings[column] = {\n",
|
||||||
" int(k): (\n",
|
" int(key): (\n",
|
||||||
" re.sub(r'\\(.*\\)', '', (\n",
|
" re.sub(\n",
|
||||||
" v.decode('iso-8859-1')\n",
|
" r\"\\(.*\\)\",\n",
|
||||||
" .replace('\\x92', \"'\")\n",
|
" \"\",\n",
|
||||||
" .replace(' Churches', '')\n",
|
" (\n",
|
||||||
" .replace('Less than $10,000', '<$10k')\n",
|
" value.decode(\"iso-8859-1\")\n",
|
||||||
" .replace('10 to under $20,000', '$10-20k')\n",
|
" .replace(\"\\x92\", \"'\")\n",
|
||||||
" .replace('20 to under $30,000', '$20-30k')\n",
|
" .replace(\" Churches\", \"\")\n",
|
||||||
" .replace('30 to under $40,000', '$30-40k')\n",
|
" .replace(\"Less than $10,000\", \"<$10k\")\n",
|
||||||
" .replace('40 to under $50,000', '$40-50k')\n",
|
" .replace(\"10 to under $20,000\", \"$10-20k\")\n",
|
||||||
" .replace('50 to under $75,000', '$50-75k')\n",
|
" .replace(\"20 to under $30,000\", \"$20-30k\")\n",
|
||||||
" .replace('75 to under $100,000', '$75-100k')\n",
|
" .replace(\"30 to under $40,000\", \"$30-40k\")\n",
|
||||||
" .replace('100 to under $150,000', '$100-150k')\n",
|
" .replace(\"40 to under $50,000\", \"$40-50k\")\n",
|
||||||
" .replace('$150,000 or more', '>150k')\n",
|
" .replace(\"50 to under $75,000\", \"$50-75k\")\n",
|
||||||
|
" .replace(\"75 to under $100,000\", \"$75-100k\")\n",
|
||||||
|
" .replace(\"100 to under $150,000\", \"$100-150k\")\n",
|
||||||
|
" .replace(\"$150,000 or more\", \">150k\")\n",
|
||||||
" ),\n",
|
" ),\n",
|
||||||
" ).strip()\n",
|
" ).strip()\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" for (k, v) in pew.all().valueLabels[c.encode()].items()\n",
|
" for (key, value) in pew.all().valueLabels[column.encode()].items()\n",
|
||||||
" }"
|
" }"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -132,25 +120,36 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"with spss.SavReader('data/pew.sav', selectVars=[c.encode() for c in columns]) as pew:\n",
|
"with spss.SavReader(\n",
|
||||||
|
" \"data/pew.sav\", selectVars=[column.encode() for column in columns]\n",
|
||||||
|
") as pew:\n",
|
||||||
" pew = list(pew)\n",
|
" pew = list(pew)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Use the above encodings to map the numeric data\n",
|
"# Use the above encodings to map the numeric data\n",
|
||||||
"# to the actual labels.\n",
|
"# to the actual labels.\n",
|
||||||
"pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
|
"pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
|
||||||
"for c in columns:\n",
|
"for column in columns:\n",
|
||||||
" pew[c] = pew[c].map(encodings[c])\n",
|
" pew[column] = pew[column].map(encodings[column])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for v in ('Atheist', 'Agnostic'):\n",
|
"for value in (\"Atheist\", \"Agnostic\"):\n",
|
||||||
" pew.loc[(pew['q16'] == v), 'reltrad'] = v\n",
|
" pew.loc[(pew[\"q16\"] == value), \"reltrad\"] = value\n",
|
||||||
"\n",
|
"\n",
|
||||||
"income_columns = ['<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k',\n",
|
"income_columns = [\n",
|
||||||
" '$75-100k', '$100-150k', '>150k', 'Don\\'t know/Refused']\n",
|
" \"<$10k\",\n",
|
||||||
"\n",
|
" \"$10-20k\",\n",
|
||||||
"pew = pew.groupby(['reltrad', 'income']).size().unstack('income')\n",
|
" \"$20-30k\",\n",
|
||||||
|
" \"$30-40k\",\n",
|
||||||
|
" \"$40-50k\",\n",
|
||||||
|
" \"$50-75k\",\n",
|
||||||
|
" \"$75-100k\",\n",
|
||||||
|
" \"$100-150k\",\n",
|
||||||
|
" \">150k\",\n",
|
||||||
|
" \"Don't know/Refused\",\n",
|
||||||
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"pew = pew.groupby([\"reltrad\", \"income\"]).size().unstack(\"income\")\n",
|
||||||
"pew = pew[income_columns]\n",
|
"pew = pew[income_columns]\n",
|
||||||
"pew.index.name = 'religion'"
|
"pew.index.name = \"religion\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -426,9 +425,9 @@
|
||||||
"\n",
|
"\n",
|
||||||
"> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
|
"> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"pandas provides a [pd.melt](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function to un-pivot the dataset.\n",
|
"pandas provides a [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function to un-pivot the dataset.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"**Notes:** *reset_index()* transforms the religion index column into a data column (*pd.melt()* needs that). Further, the resulting table is sorted implicitly by the *religion* column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
|
"**Notes:** `.reset_index()` transforms the religion index column into a data column (`pd.melt()` needs that). Further, the resulting table is sorted implicitly by the `\"religion\"` column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -437,7 +436,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"molten_pew = pd.melt(pew.reset_index(), id_vars=['religion'], value_name='frequency')"
|
"molten_pew = pd.melt(pew.reset_index(), id_vars=[\"religion\"], value_name=\"frequency\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -448,8 +447,8 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Create a ordered column for the income labels.\n",
|
"# Create a ordered column for the income labels.\n",
|
||||||
"income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
|
"income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
|
||||||
"molten_pew['income'] = molten_pew['income'].astype(income_dtype)\n",
|
"molten_pew[\"income\"] = molten_pew[\"income\"].astype(income_dtype)\n",
|
||||||
"molten_pew = molten_pew.sort_values(['religion', 'income']).reset_index(drop=True)"
|
"molten_pew = molten_pew.sort_values([\"religion\", \"income\"]).reset_index(drop=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -616,37 +615,40 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
|
"# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
|
||||||
"usecols = ['artist.inverted', 'track', 'time', 'date.entered'] + (\n",
|
"usecols = [\"artist.inverted\", \"track\", \"time\", \"date.entered\"] + (\n",
|
||||||
" [f'x{i}st.week' for i in range(1, 76, 10) if i != 11]\n",
|
" [f\"x{i}st.week\" for i in range(1, 76, 10) if i != 11]\n",
|
||||||
" + [f'x{i}nd.week' for i in range(2, 76, 10) if i != 12]\n",
|
" + [f\"x{i}nd.week\" for i in range(2, 76, 10) if i != 12]\n",
|
||||||
" + [f'x{i}rd.week' for i in range(3, 76, 10) if i != 13]\n",
|
" + [f\"x{i}rd.week\" for i in range(3, 76, 10) if i != 13]\n",
|
||||||
" + [f'x{i}th.week' for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
|
" + [f\"x{i}th.week\" for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
|
||||||
" + [f'x11th.week', f'x12th.week', f'x13th.week']\n",
|
" + [f\"x11th.week\", f\"x12th.week\", f\"x13th.week\"]\n",
|
||||||
|
")\n",
|
||||||
|
"billboard = pd.read_csv(\n",
|
||||||
|
" \"data/billboard.csv\",\n",
|
||||||
|
" encoding=\"iso-8859-1\",\n",
|
||||||
|
" parse_dates=[\"date.entered\"],\n",
|
||||||
|
" usecols=usecols,\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"billboard = pd.read_csv('data/billboard.csv', encoding='iso-8859-1',\n",
|
"billboard = billboard.assign(year=lambda x: x[\"date.entered\"].dt.year)\n",
|
||||||
" parse_dates=['date.entered'], usecols=usecols)\n",
|
|
||||||
"\n",
|
|
||||||
"billboard = billboard.assign(year=lambda x: x['date.entered'].dt.year)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Rename the week columns.\n",
|
"# Rename the week columns.\n",
|
||||||
"week_columns = {\n",
|
"week_columns = {\n",
|
||||||
" c: ('wk' + re.sub(r'[^\\d]+', '', c))\n",
|
" column: (\"wk\" + re.sub(r\"[^\\d]+\", \"\", column))\n",
|
||||||
" for c in billboard.columns\n",
|
" for column in billboard.columns\n",
|
||||||
" if c.endswith('.week')\n",
|
" if column.endswith(\".week\")\n",
|
||||||
"}\n",
|
"}\n",
|
||||||
"billboard = billboard.rename(columns={'artist.inverted': 'artist', **week_columns})\n",
|
"billboard = billboard.rename(columns={\"artist.inverted\": \"artist\", **week_columns})\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Ensure the columns' order is the same as in the paper.\n",
|
"# Ensure the columns' order is the same as in the paper.\n",
|
||||||
"columns = ['year', 'artist', 'track', 'time', 'date.entered'] + [\n",
|
"columns = [\"year\", \"artist\", \"track\", \"time\", \"date.entered\"] + [\n",
|
||||||
" f'wk{i}' for i in range(1, 76)\n",
|
" f\"wk{i}\" for i in range(1, 76)\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"billboard = billboard[columns]\n",
|
"billboard = billboard[columns]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Ensure the rows' order is similar as in the paper.\n",
|
"# Ensure the rows' order is similar as in the paper.\n",
|
||||||
"# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
|
"# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
|
||||||
"billboard = billboard[billboard['year'] == 2000]\n",
|
"billboard = billboard[billboard[\"year\"] == 2000]\n",
|
||||||
"billboard = billboard.sort_values(['artist', 'track'])"
|
"billboard = billboard.sort_values([\"artist\", \"track\"])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -986,17 +988,17 @@
|
||||||
"14 2000 Aaliyah Try Again \n",
|
"14 2000 Aaliyah Try Again \n",
|
||||||
"200 2000 Adams, Yolanda Open My Heart \n",
|
"200 2000 Adams, Yolanda Open My Heart \n",
|
||||||
"\n",
|
"\n",
|
||||||
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
|
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
|
||||||
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
|
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
|
||||||
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
|
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
|
||||||
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
|
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
|
||||||
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
|
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
|
||||||
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
|
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
|
||||||
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
|
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
|
||||||
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
|
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
|
||||||
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
|
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
|
||||||
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
|
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
|
||||||
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
|
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
|
||||||
"\n",
|
"\n",
|
||||||
" wk69 wk70 wk71 wk72 wk73 wk74 wk75 \n",
|
" wk69 wk70 wk71 wk72 wk73 wk74 wk75 \n",
|
||||||
"246 NaN NaN NaN NaN NaN NaN NaN \n",
|
"246 NaN NaN NaN NaN NaN NaN NaN \n",
|
||||||
|
|
@ -1028,7 +1030,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"### \"Tidy\" Data\n",
|
"### \"Tidy\" Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"As before the *pd.melt* function is used to transform the data from \"wide\" to \"long\" form."
|
"As before the `pd.melt()` function is used to transform the data from \"wide\" to \"long\" form."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -1039,9 +1041,9 @@
|
||||||
"source": [
|
"source": [
|
||||||
"molten_billboard = pd.melt(\n",
|
"molten_billboard = pd.melt(\n",
|
||||||
" billboard,\n",
|
" billboard,\n",
|
||||||
" id_vars=['year', 'artist', 'track', 'time', 'date.entered'],\n",
|
" id_vars=[\"year\", \"artist\", \"track\", \"time\", \"date.entered\"],\n",
|
||||||
" var_name='week',\n",
|
" var_name=\"week\",\n",
|
||||||
" value_name='rank',\n",
|
" value_name=\"rank\",\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -1049,7 +1051,7 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column *date* indicating when exactly a particular song was at a certain rank in the charts is added."
|
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column`\"date\"` indicating when exactly a particular song was at a certain rank in the charts is added."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -1059,24 +1061,23 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# pandas keeps \"wide\" variables that had missing values as rows.\n",
|
"# pandas keeps \"wide\" variables that had missing values as rows.\n",
|
||||||
"molten_billboard = molten_billboard[molten_billboard['rank'].notnull()]\n",
|
"molten_billboard = molten_billboard[molten_billboard[\"rank\"].notnull()]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Cast as integer after missing values are removed.\n",
|
"# Cast as integer after missing values are removed.\n",
|
||||||
"molten_billboard['week'] = molten_billboard['week'].map(lambda x: int(x[2:]))\n",
|
"molten_billboard[\"week\"] = molten_billboard[\"week\"].map(lambda x: int(x[2:]))\n",
|
||||||
"molten_billboard['rank'] = molten_billboard['rank'].map(int)\n",
|
"molten_billboard[\"rank\"] = molten_billboard[\"rank\"].map(int)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Calculate the actual week from the date of first entering the list.\n",
|
"# Calculate the actual week from the date of first entering the list.\n",
|
||||||
"molten_billboard = molten_billboard.assign(\n",
|
"molten_billboard = molten_billboard.assign(\n",
|
||||||
" date=lambda x: x['date.entered'] + (x['week'] - 1) * datetime.timedelta(weeks=1)\n",
|
" date=lambda x: x[\"date.entered\"] + (x[\"week\"] - 1) * datetime.timedelta(weeks=1)\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Sort rows and columns as in the paper.\n",
|
"# Sort rows and columns as in the paper.\n",
|
||||||
"molten_billboard = molten_billboard[\n",
|
"molten_billboard = molten_billboard[\n",
|
||||||
" ['year', 'artist', 'time', 'track', 'date', 'week', 'rank']\n",
|
" [\"year\", \"artist\", \"time\", \"track\", \"date\", \"week\", \"rank\"]\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"molten_billboard = (\n",
|
"molten_billboard = molten_billboard.sort_values([\"artist\", \"track\", \"week\"])\n",
|
||||||
" molten_billboard.sort_values(['artist', 'track', 'week']).reset_index(drop=True)\n",
|
"molten_billboard = molten_billboard.reset_index(drop=True)"
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -1336,7 +1337,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"molten_billboard.to_csv('data/billboard_cleaned.csv', index=False)"
|
"molten_billboard.to_csv(\"data/billboard_cleaned.csv\", index=False)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -1356,9 +1357,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.5"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,24 +20,9 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2018-08-26 11:50:39 CEST\n",
|
|
||||||
"\n",
|
|
||||||
"CPython 3.6.5\n",
|
|
||||||
"IPython 6.5.0\n",
|
|
||||||
"\n",
|
|
||||||
"numpy 1.15.1\n",
|
|
||||||
"pandas 0.23.4\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"% load_ext watermark\n",
|
"%load_ext lab_black"
|
||||||
"% watermark -d -t -v -z -p numpy,pandas"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -71,15 +56,30 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"columns = ['iso2', 'year',\n",
|
"columns = [\n",
|
||||||
" 'new_sp_m014', 'new_sp_m1524', 'new_sp_m2534', 'new_sp_m3544',\n",
|
" \"iso2\",\n",
|
||||||
" 'new_sp_m4554', 'new_sp_m5564', 'new_sp_m65', 'new_sp_mu',\n",
|
" \"year\",\n",
|
||||||
" 'new_sp_f014', 'new_sp_f1524', 'new_sp_f2534', 'new_sp_f3544',\n",
|
" \"new_sp_m014\",\n",
|
||||||
" 'new_sp_f4554', 'new_sp_f5564', 'new_sp_f65', 'new_sp_fu']\n",
|
" \"new_sp_m1524\",\n",
|
||||||
"tb = pd.read_csv('data/tb.csv', usecols=columns)\n",
|
" \"new_sp_m2534\",\n",
|
||||||
|
" \"new_sp_m3544\",\n",
|
||||||
|
" \"new_sp_m4554\",\n",
|
||||||
|
" \"new_sp_m5564\",\n",
|
||||||
|
" \"new_sp_m65\",\n",
|
||||||
|
" \"new_sp_mu\",\n",
|
||||||
|
" \"new_sp_f014\",\n",
|
||||||
|
" \"new_sp_f1524\",\n",
|
||||||
|
" \"new_sp_f2534\",\n",
|
||||||
|
" \"new_sp_f3544\",\n",
|
||||||
|
" \"new_sp_f4554\",\n",
|
||||||
|
" \"new_sp_f5564\",\n",
|
||||||
|
" \"new_sp_f65\",\n",
|
||||||
|
" \"new_sp_fu\",\n",
|
||||||
|
"]\n",
|
||||||
|
"tb = pd.read_csv(\"data/tb.csv\", usecols=columns)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"rename = {c: c[7:] for c in columns if c.startswith('new_sp_')}\n",
|
"rename = {column: column[7:] for column in columns if column.startswith(\"new_sp_\")}\n",
|
||||||
"rename = {'iso2': 'country', **rename}\n",
|
"rename = {\"iso2\": \"country\", **rename}\n",
|
||||||
"tb = tb.rename(columns=rename)"
|
"tb = tb.rename(columns=rename)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
@ -89,7 +89,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"### Messy Data\n",
|
"### Messy Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The data are assumed to be provided as below. Except for the *country* and *year* columns, the remaining columns are actually joint realizations of two variables **sex** and **age**."
|
"The data are assumed to be provided as below. Except for the `\"country\"` and `\"year\"` columns, the remaining columns are actually joint realizations of two variables `\"sex\"` and `\"age\"`."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -385,7 +385,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"tb[(tb['year'] == 2000)].head(10)"
|
"tb[(tb[\"year\"] == 2000)].head(10)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -394,7 +394,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"### Molten Data\n",
|
"### Molten Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"As in the previous notebook the [*pd.melt*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are then discarded (then, without any more missing values, the column's data type is casted as integer). Furthermore, the resulting *molten* dataset is sorted as in the paper."
|
"As in the previous notebook the [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function can be used to un-pivot the columns. As before, pandas keeps rows for columns with missing data that are discarded. Then, without any more missing values, the column's data type is casted as `int`. Furthermore, the resulting *molten* dataset is sorted as in the paper."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -403,10 +403,12 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"molten_tb = pd.melt(tb, id_vars=['country', 'year'], var_name='column', value_name='cases')\n",
|
"molten_tb = pd.melt(\n",
|
||||||
"molten_tb = molten_tb[molten_tb['cases'].notnull()]\n",
|
" tb, id_vars=[\"country\", \"year\"], var_name=\"column\", value_name=\"cases\"\n",
|
||||||
"molten_tb['cases'] = molten_tb['cases'].astype(int)\n",
|
")\n",
|
||||||
"molten_tb = molten_tb.sort_values(['country', 'year', 'column']).reset_index(drop=True)"
|
"molten_tb = molten_tb[molten_tb[\"cases\"].notnull()]\n",
|
||||||
|
"molten_tb[\"cases\"] = molten_tb[\"cases\"].astype(int)\n",
|
||||||
|
"molten_tb = molten_tb.sort_values([\"country\", \"year\", \"column\"]).reset_index(drop=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -536,7 +538,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"molten_tb[(molten_tb['year'] == 2000)].head(10)"
|
"molten_tb[(molten_tb[\"year\"] == 2000)].head(10)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -545,7 +547,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"### Tidy Data\n",
|
"### Tidy Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Using the [*pd.Series.str.extract*](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper."
|
"Using the [pd.Series.str.extract()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.extract.html) method the two variables are isolated. The age labels are renamed as in the paper."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -554,14 +556,21 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"tidy_tb = molten_tb[['country', 'year', 'cases']]\n",
|
"tidy_tb = molten_tb[[\"country\", \"year\", \"cases\"]]\n",
|
||||||
"tidy_tb[['sex', 'age']] = molten_tb['column'].str.extract(r'(f|m)(.*)')\n",
|
"tidy_tb[[\"sex\", \"age\"]] = molten_tb[\"column\"].str.extract(r\"(f|m)(.*)\")\n",
|
||||||
"tidy_tb['age'] = tidy_tb['age'].map({\n",
|
"tidy_tb[\"age\"] = tidy_tb[\"age\"].map(\n",
|
||||||
" '014': '0-14', '1524': '15-24', '2534': '25-34',\n",
|
" {\n",
|
||||||
" '3544': '35-44', '4554': '45-54', '5564': '55-64',\n",
|
" \"014\": \"0-14\",\n",
|
||||||
" '65': '65+', 'u': 'unknown'\n",
|
" \"1524\": \"15-24\",\n",
|
||||||
"})\n",
|
" \"2534\": \"25-34\",\n",
|
||||||
"tidy_tb = tidy_tb[['country', 'year', 'sex', 'age', 'cases']]"
|
" \"3544\": \"35-44\",\n",
|
||||||
|
" \"4554\": \"45-54\",\n",
|
||||||
|
" \"5564\": \"55-64\",\n",
|
||||||
|
" \"65\": \"65+\",\n",
|
||||||
|
" \"u\": \"unknown\",\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"tidy_tb = tidy_tb[[\"country\", \"year\", \"sex\", \"age\", \"cases\"]]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -702,7 +711,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"tidy_tb[(tidy_tb['year'] == 2000)].head(10)"
|
"tidy_tb[(tidy_tb[\"year\"] == 2000)].head(10)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -722,9 +731,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.5"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -18,24 +18,9 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2018-08-26 12:56:31 CEST\n",
|
|
||||||
"\n",
|
|
||||||
"CPython 3.6.5\n",
|
|
||||||
"IPython 6.5.0\n",
|
|
||||||
"\n",
|
|
||||||
"numpy 1.15.1\n",
|
|
||||||
"pandas 0.23.4\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"% load_ext watermark\n",
|
"%load_ext lab_black"
|
||||||
"% watermark -d -t -v -z -p numpy,pandas"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -54,7 +39,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"pd.set_option('display.max_columns', 40)"
|
"pd.set_option(\"display.max_columns\", 40)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -83,38 +68,46 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Extract the data as one column and\n",
|
"# Extract the data as one column and\n",
|
||||||
"# use string slicing to obtain groups of columns.\n",
|
"# use string slicing to obtain groups of columns.\n",
|
||||||
"weather = pd.read_csv('data/weather.txt', header=None, sep='^')\n",
|
"weather = pd.read_csv(\"data/weather.txt\", header=None, sep=\"^\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# First, remove the weird character seperators,\n",
|
"# First, remove the weird character seperators,\n",
|
||||||
"# then split the columns by whitespace, and\n",
|
"# then split the columns by whitespace, and\n",
|
||||||
"# finally name them appropriately.\n",
|
"# finally name them appropriately.\n",
|
||||||
"days = (\n",
|
"days = (\n",
|
||||||
" weather[0]\n",
|
" weather[0]\n",
|
||||||
" .map(lambda x: x[21:]).str.replace('OI', ' ')\n",
|
" .map(lambda x: x[21:])\n",
|
||||||
" .str.replace('OS', ' ').str.replace('SI', ' ').str.replace('I', ' ')\n",
|
" .str.replace(\"OI\", \" \")\n",
|
||||||
" .str.replace('S', ' ').str.replace('B', ' ').str.replace('D', ' ')\n",
|
" .str.replace(\"OS\", \" \")\n",
|
||||||
" .map(str.lstrip).str.split(r'\\s+', expand=True)\n",
|
" .str.replace(\"SI\", \" \")\n",
|
||||||
")[list(range(31))].rename(columns={i: f'd{i+1}' for i in range(31)})\n",
|
" .str.replace(\"I\", \" \")\n",
|
||||||
|
" .str.replace(\"S\", \" \")\n",
|
||||||
|
" .str.replace(\"B\", \" \")\n",
|
||||||
|
" .str.replace(\"D\", \" \")\n",
|
||||||
|
" .map(str.lstrip)\n",
|
||||||
|
" .str.split(r\"\\s+\", expand=True)\n",
|
||||||
|
")[list(range(31))].rename(columns={i: f\"d{i+1}\" for i in range(31)})\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# The non-temperature columns can be extracted as simple slices.\n",
|
"# The non-temperature columns can be extracted as simple slices.\n",
|
||||||
"weather = pd.DataFrame(data={\n",
|
"weather = pd.DataFrame(\n",
|
||||||
" 'id': weather[0].map(lambda x: x[:11]),\n",
|
" data={\n",
|
||||||
" 'year': weather[0].map(lambda x: x[11:15]).astype(int),\n",
|
" \"id\": weather[0].map(lambda x: x[:11]),\n",
|
||||||
" 'month': weather[0].map(lambda x: x[15:17]).astype(int),\n",
|
" \"year\": weather[0].map(lambda x: x[11:15]).astype(int),\n",
|
||||||
" 'element': weather[0].map(lambda x: x[17:21]).str.lower(),\n",
|
" \"month\": weather[0].map(lambda x: x[15:17]).astype(int),\n",
|
||||||
"})\n",
|
" \"element\": weather[0].map(lambda x: x[17:21]).str.lower(),\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# The temperatures were stored as whole integers\n",
|
"# The temperatures were stored as whole integers\n",
|
||||||
"# with -9999 indicating missing values.\n",
|
"# with -9999 indicating missing values.\n",
|
||||||
"for i in range(1, 32):\n",
|
"for i in range(1, 32):\n",
|
||||||
" weather[f'd{i}'] = days[f'd{i}'].astype(float) / 10\n",
|
" weather[f\"d{i}\"] = days[f\"d{i}\"].astype(float) / 10\n",
|
||||||
"weather = weather.replace(-999.9, np.NaN)\n",
|
"weather = weather.replace(-999.9, np.NaN)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Discard the non-temperature observations and\n",
|
"# Discard the non-temperature observations and\n",
|
||||||
"# sort the dataset as in the paper.\n",
|
"# sort the dataset as in the paper.\n",
|
||||||
"weather = (\n",
|
"weather = (\n",
|
||||||
" weather[weather['element'].isin(['tmax', 'tmin'])]\n",
|
" weather[weather[\"element\"].isin([\"tmax\", \"tmin\"])]\n",
|
||||||
" .sort_values(['id', 'year', 'month', 'element'])\n",
|
" .sort_values([\"id\", \"year\", \"month\", \"element\"])\n",
|
||||||
" .reset_index(drop=True)\n",
|
" .reset_index(drop=True)\n",
|
||||||
")"
|
")"
|
||||||
]
|
]
|
||||||
|
|
@ -128,8 +121,7 @@
|
||||||
"Below is a dataset assumed to have been provided like this as \"raw\", i.e., the data analyst did not do the above parsing work but some third party instead.\n",
|
"Below is a dataset assumed to have been provided like this as \"raw\", i.e., the data analyst did not do the above parsing work but some third party instead.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"> The most complicated form of messy data occurs when variables are stored in both rows and columns. Table 11 shows daily weather data from the Global Historical Climatology Network for one weather station (MX17004) in Mexico for five months in 2010. It has variables in\n",
|
"> The most complicated form of messy data occurs when variables are stored in both rows and columns. Table 11 shows daily weather data from the Global Historical Climatology Network for one weather station (MX17004) in Mexico for five months in 2010. It has variables in\n",
|
||||||
"individual columns (*id*, *year*, *month*), spread across columns (day, d1–d31) and across rows (*tmin*, *tmax*) (minimum and maximum temperature). Months with less than 31 days have\n",
|
"individual columns (`\"id\"`, `\"year\"`, `\"month\"`), spread across columns (day, `\"d1\"`–`\"d31\"`) and across rows (`\"tmin\"` and `\"tmax\"` for the minimum and maximum temperatures). Months with less than 31 days have missing values for the last day(s) of the month. The `\"element\"` column is not a variable: it stores the *names* of variables."
|
||||||
"structural missing values for the last day(s) of the month. The *element* column is not a variable; it stores the names of variables."
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -624,7 +616,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"weather[(weather['year'] == 2010)].head(10)"
|
"weather[(weather[\"year\"] == 2010)].head(10)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -638,7 +630,7 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"> To tidy this dataset we first melt it with colvars *id*, *year*, *month* and the column that contains variable names, *element* [...]. For presentation, we have dropped the missing values, making them implicit rather than explicit. This is permissible because we know how many days are in each month and can easily reconstruct the explicit missing values."
|
"> To tidy this dataset we first melt it with colvars `\"id\"`, `\"year\"`, `\"month\"`, and the column that contains the actual variable names, `\"element\"` [...]. For presentation, we have dropped the missing values, making them implicit rather than explicit. This is permissible because we know how many days are in each month and can easily reconstruct the explicit missing values."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -649,27 +641,25 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Melt the dataset and extract a date column.\n",
|
"# Melt the dataset and extract a date column.\n",
|
||||||
"molten_weather = (\n",
|
"molten_weather = (\n",
|
||||||
" pd.melt(weather, id_vars=['id', 'year', 'month', 'element'], var_name='day')\n",
|
" pd.melt(weather, id_vars=[\"id\", \"year\", \"month\", \"element\"], var_name=\"day\")\n",
|
||||||
" .assign(day=lambda x: x['day'].str.extract('(\\d+)').astype(int))\n",
|
" .assign(day=lambda x: x[\"day\"].str.extract(\"(\\d+)\").astype(int))\n",
|
||||||
" .assign(date=lambda x: pd.to_datetime(x[['year', 'month', 'day']], errors='coerce'))\n",
|
" .assign(date=lambda x: pd.to_datetime(x[[\"year\", \"month\", \"day\"]], errors=\"coerce\"))\n",
|
||||||
")[['id', 'date', 'element', 'value']]\n",
|
")\n",
|
||||||
|
"molten_weather = molten_weather[[\"id\", \"date\", \"element\", \"value\"]]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Make the missing values implicit.\n",
|
"# Make the missing values implicit.\n",
|
||||||
"molten_weather = molten_weather[molten_weather['value'].notnull()]\n",
|
"molten_weather = molten_weather[molten_weather[\"value\"].notnull()]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Sort the data as in the paper.\n",
|
"# Sort the data as in the paper.\n",
|
||||||
"molten_weather = (\n",
|
"molten_weather = molten_weather.sort_values([\"id\", \"date\", \"element\"])\n",
|
||||||
" molten_weather\n",
|
"molten_weather = molten_weather.reset_index(drop=True)"
|
||||||
" .sort_values(['id', 'date', 'element'])\n",
|
|
||||||
" .reset_index(drop=True)\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"> This dataset is mostly tidy, but we have two variables stored in rows: *tmin* and *tmax*, the type of observation."
|
"> This dataset is mostly tidy, but we have two variables stored in rows: `\"tmin\"` and `\"tmax\"`, the type of observation."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -799,7 +789,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"molten_weather[(molten_weather['date'].dt.year == 2010)].head(10)"
|
"molten_weather[(molten_weather[\"date\"].dt.year == 2010)].head(10)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -815,7 +805,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"> Fixing this requires the cast, or unstack, operation. This performs the inverse of melting by rotating the element variable back out into the columns\n",
|
"> Fixing this requires the cast, or unstack, operation. This performs the inverse of melting by rotating the element variable back out into the columns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Note that [pd.DataFrame.unstack](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.unstack.html) method uses a DataFrame's index as columns to unstack over."
|
"Below, [pd.DataFrame.unstack()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html) uses a DataFrame's index as columns to unstack over."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -824,7 +814,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"tidy_weather = molten_weather.set_index(['id', 'date', 'element']).unstack()\n",
|
"tidy_weather = molten_weather.set_index([\"id\", \"date\", \"element\"]).unstack()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Make the column headers look as in the paper.\n",
|
"# Make the column headers look as in the paper.\n",
|
||||||
"tidy_weather.columns = tidy_weather.columns.droplevel(0)\n",
|
"tidy_weather.columns = tidy_weather.columns.droplevel(0)\n",
|
||||||
|
|
@ -966,7 +956,7 @@
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"tidy_weather[(tidy_weather['date'].dt.year == 2010)].head(10)"
|
"tidy_weather[(tidy_weather[\"date\"].dt.year == 2010)].head(10)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -986,9 +976,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.5"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -20,24 +20,9 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"2018-08-26 15:32:47 CEST\n",
|
|
||||||
"\n",
|
|
||||||
"CPython 3.6.5\n",
|
|
||||||
"IPython 6.5.0\n",
|
|
||||||
"\n",
|
|
||||||
"numpy 1.15.1\n",
|
|
||||||
"pandas 0.23.4\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"% load_ext watermark\n",
|
"%load_ext lab_black"
|
||||||
"% watermark -d -t -v -z -p numpy,pandas"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -71,7 +56,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"billboard = pd.read_csv('data/billboard_cleaned.csv')"
|
"billboard = pd.read_csv(\"data/billboard_cleaned.csv\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -81,7 +66,7 @@
|
||||||
"### Messy Data\n",
|
"### Messy Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"> The Billboard dataset described in Table 8 actually contains observations on two types of\n",
|
"> The Billboard dataset described in Table 8 actually contains observations on two types of\n",
|
||||||
"observational units: the *song* and its *rank* in each week. This manifests itself through the duplication of facts about the song: *artist* and *time* are repeated for every song in each *week*."
|
"observational units: the **song** and its **rank** in each week. This manifests itself through the duplication of facts about the song: `\"artist\"` and `\"time\"` are repeated for every song in each `\"week\"`."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -325,9 +310,9 @@
|
||||||
"source": [
|
"source": [
|
||||||
"### Tidy Data\n",
|
"### Tidy Data\n",
|
||||||
"\n",
|
"\n",
|
||||||
"> The billboard dataset needs to be broken down into two datasets: a **song** dataset which stores *artist*, *song name* and *time*, and a **ranking** dataset which gives the *rank* of the song in each *week*.\n",
|
"> The billboard dataset needs to be broken down into two datasets: a **song** dataset which stores `\"artist\"`, `\"song name\"` and `\"time\"`, and a **ranking** dataset which gives the `\"rank\"` of the song in each `\"week\".\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Transforming data columns into index columns is enough in pandas to obtain unique tuples from several columns. So no real \"function\" is needed to tidy up the dataset."
|
"Transforming data columns into index columns is enough in pandas to obtain unique `tuple`s from several columns. So, no real \"function\" is needed to tidy up the dataset."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -338,26 +323,25 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# Get the unique combinations for the song DataFrame and\n",
|
"# Get the unique combinations for the song DataFrame and\n",
|
||||||
"# \"store\" them in the original dataset for reuse.\n",
|
"# \"store\" them in the original dataset for reuse.\n",
|
||||||
"billboard = billboard.set_index(['artist', 'track', 'time'])\n",
|
"billboard = billboard.set_index([\"artist\", \"track\", \"time\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Create the song DataFrame.\n",
|
"# Create the song DataFrame.\n",
|
||||||
"songs = pd.DataFrame.from_records(\n",
|
"songs = pd.DataFrame.from_records(\n",
|
||||||
" columns=['id', 'artist', 'track', 'time'],\n",
|
" columns=[\"id\", \"artist\", \"track\", \"time\"],\n",
|
||||||
" data=[ # Combine enumerate with tuple unpacking\n",
|
" data=[ # Combine enumerate with tuple unpacking\n",
|
||||||
" (a + 1, b, c, d) # to create the ID column.\n",
|
" (a + 1, b, c, d) # to create the ID column.\n",
|
||||||
" for (a, (b, c, d))\n",
|
" for (a, (b, c, d)) in enumerate(billboard.index.unique())\n",
|
||||||
" in enumerate(billboard.index.unique())\n",
|
|
||||||
" ],\n",
|
" ],\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Take the date and rank columns from the original dataset\n",
|
"# Take the date and rank columns from the original dataset\n",
|
||||||
"# and use the implicit index alignment to assign the songs' IDs.\n",
|
"# and use the implicit index alignment to assign the songs' IDs.\n",
|
||||||
"ranking = billboard[['date', 'rank']].copy()\n",
|
"ranking = billboard[[\"date\", \"rank\"]].copy()\n",
|
||||||
"ranking['id'] = songs.set_index(['artist', 'track', 'time'])\n",
|
"ranking[\"id\"] = songs.set_index([\"artist\", \"track\", \"time\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Use the song ID as the index as in the paper.\n",
|
"# Use the song ID as the index as in the paper.\n",
|
||||||
"ranking = ranking.reset_index(drop=True).set_index('id')\n",
|
"ranking = ranking.reset_index(drop=True).set_index(\"id\")\n",
|
||||||
"songs = songs.set_index('id')"
|
"songs = songs.set_index(\"id\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -700,9 +684,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.5"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# One Type in multiple Tables\n",
|
"# One Type in multiple Tables\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The repository with the original R code does not provide code for this case but only refers to other projects that cannot be replicated any more (source website not available any more)."
|
"The repository with the original R code does not provide code for this case but only refers to other projects that cannot be replicated any more (because the source website is *not* available any more)."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -47,9 +47,9 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.6.5"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 2
|
"nbformat_minor": 4
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
19
LICENSE.txt
Normal file
19
LICENSE.txt
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
Copyright (c) 2018-2020 Alexander Hess [alexander@webartifex.biz]
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
24
Pipfile
24
Pipfile
|
|
@ -1,24 +0,0 @@
|
||||||
[[source]]
|
|
||||||
url = "https://pypi.org/simple"
|
|
||||||
verify_ssl = true
|
|
||||||
name = "pypi"
|
|
||||||
|
|
||||||
[packages]
|
|
||||||
pandas = "*"
|
|
||||||
jupyter = "*"
|
|
||||||
watermark = "*"
|
|
||||||
savreaderwriter = "*"
|
|
||||||
"rpy2" = "==2.8.*"
|
|
||||||
matplotlib = "*"
|
|
||||||
seaborn = "*"
|
|
||||||
sklearn = "*"
|
|
||||||
|
|
||||||
[dev-packages]
|
|
||||||
black = "*"
|
|
||||||
blackcellmagic = "*"
|
|
||||||
|
|
||||||
[requires]
|
|
||||||
python_version = "3.6"
|
|
||||||
|
|
||||||
[pipenv]
|
|
||||||
allow_prereleases = true
|
|
||||||
758
Pipfile.lock
generated
758
Pipfile.lock
generated
|
|
@ -1,758 +0,0 @@
|
||||||
{
|
|
||||||
"_meta": {
|
|
||||||
"hash": {
|
|
||||||
"sha256": "9fc4c60d75aac99be98f4bd18fa6b1bf507d093c96a4c639901a1d0746a83ace"
|
|
||||||
},
|
|
||||||
"pipfile-spec": 6,
|
|
||||||
"requires": {
|
|
||||||
"python_version": "3.6"
|
|
||||||
},
|
|
||||||
"sources": [
|
|
||||||
{
|
|
||||||
"name": "pypi",
|
|
||||||
"url": "https://pypi.org/simple",
|
|
||||||
"verify_ssl": true
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"default": {
|
|
||||||
"attrs": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265",
|
|
||||||
"sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b"
|
|
||||||
],
|
|
||||||
"version": "==18.1.0"
|
|
||||||
},
|
|
||||||
"backcall": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
|
|
||||||
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
|
|
||||||
],
|
|
||||||
"version": "==0.1.0"
|
|
||||||
},
|
|
||||||
"bleach": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0ee95f6167129859c5dce9b1ca291ebdb5d8cd7e382ca0e237dfd0dad63f63d8",
|
|
||||||
"sha256:24754b9a7d530bf30ce7cbc805bc6cce785660b4a10ff3a43633728438c105ab"
|
|
||||||
],
|
|
||||||
"version": "==2.1.4"
|
|
||||||
},
|
|
||||||
"cycler": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1d8a5ae1ff6c5cf9b93e8811e581232ad8920aeec647c37316ceac982b08cb2d",
|
|
||||||
"sha256:cd7b2d1018258d7247a71425e9f26463dfb444d411c39569972f4ce586b0c9d8"
|
|
||||||
],
|
|
||||||
"version": "==0.10.0"
|
|
||||||
},
|
|
||||||
"decorator": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
|
|
||||||
"sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
|
|
||||||
],
|
|
||||||
"version": "==4.3.0"
|
|
||||||
},
|
|
||||||
"entrypoints": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:10ad569bb245e7e2ba425285b9fa3e8178a0dc92fc53b1e1c553805e15a8825b",
|
|
||||||
"sha256:d2d587dde06f99545fb13a383d2cd336a8ff1f359c5839ce3a64c917d10c029f"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7'",
|
|
||||||
"version": "==0.2.3"
|
|
||||||
},
|
|
||||||
"html5lib": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:20b159aa3badc9d5ee8f5c647e5efd02ed2a66ab8d354930bd9ff139fc1dc0a3",
|
|
||||||
"sha256:66cb0dcfdbbc4f9c3ba1a63fdb511ffdbd4f513b2b6d81b80cd26ce6b3fb3736"
|
|
||||||
],
|
|
||||||
"version": "==1.0.1"
|
|
||||||
},
|
|
||||||
"ipykernel": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:395f020610e33ffa0b0c9c0cd1a1d927d51ab9aa9f30a7ae36bb0c908a33e89c",
|
|
||||||
"sha256:935941dba29d856eee34b8b5261d971bd5012547239ed73ddfff099143748c37",
|
|
||||||
"sha256:c091449dd0fad7710ddd9c4a06e8b9e15277da306590bc07a3a1afa6b4453c8f"
|
|
||||||
],
|
|
||||||
"version": "==4.8.2"
|
|
||||||
},
|
|
||||||
"ipython": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62",
|
|
||||||
"sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.3'",
|
|
||||||
"version": "==6.5.0"
|
|
||||||
},
|
|
||||||
"ipython-genutils": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
|
|
||||||
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
|
|
||||||
],
|
|
||||||
"version": "==0.2.0"
|
|
||||||
},
|
|
||||||
"ipywidgets": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:100f4ea495e1fa2c1dfeabb68641af2302e65e877003f910be4e29f3aa68a0b2",
|
|
||||||
"sha256:fd24a66d82f2ea49e281da7714a7c656340d3ec24dff376b17590fa59469b817"
|
|
||||||
],
|
|
||||||
"version": "==7.4.0"
|
|
||||||
},
|
|
||||||
"jedi": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1",
|
|
||||||
"sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f"
|
|
||||||
],
|
|
||||||
"version": "==0.12.1"
|
|
||||||
},
|
|
||||||
"jinja2": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd",
|
|
||||||
"sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"
|
|
||||||
],
|
|
||||||
"version": "==2.10"
|
|
||||||
},
|
|
||||||
"jsonschema": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1ae9fe07aec50f07fc546a4432c93e2f23deeaa23bb67aef42d75d8aa496849b",
|
|
||||||
"sha256:aef58a18d83e4c5ea117d7ae1ba4238a6a84654fee6d0f32fd335ded63a1626e"
|
|
||||||
],
|
|
||||||
"version": "==3.0.0a2"
|
|
||||||
},
|
|
||||||
"jupyter": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7",
|
|
||||||
"sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78",
|
|
||||||
"sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==1.0.0"
|
|
||||||
},
|
|
||||||
"jupyter-client": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:27befcf0446b01e29853014d6a902dd101ad7d7f94e2252b1adca17c3466b761",
|
|
||||||
"sha256:59e6d791e22a8002ad0e80b78c6fd6deecab4f9e1b1aa1a22f4213de271b29ea"
|
|
||||||
],
|
|
||||||
"version": "==5.2.3"
|
|
||||||
},
|
|
||||||
"jupyter-console": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:3f928b817fc82cda95e431eb4c2b5eb21be5c483c2b43f424761a966bb808094",
|
|
||||||
"sha256:545dedd3aaaa355148093c5609f0229aeb121b4852995c2accfa64fe3e0e55cd"
|
|
||||||
],
|
|
||||||
"version": "==5.2.0"
|
|
||||||
},
|
|
||||||
"jupyter-core": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:927d713ffa616ea11972534411544589976b2493fc7e09ad946e010aa7eb9970",
|
|
||||||
"sha256:ba70754aa680300306c699790128f6fbd8c306ee5927976cbe48adacf240c0b7"
|
|
||||||
],
|
|
||||||
"version": "==4.4.0"
|
|
||||||
},
|
|
||||||
"kiwisolver": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0ee4ed8b3ae8f5f712b0aa9ebd2858b5b232f1b9a96b0943dceb34df2a223bc3",
|
|
||||||
"sha256:0f7f532f3c94e99545a29f4c3f05637f4d2713e7fd91b4dd8abfc18340b86cd5",
|
|
||||||
"sha256:1a078f5dd7e99317098f0e0d490257fd0349d79363e8c923d5bb76428f318421",
|
|
||||||
"sha256:1aa0b55a0eb1bd3fa82e704f44fb8f16e26702af1a073cc5030eea399e617b56",
|
|
||||||
"sha256:2874060b91e131ceeff00574b7c2140749c9355817a4ed498e82a4ffa308ecbc",
|
|
||||||
"sha256:379d97783ba8d2934d52221c833407f20ca287b36d949b4bba6c75274bcf6363",
|
|
||||||
"sha256:3b791ddf2aefc56382aadc26ea5b352e86a2921e4e85c31c1f770f527eb06ce4",
|
|
||||||
"sha256:4329008a167fac233e398e8a600d1b91539dc33c5a3eadee84c0d4b04d4494fa",
|
|
||||||
"sha256:45813e0873bbb679334a161b28cb9606d9665e70561fd6caa8863e279b5e464b",
|
|
||||||
"sha256:53a5b27e6b5717bdc0125338a822605084054c80f382051fb945d2c0e6899a20",
|
|
||||||
"sha256:574f24b9805cb1c72d02b9f7749aa0cc0b81aa82571be5201aa1453190390ae5",
|
|
||||||
"sha256:66f82819ff47fa67a11540da96966fb9245504b7f496034f534b81cacf333861",
|
|
||||||
"sha256:79e5fe3ccd5144ae80777e12973027bd2f4f5e3ae8eb286cabe787bed9780138",
|
|
||||||
"sha256:83410258eb886f3456714eea4d4304db3a1fc8624623fc3f38a487ab36c0f653",
|
|
||||||
"sha256:8b6a7b596ce1d2a6d93c3562f1178ebd3b7bb445b3b0dd33b09f9255e312a965",
|
|
||||||
"sha256:9576cb63897fbfa69df60f994082c3f4b8e6adb49cccb60efb2a80a208e6f996",
|
|
||||||
"sha256:95a25d9f3449046ecbe9065be8f8380c03c56081bc5d41fe0fb964aaa30b2195",
|
|
||||||
"sha256:a424f048bebc4476620e77f3e4d1f282920cef9bc376ba16d0b8fe97eec87cde",
|
|
||||||
"sha256:aaec1cfd94f4f3e9a25e144d5b0ed1eb8a9596ec36d7318a504d813412563a85",
|
|
||||||
"sha256:acb673eecbae089ea3be3dcf75bfe45fc8d4dcdc951e27d8691887963cf421c7",
|
|
||||||
"sha256:b15bc8d2c2848a4a7c04f76c9b3dc3561e95d4dabc6b4f24bfabe5fd81a0b14f",
|
|
||||||
"sha256:b1c240d565e977d80c0083404c01e4d59c5772c977fae2c483f100567f50847b",
|
|
||||||
"sha256:c595693de998461bcd49b8d20568c8870b3209b8ea323b2a7b0ea86d85864694",
|
|
||||||
"sha256:ce3be5d520b4d2c3e5eeb4cd2ef62b9b9ab8ac6b6fedbaa0e39cdb6f50644278",
|
|
||||||
"sha256:e0f910f84b35c36a3513b96d816e6442ae138862257ae18a0019d2fc67b041dc",
|
|
||||||
"sha256:ea36e19ac0a483eea239320aef0bd40702404ff8c7e42179a2d9d36c5afcb55c",
|
|
||||||
"sha256:efabbcd4f406b532206b8801058c8bab9e79645b9880329253ae3322b7b02cd5",
|
|
||||||
"sha256:f923406e6b32c86309261b8195e24e18b6a8801df0cfc7814ac44017bfcb3939"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*'",
|
|
||||||
"version": "==1.0.1"
|
|
||||||
},
|
|
||||||
"markupsafe": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:a6be69091dac236ea9c6bc7d012beab42010fa914c459791d627dad4910eb665"
|
|
||||||
],
|
|
||||||
"version": "==1.0"
|
|
||||||
},
|
|
||||||
"matplotlib": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0ba8e3ec1b0feddc6b068fe70dc38dcf2917e301ad8d2b3f848c14ad463a4157",
|
|
||||||
"sha256:10a48e33e64dbd95f0776ba162f379c5cc55301c2d155506e79ce0c26b52f2ce",
|
|
||||||
"sha256:1376535fe731adbba55ab9e48896de226b7e89dbb55390c5fbd8f7161b7ae3be",
|
|
||||||
"sha256:16f0f8ba22df1e2c9f06c87088de45742322fde282a93b5c744c0f969cf7932e",
|
|
||||||
"sha256:1c6c999f2212858021329537f8e0f98f3f29086ec3683511dd1ecec84409f51d",
|
|
||||||
"sha256:2316dc177fc7b3d8848b49365498de0c385b4c9bba511edddd24c34fbe3d37a4",
|
|
||||||
"sha256:3398bfb533482bf21974cecf28224dd23784ad4e4848be582903f7a2436ec12e",
|
|
||||||
"sha256:3477cb1e1061b34210acc43d20050be8444478ff50b8adfac5fe2b45fc97df01",
|
|
||||||
"sha256:4259ea7cb2c238355ee13275eddd261d869cefbdeb18a65f35459589d6d17def",
|
|
||||||
"sha256:4addcf93234b6122f530f90f485fd3d00d158911fbc1ed24db3fa66cd49fe565",
|
|
||||||
"sha256:50c0e24bcbce9c54346f4a2f4e97b0ed111f0413ac3fe9954061ae1c8aa7021f",
|
|
||||||
"sha256:62ed7597d9e54db6e133420d779c642503c25eba390e1178d85dfb2ba0d05948",
|
|
||||||
"sha256:69f6d51e41a17f6a5f70c56bb10b8ded9f299609204495a7fa2782a3a755ffc5",
|
|
||||||
"sha256:6d232e49b74e3d2db22c63c25a9a0166d965e87e2b057f795487f1f244b61d9d",
|
|
||||||
"sha256:7355bf757ecacd5f0ac9dd9523c8e1a1103faadf8d33c22664178e17533f8ce5",
|
|
||||||
"sha256:886b1045c5105631f10c1cbc999f910e44d33af3e9c7efd68c2123efc06ab636",
|
|
||||||
"sha256:9e1f353edd7fc7e5e9101abd5bc0201946f77a1b59e0da49095086c03db856ed",
|
|
||||||
"sha256:b3a343dfcbe296dbe0f26c731beee72a792ff948407e6979524298ae7bc3234e",
|
|
||||||
"sha256:d93675af09ca497a25f4f8d62f3313cf0f21e45427a87487049fe84898b99909",
|
|
||||||
"sha256:e2409ef9d37804dfb566f39c962e6ed70f281ff516b8131b3e6b4e6442711ff1",
|
|
||||||
"sha256:f8b653b0f89938ba72e92ab080c2f3aa24c1b72e2f61add22880cd1b9a6e3cdd"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==2.2.3"
|
|
||||||
},
|
|
||||||
"mistune": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:b4c512ce2fc99e5a62eb95a4aba4b73e5f90264115c40b70a21e1f7d4e0eac91",
|
|
||||||
"sha256:bc10c33bfdcaa4e749b779f62f60d6e12f8215c46a292d05e486b869ae306619"
|
|
||||||
],
|
|
||||||
"version": "==0.8.3"
|
|
||||||
},
|
|
||||||
"nbconvert": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:12b1a4671d4463ab73af6e4cbcc965b62254e05d182cd54995dda0d0ef9e2db9",
|
|
||||||
"sha256:260d390b989a647575b8ecae2cd06a9eaead10d396733d6e50185d5ebd08996e"
|
|
||||||
],
|
|
||||||
"version": "==5.3.1"
|
|
||||||
},
|
|
||||||
"nbformat": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b",
|
|
||||||
"sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402"
|
|
||||||
],
|
|
||||||
"version": "==4.4.0"
|
|
||||||
},
|
|
||||||
"notebook": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:66dd59e76e755584ae9450eb015c39f55d4bb1d8ec68f2c694d2b3cba7bf5c7e",
|
|
||||||
"sha256:e2c8e931cc19db4f8c63e6a396efbc13a228b2cb5b2919df011b946f28239a08"
|
|
||||||
],
|
|
||||||
"version": "==5.6.0"
|
|
||||||
},
|
|
||||||
"numpy": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1c362ad12dd09a43b348bb28dd2295dd9cdf77f41f0f45965e04ba97f525b864",
|
|
||||||
"sha256:2156a06bd407918df4ac0122df6497a9c137432118f585e5b17d543e593d1587",
|
|
||||||
"sha256:24e4149c38489b51fc774b1e1faa9103e82f73344d7a00ba66f6845ab4769f3f",
|
|
||||||
"sha256:340ec1697d9bb3a9c464028af7a54245298502e91178bddb4c37626d36e197b7",
|
|
||||||
"sha256:35db8d419345caa4eeaa65cd63f34a15208acd87530a30f0bc25fc84f55c8c80",
|
|
||||||
"sha256:361370e9b7f5e44c41eee29f2bb5cb3b755abb4b038bce6d6cbe08db7ff9cb74",
|
|
||||||
"sha256:36e8dcd1813ca92ce7e4299120cee6c03adad33d89b54862c1b1a100443ac399",
|
|
||||||
"sha256:378378973546ecc1dfaf9e24c160d683dd04df871ecd2dcc86ce658ca20f92c0",
|
|
||||||
"sha256:419e6faee16097124ee627ed31572c7e80a1070efa25260b78097cca240e219a",
|
|
||||||
"sha256:4287104c24e6a09b9b418761a1e7b1bbde65105f110690ca46a23600a3c606b8",
|
|
||||||
"sha256:549f3e9778b148a47f4fb4682955ed88057eb627c9fe5467f33507c536deda9d",
|
|
||||||
"sha256:5e359e9c531075220785603e5966eef20ccae9b3b6b8a06fdfb66c084361ce92",
|
|
||||||
"sha256:5ee7f3dbbdba0da75dec7e94bd7a2b10fe57a83e1b38e678200a6ad8e7b14fdc",
|
|
||||||
"sha256:62d55e96ec7b117d3d5e618c15efcf769e70a6effaee5842857b64fb4883887a",
|
|
||||||
"sha256:719b6789acb2bc86ea9b33a701d7c43dc2fc56d95107fd3c5b0a8230164d4dfb",
|
|
||||||
"sha256:7a70f2b60d48828cba94a54a8776b61a9c2657a803d47f5785f8062e3a9c7c55",
|
|
||||||
"sha256:7b9e37f194f8bcdca8e9e6af92e2cbad79e360542effc2dd6b98d63955d8d8a3",
|
|
||||||
"sha256:83b8fc18261b70f45bece2d392537c93dc81eb6c539a16c9ac994c47fc79f09a",
|
|
||||||
"sha256:9473ad28375710ab18378e72b59422399b27e957e9339c413bf00793b4b12df0",
|
|
||||||
"sha256:95b085b253080e5d09f7826f5e27dce067bae813a132023a77b739614a29de6e",
|
|
||||||
"sha256:98b86c62c08c2e5dc98a9c856d4a95329d11b1c6058cb9b5191d5ea6891acd09",
|
|
||||||
"sha256:a3bd01d6d3ed3d7c06d7f9979ba5d68281f15383fafd53b81aa44b9191047cf8",
|
|
||||||
"sha256:c81a6afc1d2531a9ada50b58f8c36197f8418ef3d0611d4c1d7af93fdcda764f",
|
|
||||||
"sha256:ce75ed495a746e3e78cfa22a77096b3bff2eda995616cb7a542047f233091268",
|
|
||||||
"sha256:dae8618c0bcbfcf6cf91350f8abcdd84158323711566a8c5892b5c7f832af76f",
|
|
||||||
"sha256:df0b02c6705c5d1c25cc35c7b5d6b6f9b3b30833f9d178843397ae55ecc2eebb",
|
|
||||||
"sha256:e3660744cda0d94b90141cdd0db9308b958a372cfeee8d7188fdf5ad9108ea82",
|
|
||||||
"sha256:f2362d0ca3e16c37782c1054d7972b8ad2729169567e3f0f4e5dd3cdf85f188e"
|
|
||||||
],
|
|
||||||
"markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.3.*'",
|
|
||||||
"version": "==1.15.1"
|
|
||||||
},
|
|
||||||
"pandas": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:11975fad9edbdb55f1a560d96f91830e83e29bed6ad5ebf506abda09818eaf60",
|
|
||||||
"sha256:12e13d127ca1b585dd6f6840d3fe3fa6e46c36a6afe2dbc5cb0b57032c902e31",
|
|
||||||
"sha256:1c87fcb201e1e06f66e23a61a5fea9eeebfe7204a66d99df24600e3f05168051",
|
|
||||||
"sha256:242e9900de758e137304ad4b5663c2eff0d798c2c3b891250bd0bd97144579da",
|
|
||||||
"sha256:26c903d0ae1542890cb9abadb4adcb18f356b14c2df46e4ff657ae640e3ac9e7",
|
|
||||||
"sha256:2e1e88f9d3e5f107b65b59cd29f141995597b035d17cc5537e58142038942e1a",
|
|
||||||
"sha256:31b7a48b344c14691a8e92765d4023f88902ba3e96e2e4d0364d3453cdfd50db",
|
|
||||||
"sha256:4fd07a932b4352f8a8973761ab4e84f965bf81cc750fb38e04f01088ab901cb8",
|
|
||||||
"sha256:5b24ca47acf69222e82530e89111dd9d14f9b970ab2cd3a1c2c78f0c4fbba4f4",
|
|
||||||
"sha256:647b3b916cc8f6aeba240c8171be3ab799c3c1b2ea179a3be0bd2712c4237553",
|
|
||||||
"sha256:66b060946046ca27c0e03e9bec9bba3e0b918bafff84c425ca2cc2e157ce121e",
|
|
||||||
"sha256:6efa9fa6e1434141df8872d0fa4226fc301b17aacf37429193f9d70b426ea28f",
|
|
||||||
"sha256:be4715c9d8367e51dbe6bc6d05e205b1ae234f0dc5465931014aa1c4af44c1ba",
|
|
||||||
"sha256:bea90da782d8e945fccfc958585210d23de374fa9294a9481ed2abcef637ebfc",
|
|
||||||
"sha256:d785fc08d6f4207437e900ffead930a61e634c5e4f980ba6d3dc03c9581748c7",
|
|
||||||
"sha256:de9559287c4fe8da56e8c3878d2374abc19d1ba2b807bfa7553e912a8e5ba87c",
|
|
||||||
"sha256:f4f98b190bb918ac0bc0e3dd2ab74ff3573da9f43106f6dba6385406912ec00f",
|
|
||||||
"sha256:f71f1a7e2d03758f6e957896ed696254e2bc83110ddbc6942018f1a232dd9dad",
|
|
||||||
"sha256:fb944c8f0b0ab5c1f7846c686bc4cdf8cde7224655c12edcd59d5212cd57bec0"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==0.23.4"
|
|
||||||
},
|
|
||||||
"pandocfilters": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9"
|
|
||||||
],
|
|
||||||
"version": "==1.4.2"
|
|
||||||
},
|
|
||||||
"parso": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
|
|
||||||
"sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
|
|
||||||
],
|
|
||||||
"version": "==0.3.1"
|
|
||||||
},
|
|
||||||
"pexpect": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
|
|
||||||
"sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
|
|
||||||
],
|
|
||||||
"markers": "sys_platform != 'win32'",
|
|
||||||
"version": "==4.6.0"
|
|
||||||
},
|
|
||||||
"pickleshare": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
|
|
||||||
"sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
|
|
||||||
],
|
|
||||||
"version": "==0.7.4"
|
|
||||||
},
|
|
||||||
"prometheus-client": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:17bc24c09431644f7c65d7bce9f4237252308070b6395d6d8e87767afe867e24"
|
|
||||||
],
|
|
||||||
"version": "==0.3.1"
|
|
||||||
},
|
|
||||||
"prompt-toolkit": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
|
|
||||||
"sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
|
|
||||||
"sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
|
|
||||||
],
|
|
||||||
"version": "==1.0.15"
|
|
||||||
},
|
|
||||||
"ptyprocess": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
|
|
||||||
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
|
|
||||||
],
|
|
||||||
"markers": "os_name != 'nt'",
|
|
||||||
"version": "==0.6.0"
|
|
||||||
},
|
|
||||||
"pygments": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
|
|
||||||
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
|
|
||||||
],
|
|
||||||
"version": "==2.2.0"
|
|
||||||
},
|
|
||||||
"pyparsing": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0832bcf47acd283788593e7a0f542407bd9550a55a8a8435214a1960e04bcb04",
|
|
||||||
"sha256:fee43f17a9c4087e7ed1605bd6df994c6173c1e977d7ade7b651292fab2bd010"
|
|
||||||
],
|
|
||||||
"version": "==2.2.0"
|
|
||||||
},
|
|
||||||
"pyrsistent": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:4024f838472cba9ea1ccbc638e0bcafec2efda28594a9905177ec365f1a95fea"
|
|
||||||
],
|
|
||||||
"version": "==0.14.4"
|
|
||||||
},
|
|
||||||
"python-dateutil": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1adb80e7a782c12e52ef9a8182bebeb73f1d7e24e374397af06fb4956c8dc5c0",
|
|
||||||
"sha256:e27001de32f627c22380a688bcc43ce83504a7bc5da472209b4c70f02829f0b8"
|
|
||||||
],
|
|
||||||
"version": "==2.7.3"
|
|
||||||
},
|
|
||||||
"pytz": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:a061aa0a9e06881eb8b3b2b43f05b9439d6583c206d0a6c340ff72a7b6669053",
|
|
||||||
"sha256:ffb9ef1de172603304d9d2819af6f5ece76f2e85ec10692a524dd876e72bf277"
|
|
||||||
],
|
|
||||||
"version": "==2018.5"
|
|
||||||
},
|
|
||||||
"pyzmq": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:25a0715c8f69cf72f67cfe5a68a3f3ed391c67c063d2257bec0fe7fc2c7f08f8",
|
|
||||||
"sha256:2bab63759632c6b9e0d5bf19cc63c3b01df267d660e0abcf230cf0afaa966349",
|
|
||||||
"sha256:30ab49d99b24bf0908ebe1cdfa421720bfab6f93174e4883075b7ff38cc555ba",
|
|
||||||
"sha256:32c7ca9fc547a91e3c26fc6080b6982e46e79819e706eb414dd78f635a65d946",
|
|
||||||
"sha256:41219ae72b3cc86d97557fe5b1ef5d1adc1057292ec597b50050874a970a39cf",
|
|
||||||
"sha256:4b8c48a9a13cea8f1f16622f9bd46127108af14cd26150461e3eab71e0de3e46",
|
|
||||||
"sha256:55724997b4a929c0d01b43c95051318e26ddbae23565018e138ae2dc60187e59",
|
|
||||||
"sha256:65f0a4afae59d4fc0aad54a917ab599162613a761b760ba167d66cc646ac3786",
|
|
||||||
"sha256:6f88591a8b246f5c285ee6ce5c1bf4f6bd8464b7f090b1333a446b6240a68d40",
|
|
||||||
"sha256:75022a4c60dcd8765bb9ca32f6de75a0ec83b0d96e0309dc479f4c7b21f26cb7",
|
|
||||||
"sha256:76ea493bfab18dcb090d825f3662b5612e2def73dffc196d51a5194b0294a81d",
|
|
||||||
"sha256:7b60c045b80709e4e3c085bab9b691e71761b44c2b42dbb047b8b498e7bc16b3",
|
|
||||||
"sha256:8e6af2f736734aef8ed6f278f9f552ec7f37b1a6b98e59b887484a840757f67d",
|
|
||||||
"sha256:9ac2298e486524331e26390eac14e4627effd3f8e001d4266ed9d8f1d2d31cce",
|
|
||||||
"sha256:9ba650f493a9bc1f24feca1d90fce0e5dd41088a252ac9840131dfbdbf3815ca",
|
|
||||||
"sha256:a02a4a385e394e46012dc83d2e8fd6523f039bb52997c1c34a2e0dd49ed839c1",
|
|
||||||
"sha256:a3ceee84114d9f5711fa0f4db9c652af0e4636c89eabc9b7f03a3882569dd1ed",
|
|
||||||
"sha256:a72b82ac1910f2cf61a49139f4974f994984475f771b0faa730839607eeedddf",
|
|
||||||
"sha256:ab136ac51027e7c484c53138a0fab4a8a51e80d05162eb7b1585583bcfdbad27",
|
|
||||||
"sha256:c095b224300bcac61e6c445e27f9046981b1ac20d891b2f1714da89d34c637c8",
|
|
||||||
"sha256:c5cc52d16c06dc2521340d69adda78a8e1031705924e103c0eb8fc8af861d810",
|
|
||||||
"sha256:d612e9833a89e8177f8c1dc68d7b4ff98d3186cd331acd616b01bbdab67d3a7b",
|
|
||||||
"sha256:e828376a23c66c6fe90dcea24b4b72cd774f555a6ee94081670872918df87a19",
|
|
||||||
"sha256:e9767c7ab2eb552796440168d5c6e23a99ecaade08dda16266d43ad461730192",
|
|
||||||
"sha256:ebf8b800d42d217e4710d1582b0c8bff20cdcb4faad7c7213e52644034300924"
|
|
||||||
],
|
|
||||||
"markers": "python_version != '3.2*' and python_version != '3.1*' and python_version != '3.0*' and python_version >= '2.7'",
|
|
||||||
"version": "==17.1.2"
|
|
||||||
},
|
|
||||||
"qtconsole": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:298431d376d71a02eb1a04fe6e72dd4beb82b83423d58b17d532e0af838e62fa",
|
|
||||||
"sha256:7870b19e6a6b0ab3acc09ee65463c0ca7568b3a01a6902d7c4e1ed2c4fc4e176"
|
|
||||||
],
|
|
||||||
"version": "==4.4.1"
|
|
||||||
},
|
|
||||||
"rpy2": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:004d13734a7b9a85cbc1e7a93ec87df741e28db1273ab5b0d9efaac04a9c5f98"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==2.8.6"
|
|
||||||
},
|
|
||||||
"savreaderwriter": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:868fe96db95706eb17168f9ccb5d5827e3bf9e7f11bb6ab6b47970654d980e89"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==3.4.2"
|
|
||||||
},
|
|
||||||
"scikit-learn": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0a718b5ffbd5053fb3f9e1a2e20b7c4f256dd8035e246b907d3117d20bac0260",
|
|
||||||
"sha256:1725540b754a9967778e9385e1ee2c8db50d5ab70ed835c9f5e36002ffabc169",
|
|
||||||
"sha256:3e3ce307d7c5c5811658ba8686b24b571a8244eaafe707665ad601f400d5ce98",
|
|
||||||
"sha256:42ad71502237c9fe300ecf157f5a394df717789a2dde541dd7034b539c70bdcc",
|
|
||||||
"sha256:42cba716db197e0d1670e2fc13c4cc4a86d5c5358120ccfee6ec427b154e74ff",
|
|
||||||
"sha256:47b4090b7686642e41176becb7c42ef3cc665d7ee0db5e7ea5d307ec9779327e",
|
|
||||||
"sha256:51d99a08c8bf689cf60c9d8dca6e3d3e5f6d762def85ad735dcea11fb528a89b",
|
|
||||||
"sha256:5f7577fbb2399a4712e96cf0e786638168940a876c33735a1b5d5a86ba4b1370",
|
|
||||||
"sha256:66bfc2b6b15db1725d03ea657ec9184ff09dcbf1ecd834ef85f2edc2c9cbba97",
|
|
||||||
"sha256:69a34d389d9ca4687ad00af4e11d53686771f484c37366f68617ef656bab16ab",
|
|
||||||
"sha256:75297f3dd6685f01555f1bb75846995d45650af417280b69c81bf11b6987aed5",
|
|
||||||
"sha256:9ebb38ab1d0ee143982aed561811903ac6c1abb512ae2b9019b3b65bde63ffb9",
|
|
||||||
"sha256:a402c1484fe65df42d5dbc22a58e0695fe3afe2b0b229aee2a09c6d60ba8e5c2",
|
|
||||||
"sha256:aad6b9aac1617bd7efa0450643888bbd3410679a94bc8680d9863825686ef369",
|
|
||||||
"sha256:ad4db28d3dc16c01df75ed6efb72524537de3839a5d179fcf94094359fc72ec5",
|
|
||||||
"sha256:b276739a5f863ccacb61999a3067d0895ee291c95502929b2ae56ea1f882e888",
|
|
||||||
"sha256:b3dc88c4d2bcb26ffc5afe16d053ae28317d7d1de083651defcd5453a04f1563",
|
|
||||||
"sha256:b3e4681253e95da5aa5c231889a32b084fd997962bf8beda6f796bf422f734b2",
|
|
||||||
"sha256:c3d852d49d6c1710089d4513702099fa6f8e1aebfedf222319d80c47b0a195f8",
|
|
||||||
"sha256:c6612e7e43988b8b5e1957150449493a55f9c059de641083df7a964f86f2d1e7",
|
|
||||||
"sha256:c69e5c6051366a6ac9600d730276db939b1a205e42504ec0b8371f154b0058db",
|
|
||||||
"sha256:ce121baa8e85ec27c3065281657dcd78adaab7dcb046c7fe96ad4e5a9dcb6610",
|
|
||||||
"sha256:ed2a9a9bea6ec443b7effe5695c9c168b7bf9a67df6d880729760feda871b6a3",
|
|
||||||
"sha256:efd842d70b87e3ef3429c3149840b9189d4441ca951ab0cec62c94a964e219d9",
|
|
||||||
"sha256:f1428af5c381f6eef30ffbc7e047b7c713d4efa5d7bf5e57b62b3fc8d387044b",
|
|
||||||
"sha256:f6c7bf8cd4de1640b760b47f4d28deb26dbbf9acbe0194cdff54a898e190d872",
|
|
||||||
"sha256:f8329ac2160ad8bbbac6a507374685ceca3f24ca427fa9ee61a501280e1972d9",
|
|
||||||
"sha256:fefba2a43b92f8393366093b60efbe984a72a2b41cce16b4002005e4104ef938"
|
|
||||||
],
|
|
||||||
"version": "==0.19.2"
|
|
||||||
},
|
|
||||||
"scipy": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:0611ee97296265af4a21164a5323f8c1b4e8e15c582d3dfa7610825900136bb7",
|
|
||||||
"sha256:08237eda23fd8e4e54838258b124f1cd141379a5f281b0a234ca99b38918c07a",
|
|
||||||
"sha256:0e645dbfc03f279e1946cf07c9c754c2a1859cb4a41c5f70b25f6b3a586b6dbd",
|
|
||||||
"sha256:0e9bb7efe5f051ea7212555b290e784b82f21ffd0f655405ac4f87e288b730b3",
|
|
||||||
"sha256:108c16640849e5827e7d51023efb3bd79244098c3f21e4897a1007720cb7ce37",
|
|
||||||
"sha256:340ef70f5b0f4e2b4b43c8c8061165911bc6b2ad16f8de85d9774545e2c47463",
|
|
||||||
"sha256:3ad73dfc6f82e494195144bd3a129c7241e761179b7cb5c07b9a0ede99c686f3",
|
|
||||||
"sha256:3b243c77a822cd034dad53058d7c2abf80062aa6f4a32e9799c95d6391558631",
|
|
||||||
"sha256:404a00314e85eca9d46b80929571b938e97a143b4f2ddc2b2b3c91a4c4ead9c5",
|
|
||||||
"sha256:423b3ff76957d29d1cce1bc0d62ebaf9a3fdfaf62344e3fdec14619bb7b5ad3a",
|
|
||||||
"sha256:42d9149a2fff7affdd352d157fa5717033767857c11bd55aa4a519a44343dfef",
|
|
||||||
"sha256:625f25a6b7d795e8830cb70439453c9f163e6870e710ec99eba5722775b318f3",
|
|
||||||
"sha256:698c6409da58686f2df3d6f815491fd5b4c2de6817a45379517c92366eea208f",
|
|
||||||
"sha256:729f8f8363d32cebcb946de278324ab43d28096f36593be6281ca1ee86ce6559",
|
|
||||||
"sha256:8190770146a4c8ed5d330d5b5ad1c76251c63349d25c96b3094875b930c44692",
|
|
||||||
"sha256:878352408424dffaa695ffedf2f9f92844e116686923ed9aa8626fc30d32cfd1",
|
|
||||||
"sha256:8b984f0821577d889f3c7ca8445564175fb4ac7c7f9659b7c60bef95b2b70e76",
|
|
||||||
"sha256:8f841bbc21d3dad2111a94c490fb0a591b8612ffea86b8e5571746ae76a3deac",
|
|
||||||
"sha256:c22b27371b3866c92796e5d7907e914f0e58a36d3222c5d436ddd3f0e354227a",
|
|
||||||
"sha256:d0cdd5658b49a722783b8b4f61a6f1f9c75042d0e29a30ccb6cacc9b25f6d9e2",
|
|
||||||
"sha256:d40dc7f494b06dcee0d303e51a00451b2da6119acbeaccf8369f2d29e28917ac",
|
|
||||||
"sha256:d8491d4784aceb1f100ddb8e31239c54e4afab8d607928a9f7ef2469ec35ae01",
|
|
||||||
"sha256:dfc5080c38dde3f43d8fbb9c0539a7839683475226cf83e4b24363b227dfe552",
|
|
||||||
"sha256:e24e22c8d98d3c704bb3410bce9b69e122a8de487ad3dbfe9985d154e5c03a40",
|
|
||||||
"sha256:e7a01e53163818d56eabddcafdc2090e9daba178aad05516b20c6591c4811020",
|
|
||||||
"sha256:ee677635393414930541a096fc8e61634304bb0153e4e02b75685b11eba14cae",
|
|
||||||
"sha256:f0521af1b722265d824d6ad055acfe9bd3341765735c44b5a4d0069e189a0f40",
|
|
||||||
"sha256:f25c281f12c0da726c6ed00535ca5d1622ec755c30a3f8eafef26cf43fede694"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '2.7' and python_version != '3.0.*' and python_version != '3.1.*' and python_version != '3.3.*' and python_version != '3.2.*'",
|
|
||||||
"version": "==1.1.0"
|
|
||||||
},
|
|
||||||
"seaborn": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:42e627b24e849c2d3bbfd059e00005f6afbc4a76e4895baf44ae23fe8a4b09a5",
|
|
||||||
"sha256:76c83f794ca320fb6b23a7c6192d5e185a5fcf4758966a0c0a54baee46d41e2f"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==0.9.0"
|
|
||||||
},
|
|
||||||
"send2trash": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2",
|
|
||||||
"sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b"
|
|
||||||
],
|
|
||||||
"version": "==1.5.0"
|
|
||||||
},
|
|
||||||
"simplegeneric": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
|
|
||||||
],
|
|
||||||
"version": "==0.8.1"
|
|
||||||
},
|
|
||||||
"six": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
|
|
||||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
|
|
||||||
],
|
|
||||||
"version": "==1.11.0"
|
|
||||||
},
|
|
||||||
"sklearn": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:e23001573aa194b834122d2b9562459bf5ae494a2d59ca6b8aa22c85a44c0e31"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==0.0"
|
|
||||||
},
|
|
||||||
"terminado": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:55abf9ade563b8f9be1f34e4233c7b7bde726059947a593322e8a553cc4c067a",
|
|
||||||
"sha256:65011551baff97f5414c67018e908110693143cfbaeb16831b743fe7cad8b927"
|
|
||||||
],
|
|
||||||
"version": "==0.8.1"
|
|
||||||
},
|
|
||||||
"testpath": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:039fa6a6c9fd3488f8336d23aebbfead5fa602c4a47d49d83845f55a595ec1b4",
|
|
||||||
"sha256:0d5337839c788da5900df70f8e01015aec141aa3fe7936cb0d0a2953f7ac7609"
|
|
||||||
],
|
|
||||||
"version": "==0.3.1"
|
|
||||||
},
|
|
||||||
"tornado": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1c0816fc32b7d31b98781bd8ebc7a9726d7dce67407dc353a2e66e697e138448",
|
|
||||||
"sha256:4f66a2172cb947387193ca4c2c3e19131f1c70fa8be470ddbbd9317fd0801582",
|
|
||||||
"sha256:5327ba1a6c694e0149e7d9126426b3704b1d9d520852a3e4aa9fc8fe989e4046",
|
|
||||||
"sha256:6a7e8657618268bb007646b9eae7661d0b57f13efc94faa33cd2588eae5912c9",
|
|
||||||
"sha256:a9b14804783a1d77c0bd6c66f7a9b1196cbddfbdf8bceb64683c5ae60bd1ec6f",
|
|
||||||
"sha256:c58757e37c4a3172949c99099d4d5106e4d7b63aa0617f9bb24bfbff712c7866",
|
|
||||||
"sha256:d8984742ce86c0855cccecd5c6f54a9f7532c983947cff06f3a0e2115b47f85c"
|
|
||||||
],
|
|
||||||
"markers": "python_version != '3.3.*' and python_version >= '2.7' and python_version != '3.1.*' and python_version != '3.0.*' and python_version != '3.2.*'",
|
|
||||||
"version": "==5.1"
|
|
||||||
},
|
|
||||||
"traitlets": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
|
||||||
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
|
|
||||||
],
|
|
||||||
"version": "==4.3.2"
|
|
||||||
},
|
|
||||||
"watermark": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1530bf12a729ab701458cb1d8365621688c2757b0b1ef1d426fe0f8bfec0b61e",
|
|
||||||
"sha256:7bdc31a0ab6e80968a3d79507ea993fbf8a422eb7a0f6277db9d1e54011e7342"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==1.6.1"
|
|
||||||
},
|
|
||||||
"wcwidth": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
|
|
||||||
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
|
|
||||||
],
|
|
||||||
"version": "==0.1.7"
|
|
||||||
},
|
|
||||||
"webencodings": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
|
|
||||||
"sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
|
|
||||||
],
|
|
||||||
"version": "==0.5.1"
|
|
||||||
},
|
|
||||||
"widgetsnbextension": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:7e8fc9688d4fb68c96537ce00604cf8d3bbf48bd348f2c4dfb91174c308b1e10",
|
|
||||||
"sha256:c9d6e426a1d79d132b57b93b368feba2c66eb7b0fd34bdb901716b4b88e94497"
|
|
||||||
],
|
|
||||||
"version": "==3.4.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"develop": {
|
|
||||||
"appdirs": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
|
|
||||||
"sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
|
|
||||||
],
|
|
||||||
"version": "==1.4.3"
|
|
||||||
},
|
|
||||||
"attrs": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:4b90b09eeeb9b88c35bc642cbac057e45a5fd85367b985bd2809c62b7b939265",
|
|
||||||
"sha256:e0d0eb91441a3b53dab4d9b743eafc1ac44476296a2053b6ca3af0b139faf87b"
|
|
||||||
],
|
|
||||||
"version": "==18.1.0"
|
|
||||||
},
|
|
||||||
"backcall": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
|
|
||||||
"sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
|
|
||||||
],
|
|
||||||
"version": "==0.1.0"
|
|
||||||
},
|
|
||||||
"black": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:22158b89c1a6b4eb333a1e65e791a3f8b998cf3b11ae094adb2570f31f769a44",
|
|
||||||
"sha256:4b475bbd528acce094c503a3d2dbc2d05a4075f6d0ef7d9e7514518e14cc5191"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==18.6b4"
|
|
||||||
},
|
|
||||||
"blackcellmagic": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:6b3804c8851591804bcdc5635c8d55b6b2a50874df63ba40a20d258cf79049b0"
|
|
||||||
],
|
|
||||||
"index": "pypi",
|
|
||||||
"version": "==0.0.1"
|
|
||||||
},
|
|
||||||
"click": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:29f99fc6125fbc931b758dc053b3114e55c77a6e4c6c3a2674a2dc986016381d",
|
|
||||||
"sha256:f15516df478d5a56180fbf80e68f206010e6d160fc39fa508b65e035fd75130b"
|
|
||||||
],
|
|
||||||
"version": "==6.7"
|
|
||||||
},
|
|
||||||
"decorator": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82",
|
|
||||||
"sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c"
|
|
||||||
],
|
|
||||||
"version": "==4.3.0"
|
|
||||||
},
|
|
||||||
"ipython": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:007dcd929c14631f83daff35df0147ea51d1af420da303fd078343878bd5fb62",
|
|
||||||
"sha256:b0f2ef9eada4a68ef63ee10b6dde4f35c840035c50fd24265f8052c98947d5a4"
|
|
||||||
],
|
|
||||||
"markers": "python_version >= '3.3'",
|
|
||||||
"version": "==6.5.0"
|
|
||||||
},
|
|
||||||
"ipython-genutils": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
|
|
||||||
"sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
|
|
||||||
],
|
|
||||||
"version": "==0.2.0"
|
|
||||||
},
|
|
||||||
"jedi": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:b409ed0f6913a701ed474a614a3bb46e6953639033e31f769ca7581da5bd1ec1",
|
|
||||||
"sha256:c254b135fb39ad76e78d4d8f92765ebc9bf92cbc76f49e97ade1d5f5121e1f6f"
|
|
||||||
],
|
|
||||||
"version": "==0.12.1"
|
|
||||||
},
|
|
||||||
"parso": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2",
|
|
||||||
"sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24"
|
|
||||||
],
|
|
||||||
"version": "==0.3.1"
|
|
||||||
},
|
|
||||||
"pexpect": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
|
|
||||||
"sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
|
|
||||||
],
|
|
||||||
"markers": "sys_platform != 'win32'",
|
|
||||||
"version": "==4.6.0"
|
|
||||||
},
|
|
||||||
"pickleshare": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:84a9257227dfdd6fe1b4be1319096c20eb85ff1e82c7932f36efccfe1b09737b",
|
|
||||||
"sha256:c9a2541f25aeabc070f12f452e1f2a8eae2abd51e1cd19e8430402bdf4c1d8b5"
|
|
||||||
],
|
|
||||||
"version": "==0.7.4"
|
|
||||||
},
|
|
||||||
"prompt-toolkit": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:1df952620eccb399c53ebb359cc7d9a8d3a9538cb34c5a1344bdbeb29fbcc381",
|
|
||||||
"sha256:3f473ae040ddaa52b52f97f6b4a493cfa9f5920c255a12dc56a7d34397a398a4",
|
|
||||||
"sha256:858588f1983ca497f1cf4ffde01d978a3ea02b01c8a26a8bbc5cd2e66d816917"
|
|
||||||
],
|
|
||||||
"version": "==1.0.15"
|
|
||||||
},
|
|
||||||
"ptyprocess": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
|
|
||||||
"sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
|
|
||||||
],
|
|
||||||
"markers": "os_name != 'nt'",
|
|
||||||
"version": "==0.6.0"
|
|
||||||
},
|
|
||||||
"pygments": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d",
|
|
||||||
"sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc"
|
|
||||||
],
|
|
||||||
"version": "==2.2.0"
|
|
||||||
},
|
|
||||||
"simplegeneric": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173"
|
|
||||||
],
|
|
||||||
"version": "==0.8.1"
|
|
||||||
},
|
|
||||||
"six": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
|
|
||||||
"sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
|
|
||||||
],
|
|
||||||
"version": "==1.11.0"
|
|
||||||
},
|
|
||||||
"toml": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:8e86bd6ce8cc11b9620cb637466453d94f5d57ad86f17e98a98d1f73e3baab2d"
|
|
||||||
],
|
|
||||||
"version": "==0.9.4"
|
|
||||||
},
|
|
||||||
"traitlets": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
|
|
||||||
"sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
|
|
||||||
],
|
|
||||||
"version": "==4.3.2"
|
|
||||||
},
|
|
||||||
"wcwidth": {
|
|
||||||
"hashes": [
|
|
||||||
"sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
|
|
||||||
"sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
|
|
||||||
],
|
|
||||||
"version": "==0.1.7"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
90
README.md
90
README.md
|
|
@ -1,21 +1,18 @@
|
||||||
# Tidy Data
|
# Tidy Data
|
||||||
|
|
||||||
The purpose of this repository is to re-do the work described in the paper
|
The purpose of this repository is to illustrate how the data cleaning process described
|
||||||
[Tidy Data](tidy-data.pdf) by Hadley Wickham (member of the RStudio team) in
|
in the paper "[Tidy Data](tidy-data.pdf)" by Hadley Wickham, a member of the
|
||||||
Python.
|
[RStudio](https://rstudio.com/) team, can be done in
|
||||||
|
[Python](https://www.python.org/).
|
||||||
|
|
||||||
The paper was published in 2014 in the Journal of
|
The paper was published in 2014 in the [Journal of Statistical Software](https://www.jstatsoft.org/article/view/v059i10).
|
||||||
[Statistical Software](https://www.jstatsoft.org/article/view/v059i10). The
|
The author offers it for free [here](http://vita.had.co.nz/papers/tidy-data.html).
|
||||||
author offers it for free download
|
Furthermore, the original [R](https://www.r-project.org/) code is available [here](https://github.com/hadley/tidy-data).
|
||||||
[here](http://vita.had.co.nz/papers/tidy-data.html). Furthermore, the original
|
|
||||||
R code is available in a Github
|
|
||||||
[repository](https://github.com/hadley/tidy-data)
|
|
||||||
|
|
||||||
After installing this project, it is recommended to first read the paper to get
|
After installing the dependencies for this project (cf., the [installation notes](https://github.com/webartifex/tidy-data#installation)
|
||||||
the big picture and then work through the six Jupyter notebooks (listed further
|
below), it is recommended to first read the paper to get the big picture and
|
||||||
below).
|
then work through the six Jupyter notebooks listed below.
|
||||||
|
|
||||||
See installation notes at the bottom.
|
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
|
|
@ -23,50 +20,51 @@ See installation notes at the bottom.
|
||||||
### Definition
|
### Definition
|
||||||
|
|
||||||
**Tidy** data is defined as data that comes in a table form adhering to the
|
**Tidy** data is defined as data that comes in a table form adhering to the
|
||||||
following requirements:
|
following requirements:
|
||||||
|
1. each variable is a column,
|
||||||
|
2. each observation a row, and
|
||||||
|
3. each type of observational unit forms a table.
|
||||||
|
|
||||||
1. Each variable forms a column.
|
This is equivalent to [Codd's 3rd normal form](https://en.wikipedia.org/wiki/Third_normal_form),
|
||||||
2. Each observation forms a row.
|
a concept from the theory on relational databases.
|
||||||
3. Each type of observational unit forms a table.
|
A dataset that does *not* satisfy these properties is called **messy**.
|
||||||
|
|
||||||
This is equivalent to Codd's 3rd normal form (in the context of relational
|
|
||||||
databases). A dataset that does not satisfy these properties is called
|
|
||||||
**messy**.
|
|
||||||
|
|
||||||
|
|
||||||
### Tidying messy Data
|
### Tidying Data
|
||||||
|
|
||||||
The five most common problems with messy data are as follows:
|
The five most common problems with messy data are:
|
||||||
|
|
||||||
- Column headers are values, not variable names
|
- column headers are values, not variable names
|
||||||
[[notebook](1_column_headers_are_values.ipynb)]
|
(cf., [notebook 1](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/1_column_headers_are_values.ipynb))
|
||||||
- Multiple variables are stored in one column
|
- multiple variables are stored in one column
|
||||||
[[notebook](2_multiple_variables_stored_in_one_column.ipynb)]
|
(cf., [notebook 2](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/2_multiple_variables_stored_in_one_column.ipynb))
|
||||||
- Variables are stored in both rows and columns
|
- variables are stored in both rows and columns
|
||||||
[[notebook](3_variables_are_stored_in_both_rows_and_columns.ipynb)]
|
(cf., [notebook 3](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/3_variables_are_stored_in_both_rows_and_columns.ipynb))
|
||||||
- Multiple types of observational units are stored in the same table
|
- multiple types of observational units are stored in the same table
|
||||||
[[notebook](4_multiple_types_in_one_table.ipynb)]
|
(cf., [notebook 4](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/4_multiple_types_in_one_table.ipynb))
|
||||||
- A single observational unit is stored in multiple tables
|
- a single observational unit is stored in multiple tables
|
||||||
[[notebook](5_one_type_in_multiple_tables.ipynb)]
|
(cf., [notebook 5](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/5_one_type_in_multiple_tables.ipynb))
|
||||||
|
|
||||||
Further, a [case study](6_case_study.ipynb) shows the advantages of tidy data
|
|
||||||
(as standardized input/output to statistical functions).
|
|
||||||
|
|
||||||
## Download & Installation
|
### Case Study
|
||||||
|
|
||||||
Create a local copy of this repository with:
|
A case study (cf., [notebook 6](https://nbviewer.jupyter.org/github/webartifex/tidy-data/blob/master/6_case_study.ipynb))
|
||||||
|
shows the advantages of tidy data as a standardized input to statistical functions.
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
Get a local copy of this repository with [git](https://git-scm.com/).
|
||||||
|
|
||||||
`git clone https://github.com/webartifex/tidy-data.git`
|
`git clone https://github.com/webartifex/tidy-data.git`
|
||||||
|
|
||||||
This project uses [pipenv](https://docs.pipenv.org/) to manage its
|
If you are not familiar with [git](https://git-scm.com/), simply download the latest
|
||||||
dependencies.
|
version of the files in a zip archive [here](https://github.com/webartifex/tidy-data/archive/master.zip).
|
||||||
|
|
||||||
To install all third-party Python packages in the most recent version into a
|
This project uses [poetry](https://python-poetry.org/docs/) to manage its dependencies.
|
||||||
project-local virtual environment, run:
|
Install all third-party packages into a [virtual environment](https://docs.python.org/3/library/venv.html).
|
||||||
|
|
||||||
`pipenv install`
|
`poetry install`
|
||||||
|
|
||||||
To install all packages with the same version as of the time of creating this
|
Alternatively, use the [Anaconda Distribution](https://www.anaconda.com/products/individual)
|
||||||
project (for exact reproducability), run:
|
that *should* also suffice to run the provided notebooks.
|
||||||
|
|
||||||
`pipenv install --ignore-pipfile`
|
|
||||||
|
|
|
||||||
1558
poetry.lock
generated
Normal file
1558
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
37
pyproject.toml
Normal file
37
pyproject.toml
Normal file
|
|
@ -0,0 +1,37 @@
|
||||||
|
[build-system]
|
||||||
|
build-backend = "poetry.masonry.api"
|
||||||
|
requires = ["poetry>=0.12"]
|
||||||
|
|
||||||
|
[tool.poetry]
|
||||||
|
name = "tidy-data"
|
||||||
|
version = "0.1.0"
|
||||||
|
|
||||||
|
authors = ["Alexander Hess <alexander@webartifex.biz>"]
|
||||||
|
description = "A Python implementation for Hadley Wickham's Tidy Data paper"
|
||||||
|
keywords = [
|
||||||
|
"data-cleaning",
|
||||||
|
"data-science",
|
||||||
|
"messy-data",
|
||||||
|
"python",
|
||||||
|
"tidy-data",
|
||||||
|
]
|
||||||
|
license = "MIT"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "^3.7"
|
||||||
|
|
||||||
|
# Data Science Tools
|
||||||
|
jupyterlab = "^2.2.6"
|
||||||
|
matplotlib = "^3.3.1"
|
||||||
|
numpy = "^1.19.1"
|
||||||
|
pandas = "^1.1.1"
|
||||||
|
seaborn = "^0.10.1"
|
||||||
|
sklearn = "^0.0"
|
||||||
|
|
||||||
|
# Interfaces to other tools
|
||||||
|
rpy2 = "==2.8.*" # R support
|
||||||
|
savreaderwriter = "^3.4.2" # IBM SPSS support
|
||||||
|
|
||||||
|
# Code Formatters
|
||||||
|
black = "^19.10b0"
|
||||||
|
nb_black = "^1.0.7"
|
||||||
Loading…
Add table
Add a link
Reference in a new issue