Update the project for 2020

- replace pipenv with poetry - update the README.md: * streamline the text * update links to notebooks with nbviewer * update installation notes with poetry info - streamline the notebooks: * use backticks in MarkDown cells to make references to columns in DataFrames clearer * blacken all code cells - add MIT license - ignore .venv/ and .python-version
2020-08-26 00:07:58 +02:00 · 2020-08-26 00:07:58 +02:00 · a3a17236a2
commit a3a17236a2
parent 4cec950887
13 changed files with 1975 additions and 1158 deletions
--- a/1_column_headers_are_values.ipynb
+++ b/1_column_headers_are_values.ipynb
@ -6,7 +6,7 @@
   "source": [
    "# Column Headers are Values, not Variable Names\n",
    "\n",
-    "This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two types of settings:\n",
+    "This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two kinds of settings:\n",
    "\n",
    "1. Presentations\n",
    "2. Recordings of regularly spaced observations over time"
@ -23,24 +23,9 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2018-08-26 14:39:56 CEST\n",
-      "\n",
-      "CPython 3.6.5\n",
-      "IPython 6.5.0\n",
-      "\n",
-      "numpy 1.15.1\n",
-      "pandas 0.23.4\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "% load_ext watermark\n",
-    "% watermark -d -t -v -z -p numpy,pandas"
+    "%load_ext lab_black"
   ]
  },
  {
@ -90,32 +75,35 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "columns = ['q16', 'reltrad', 'income']\n",
+    "columns = [\"q16\", \"reltrad\", \"income\"]\n",
    "encodings = {}\n",
    "\n",
-    "# For sake of simplicity all data cleaning operations\n",
+    "# For the sake of simplicity, all data cleaning operations\n",
    "# are done within the for-loop for all columns.\n",
-    "with spss.SavHeaderReader('data/pew.sav') as pew:\n",
-    "    for c in columns:\n",
-    "        encodings[c] = {\n",
-    "            int(k): (\n",
-    "                re.sub(r'\\(.*\\)', '', (\n",
-    "                        v.decode('iso-8859-1')\n",
-    "                        .replace('\\x92', \"'\")\n",
-    "                        .replace(' Churches', '')\n",
-    "                        .replace('Less than $10,000', '<$10k')\n",
-    "                        .replace('10 to under $20,000', '$10-20k')\n",
-    "                        .replace('20 to under $30,000', '$20-30k')\n",
-    "                        .replace('30 to under $40,000', '$30-40k')\n",
-    "                        .replace('40 to under $50,000', '$40-50k')\n",
-    "                        .replace('50 to under $75,000', '$50-75k')\n",
-    "                        .replace('75 to under $100,000', '$75-100k')\n",
-    "                        .replace('100 to under $150,000', '$100-150k')\n",
-    "                        .replace('$150,000 or more', '>150k')\n",
+    "with spss.SavHeaderReader(\"data/pew.sav\") as pew:\n",
+    "    for column in columns:\n",
+    "        encodings[column] = {\n",
+    "            int(key): (\n",
+    "                re.sub(\n",
+    "                    r\"\\(.*\\)\",\n",
+    "                    \"\",\n",
+    "                    (\n",
+    "                        value.decode(\"iso-8859-1\")\n",
+    "                        .replace(\"\\x92\", \"'\")\n",
+    "                        .replace(\" Churches\", \"\")\n",
+    "                        .replace(\"Less than $10,000\", \"<$10k\")\n",
+    "                        .replace(\"10 to under $20,000\", \"$10-20k\")\n",
+    "                        .replace(\"20 to under $30,000\", \"$20-30k\")\n",
+    "                        .replace(\"30 to under $40,000\", \"$30-40k\")\n",
+    "                        .replace(\"40 to under $50,000\", \"$40-50k\")\n",
+    "                        .replace(\"50 to under $75,000\", \"$50-75k\")\n",
+    "                        .replace(\"75 to under $100,000\", \"$75-100k\")\n",
+    "                        .replace(\"100 to under $150,000\", \"$100-150k\")\n",
+    "                        .replace(\"$150,000 or more\", \">150k\")\n",
    "                    ),\n",
    "                ).strip()\n",
    "            )\n",
-    "            for (k, v) in pew.all().valueLabels[c.encode()].items()\n",
+    "            for (key, value) in pew.all().valueLabels[column.encode()].items()\n",
    "        }"
   ]
  },
@ -132,25 +120,36 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with spss.SavReader('data/pew.sav', selectVars=[c.encode() for c in columns]) as pew:\n",
+    "with spss.SavReader(\n",
+    "    \"data/pew.sav\", selectVars=[column.encode() for column in columns]\n",
+    ") as pew:\n",
    "    pew = list(pew)\n",
    "\n",
    "# Use the above encodings to map the numeric data\n",
    "# to the actual labels.\n",
    "pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
-    "for c in columns:\n",
-    "    pew[c] = pew[c].map(encodings[c])\n",
+    "for column in columns:\n",
+    "    pew[column] = pew[column].map(encodings[column])\n",
    "\n",
-    "for v in ('Atheist', 'Agnostic'):\n",
-    "    pew.loc[(pew['q16'] == v), 'reltrad'] = v\n",
+    "for value in (\"Atheist\", \"Agnostic\"):\n",
+    "    pew.loc[(pew[\"q16\"] == value), \"reltrad\"] = value\n",
    "\n",
-    "income_columns = ['<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k',\n",
-    "                  '$75-100k', '$100-150k', '>150k', 'Don\\'t know/Refused']\n",
-    "\n",
-    "pew = pew.groupby(['reltrad', 'income']).size().unstack('income')\n",
+    "income_columns = [\n",
+    "    \"<$10k\",\n",
+    "    \"$10-20k\",\n",
+    "    \"$20-30k\",\n",
+    "    \"$30-40k\",\n",
+    "    \"$40-50k\",\n",
+    "    \"$50-75k\",\n",
+    "    \"$75-100k\",\n",
+    "    \"$100-150k\",\n",
+    "    \">150k\",\n",
+    "    \"Don't know/Refused\",\n",
+    "]\n",
    "\n",
+    "pew = pew.groupby([\"reltrad\", \"income\"]).size().unstack(\"income\")\n",
    "pew = pew[income_columns]\n",
-    "pew.index.name = 'religion'"
+    "pew.index.name = \"religion\""
   ]
  },
  {
@ -426,9 +425,9 @@
    "\n",
    "> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
    "\n",
-    "pandas provides a [pd.melt](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function to un-pivot the dataset.\n",
+    "pandas provides a [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function to un-pivot the dataset.\n",
    "\n",
-    "**Notes:** *reset_index()* transforms the religion index column into a data column (*pd.melt()* needs that). Further, the resulting table is sorted implicitly by the *religion* column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
+    "**Notes:** `.reset_index()` transforms the religion index column into a data column (`pd.melt()` needs that). Further, the resulting table is sorted implicitly by the `\"religion\"` column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
   ]
  },
  {
@ -437,7 +436,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "molten_pew = pd.melt(pew.reset_index(), id_vars=['religion'], value_name='frequency')"
+    "molten_pew = pd.melt(pew.reset_index(), id_vars=[\"religion\"], value_name=\"frequency\")"
   ]
  },
  {
@ -448,8 +447,8 @@
   "source": [
    "# Create a ordered column for the income labels.\n",
    "income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
-    "molten_pew['income'] = molten_pew['income'].astype(income_dtype)\n",
-    "molten_pew = molten_pew.sort_values(['religion', 'income']).reset_index(drop=True)"
+    "molten_pew[\"income\"] = molten_pew[\"income\"].astype(income_dtype)\n",
+    "molten_pew = molten_pew.sort_values([\"religion\", \"income\"]).reset_index(drop=True)"
   ]
  },
  {
@ -616,37 +615,40 @@
   "outputs": [],
   "source": [
    "# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
-    "usecols = ['artist.inverted', 'track', 'time', 'date.entered'] + (\n",
-    "    [f'x{i}st.week' for i in range(1, 76, 10) if i != 11]\n",
-    "    + [f'x{i}nd.week' for i in range(2, 76, 10) if i != 12]\n",
-    "    + [f'x{i}rd.week' for i in range(3, 76, 10) if i != 13]\n",
-    "    + [f'x{i}th.week' for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
-    "    + [f'x11th.week', f'x12th.week', f'x13th.week']\n",
+    "usecols = [\"artist.inverted\", \"track\", \"time\", \"date.entered\"] + (\n",
+    "    [f\"x{i}st.week\" for i in range(1, 76, 10) if i != 11]\n",
+    "    + [f\"x{i}nd.week\" for i in range(2, 76, 10) if i != 12]\n",
+    "    + [f\"x{i}rd.week\" for i in range(3, 76, 10) if i != 13]\n",
+    "    + [f\"x{i}th.week\" for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
+    "    + [f\"x11th.week\", f\"x12th.week\", f\"x13th.week\"]\n",
+    ")\n",
+    "billboard = pd.read_csv(\n",
+    "    \"data/billboard.csv\",\n",
+    "    encoding=\"iso-8859-1\",\n",
+    "    parse_dates=[\"date.entered\"],\n",
+    "    usecols=usecols,\n",
    ")\n",
    "\n",
-    "billboard = pd.read_csv('data/billboard.csv', encoding='iso-8859-1',\n",
-    "                        parse_dates=['date.entered'], usecols=usecols)\n",
-    "\n",
-    "billboard = billboard.assign(year=lambda x: x['date.entered'].dt.year)\n",
+    "billboard = billboard.assign(year=lambda x: x[\"date.entered\"].dt.year)\n",
    "\n",
    "# Rename the week columns.\n",
    "week_columns = {\n",
-    "    c: ('wk' + re.sub(r'[^\\d]+', '', c))\n",
-    "    for c in billboard.columns\n",
-    "    if c.endswith('.week')\n",
+    "    column: (\"wk\" + re.sub(r\"[^\\d]+\", \"\", column))\n",
+    "    for column in billboard.columns\n",
+    "    if column.endswith(\".week\")\n",
    "}\n",
-    "billboard = billboard.rename(columns={'artist.inverted': 'artist', **week_columns})\n",
+    "billboard = billboard.rename(columns={\"artist.inverted\": \"artist\", **week_columns})\n",
    "\n",
    "# Ensure the columns' order is the same as in the paper.\n",
-    "columns = ['year', 'artist', 'track', 'time', 'date.entered'] + [\n",
-    "    f'wk{i}' for i in range(1, 76)\n",
+    "columns = [\"year\", \"artist\", \"track\", \"time\", \"date.entered\"] + [\n",
+    "    f\"wk{i}\" for i in range(1, 76)\n",
    "]\n",
    "billboard = billboard[columns]\n",
    "\n",
    "# Ensure the rows' order is similar as in the paper.\n",
    "# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
-    "billboard = billboard[billboard['year'] == 2000]\n",
-    "billboard = billboard.sort_values(['artist', 'track'])"
+    "billboard = billboard[billboard[\"year\"] == 2000]\n",
+    "billboard = billboard.sort_values([\"artist\", \"track\"])"
   ]
  },
  {
@ -986,17 +988,17 @@
       "14   2000         Aaliyah                                          Try Again   \n",
       "200  2000  Adams, Yolanda                                      Open My Heart   \n",
       "\n",
-       "     time date.entered  wk1   wk2   wk3   wk4    wk5  ...   wk66  wk67  wk68  \\\n",
-       "246  4:22   2000-02-26   87  82.0  72.0  77.0   87.0  ...    NaN   NaN   NaN   \n",
-       "287  3:15   2000-09-02   91  87.0  92.0   NaN    NaN  ...    NaN   NaN   NaN   \n",
-       "24   3:53   2000-04-08   81  70.0  68.0  67.0   66.0  ...    NaN   NaN   NaN   \n",
-       "193  4:24   2000-10-21   76  76.0  72.0  69.0   67.0  ...    NaN   NaN   NaN   \n",
-       "69   3:35   2000-04-15   57  34.0  25.0  17.0   17.0  ...    NaN   NaN   NaN   \n",
-       "22   3:24   2000-08-19   51  39.0  34.0  26.0   26.0  ...    NaN   NaN   NaN   \n",
-       "304  3:44   2000-07-08   97  97.0  96.0  95.0  100.0  ...    NaN   NaN   NaN   \n",
-       "135  4:15   2000-01-29   84  62.0  51.0  41.0   38.0  ...    NaN   NaN   NaN   \n",
-       "14   4:03   2000-03-18   59  53.0  38.0  28.0   21.0  ...    NaN   NaN   NaN   \n",
-       "200  5:30   2000-08-26   76  76.0  74.0  69.0   68.0  ...    NaN   NaN   NaN   \n",
+       "     time date.entered  wk1   wk2   wk3   wk4    wk5  ...  wk66  wk67  wk68  \\\n",
+       "246  4:22   2000-02-26   87  82.0  72.0  77.0   87.0  ...   NaN   NaN   NaN   \n",
+       "287  3:15   2000-09-02   91  87.0  92.0   NaN    NaN  ...   NaN   NaN   NaN   \n",
+       "24   3:53   2000-04-08   81  70.0  68.0  67.0   66.0  ...   NaN   NaN   NaN   \n",
+       "193  4:24   2000-10-21   76  76.0  72.0  69.0   67.0  ...   NaN   NaN   NaN   \n",
+       "69   3:35   2000-04-15   57  34.0  25.0  17.0   17.0  ...   NaN   NaN   NaN   \n",
+       "22   3:24   2000-08-19   51  39.0  34.0  26.0   26.0  ...   NaN   NaN   NaN   \n",
+       "304  3:44   2000-07-08   97  97.0  96.0  95.0  100.0  ...   NaN   NaN   NaN   \n",
+       "135  4:15   2000-01-29   84  62.0  51.0  41.0   38.0  ...   NaN   NaN   NaN   \n",
+       "14   4:03   2000-03-18   59  53.0  38.0  28.0   21.0  ...   NaN   NaN   NaN   \n",
+       "200  5:30   2000-08-26   76  76.0  74.0  69.0   68.0  ...   NaN   NaN   NaN   \n",
       "\n",
       "     wk69  wk70  wk71  wk72  wk73  wk74  wk75  \n",
       "246   NaN   NaN   NaN   NaN   NaN   NaN   NaN  \n",
@ -1028,7 +1030,7 @@
   "source": [
    "### \"Tidy\" Data\n",
    "\n",
-    "As before the *pd.melt* function is used to transform the data from \"wide\" to \"long\" form."
+    "As before the `pd.melt()` function is used to transform the data from \"wide\" to \"long\" form."
   ]
  },
  {
@ -1039,9 +1041,9 @@
   "source": [
    "molten_billboard = pd.melt(\n",
    "    billboard,\n",
-    "    id_vars=['year', 'artist', 'track', 'time', 'date.entered'],\n",
-    "    var_name='week',\n",
-    "    value_name='rank',\n",
+    "    id_vars=[\"year\", \"artist\", \"track\", \"time\", \"date.entered\"],\n",
+    "    var_name=\"week\",\n",
+    "    value_name=\"rank\",\n",
    ")"
   ]
  },
@ -1049,7 +1051,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column *date* indicating when exactly a particular song was at a certain rank in the charts is added."
+    "In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column`\"date\"` indicating when exactly a particular song was at a certain rank in the charts is added."
   ]
  },
  {
@ -1059,24 +1061,23 @@
   "outputs": [],
   "source": [
    "# pandas keeps \"wide\" variables that had missing values as rows.\n",
-    "molten_billboard = molten_billboard[molten_billboard['rank'].notnull()]\n",
+    "molten_billboard = molten_billboard[molten_billboard[\"rank\"].notnull()]\n",
    "\n",
    "# Cast as integer after missing values are removed.\n",
-    "molten_billboard['week'] = molten_billboard['week'].map(lambda x: int(x[2:]))\n",
-    "molten_billboard['rank'] = molten_billboard['rank'].map(int)\n",
+    "molten_billboard[\"week\"] = molten_billboard[\"week\"].map(lambda x: int(x[2:]))\n",
+    "molten_billboard[\"rank\"] = molten_billboard[\"rank\"].map(int)\n",
    "\n",
    "# Calculate the actual week from the date of first entering the list.\n",
    "molten_billboard = molten_billboard.assign(\n",
-    "    date=lambda x: x['date.entered'] + (x['week'] - 1) * datetime.timedelta(weeks=1)\n",
+    "    date=lambda x: x[\"date.entered\"] + (x[\"week\"] - 1) * datetime.timedelta(weeks=1)\n",
    ")\n",
    "\n",
    "# Sort rows and columns as in the paper.\n",
    "molten_billboard = molten_billboard[\n",
-    "    ['year', 'artist', 'time', 'track', 'date', 'week', 'rank']\n",
+    "    [\"year\", \"artist\", \"time\", \"track\", \"date\", \"week\", \"rank\"]\n",
    "]\n",
-    "molten_billboard = (\n",
-    "    molten_billboard.sort_values(['artist', 'track', 'week']).reset_index(drop=True)\n",
-    ")"
+    "molten_billboard = molten_billboard.sort_values([\"artist\", \"track\", \"week\"])\n",
+    "molten_billboard = molten_billboard.reset_index(drop=True)"
   ]
  },
  {
@ -1336,7 +1337,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "molten_billboard.to_csv('data/billboard_cleaned.csv', index=False)"
+    "molten_billboard.to_csv(\"data/billboard_cleaned.csv\", index=False)"
   ]
  }
 ],
@ -1356,9 +1357,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.7.9"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }