1
0
Fork 0

Update the project for 2020

- replace pipenv with poetry
- update the README.md:
  * streamline the text
  * update links to notebooks with nbviewer
  * update installation notes with poetry info
- streamline the notebooks:
  * use backticks in MarkDown cells to make references to
    columns in DataFrames clearer
  * blacken all code cells
- add MIT license
- ignore .venv/ and .python-version
This commit is contained in:
Alexander Hess 2020-08-26 00:07:58 +02:00
commit a3a17236a2
Signed by: alexander
GPG key ID: 344EA5AB10D868E0
13 changed files with 1975 additions and 1158 deletions

View file

@ -6,7 +6,7 @@
"source": [
"# Column Headers are Values, not Variable Names\n",
"\n",
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two types of settings:\n",
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two kinds of settings:\n",
"\n",
"1. Presentations\n",
"2. Recordings of regularly spaced observations over time"
@ -23,24 +23,9 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2018-08-26 14:39:56 CEST\n",
"\n",
"CPython 3.6.5\n",
"IPython 6.5.0\n",
"\n",
"numpy 1.15.1\n",
"pandas 0.23.4\n"
]
}
],
"outputs": [],
"source": [
"% load_ext watermark\n",
"% watermark -d -t -v -z -p numpy,pandas"
"%load_ext lab_black"
]
},
{
@ -90,32 +75,35 @@
"metadata": {},
"outputs": [],
"source": [
"columns = ['q16', 'reltrad', 'income']\n",
"columns = [\"q16\", \"reltrad\", \"income\"]\n",
"encodings = {}\n",
"\n",
"# For sake of simplicity all data cleaning operations\n",
"# For the sake of simplicity, all data cleaning operations\n",
"# are done within the for-loop for all columns.\n",
"with spss.SavHeaderReader('data/pew.sav') as pew:\n",
" for c in columns:\n",
" encodings[c] = {\n",
" int(k): (\n",
" re.sub(r'\\(.*\\)', '', (\n",
" v.decode('iso-8859-1')\n",
" .replace('\\x92', \"'\")\n",
" .replace(' Churches', '')\n",
" .replace('Less than $10,000', '<$10k')\n",
" .replace('10 to under $20,000', '$10-20k')\n",
" .replace('20 to under $30,000', '$20-30k')\n",
" .replace('30 to under $40,000', '$30-40k')\n",
" .replace('40 to under $50,000', '$40-50k')\n",
" .replace('50 to under $75,000', '$50-75k')\n",
" .replace('75 to under $100,000', '$75-100k')\n",
" .replace('100 to under $150,000', '$100-150k')\n",
" .replace('$150,000 or more', '>150k')\n",
"with spss.SavHeaderReader(\"data/pew.sav\") as pew:\n",
" for column in columns:\n",
" encodings[column] = {\n",
" int(key): (\n",
" re.sub(\n",
" r\"\\(.*\\)\",\n",
" \"\",\n",
" (\n",
" value.decode(\"iso-8859-1\")\n",
" .replace(\"\\x92\", \"'\")\n",
" .replace(\" Churches\", \"\")\n",
" .replace(\"Less than $10,000\", \"<$10k\")\n",
" .replace(\"10 to under $20,000\", \"$10-20k\")\n",
" .replace(\"20 to under $30,000\", \"$20-30k\")\n",
" .replace(\"30 to under $40,000\", \"$30-40k\")\n",
" .replace(\"40 to under $50,000\", \"$40-50k\")\n",
" .replace(\"50 to under $75,000\", \"$50-75k\")\n",
" .replace(\"75 to under $100,000\", \"$75-100k\")\n",
" .replace(\"100 to under $150,000\", \"$100-150k\")\n",
" .replace(\"$150,000 or more\", \">150k\")\n",
" ),\n",
" ).strip()\n",
" )\n",
" for (k, v) in pew.all().valueLabels[c.encode()].items()\n",
" for (key, value) in pew.all().valueLabels[column.encode()].items()\n",
" }"
]
},
@ -132,25 +120,36 @@
"metadata": {},
"outputs": [],
"source": [
"with spss.SavReader('data/pew.sav', selectVars=[c.encode() for c in columns]) as pew:\n",
"with spss.SavReader(\n",
" \"data/pew.sav\", selectVars=[column.encode() for column in columns]\n",
") as pew:\n",
" pew = list(pew)\n",
"\n",
"# Use the above encodings to map the numeric data\n",
"# to the actual labels.\n",
"pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
"for c in columns:\n",
" pew[c] = pew[c].map(encodings[c])\n",
"for column in columns:\n",
" pew[column] = pew[column].map(encodings[column])\n",
"\n",
"for v in ('Atheist', 'Agnostic'):\n",
" pew.loc[(pew['q16'] == v), 'reltrad'] = v\n",
"for value in (\"Atheist\", \"Agnostic\"):\n",
" pew.loc[(pew[\"q16\"] == value), \"reltrad\"] = value\n",
"\n",
"income_columns = ['<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k',\n",
" '$75-100k', '$100-150k', '>150k', 'Don\\'t know/Refused']\n",
"\n",
"pew = pew.groupby(['reltrad', 'income']).size().unstack('income')\n",
"income_columns = [\n",
" \"<$10k\",\n",
" \"$10-20k\",\n",
" \"$20-30k\",\n",
" \"$30-40k\",\n",
" \"$40-50k\",\n",
" \"$50-75k\",\n",
" \"$75-100k\",\n",
" \"$100-150k\",\n",
" \">150k\",\n",
" \"Don't know/Refused\",\n",
"]\n",
"\n",
"pew = pew.groupby([\"reltrad\", \"income\"]).size().unstack(\"income\")\n",
"pew = pew[income_columns]\n",
"pew.index.name = 'religion'"
"pew.index.name = \"religion\""
]
},
{
@ -426,9 +425,9 @@
"\n",
"> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
"\n",
"pandas provides a [pd.melt](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function to un-pivot the dataset.\n",
"pandas provides a [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function to un-pivot the dataset.\n",
"\n",
"**Notes:** *reset_index()* transforms the religion index column into a data column (*pd.melt()* needs that). Further, the resulting table is sorted implicitly by the *religion* column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
"**Notes:** `.reset_index()` transforms the religion index column into a data column (`pd.melt()` needs that). Further, the resulting table is sorted implicitly by the `\"religion\"` column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
]
},
{
@ -437,7 +436,7 @@
"metadata": {},
"outputs": [],
"source": [
"molten_pew = pd.melt(pew.reset_index(), id_vars=['religion'], value_name='frequency')"
"molten_pew = pd.melt(pew.reset_index(), id_vars=[\"religion\"], value_name=\"frequency\")"
]
},
{
@ -448,8 +447,8 @@
"source": [
"# Create a ordered column for the income labels.\n",
"income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
"molten_pew['income'] = molten_pew['income'].astype(income_dtype)\n",
"molten_pew = molten_pew.sort_values(['religion', 'income']).reset_index(drop=True)"
"molten_pew[\"income\"] = molten_pew[\"income\"].astype(income_dtype)\n",
"molten_pew = molten_pew.sort_values([\"religion\", \"income\"]).reset_index(drop=True)"
]
},
{
@ -616,37 +615,40 @@
"outputs": [],
"source": [
"# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
"usecols = ['artist.inverted', 'track', 'time', 'date.entered'] + (\n",
" [f'x{i}st.week' for i in range(1, 76, 10) if i != 11]\n",
" + [f'x{i}nd.week' for i in range(2, 76, 10) if i != 12]\n",
" + [f'x{i}rd.week' for i in range(3, 76, 10) if i != 13]\n",
" + [f'x{i}th.week' for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
" + [f'x11th.week', f'x12th.week', f'x13th.week']\n",
"usecols = [\"artist.inverted\", \"track\", \"time\", \"date.entered\"] + (\n",
" [f\"x{i}st.week\" for i in range(1, 76, 10) if i != 11]\n",
" + [f\"x{i}nd.week\" for i in range(2, 76, 10) if i != 12]\n",
" + [f\"x{i}rd.week\" for i in range(3, 76, 10) if i != 13]\n",
" + [f\"x{i}th.week\" for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
" + [f\"x11th.week\", f\"x12th.week\", f\"x13th.week\"]\n",
")\n",
"billboard = pd.read_csv(\n",
" \"data/billboard.csv\",\n",
" encoding=\"iso-8859-1\",\n",
" parse_dates=[\"date.entered\"],\n",
" usecols=usecols,\n",
")\n",
"\n",
"billboard = pd.read_csv('data/billboard.csv', encoding='iso-8859-1',\n",
" parse_dates=['date.entered'], usecols=usecols)\n",
"\n",
"billboard = billboard.assign(year=lambda x: x['date.entered'].dt.year)\n",
"billboard = billboard.assign(year=lambda x: x[\"date.entered\"].dt.year)\n",
"\n",
"# Rename the week columns.\n",
"week_columns = {\n",
" c: ('wk' + re.sub(r'[^\\d]+', '', c))\n",
" for c in billboard.columns\n",
" if c.endswith('.week')\n",
" column: (\"wk\" + re.sub(r\"[^\\d]+\", \"\", column))\n",
" for column in billboard.columns\n",
" if column.endswith(\".week\")\n",
"}\n",
"billboard = billboard.rename(columns={'artist.inverted': 'artist', **week_columns})\n",
"billboard = billboard.rename(columns={\"artist.inverted\": \"artist\", **week_columns})\n",
"\n",
"# Ensure the columns' order is the same as in the paper.\n",
"columns = ['year', 'artist', 'track', 'time', 'date.entered'] + [\n",
" f'wk{i}' for i in range(1, 76)\n",
"columns = [\"year\", \"artist\", \"track\", \"time\", \"date.entered\"] + [\n",
" f\"wk{i}\" for i in range(1, 76)\n",
"]\n",
"billboard = billboard[columns]\n",
"\n",
"# Ensure the rows' order is similar as in the paper.\n",
"# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
"billboard = billboard[billboard['year'] == 2000]\n",
"billboard = billboard.sort_values(['artist', 'track'])"
"billboard = billboard[billboard[\"year\"] == 2000]\n",
"billboard = billboard.sort_values([\"artist\", \"track\"])"
]
},
{
@ -986,17 +988,17 @@
"14 2000 Aaliyah Try Again \n",
"200 2000 Adams, Yolanda Open My Heart \n",
"\n",
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
"\n",
" wk69 wk70 wk71 wk72 wk73 wk74 wk75 \n",
"246 NaN NaN NaN NaN NaN NaN NaN \n",
@ -1028,7 +1030,7 @@
"source": [
"### \"Tidy\" Data\n",
"\n",
"As before the *pd.melt* function is used to transform the data from \"wide\" to \"long\" form."
"As before the `pd.melt()` function is used to transform the data from \"wide\" to \"long\" form."
]
},
{
@ -1039,9 +1041,9 @@
"source": [
"molten_billboard = pd.melt(\n",
" billboard,\n",
" id_vars=['year', 'artist', 'track', 'time', 'date.entered'],\n",
" var_name='week',\n",
" value_name='rank',\n",
" id_vars=[\"year\", \"artist\", \"track\", \"time\", \"date.entered\"],\n",
" var_name=\"week\",\n",
" value_name=\"rank\",\n",
")"
]
},
@ -1049,7 +1051,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column *date* indicating when exactly a particular song was at a certain rank in the charts is added."
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column`\"date\"` indicating when exactly a particular song was at a certain rank in the charts is added."
]
},
{
@ -1059,24 +1061,23 @@
"outputs": [],
"source": [
"# pandas keeps \"wide\" variables that had missing values as rows.\n",
"molten_billboard = molten_billboard[molten_billboard['rank'].notnull()]\n",
"molten_billboard = molten_billboard[molten_billboard[\"rank\"].notnull()]\n",
"\n",
"# Cast as integer after missing values are removed.\n",
"molten_billboard['week'] = molten_billboard['week'].map(lambda x: int(x[2:]))\n",
"molten_billboard['rank'] = molten_billboard['rank'].map(int)\n",
"molten_billboard[\"week\"] = molten_billboard[\"week\"].map(lambda x: int(x[2:]))\n",
"molten_billboard[\"rank\"] = molten_billboard[\"rank\"].map(int)\n",
"\n",
"# Calculate the actual week from the date of first entering the list.\n",
"molten_billboard = molten_billboard.assign(\n",
" date=lambda x: x['date.entered'] + (x['week'] - 1) * datetime.timedelta(weeks=1)\n",
" date=lambda x: x[\"date.entered\"] + (x[\"week\"] - 1) * datetime.timedelta(weeks=1)\n",
")\n",
"\n",
"# Sort rows and columns as in the paper.\n",
"molten_billboard = molten_billboard[\n",
" ['year', 'artist', 'time', 'track', 'date', 'week', 'rank']\n",
" [\"year\", \"artist\", \"time\", \"track\", \"date\", \"week\", \"rank\"]\n",
"]\n",
"molten_billboard = (\n",
" molten_billboard.sort_values(['artist', 'track', 'week']).reset_index(drop=True)\n",
")"
"molten_billboard = molten_billboard.sort_values([\"artist\", \"track\", \"week\"])\n",
"molten_billboard = molten_billboard.reset_index(drop=True)"
]
},
{
@ -1336,7 +1337,7 @@
"metadata": {},
"outputs": [],
"source": [
"molten_billboard.to_csv('data/billboard_cleaned.csv', index=False)"
"molten_billboard.to_csv(\"data/billboard_cleaned.csv\", index=False)"
]
}
],
@ -1356,9 +1357,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}