Update the project for 2020
- replace pipenv with poetry
- update the README.md:
* streamline the text
* update links to notebooks with nbviewer
* update installation notes with poetry info
- streamline the notebooks:
* use backticks in MarkDown cells to make references to
columns in DataFrames clearer
* blacken all code cells
- add MIT license
- ignore .venv/ and .python-version
This commit is contained in:
parent
4cec950887
commit
a3a17236a2
13 changed files with 1975 additions and 1158 deletions
|
|
@ -6,7 +6,7 @@
|
|||
"source": [
|
||||
"# Column Headers are Values, not Variable Names\n",
|
||||
"\n",
|
||||
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two types of settings:\n",
|
||||
"This notebook shows two examples of how column headers display values. These type of messy datasets have practical use in two kinds of settings:\n",
|
||||
"\n",
|
||||
"1. Presentations\n",
|
||||
"2. Recordings of regularly spaced observations over time"
|
||||
|
|
@ -23,24 +23,9 @@
|
|||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2018-08-26 14:39:56 CEST\n",
|
||||
"\n",
|
||||
"CPython 3.6.5\n",
|
||||
"IPython 6.5.0\n",
|
||||
"\n",
|
||||
"numpy 1.15.1\n",
|
||||
"pandas 0.23.4\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"% load_ext watermark\n",
|
||||
"% watermark -d -t -v -z -p numpy,pandas"
|
||||
"%load_ext lab_black"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -90,32 +75,35 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"columns = ['q16', 'reltrad', 'income']\n",
|
||||
"columns = [\"q16\", \"reltrad\", \"income\"]\n",
|
||||
"encodings = {}\n",
|
||||
"\n",
|
||||
"# For sake of simplicity all data cleaning operations\n",
|
||||
"# For the sake of simplicity, all data cleaning operations\n",
|
||||
"# are done within the for-loop for all columns.\n",
|
||||
"with spss.SavHeaderReader('data/pew.sav') as pew:\n",
|
||||
" for c in columns:\n",
|
||||
" encodings[c] = {\n",
|
||||
" int(k): (\n",
|
||||
" re.sub(r'\\(.*\\)', '', (\n",
|
||||
" v.decode('iso-8859-1')\n",
|
||||
" .replace('\\x92', \"'\")\n",
|
||||
" .replace(' Churches', '')\n",
|
||||
" .replace('Less than $10,000', '<$10k')\n",
|
||||
" .replace('10 to under $20,000', '$10-20k')\n",
|
||||
" .replace('20 to under $30,000', '$20-30k')\n",
|
||||
" .replace('30 to under $40,000', '$30-40k')\n",
|
||||
" .replace('40 to under $50,000', '$40-50k')\n",
|
||||
" .replace('50 to under $75,000', '$50-75k')\n",
|
||||
" .replace('75 to under $100,000', '$75-100k')\n",
|
||||
" .replace('100 to under $150,000', '$100-150k')\n",
|
||||
" .replace('$150,000 or more', '>150k')\n",
|
||||
"with spss.SavHeaderReader(\"data/pew.sav\") as pew:\n",
|
||||
" for column in columns:\n",
|
||||
" encodings[column] = {\n",
|
||||
" int(key): (\n",
|
||||
" re.sub(\n",
|
||||
" r\"\\(.*\\)\",\n",
|
||||
" \"\",\n",
|
||||
" (\n",
|
||||
" value.decode(\"iso-8859-1\")\n",
|
||||
" .replace(\"\\x92\", \"'\")\n",
|
||||
" .replace(\" Churches\", \"\")\n",
|
||||
" .replace(\"Less than $10,000\", \"<$10k\")\n",
|
||||
" .replace(\"10 to under $20,000\", \"$10-20k\")\n",
|
||||
" .replace(\"20 to under $30,000\", \"$20-30k\")\n",
|
||||
" .replace(\"30 to under $40,000\", \"$30-40k\")\n",
|
||||
" .replace(\"40 to under $50,000\", \"$40-50k\")\n",
|
||||
" .replace(\"50 to under $75,000\", \"$50-75k\")\n",
|
||||
" .replace(\"75 to under $100,000\", \"$75-100k\")\n",
|
||||
" .replace(\"100 to under $150,000\", \"$100-150k\")\n",
|
||||
" .replace(\"$150,000 or more\", \">150k\")\n",
|
||||
" ),\n",
|
||||
" ).strip()\n",
|
||||
" )\n",
|
||||
" for (k, v) in pew.all().valueLabels[c.encode()].items()\n",
|
||||
" for (key, value) in pew.all().valueLabels[column.encode()].items()\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
|
|
@ -132,25 +120,36 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with spss.SavReader('data/pew.sav', selectVars=[c.encode() for c in columns]) as pew:\n",
|
||||
"with spss.SavReader(\n",
|
||||
" \"data/pew.sav\", selectVars=[column.encode() for column in columns]\n",
|
||||
") as pew:\n",
|
||||
" pew = list(pew)\n",
|
||||
"\n",
|
||||
"# Use the above encodings to map the numeric data\n",
|
||||
"# to the actual labels.\n",
|
||||
"pew = pd.DataFrame(pew, columns=columns, dtype=int)\n",
|
||||
"for c in columns:\n",
|
||||
" pew[c] = pew[c].map(encodings[c])\n",
|
||||
"for column in columns:\n",
|
||||
" pew[column] = pew[column].map(encodings[column])\n",
|
||||
"\n",
|
||||
"for v in ('Atheist', 'Agnostic'):\n",
|
||||
" pew.loc[(pew['q16'] == v), 'reltrad'] = v\n",
|
||||
"for value in (\"Atheist\", \"Agnostic\"):\n",
|
||||
" pew.loc[(pew[\"q16\"] == value), \"reltrad\"] = value\n",
|
||||
"\n",
|
||||
"income_columns = ['<$10k', '$10-20k', '$20-30k', '$30-40k', '$40-50k', '$50-75k',\n",
|
||||
" '$75-100k', '$100-150k', '>150k', 'Don\\'t know/Refused']\n",
|
||||
"\n",
|
||||
"pew = pew.groupby(['reltrad', 'income']).size().unstack('income')\n",
|
||||
"income_columns = [\n",
|
||||
" \"<$10k\",\n",
|
||||
" \"$10-20k\",\n",
|
||||
" \"$20-30k\",\n",
|
||||
" \"$30-40k\",\n",
|
||||
" \"$40-50k\",\n",
|
||||
" \"$50-75k\",\n",
|
||||
" \"$75-100k\",\n",
|
||||
" \"$100-150k\",\n",
|
||||
" \">150k\",\n",
|
||||
" \"Don't know/Refused\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"pew = pew.groupby([\"reltrad\", \"income\"]).size().unstack(\"income\")\n",
|
||||
"pew = pew[income_columns]\n",
|
||||
"pew.index.name = 'religion'"
|
||||
"pew.index.name = \"religion\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -426,9 +425,9 @@
|
|||
"\n",
|
||||
"> This dataset has **three** variables, **religion**, **income** and **frequency**. To tidy it, we need to **melt**, or stack it. In other words, we need to turn columns into rows.\n",
|
||||
"\n",
|
||||
"pandas provides a [pd.melt](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.melt.html) function to un-pivot the dataset.\n",
|
||||
"pandas provides a [pd.melt()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html) function to un-pivot the dataset.\n",
|
||||
"\n",
|
||||
"**Notes:** *reset_index()* transforms the religion index column into a data column (*pd.melt()* needs that). Further, the resulting table is sorted implicitly by the *religion* column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
|
||||
"**Notes:** `.reset_index()` transforms the religion index column into a data column (`pd.melt()` needs that). Further, the resulting table is sorted implicitly by the `\"religion\"` column. To get to the same ordering as in the paper, the molten table is explicitly sorted."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -437,7 +436,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"molten_pew = pd.melt(pew.reset_index(), id_vars=['religion'], value_name='frequency')"
|
||||
"molten_pew = pd.melt(pew.reset_index(), id_vars=[\"religion\"], value_name=\"frequency\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -448,8 +447,8 @@
|
|||
"source": [
|
||||
"# Create a ordered column for the income labels.\n",
|
||||
"income_dtype = pd.api.types.CategoricalDtype(income_columns, ordered=True)\n",
|
||||
"molten_pew['income'] = molten_pew['income'].astype(income_dtype)\n",
|
||||
"molten_pew = molten_pew.sort_values(['religion', 'income']).reset_index(drop=True)"
|
||||
"molten_pew[\"income\"] = molten_pew[\"income\"].astype(income_dtype)\n",
|
||||
"molten_pew = molten_pew.sort_values([\"religion\", \"income\"]).reset_index(drop=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -616,37 +615,40 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# Usage of \"1st\", \"2nd\", \"3rd\" should be forbidden by law :)\n",
|
||||
"usecols = ['artist.inverted', 'track', 'time', 'date.entered'] + (\n",
|
||||
" [f'x{i}st.week' for i in range(1, 76, 10) if i != 11]\n",
|
||||
" + [f'x{i}nd.week' for i in range(2, 76, 10) if i != 12]\n",
|
||||
" + [f'x{i}rd.week' for i in range(3, 76, 10) if i != 13]\n",
|
||||
" + [f'x{i}th.week' for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
|
||||
" + [f'x11th.week', f'x12th.week', f'x13th.week']\n",
|
||||
"usecols = [\"artist.inverted\", \"track\", \"time\", \"date.entered\"] + (\n",
|
||||
" [f\"x{i}st.week\" for i in range(1, 76, 10) if i != 11]\n",
|
||||
" + [f\"x{i}nd.week\" for i in range(2, 76, 10) if i != 12]\n",
|
||||
" + [f\"x{i}rd.week\" for i in range(3, 76, 10) if i != 13]\n",
|
||||
" + [f\"x{i}th.week\" for i in range(1, 76) if (i % 10) not in (1, 2, 3)]\n",
|
||||
" + [f\"x11th.week\", f\"x12th.week\", f\"x13th.week\"]\n",
|
||||
")\n",
|
||||
"billboard = pd.read_csv(\n",
|
||||
" \"data/billboard.csv\",\n",
|
||||
" encoding=\"iso-8859-1\",\n",
|
||||
" parse_dates=[\"date.entered\"],\n",
|
||||
" usecols=usecols,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"billboard = pd.read_csv('data/billboard.csv', encoding='iso-8859-1',\n",
|
||||
" parse_dates=['date.entered'], usecols=usecols)\n",
|
||||
"\n",
|
||||
"billboard = billboard.assign(year=lambda x: x['date.entered'].dt.year)\n",
|
||||
"billboard = billboard.assign(year=lambda x: x[\"date.entered\"].dt.year)\n",
|
||||
"\n",
|
||||
"# Rename the week columns.\n",
|
||||
"week_columns = {\n",
|
||||
" c: ('wk' + re.sub(r'[^\\d]+', '', c))\n",
|
||||
" for c in billboard.columns\n",
|
||||
" if c.endswith('.week')\n",
|
||||
" column: (\"wk\" + re.sub(r\"[^\\d]+\", \"\", column))\n",
|
||||
" for column in billboard.columns\n",
|
||||
" if column.endswith(\".week\")\n",
|
||||
"}\n",
|
||||
"billboard = billboard.rename(columns={'artist.inverted': 'artist', **week_columns})\n",
|
||||
"billboard = billboard.rename(columns={\"artist.inverted\": \"artist\", **week_columns})\n",
|
||||
"\n",
|
||||
"# Ensure the columns' order is the same as in the paper.\n",
|
||||
"columns = ['year', 'artist', 'track', 'time', 'date.entered'] + [\n",
|
||||
" f'wk{i}' for i in range(1, 76)\n",
|
||||
"columns = [\"year\", \"artist\", \"track\", \"time\", \"date.entered\"] + [\n",
|
||||
" f\"wk{i}\" for i in range(1, 76)\n",
|
||||
"]\n",
|
||||
"billboard = billboard[columns]\n",
|
||||
"\n",
|
||||
"# Ensure the rows' order is similar as in the paper.\n",
|
||||
"# For unknown reasons the exact ordering as in the paper cannot be reconstructed.\n",
|
||||
"billboard = billboard[billboard['year'] == 2000]\n",
|
||||
"billboard = billboard.sort_values(['artist', 'track'])"
|
||||
"billboard = billboard[billboard[\"year\"] == 2000]\n",
|
||||
"billboard = billboard.sort_values([\"artist\", \"track\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -986,17 +988,17 @@
|
|||
"14 2000 Aaliyah Try Again \n",
|
||||
"200 2000 Adams, Yolanda Open My Heart \n",
|
||||
"\n",
|
||||
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
|
||||
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
|
||||
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
|
||||
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
|
||||
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
|
||||
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
|
||||
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
|
||||
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
|
||||
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
|
||||
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
|
||||
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
|
||||
" time date.entered wk1 wk2 wk3 wk4 wk5 ... wk66 wk67 wk68 \\\n",
|
||||
"246 4:22 2000-02-26 87 82.0 72.0 77.0 87.0 ... NaN NaN NaN \n",
|
||||
"287 3:15 2000-09-02 91 87.0 92.0 NaN NaN ... NaN NaN NaN \n",
|
||||
"24 3:53 2000-04-08 81 70.0 68.0 67.0 66.0 ... NaN NaN NaN \n",
|
||||
"193 4:24 2000-10-21 76 76.0 72.0 69.0 67.0 ... NaN NaN NaN \n",
|
||||
"69 3:35 2000-04-15 57 34.0 25.0 17.0 17.0 ... NaN NaN NaN \n",
|
||||
"22 3:24 2000-08-19 51 39.0 34.0 26.0 26.0 ... NaN NaN NaN \n",
|
||||
"304 3:44 2000-07-08 97 97.0 96.0 95.0 100.0 ... NaN NaN NaN \n",
|
||||
"135 4:15 2000-01-29 84 62.0 51.0 41.0 38.0 ... NaN NaN NaN \n",
|
||||
"14 4:03 2000-03-18 59 53.0 38.0 28.0 21.0 ... NaN NaN NaN \n",
|
||||
"200 5:30 2000-08-26 76 76.0 74.0 69.0 68.0 ... NaN NaN NaN \n",
|
||||
"\n",
|
||||
" wk69 wk70 wk71 wk72 wk73 wk74 wk75 \n",
|
||||
"246 NaN NaN NaN NaN NaN NaN NaN \n",
|
||||
|
|
@ -1028,7 +1030,7 @@
|
|||
"source": [
|
||||
"### \"Tidy\" Data\n",
|
||||
"\n",
|
||||
"As before the *pd.melt* function is used to transform the data from \"wide\" to \"long\" form."
|
||||
"As before the `pd.melt()` function is used to transform the data from \"wide\" to \"long\" form."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1039,9 +1041,9 @@
|
|||
"source": [
|
||||
"molten_billboard = pd.melt(\n",
|
||||
" billboard,\n",
|
||||
" id_vars=['year', 'artist', 'track', 'time', 'date.entered'],\n",
|
||||
" var_name='week',\n",
|
||||
" value_name='rank',\n",
|
||||
" id_vars=[\"year\", \"artist\", \"track\", \"time\", \"date.entered\"],\n",
|
||||
" var_name=\"week\",\n",
|
||||
" value_name=\"rank\",\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
|
|
@ -1049,7 +1051,7 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column *date* indicating when exactly a particular song was at a certain rank in the charts is added."
|
||||
"In contrast to R, pandas keeps (unneccesary) rows for weeks where the song was already out of the charts. These are discarded. Also, a new column`\"date\"` indicating when exactly a particular song was at a certain rank in the charts is added."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1059,24 +1061,23 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"# pandas keeps \"wide\" variables that had missing values as rows.\n",
|
||||
"molten_billboard = molten_billboard[molten_billboard['rank'].notnull()]\n",
|
||||
"molten_billboard = molten_billboard[molten_billboard[\"rank\"].notnull()]\n",
|
||||
"\n",
|
||||
"# Cast as integer after missing values are removed.\n",
|
||||
"molten_billboard['week'] = molten_billboard['week'].map(lambda x: int(x[2:]))\n",
|
||||
"molten_billboard['rank'] = molten_billboard['rank'].map(int)\n",
|
||||
"molten_billboard[\"week\"] = molten_billboard[\"week\"].map(lambda x: int(x[2:]))\n",
|
||||
"molten_billboard[\"rank\"] = molten_billboard[\"rank\"].map(int)\n",
|
||||
"\n",
|
||||
"# Calculate the actual week from the date of first entering the list.\n",
|
||||
"molten_billboard = molten_billboard.assign(\n",
|
||||
" date=lambda x: x['date.entered'] + (x['week'] - 1) * datetime.timedelta(weeks=1)\n",
|
||||
" date=lambda x: x[\"date.entered\"] + (x[\"week\"] - 1) * datetime.timedelta(weeks=1)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Sort rows and columns as in the paper.\n",
|
||||
"molten_billboard = molten_billboard[\n",
|
||||
" ['year', 'artist', 'time', 'track', 'date', 'week', 'rank']\n",
|
||||
" [\"year\", \"artist\", \"time\", \"track\", \"date\", \"week\", \"rank\"]\n",
|
||||
"]\n",
|
||||
"molten_billboard = (\n",
|
||||
" molten_billboard.sort_values(['artist', 'track', 'week']).reset_index(drop=True)\n",
|
||||
")"
|
||||
"molten_billboard = molten_billboard.sort_values([\"artist\", \"track\", \"week\"])\n",
|
||||
"molten_billboard = molten_billboard.reset_index(drop=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
@ -1336,7 +1337,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"molten_billboard.to_csv('data/billboard_cleaned.csv', index=False)"
|
||||
"molten_billboard.to_csv(\"data/billboard_cleaned.csv\", index=False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -1356,9 +1357,9 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
"version": "3.7.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue