From 9257890614b470abfbb640859d379e882b805f58 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Mon, 19 Oct 2020 20:12:06 +0200 Subject: [PATCH 01/10] Pin the dependencies - upgrade transient dependencies: + lxml + nbclient + urllib3 --- poetry.lock | 85 ++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/poetry.lock b/poetry.lock index 9f17fc7..8e454f7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -475,7 +475,7 @@ test = ["pytest", "requests"] [[package]] name = "lxml" -version = "4.5.2" +version = "4.6.1" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." category = "dev" optional = false @@ -505,8 +505,8 @@ python-versions = "*" [[package]] name = "nbclient" -version = "0.5.0" -description = "A client library for executing notebooks. Formally nbconvert's ExecutePreprocessor." +version = "0.5.1" +description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." category = "main" optional = false python-versions = ">=3.6" @@ -920,7 +920,7 @@ test = ["pytest"] [[package]] name = "urllib3" -version = "1.25.10" +version = "1.25.11" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false @@ -928,7 +928,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" [package.extras] brotli = ["brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=0.14)", "ipaddress"] +secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] [[package]] @@ -1169,37 +1169,42 @@ jupyterlab-server = [ {file = "jupyterlab_server-1.2.0.tar.gz", hash = "sha256:5431d9dde96659364b7cc877693d5d21e7b80cea7ae3959ecc2b87518e5f5d8c"}, ] lxml = [ - {file = "lxml-4.5.2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:74f48ec98430e06c1fa8949b49ebdd8d27ceb9df8d3d1c92e1fdc2773f003f20"}, - {file = "lxml-4.5.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e70d4e467e243455492f5de463b72151cc400710ac03a0678206a5f27e79ddef"}, - {file = "lxml-4.5.2-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:7ad7906e098ccd30d8f7068030a0b16668ab8aa5cda6fcd5146d8d20cbaa71b5"}, - {file = "lxml-4.5.2-cp27-cp27m-win32.whl", hash = "sha256:92282c83547a9add85ad658143c76a64a8d339028926d7dc1998ca029c88ea6a"}, - {file = "lxml-4.5.2-cp27-cp27m-win_amd64.whl", hash = "sha256:05a444b207901a68a6526948c7cc8f9fe6d6f24c70781488e32fd74ff5996e3f"}, - {file = "lxml-4.5.2-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:94150231f1e90c9595ccc80d7d2006c61f90a5995db82bccbca7944fd457f0f6"}, - {file = "lxml-4.5.2-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bea760a63ce9bba566c23f726d72b3c0250e2fa2569909e2d83cda1534c79443"}, - {file = "lxml-4.5.2-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:c3f511a3c58676147c277eff0224c061dd5a6a8e1373572ac817ac6324f1b1e0"}, - {file = "lxml-4.5.2-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:59daa84aef650b11bccd18f99f64bfe44b9f14a08a28259959d33676554065a1"}, - {file = "lxml-4.5.2-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:c9d317efde4bafbc1561509bfa8a23c5cab66c44d49ab5b63ff690f5159b2304"}, - {file = "lxml-4.5.2-cp35-cp35m-win32.whl", hash = "sha256:9dc9006dcc47e00a8a6a029eb035c8f696ad38e40a27d073a003d7d1443f5d88"}, - {file = "lxml-4.5.2-cp35-cp35m-win_amd64.whl", hash = "sha256:08fc93257dcfe9542c0a6883a25ba4971d78297f63d7a5a26ffa34861ca78730"}, - {file = "lxml-4.5.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:121b665b04083a1e85ff1f5243d4a93aa1aaba281bc12ea334d5a187278ceaf1"}, - {file = "lxml-4.5.2-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:5591c4164755778e29e69b86e425880f852464a21c7bb53c7ea453bbe2633bbe"}, - {file = "lxml-4.5.2-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:cc411ad324a4486b142c41d9b2b6a722c534096963688d879ea6fa8a35028258"}, - {file = "lxml-4.5.2-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:1fa21263c3aba2b76fd7c45713d4428dbcc7644d73dcf0650e9d344e433741b3"}, - {file = "lxml-4.5.2-cp36-cp36m-win32.whl", hash = "sha256:786aad2aa20de3dbff21aab86b2fb6a7be68064cbbc0219bde414d3a30aa47ae"}, - {file = "lxml-4.5.2-cp36-cp36m-win_amd64.whl", hash = "sha256:e1cacf4796b20865789083252186ce9dc6cc59eca0c2e79cca332bdff24ac481"}, - {file = "lxml-4.5.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:80a38b188d20c0524fe8959c8ce770a8fdf0e617c6912d23fc97c68301bb9aba"}, - {file = "lxml-4.5.2-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:ecc930ae559ea8a43377e8b60ca6f8d61ac532fc57efb915d899de4a67928efd"}, - {file = "lxml-4.5.2-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:a76979f728dd845655026ab991df25d26379a1a8fc1e9e68e25c7eda43004bed"}, - {file = "lxml-4.5.2-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cfd7c5dd3c35c19cec59c63df9571c67c6d6e5c92e0fe63517920e97f61106d1"}, - {file = "lxml-4.5.2-cp37-cp37m-win32.whl", hash = "sha256:5a9c8d11aa2c8f8b6043d845927a51eb9102eb558e3f936df494e96393f5fd3e"}, - {file = "lxml-4.5.2-cp37-cp37m-win_amd64.whl", hash = "sha256:4b4a111bcf4b9c948e020fd207f915c24a6de3f1adc7682a2d92660eb4e84f1a"}, - {file = "lxml-4.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5dd20538a60c4cc9a077d3b715bb42307239fcd25ef1ca7286775f95e9e9a46d"}, - {file = "lxml-4.5.2-cp38-cp38-manylinux1_i686.whl", hash = "sha256:2b30aa2bcff8e958cd85d907d5109820b01ac511eae5b460803430a7404e34d7"}, - {file = "lxml-4.5.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:aa8eba3db3d8761db161003e2d0586608092e217151d7458206e243be5a43843"}, - {file = "lxml-4.5.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8f0ec6b9b3832e0bd1d57af41f9238ea7709bbd7271f639024f2fc9d3bb01293"}, - {file = "lxml-4.5.2-cp38-cp38-win32.whl", hash = "sha256:107781b213cf7201ec3806555657ccda67b1fccc4261fb889ef7fc56976db81f"}, - {file = "lxml-4.5.2-cp38-cp38-win_amd64.whl", hash = "sha256:f161af26f596131b63b236372e4ce40f3167c1b5b5d459b29d2514bd8c9dc9ee"}, - {file = "lxml-4.5.2.tar.gz", hash = "sha256:cdc13a1682b2a6241080745b1953719e7fe0850b40a5c71ca574f090a1391df6"}, + {file = "lxml-4.6.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:4b7572145054330c8e324a72d808c8c8fbe12be33368db28c39a255ad5f7fb51"}, + {file = "lxml-4.6.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:302160eb6e9764168e01d8c9ec6becddeb87776e81d3fcb0d97954dd51d48e0a"}, + {file = "lxml-4.6.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:d4ad7fd3269281cb471ad6c7bafca372e69789540d16e3755dd717e9e5c9d82f"}, + {file = "lxml-4.6.1-cp27-cp27m-win32.whl", hash = "sha256:189ad47203e846a7a4951c17694d845b6ade7917c47c64b29b86526eefc3adf5"}, + {file = "lxml-4.6.1-cp27-cp27m-win_amd64.whl", hash = "sha256:56eff8c6fb7bc4bcca395fdff494c52712b7a57486e4fbde34c31bb9da4c6cc4"}, + {file = "lxml-4.6.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:23c83112b4dada0b75789d73f949dbb4e8f29a0a3511647024a398ebd023347b"}, + {file = "lxml-4.6.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:0e89f5d422988c65e6936e4ec0fe54d6f73f3128c80eb7ecc3b87f595523607b"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:2358809cc64394617f2719147a58ae26dac9e21bae772b45cfb80baa26bfca5d"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:be1ebf9cc25ab5399501c9046a7dcdaa9e911802ed0e12b7d620cd4bbf0518b3"}, + {file = "lxml-4.6.1-cp35-cp35m-manylinux2014_aarch64.whl", hash = "sha256:4fff34721b628cce9eb4538cf9a73d02e0f3da4f35a515773cce6f5fe413b360"}, + {file = "lxml-4.6.1-cp35-cp35m-win32.whl", hash = "sha256:475325e037fdf068e0c2140b818518cf6bc4aa72435c407a798b2db9f8e90810"}, + {file = "lxml-4.6.1-cp35-cp35m-win_amd64.whl", hash = "sha256:f98b6f256be6cec8dd308a8563976ddaff0bdc18b730720f6f4bee927ffe926f"}, + {file = "lxml-4.6.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:be7c65e34d1b50ab7093b90427cbc488260e4b3a38ef2435d65b62e9fa3d798a"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:d18331ea905a41ae71596502bd4c9a2998902328bbabd29e3d0f5f8569fabad1"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:3d9b2b72eb0dbbdb0e276403873ecfae870599c83ba22cadff2db58541e72856"}, + {file = "lxml-4.6.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:d20d32cbb31d731def4b1502294ca2ee99f9249b63bc80e03e67e8f8e126dea8"}, + {file = "lxml-4.6.1-cp36-cp36m-win32.whl", hash = "sha256:d182eada8ea0de61a45a526aa0ae4bcd222f9673424e65315c35820291ff299c"}, + {file = "lxml-4.6.1-cp36-cp36m-win_amd64.whl", hash = "sha256:c0dac835c1a22621ffa5e5f999d57359c790c52bbd1c687fe514ae6924f65ef5"}, + {file = "lxml-4.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d84d741c6e35c9f3e7406cb7c4c2e08474c2a6441d59322a00dcae65aac6315d"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:8862d1c2c020cb7a03b421a9a7b4fe046a208db30994fc8ff68c627a7915987f"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:3a7a380bfecc551cfd67d6e8ad9faa91289173bdf12e9cfafbd2bdec0d7b1ec1"}, + {file = "lxml-4.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:2d6571c48328be4304aee031d2d5046cbc8aed5740c654575613c5a4f5a11311"}, + {file = "lxml-4.6.1-cp37-cp37m-win32.whl", hash = "sha256:803a80d72d1f693aa448566be46ffd70882d1ad8fc689a2e22afe63035eb998a"}, + {file = "lxml-4.6.1-cp37-cp37m-win_amd64.whl", hash = "sha256:24e811118aab6abe3ce23ff0d7d38932329c513f9cef849d3ee88b0f848f2aa9"}, + {file = "lxml-4.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2e311a10f3e85250910a615fe194839a04a0f6bc4e8e5bb5cac221344e3a7891"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:a71400b90b3599eb7bf241f947932e18a066907bf84617d80817998cee81e4bf"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:211b3bcf5da70c2d4b84d09232534ad1d78320762e2c59dedc73bf01cb1fc45b"}, + {file = "lxml-4.6.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:e65c221b2115a91035b55a593b6eb94aa1206fa3ab374f47c6dc10d364583ff9"}, + {file = "lxml-4.6.1-cp38-cp38-win32.whl", hash = "sha256:d6f8c23f65a4bfe4300b85f1f40f6c32569822d08901db3b6454ab785d9117cc"}, + {file = "lxml-4.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:573b2f5496c7e9f4985de70b9bbb4719ffd293d5565513e04ac20e42e6e5583f"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:1d87936cb5801c557f3e981c9c193861264c01209cb3ad0964a16310ca1b3301"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:2d5896ddf5389560257bbe89317ca7bcb4e54a02b53a3e572e1ce4226512b51b"}, + {file = "lxml-4.6.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9b06690224258db5cd39a84e993882a6874676f5de582da57f3df3a82ead9174"}, + {file = "lxml-4.6.1-cp39-cp39-win32.whl", hash = "sha256:bb252f802f91f59767dcc559744e91efa9df532240a502befd874b54571417bd"}, + {file = "lxml-4.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:7ecaef52fd9b9535ae5f01a1dd2651f6608e4ec9dc136fc4dfe7ebe3c3ddb230"}, + {file = "lxml-4.6.1.tar.gz", hash = "sha256:c152b2e93b639d1f36ec5a8ca24cde4a8eefb2b6b83668fcd8e83a67badcb367"}, ] markupsafe = [ {file = "MarkupSafe-1.1.1-cp27-cp27m-macosx_10_6_intel.whl", hash = "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161"}, @@ -1241,8 +1246,8 @@ mistune = [ {file = "mistune-0.8.4.tar.gz", hash = "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e"}, ] nbclient = [ - {file = "nbclient-0.5.0-py3-none-any.whl", hash = "sha256:8a6e27ff581cee50895f44c41936ce02369674e85e2ad58643d8d4a6c36771b0"}, - {file = "nbclient-0.5.0.tar.gz", hash = "sha256:8ad52d27ba144fca1402db014857e53c5a864a2f407be66ca9d74c3a56d6591d"}, + {file = "nbclient-0.5.1-py3-none-any.whl", hash = "sha256:4d6b116187c795c99b9dba13d46e764d596574b14c296d60670c8dfe454db364"}, + {file = "nbclient-0.5.1.tar.gz", hash = "sha256:01e2d726d16eaf2cde6db74a87e2451453547e8832d142f73f72fddcd4fe0250"}, ] nbconvert = [ {file = "nbconvert-6.0.7-py3-none-any.whl", hash = "sha256:39e9f977920b203baea0be67eea59f7b37a761caa542abe80f5897ce3cf6311d"}, @@ -1467,8 +1472,8 @@ traitlets = [ {file = "traitlets-5.0.5.tar.gz", hash = "sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396"}, ] urllib3 = [ - {file = "urllib3-1.25.10-py2.py3-none-any.whl", hash = "sha256:e7983572181f5e1522d9c98453462384ee92a0be7fac5f1413a1e35c56cc0461"}, - {file = "urllib3-1.25.10.tar.gz", hash = "sha256:91056c15fa70756691db97756772bb1eb9678fa585d9184f24534b100dc60f4a"}, + {file = "urllib3-1.25.11-py2.py3-none-any.whl", hash = "sha256:f5321fbe4bf3fefa0efd0bfe7fb14e90909eb62a48ccda331726b4319897dd5e"}, + {file = "urllib3-1.25.11.tar.gz", hash = "sha256:8d7eaa5a82a1cac232164990f04874c594c9453ec55eef02eab885aa02fc17a2"}, ] virtualenv = [ {file = "virtualenv-20.0.35-py2.py3-none-any.whl", hash = "sha256:0ebc633426d7468664067309842c81edab11ae97fcaf27e8ad7f5748c89b431b"}, From 5d8af25aa63cefe2357c9cc417d6c4123f0765e3 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 16:58:20 +0200 Subject: [PATCH 02/10] Add initial version of chapter 08, part 1 --- 00_intro/00_content.ipynb | 2 +- 08_mfr/00_content.ipynb | 1377 +++++++++++++++++++++++++++++++++++++ CONTENTS.md | 7 + README.md | 1 + 4 files changed, 1386 insertions(+), 1 deletion(-) create mode 100644 08_mfr/00_content.ipynb diff --git a/00_intro/00_content.ipynb b/00_intro/00_content.ipynb index a5d9e15..b1c99a6 100644 --- a/00_intro/00_content.ipynb +++ b/00_intro/00_content.ipynb @@ -771,7 +771,7 @@ " - *Chapter 5*: [Numbers & Bits ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/05_numbers/00_content.ipynb)\n", " - *Chapter 6*: [Text & Bytes ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/06_text/00_content.ipynb)\n", " - *Chapter 7*: [Sequential Data ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/00_content.ipynb)\n", - " - *Chapter 8*: Map, Filter, & Reduce\n", + " - *Chapter 8*: [Map, Filter, & Reduce ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb)\n", " - *Chapter 9*: Mappings & Sets\n", " - *Chapter 10*: Arrays & Dataframes\n", "- How can we create custom data types?\n", diff --git a/08_mfr/00_content.ipynb b/08_mfr/00_content.ipynb new file mode 100644 index 0000000..4cf1aa7 --- /dev/null +++ b/08_mfr/00_content.ipynb @@ -0,0 +1,1377 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**Note**: Click on \"*Kernel*\" > \"*Restart Kernel and Clear All Outputs*\" in [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) *before* reading this notebook to reset its output. If you cannot run this file on your machine, you may want to open it [in the cloud ](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/00_content.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Chapter 8: Map, Filter, & Reduce" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In this chapter, we continue the study of sequential data by looking at memory efficient ways to process the elements in a sequence. That is an important topic for the data science practitioner who must be able to work with data that does *not* fit into a single computer's memory.\n", + "\n", + "As shown in [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/02_content.ipynb#Containers-vs.-Iterables), both the `list` objects `[0, 1, 2, 3, 4]` and `[1, 3, 5, 7, 9]` on the one side and the `range` objects `range(5)` and `range(1, 10, 2)` on the other side allow us to loop over the same numbers. However, the latter two only create *one* `int` object in every iteration while the former two create *all* `int` objects before the loop even starts. In this aspect, we consider `range` objects to be \"rules\" in memory that know how to calculate the numbers *without* calculating them.\n", + "\n", + "In [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/01_content.ipynb#The-list-Type), we see how the built-in [list() ](https://docs.python.org/3/library/functions.html#func-list) constructor **materializes** the `range(1, 13)` object into the `list` object `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]`. In other words, we make `range(1, 13)` calculate *all* numbers at once and store them in a `list` object for further processing.\n", + "\n", + "In many cases, however, it is not necessary to do that, and, in this chapter, we look at other types of \"rules\" in memory and how we can compose different \"rules\" together to implement bigger computations.\n", + "\n", + "Next, we take a step back and continue with a simple example involving the familiar `numbers` list. Then, we iteratively exchange `list` objects with \"rule\"-like objects *without* changing the overall computation at all. As computations involving sequential data are commonly classified into three categories **map**, **filter**, or **reduce**, we do so too for our `numbers` example." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**Mapping** refers to the idea of applying a transformation to every element in a sequence.\n", + "\n", + "For example, let's square each element in `numbers` and add `1` to the squares. In essence, we apply the transformation $y := x^2 + 1$ as expressed with the `transform()` function below." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "def transform(element):\n", + " \"\"\"Map elements to their squares plus 1.\"\"\"\n", + " return (element ** 2) + 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "With the syntax we know so far, we revert to a `for`-loop that iteratively appends the transformed elements to an initially empty `transformed_numbers` list." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "transformed_numbers = []\n", + "\n", + "for old in numbers:\n", + " new = transform(old)\n", + " transformed_numbers.append(new)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 65, 26, 10, 145, 5, 37, 82, 101, 2, 17]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "As this kind of data processing is so common, Python provides the [map() ](https://docs.python.org/3/library/functions.html#map) built-in. In its simplest usage form, it takes two arguments: A transformation `function` that takes exactly *one* positional argument and an `iterable` that provides the objects to be mapped.\n", + "\n", + "We call [map() ](https://docs.python.org/3/library/functions.html#map) with a reference to the `transform()` function and the `numbers` list as the arguments and store the result in the variable `transformer` to inspect it." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "transformer = map(transform, numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We might expect to get back a materialized sequence (i.e., all elements exist in memory), and a `list` object would feel the most natural because of the type of the `numbers` argument. However, `transformer` is an object of type `map`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "map" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(transformer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Like `range` objects, `map` objects generate a series of objects \"on the fly\" (i.e., one by one), and we use the built-in [next() ](https://docs.python.org/3/library/functions.html#next) function to obtain the next object in line. So, we should think of a `map` object as a \"rule\" stored in memory that only knows how to calculate the next object of possibly *infinitely* many." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(transformer)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "122" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(transformer)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "65" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(transformer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "It is essential to understand that by creating a `map` object with the [map() ](https://docs.python.org/3/library/functions.html#map) built-in, *nothing* happens in memory except the creation of the `map` object. In particular, no second `list` object derived from `numbers` is created. Also, we may view `range` objects as a special case of `map` objects: They are constrained to generating `int` objects only, and the `iterable` argument is replaced with `start`, `stop`, and `step` arguments.\n", + "\n", + "If we are sure that a `map` object generates a *finite* number of elements, we may materialize them into a `list` object with the built-in [list() ](https://docs.python.org/3/library/functions.html#func-list) constructor. Below, we \"pull out\" the remaining `int` objects from `transformer`, which itself is derived from a *finite* `list` object." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[26, 10, 145, 5, 37, 82, 101, 2, 17]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(transformer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In summary, instead of creating an empty list first and appending it in a `for`-loop as above, we write the following one-liner and obtain an equal `transformed_numbers` list." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "transformed_numbers = list(map(transform, numbers))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 65, 26, 10, 145, 5, 37, 82, 101, 2, 17]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Filtering" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**Filtering** refers to the idea of creating a subset of a sequence with a **boolean filter** `function` that indicates if an element should be kept (i.e., `True`) or not (i.e., `False`).\n", + "\n", + "In the example, let's only keep the even elements in `numbers`. The `is_even()` function implements that as a filter." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "def is_even(element):\n", + " \"\"\"Filter out odd numbers.\"\"\"\n", + " if element % 2 == 0:\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "As `element % 2 == 0` is already a boolean expression, we could shorten `is_even()` like so." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "def is_even(element):\n", + " \"\"\"Filter out odd numbers.\"\"\"\n", + " return element % 2 == 0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "As before, we first use a `for`-loop that appends the elements to be kept iteratively to an initially empty `even_numbers` list." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "even_numbers = []\n", + "\n", + "for number in transformed_numbers:\n", + " if is_even(number):\n", + " even_numbers.append(number)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 26, 10, 82, 2]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "even_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Analogously to the `map` object above, we use the [filter() ](https://docs.python.org/3/library/functions.html#filter) built-in to create an object of type `filter` and assign it to `evens`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "evens = filter(is_even, transformed_numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evens" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "filter" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(evens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`evens` works like `transformer` above: With the built-in [next() ](https://docs.python.org/3/library/functions.html#next) function we obtain the even numbers one by one. So, the \"next\" element in line is simply the next even `int` object the `filter` object encounters." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 65, 26, 10, 145, 5, 37, 82, 101, 2, 17]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transformed_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(evens)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "122" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(evens)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "26" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(evens)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "As above, we could create a materialized `list` object with the [list() ](https://docs.python.org/3/library/functions.html#func-list) constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 26, 10, 82, 2]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(filter(is_even, transformed_numbers))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We may also chain `map` and `filter` objects derived from the original `numbers` list. As the entire cell is *one* big expression consisting of nested function calls, we read it from the inside out." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[50, 122, 26, 10, 82, 2]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(\n", + " filter(\n", + " is_even,\n", + " map(transform, numbers),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Using the [map() ](https://docs.python.org/3/library/functions.html#map) and [filter() ](https://docs.python.org/3/library/functions.html#filter) built-ins, we can quickly switch the order: Filter first and then transform the remaining elements. This variant equals the \"*A simple Filter*\" example in [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/03_content.ipynb#Example:-A-simple-Filter). On the contrary, code with `for`-loops and `if` statements is more tedious to adapt. Additionally, `map` and `filter` objects loop \"at the C level\" and are a lot faster because of that. Because of that, experienced Pythonistas tend to *not* use explicit `for`-loops so often." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[65, 145, 5, 37, 101, 17]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(\n", + " map(\n", + " transform,\n", + " filter(is_even, numbers),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Reducing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Lastly, **reducing** sequential data means to summarize the elements into a single statistic.\n", + "\n", + "A simple example is the built-in [sum() ](https://docs.python.org/3/library/functions.html#sum) function." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(\n", + " map(\n", + " transform,\n", + " filter(is_even, numbers),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Other straightforward examples are the built-in [min() ](https://docs.python.org/3/library/functions.html#min) or [max() ](https://docs.python.org/3/library/functions.html#max) functions." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min(map(transform, filter(is_even, numbers)))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "145" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max(map(transform, filter(is_even, numbers)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "[sum() ](https://docs.python.org/3/library/functions.html#sum), [min() ](https://docs.python.org/3/library/functions.html#min), and [max() ](https://docs.python.org/3/library/functions.html#max) can be regarded as special cases.\n", + "\n", + "The generic way of reducing a sequence is to apply a function of *two* arguments on a rolling horizon: Its first argument is the reduction of the elements processed so far, and the second the next element to be reduced.\n", + "\n", + "For illustration, let's replicate [sum() ](https://docs.python.org/3/library/functions.html#sum) as such a function, called `sum_alt()`. Its implementation only adds two numbers." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "def sum_alt(sum_so_far, next_number):\n", + " \"\"\"Reduce a sequence by addition.\"\"\"\n", + " return sum_so_far + next_number" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Further, we create a *new* `map` object derived from `numbers` ..." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "evens_transformed = map(transform, filter(is_even, numbers))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "... and loop over all *but* the first element it generates. The latter is captured separately as the initial `result` with the [next() ](https://docs.python.org/3/library/functions.html#next) function. We know from above that `evens_transformed` generates *six* elements. That is why we see *five* growing `result` values resembling a [cumulative sum](http://mathworld.wolfram.com/CumulativeSum.html). The first `210` is the sum of the first two elements generated by `evens_transformed`, `65` and `145`.\n", + "\n", + "So, we also learn that `map` objects, and analogously `filter` objects, are *iterable* as we may loop over them." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "210 215 252 353 370 " + ] + } + ], + "source": [ + "result = next(evens_transformed)\n", + "\n", + "for number in evens_transformed:\n", + " result = sum_alt(result, number)\n", + " print(result, end=\" \") # line added for didactical purposes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The final `result` is the same `370` as above." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) function in the [functools ](https://docs.python.org/3/library/functools.html) module in the [standard library ](https://docs.python.org/3/library/index.html) provides more convenience (and speed) replacing the `for`-loop. It takes two arguments, `function` and `iterable`, in the same way as the [map() ](https://docs.python.org/3/library/functions.html#map) and [filter() ](https://docs.python.org/3/library/functions.html#filter) built-ins.\n", + "\n", + "[reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) is **[eager ](https://en.wikipedia.org/wiki/Eager_evaluation)** meaning that all computations implied by the contained `map` and `filter` \"rules\" are executed immediately, and the code cell evaluates to `370`. On the contrary, [map() ](https://docs.python.org/3/library/functions.html#map) and [filter() ](https://docs.python.org/3/library/functions.html#filter) create **[lazy ](https://en.wikipedia.org/wiki/Lazy_evaluation)** `map` and `filter` objects, and we have to use the [next() ](https://docs.python.org/3/library/functions.html#next) function to obtain the elements, one by one." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "from functools import reduce" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reduce(\n", + " sum_alt,\n", + " map(\n", + " transform,\n", + " filter(is_even, numbers),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Lambda Expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "[map() ](https://docs.python.org/3/library/functions.html#map), [filter() ](https://docs.python.org/3/library/functions.html#filter), and [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) take a `function` object as their first argument, and we defined `transform()`, `is_even()`, and `sum_alt()` to be used precisely for that.\n", + "\n", + "Often, such functions are used *only once* in a program. However, the primary purpose of functions is to *reuse* them. In such cases, it makes more sense to define them \"anonymously\" right at the position where the first argument goes.\n", + "\n", + "As mentioned in [Chapter 2 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/02_functions/00_content.ipynb#Anonymous-Functions), we use `lambda` expressions to create `function` objects *without* a name referencing them.\n", + "\n", + "So, the above `sum_alt()` function could be rewritten as a `lambda` expression like so ..." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(sum_so_far, next_number)>" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lambda sum_so_far, next_number: sum_so_far + next_number" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "... or even shorter." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(x, y)>" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lambda x, y: x + y" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "With the new concepts in this section, we can rewrite the entire example in just a few lines of code *without* any `for`, `if`, and `def` statements. The resulting code is concise, easy to read, quick to modify, and even faster in execution. Most importantly, it is optimized to handle big amounts of data as *no* temporary `list` objects are materialized in memory." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]\n", + "evens = filter(lambda x: x % 2 == 0, numbers)\n", + "transformed = map(lambda x: (x ** 2) + 1, evens)\n", + "sum(transformed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "If `numbers` comes as a sorted sequence of whole numbers, we may use the [range() ](https://docs.python.org/3/library/functions.html#func-range) built-in and get away *without* any materialized `list` object in memory at all!" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers = range(1, 13)\n", + "evens = filter(lambda x: x % 2 == 0, numbers)\n", + "transformed = map(lambda x: (x ** 2) + 1, evens)\n", + "sum(transformed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To additionally save the temporary variables, `numbers`, `evens`, and `transformed`, we could write the entire computation as *one* expression." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(\n", + " map(\n", + " lambda x: (x ** 2) + 1,\n", + " filter(\n", + " lambda x: x % 2 == 0,\n", + " range(1, 13),\n", + " )\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "PythonTutor visualizes the differences in the number of computational steps and memory usage:\n", + "- [Version 1 ](http://pythontutor.com/visualize.html#code=def%20is_even%28element%29%3A%0A%20%20%20%20if%20element%20%25%202%20%3D%3D%200%3A%0A%20%20%20%20%20%20%20%20return%20True%0A%20%20%20%20return%20False%0A%0Adef%20transform%28element%29%3A%0A%20%20%20%20return%20%28element%20**%202%29%20%2B%201%0A%0Anumbers%20%3D%20list%28range%281,%2013%29%29%0A%0Aevens%20%3D%20%5B%5D%0Afor%20number%20in%20numbers%3A%0A%20%20%20%20if%20is_even%28number%29%3A%0A%20%20%20%20%20%20%20%20evens.append%28number%29%0A%0Atransformed%20%3D%20%5B%5D%0Afor%20number%20in%20evens%3A%0A%20%20%20%20transformed.append%28transform%28number%29%29%0A%0Aresult%20%3D%20sum%28transformed%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false): With `for`-loops, `if` statements, and named functions -> **116** steps and **3** `list` objects\n", + "- [Version 2 ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20range%281,%2013%29%0Aevens%20%3D%20filter%28lambda%20x%3A%20x%20%25%202%20%3D%3D%200,%20numbers%29%0Atransformed%20%3D%20map%28lambda%20x%3A%20%28x%20**%202%29%20%2B%201,%20evens%29%0Aresult%20%3D%20sum%28transformed%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false): With named `map` and `filter` objects -> **58** steps and **no** `list` object\n", + "- [Version 3 ](http://pythontutor.com/visualize.html#code=result%20%3D%20sum%28map%28lambda%20x%3A%20%28x%20**%202%29%20%2B%201,%20filter%28lambda%20x%3A%20x%20%25%202%20%3D%3D%200,%20range%281,%2013%29%29%29%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false): Everything in *one* expression -> **55** steps and **no** `list` object\n", + "\n", + "Versions 2 and 3 are the same, except for the three additional steps required to create the temporary variables. The *major* downside of Version 1 is that, in the worst case, it may need *three times* the memory as compared to the other two versions!\n", + "\n", + "An experienced Pythonista would probably go with Version 2 in a production system to keep the code readable and maintainable.\n", + "\n", + "The map-filter-reduce paradigm has caught attention in recent years as it enables **[parallel computing ](https://en.wikipedia.org/wiki/Parallel_computing)**, and this gets important when dealing with big amounts of data. The workings in the memory as shown in this section provide an idea why." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "livereveal": { + "auto_select": "code", + "auto_select_fragment": true, + "scroll": true, + "theme": "serif" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "384px" + }, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index bbc1dee..476808d 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -189,3 +189,10 @@ If this is not possible, (`namedtuple` Type) - [summary ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/06_summary.ipynb) - [review questions ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/07_review.ipynb) + - *Chapter 8*: Map, Filter, & Reduce + - [content ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb) + [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/00_content.ipynb) + (Mapping; + Filtering; + Reducing; + `lambda` Expression) \ No newline at end of file diff --git a/README.md b/README.md index e9b2add..23b5521 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ For a more *detailed version* with **clickable links** - *Chapter 5*: Numbers & Bits - *Chapter 6*: Text & Bytes - *Chapter 7*: Sequential Data + - *Chapter 8*: Map, Filter, & Reduce #### Videos From e63f5ae25a058635873544e2a0c1ace956dccc44 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 17:47:42 +0200 Subject: [PATCH 03/10] Add initial version of chapter 08, part 2 --- 08_mfr/01_content.ipynb | 2380 +++++++++++++++++++++++++++++++++++++++ CONTENTS.md | 8 +- 2 files changed, 2387 insertions(+), 1 deletion(-) create mode 100644 08_mfr/01_content.ipynb diff --git a/08_mfr/01_content.ipynb b/08_mfr/01_content.ipynb new file mode 100644 index 0000000..2cf8720 --- /dev/null +++ b/08_mfr/01_content.ipynb @@ -0,0 +1,2380 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**Note**: Click on \"*Kernel*\" > \"*Restart Kernel and Clear All Outputs*\" in [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) *before* reading this notebook to reset its output. If you cannot run this file on your machine, you may want to open it [in the cloud ](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/01_content.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Chapter 8: Map, Filter, & Reduce (continued)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "After introducing the Map-Filter-Reduce paradigm in the [first part ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb) of this chapter, we first see how `list` comprehensions can replace the [map() ](https://docs.python.org/3/library/functions.html#map) and [filter() ](https://docs.python.org/3/library/functions.html#filter) built-ins in many cases. Then, we learn how `generator` expressions are like `list` comprehensions *without* using the memory. We end this part with a short discussion of the built-in [all() ](https://docs.python.org/3/library/functions.html#all) and [any() ](https://docs.python.org/3/library/functions.html#any) functions." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## `list` Comprehensions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Consider again the original \"*A simple Filter*\" example from [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/03_content.ipynb#Example:-A-simple-Filter), re-written such that both the mapping and the filtering are done in *one* `for`-loop instead of the *two* above." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "evens_transformed = []\n", + "\n", + "for number in numbers:\n", + " if number % 2 == 0:\n", + " evens_transformed.append((number ** 2) + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(evens_transformed)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**`list` comprehensions** are *expressions* to derive *new* `list` objects out of *existing* ones (cf., [reference ](https://docs.python.org/3/reference/expressions.html#displays-for-lists-sets-and-dictionaries)). Practically, this means that we place the `for` and `if` inside brackets `[` and `]`.\n", + "\n", + "So, the example can be written in a single *expression* like below replacing the compound `for` *statement* above." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[65, 145, 5, 37, 101, 17]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(n ** 2) + 1 for n in numbers if n % 2 == 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "A list comprehension may always be used in a place where otherwise a `list` object would work.\n", + "\n", + "For example, let's rewrite the \"*A simple Filter*\" example from [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/03_content.ipynb#Example:-A-simple-Filter) in just one line. As a caveat, the code below *materializes* all elements in memory *before* summing them up, and may, therefore, cause a `MemoryError` when executed with a bigger `numbers` list. We see with [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20range%281,%2013%29%0Aresult%20%3D%20sum%28%5B%28n%20**%202%29%20%2B%201%20for%20n%20in%20numbers%20if%20n%20%25%202%20%3D%3D%200%5D%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) how a `list` object exists in memory at step 17 and then \"gets lost\" right after. As the next section shows, this downside may be mitigated." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([(n ** 2) + 1 for n in numbers if n % 2 == 0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "### Example: Nested Lists" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "List comprehensions may come with several `for`'s and `if`'s.\n", + "\n", + "The cell below creates a `list` object that contains three other `list` objects with a series of numbers in them. The first and last numbers in each inner `list` object are offset by `1`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "nested_numbers = [\n", + " [1, 2, 3, 4, 5, 6, 7],\n", + " [2, 3, 4, 5, 6, 7, 8],\n", + " [3, 4, 5, 6, 7, 8, 9],\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 8, 9]]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nested_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To do something meaningful with the numbers, we have to remove the inner layer of `list` objects and **flatten** (i.e., \"un-nest\") the data.\n", + "\n", + "Without list comprehensions, we achieve that with two nested `for`-loops." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "flat_numbers = []\n", + "\n", + "for inner_numbers in nested_numbers:\n", + " for number in inner_numbers:\n", + " flat_numbers.append(number)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "flat_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "That translates into a list comprehension like below. The order of the `for`'s may be confusing at first but is the *same* as writing out the nested `for`-loops as above." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[number for inner_numbers in nested_numbers for number in inner_numbers]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Now, we may use the `list` object resulting from the list comprehension in any way we want.\n", + "\n", + "For example, to add up the flattened numbers with [sum() ](https://docs.python.org/3/library/functions.html#sum). The same caveat holds as before: The `list` object passed into [sum() ](https://docs.python.org/3/library/functions.html#sum) is *materialized* with *all* its elements *before* the sum is calculated!" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([number for inner_numbers in nested_numbers for number in inner_numbers])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In this particular example, however, we can exploit the fact that any sum of numbers can be expressed as the sum of sums of mutually exclusive and collectively exhaustive subsets of these numbers and get away with just *one* `for` in the list comprehension." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([sum(inner_numbers) for inner_numbers in nested_numbers])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Example: Working with Cartesian Products" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "A popular use case of nested list comprehensions is applying a transformation to each $n$-tuple in the [Cartesian product ](https://en.wikipedia.org/wiki/Cartesian_product) created from elements of $n$ iterables. In the generic illustration below, a transformation $f(x, y)$ is applied to each $2$-tuple in the Cartesian product $X \\times Y$ where $x$ is an element in $X = \\{x_1, x_2, x_3\\}$ and $y$ is an element in $Y = \\{y_1, y_2, y_3\\}$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "|$Y$ \\ $X$| $x_1$ | $x_2$ | $x_3$ |\n", + "|---------|--------------|--------------|--------------|\n", + "| $y_1$ |f($x_1$,$y_1$)|f($x_2$,$y_1$)|f($x_3$,$y_1$)|\n", + "| $y_2$ |f($x_1$,$y_2$)|f($x_2$,$y_2$)|f($x_3$,$y_2$)|\n", + "| $y_3$ |f($x_1$,$y_3$)|f($x_2$,$y_3$)|f($x_3$,$y_3$)|" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "For example, let's add each quotient obtained by taking the numerator $x$ from `[10, 20, 30]` and the denominator $y$ from `[40, 50, 60]` to `1`, and then find the overall product. This transformation can be described mathematically as the function $z = f(x, y)$ below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "$$z = f(x, y) = \\prod{ (1 + \\frac{x}{y} )}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Further, the table below visualizes the calculations: The result is the product of *nine* entries." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "source": [ + "|`y` \\ `x`|**10**|**20**|**30**|\n", + "|------|------|------|------|\n", + "|**40**| 1.25 | 1.50 | 1.75 |\n", + "|**50**| 1.20 | 1.40 | 1.60 |\n", + "|**60**| 1.17 | 1.33 | 1.50 |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To express that in Python, we start by creating two `list` objects, `first` and `second`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "first = [10, 20, 30]\n", + "second = [40, 50, 60]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "For a Cartesian product, we must loop over *all* possible $2$-tuples where one element is drawn from `first` and the other from `second`. We achieve that with two nested `for`-loops, in which we calculate each `quotient` and append it to an initially empty `cartesian_product` list." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.25, 1.2, 1.1666666666666667, 1.5, 1.4, 1.3333333333333333, 1.75, 1.6, 1.5]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cartesian_product = []\n", + "\n", + "for numerator in first:\n", + " for denominator in second:\n", + " quotient = 1 + (numerator / denominator)\n", + " cartesian_product.append(quotient)\n", + "\n", + "cartesian_product" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Next, we convert the two explicit `for`-loops into one list comprehensions and use `x` and `y` as shorter variable names." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.25, 1.2, 1.1666666666666667, 1.5, 1.4, 1.3333333333333333, 1.75, 1.6, 1.5]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[1 + (x / y) for x in first for y in second]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The order of the `for`'s is *important*: The list comprehension above divides numbers from `first` by numbers from `second`, whereas the list comprehension below does the opposite." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[5.0, 3.0, 2.333333333333333, 6.0, 3.5, 2.666666666666667, 7.0, 4.0, 3.0]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[1 + (x / y) for x in second for y in first]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To find the overall product, we *unpack* the list comprehension into the `product()` function from the \"*Function Definitions & Calls*\" sub-section in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/03_content.ipynb#Function-Definitions-&-Calls)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "def product(*args):\n", + " \"\"\"Multiply all arguments.\"\"\"\n", + " result = args[0]\n", + "\n", + " for arg in args[1:]:\n", + " result *= arg\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20.58" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product(*[1 + (x / y) for x in first for y in second])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Alternatively, we use a `lambda` expression with the [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) function from the [functools ](https://docs.python.org/3/library/functools.html) module." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import reduce" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20.58" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reduce(lambda x, y: x * y, [1 + (x / y) for x in first for y in second])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "While this example is stylized, Cartesian products are hidden in many applications, and it shows how the various language features introduced in this chapter can be seamlessly combined to process sequential data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## `generator` Expressions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Because of the high memory consumption, Pythonistas avoid materialized `list` objects, and, thus, also `list` comprehensions, whenever possible. Instead, they prefer to work with **[`generator` expressions ](https://docs.python.org/3/reference/expressions.html#generator-expressions)**. Syntactically, they work like list comprehensions except that parentheses, `(` and `)`, replace brackets, `[` and `]`.\n", + "\n", + "Let's go back to the original \"*A simple Filter*\" example from [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/03_content.ipynb#Example:-A-simple-Filter) one more time, apply the transformation $y := x^2 + 1$ to all even `numbers`, and sum them up." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To filter and transform `numbers`, we wrote a list comprehension above ..." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[65, 145, 5, 37, 101, 17]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[(n ** 2) + 1 for n in numbers if n % 2 == 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "... that now becomes a `generator` expression." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " at 0x7fcf703d84a0>" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "((n ** 2) + 1 for n in numbers if n % 2 == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "A `generator` expression evaluates to yet another \"rule\"-like object in memory that knows how to generate the individual objects in a series one by one. Whereas a `list` comprehension materializes its elements in memory *when* it is evaluated, the opposite holds true for `generator` expressions: *No* object is created in memory except the \"rule\" itself. Because of this behavior, we describe `generator` expressions as *lazy* and `list` comprehensions as *eager*.\n", + "\n", + "To materialize the elements specified by a `generator` expression, we use the [list() ](https://docs.python.org/3/library/functions.html#func-list) constructor as seen above." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[65, 145, 5, 37, 101, 17]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(((n ** 2) + 1 for n in numbers if n % 2 == 0))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Whenever a `generator` expression is the only argument in a function call, we may merge the double parentheses, `((` and `))`, into just `(` and `)`." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[65, 145, 5, 37, 101, 17]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list((n ** 2) + 1 for n in numbers if n % 2 == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "A common use case is to reduce the elements into a single object instead, for example, by adding them up with [sum() ](https://docs.python.org/3/library/functions.html#sum) as in the original \"*A simple Filter*\" example. [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20range%281,%2013%29%0Asum_with_list%20%3D%20sum%28%5B%28n%20**%202%29%20%2B%201%20for%20n%20in%20numbers%20if%20n%20%25%202%20%3D%3D%200%5D%29%0Asum_with_gen%20%3D%20sum%28%28n%20**%202%29%20%2B%201%20for%20n%20in%20numbers%20if%20n%20%25%202%20%3D%3D%200%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) shows how the code cell below does *not* create a temporary `list` object in memory whereas a `list` comprehension would do so (cf., step 17)." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "370" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum((n ** 2) + 1 for n in numbers if n % 2 == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Let's assign the object to which the `generator` expression below evaluates to to a variable `gen` and inspect it." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "gen = ((n ** 2) + 1 for n in numbers if n % 2 == 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " at 0x7fcf703d8c10>" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Unsurprisingly, `generator` expressions create objects of type `generator`." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "generator" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(gen)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`generator` objects work just like the `map` and `filter` objects in the [first part ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb) of this chapter. So, with the [next() ](https://docs.python.org/3/library/functions.html#next) function, we can generate elements one by one." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "65" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "145" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "37" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "101" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "17" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Once a `generator` object runs out of elements, it raises a `StopIteration` exception, and we say that the `generator` object is **exhausted**. To loop over its elements again, we must create a *new* one." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "ename": "StopIteration", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m: " + ] + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "ename": "StopIteration", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m: " + ] + } + ], + "source": [ + "next(gen)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Calling the [next() ](https://docs.python.org/3/library/functions.html#next) function repeatedly with the *same* `generator` object as the argument is essentially what a `for`-loop automates for us. So, `generator` objects are *iterable*. We look into this in detail further below in the \"*The `for` Statement (revisited)*\" section." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "65 145 5 37 101 17 " + ] + } + ], + "source": [ + "for number in ((n ** 2) + 1 for n in numbers if n % 2 == 0):\n", + " print(number, end=\" \")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "### Example: Nested Lists (continued)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "If we are only interested in a *reduction* of `nested_numbers` into a single statistic, as the overall sum in the \"*Nested Lists*\" example above, we should replace `list` objects or `list` comprehensions with `generator` expressions wherever possible. The result is the *same*, but no intermediate `list` objects are materialized! That makes our code scale to large amounts of data.\n", + "\n", + "Let's adapt the example `nested_numbers` from above." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[1, 2, 3, 4, 5, 6, 7], [2, 3, 4, 5, 6, 7, 8], [3, 4, 5, 6, 7, 8, 9]]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nested_numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Compared to the implementation with the list comprehension above, we simply remove the brackets, `[` and `]`: The argument to [sum() ](https://docs.python.org/3/library/functions.html#sum) becomes a generator expression." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(number for inner_numbers in nested_numbers for number in inner_numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "That also holds for the alternative formulation as a sum of sums." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(sum(inner_numbers) for inner_numbers in nested_numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Because `nested_numbers` has an internal structure (i.e., the inner `list` objects are series of consecutive `int` objects), we can even provide an effectively **memoryless** implementation by expressing it as a `generator` expression derived from `range` objects. [PythonTutor ](http://pythontutor.com/visualize.html#code=nested_numbers%20%3D%20%28%28range%28x,%20y%20%2B%201%29%29%20for%20x,%20y%20in%20zip%28range%281,%204%29,%20range%287,%2010%29%29%29%0Aresult%20%3D%20sum%28number%20for%20inner_numbers%20in%20nested_numbers%20for%20number%20in%20inner_numbers%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) confirms that no `list` objects materialize at any point in time." + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "nested_numbers = ((range(x, y + 1)) for x, y in zip(range(1, 4), range(7, 10)))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + " at 0x7fcf70347970>" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nested_numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "105" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(number for inner_numbers in nested_numbers for number in inner_numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We must be careful when assigning a `generator` object to a variable: If we use `nested_numbers` again, for example, in the alternative formulation below, [sum() ](https://docs.python.org/3/library/functions.html#sum) returns `0` because `nested_numbers` is exhausted after executing the previous code cell. [PythonTutor ](http://pythontutor.com/visualize.html#code=nested_numbers%20%3D%20%28%28range%28x,%20y%20%2B%201%29%29%20for%20x,%20y%20in%20zip%28range%281,%204%29,%20range%287,%2010%29%29%29%0Aresult%20%3D%20sum%28number%20for%20inner_numbers%20in%20nested_numbers%20for%20number%20in%20inner_numbers%29%0Ano_result%20%3D%20sum%28sum%28inner_numbers%29%20for%20inner_numbers%20in%20nested_numbers%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) also shows that." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum(sum(inner_numbers) for inner_numbers in nested_numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Example: Working with Cartesian Products (continued)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Let's also rewrite the \"*Working with Cartesian Products*\" example from above with generator expressions.\n", + "\n", + "As a first optimization, we replace the materialized `list` objects, `first` and `second`, with memoryless `range` objects." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "first = range(10, 31, 10) # \"==\" [10, 20, 30]\n", + "second = range(40, 61, 10) # \"==\" [40, 50, 60]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Now, the first of the two alternative solutions may be more appealing to many readers. In general, many practitioners seem to dislike `lambda` expressions.\n", + "\n", + "In the first solution, we *unpack* the elements produced by `(1 + (x / y) for x in first for y in second)` into the `product()` function from the \"*Function Definitions & Calls*\" sub-section in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/03_content.ipynb#Function-Definitions-&-Calls). However, inside `product()`, the elements are *packed* into `args`, a *materialized* `tuple` object! So, all the memory efficiency gained by using a generator expression is lost! [PythonTutor ](http://pythontutor.com/visualize.html#code=def%20product%28*args%29%3A%0A%20%20%20%20result%20%3D%20args%5B0%5D%0A%20%20%20%20for%20arg%20in%20args%5B1%3A%5D%3A%0A%20%20%20%20%20%20%20%20result%20*%3D%20arg%0A%20%20%20%20return%20result%0A%0Afirst%20%3D%20range%2810,%2031,%2010%29%0Asecond%20%3D%20range%2840,%2061,%2010%29%0A%0Aresult%20%3D%20product%28*%281%20%2B%20%28x%20/%20y%29%20for%20x%20in%20first%20for%20y%20in%20second%29%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) shows how a `tuple` object exists in steps 38-58." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20.58" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "product(*(1 + (x / y) for x in first for y in second)) # not memory efficient!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "On the contrary, the second solution with the [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) function from the [functools ](https://docs.python.org/3/library/functools.html) module and the `lambda` expression works *without* the elements materialized at the same time, and [PythonTutor ](http://pythontutor.com/visualize.html#code=from%20functools%20import%20reduce%0A%0Afirst%20%3D%20range%2810,%2031,%2010%29%0Asecond%20%3D%20range%2840,%2061,%2010%29%0A%0Aresult%20%3D%20reduce%28%0A%20%20%20%20lambda%20x,%20y%3A%20x%20*%20y,%0A%20%20%20%20%281%20%2B%20%28x%20/%20y%29%20for%20x%20in%20first%20for%20y%20in%20second%29%0A%29&cumulative=false&curInstr=0&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) confirms that. So, only the second alternative is truly memory-efficient!" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20.58" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reduce(lambda x, y: x * y, (1 + (x / y) for x in first for y in second))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In summary, we learn from this example that unpacking `generator` objects *may* be a *bad* idea." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "### Example: Averaging all even Numbers in a List (revisited)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "With the new concepts in this chapter, let's rewrite the book's introductory \"*Averaging all even Numbers in a List*\" example from [Chapter 1 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/01_elements/00_content.ipynb#Example:-Averaging-all-even-Numbers-in-a-List) such that it efficiently handles a large sequence of numbers. We continue from its latest implementation, the `average_evens()` function in the \"*Keyword-only Arguments*\" section in [Chapter 2 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/02_functions/00_content.ipynb#Keyword-only-Arguments).\n", + "\n", + "We assume that `average_evens()` is called with a *finite* and *iterable* object that generates a **stream** of numeric objects that can be cast as `int` objects. After all, the idea of even and odd numbers makes sense only in the context of whole numbers.\n", + "\n", + "The `generator` expression `(round(n) for n in numbers)` implements the type casting, and, when it is evaluated during a function call, *nothing* happens except that a `generator` object is assigned to `integers`. Then, with the [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) function from the [functools ](https://docs.python.org/3/library/functools.html) module, we *simultaneously* add up *and* count the even numbers with the `generator` object to which the inner `generator` expression `((n, 1) for n in integers if n % 2 == 0)` evaluates to. That `generator` object takes the `integers` generator as its source and produces `tuple` objects consisting of the next *even* number in line and `1`. Two such `tuple` objects are then iteratively passed to the `function` object to which the `lambda` expression evaluates to. `x` represents the total and the count of the even numbers processed so far, while `y`'s first element, `y[0]`, is the next *even* number to be added to the running total. The result of calling [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) is again a `tuple` object, namely the final `total` and `count`. Lastly, we calculate the simple average and scale it.\n", + "\n", + "In summary, this implementation of `average_evens()` does *not* keep materialized `list` objects internally like its predecessors in [Chapter 2 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/02_functions/00_content.ipynb) but processes the elements of the `numbers` argument on a one-by-one basis." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "def average_evens(numbers, *, scalar=1):\n", + " \"\"\"Calculate the average of all even numbers.\n", + "\n", + " Args:\n", + " numbers (iterable): a finite stream of the numbers to be averaged;\n", + " if non-whole numbers are provided, they are rounded\n", + " scalar (float, optional): multiplies the average; defaults to 1\n", + "\n", + " Returns:\n", + " float: (scaled) average\n", + " \"\"\"\n", + " integers = (round(n) for n in numbers)\n", + " total, count = reduce(\n", + " lambda x, y: (x[0] + y[0], x[1] + y[1]),\n", + " ((n, 1) for n in integers if n % 2 == 0)\n", + " )\n", + " return scalar * total / count" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_evens([7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We may provide an optional `scalar` argument as before." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "14.0" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_evens([7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4], scalar=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "An argument with `float` objects works just as well." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "7.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_evens([7., 11., 8., 5., 3., 12., 2., 6., 9., 10., 1., 4.])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To show that `average_evens()` can process a large stream of data, we simulate `10_000_000` randomly drawn integers between `0` and `100` with the [randint() ](https://docs.python.org/3/library/random.html#random.randint) function from the [random ](https://docs.python.org/3/library/random.html) module. We use a `generator` expression derived from a `range` object as the `numbers` argument. So, at *no* point in time is there a materialized `list` object in memory. The result approaching `50` confirms that [randint() ](https://docs.python.org/3/library/random.html#random.randint) must be based on a uniform distribution." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "import random" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "random.seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "49.994081434519636" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "average_evens(random.randint(0, 100) for _ in range(10_000_000))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To show that `average_evens()` filters out odd numbers, we simulate another stream of `10_000_000` randomly drawn odd integers between `1` and `99`. As no function in the [random ](https://docs.python.org/3/library/random.html) module does that \"out of the box,\" we must be creative: Doubling a number drawn from `random.randint(0, 49)` results in an even number between `0` and `98`, and adding `1` makes it odd. Then, `average_evens()` raises a `TypeError`, essentially because `(int(n) for n in numbers)` does not generate any element." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "ename": "TypeError", + "evalue": "reduce() of empty sequence with no initial value", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0maverage_evens\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m49\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10_000_000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36maverage_evens\u001b[0;34m(numbers, scalar)\u001b[0m\n\u001b[1;32m 11\u001b[0m \"\"\"\n\u001b[1;32m 12\u001b[0m \u001b[0mintegers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mround\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mnumbers\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 13\u001b[0;31m total, count = reduce(\n\u001b[0m\u001b[1;32m 14\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mn\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mintegers\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mn\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: reduce() of empty sequence with no initial value" + ] + } + ], + "source": [ + "average_evens(2 * random.randint(0, 49) + 1 for _ in range(10_000_000))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "## `tuple` Comprehensions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "There is no syntax to derive *new* `tuple` objects out of existing ones. However, we can mimic such a construct by combining the built-in [tuple() ](https://docs.python.org/3/library/functions.html#func-tuple) constructor with a `generator` expression.\n", + "\n", + "So, to convert the `list` comprehension `[(n ** 2) + 1 for n in numbers if n % 2 == 0]` from above into a \"`tuple` comprehension,\" we write the following." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(65, 145, 5, 37, 101, 17)" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuple((n ** 2) + 1 for n in numbers if n % 2 == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Boolean Reducers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Besides [min() ](https://docs.python.org/3/library/functions.html#min), [max() ](https://docs.python.org/3/library/functions.html#max), and [sum() ](https://docs.python.org/3/library/functions.html#sum), Python provides two boolean reduce functions: [all() ](https://docs.python.org/3/library/functions.html#all) and [any() ](https://docs.python.org/3/library/functions.html#any).\n", + "\n", + "Let's look at straightforward examples involving `numbers` again." + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "[all() ](https://docs.python.org/3/library/functions.html#all) takes an `iterable` argument and returns `True` if *all* elements are *truthy*.\n", + "\n", + "For example, let's check if the square of each element in `numbers` is below `100` or `150`, respectively. We express the computation with a `generator` expression passed as the only argument to [all() ](https://docs.python.org/3/library/functions.html#all)." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all(x ** 2 < 100 for x in numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all(x ** 2 < 150 for x in numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "[all() ](https://docs.python.org/3/library/functions.html#all) can be viewed as syntactic sugar replacing a `for`-loop: Internally, [all() ](https://docs.python.org/3/library/functions.html#all) implements the *short-circuiting* strategy explained in [Chapter 3 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/03_conditionals/00_content.ipynb#Short-Circuiting), and we mimic that by testing for the *opposite* condition in the `if` statement and stopping the `for`-loop early with the `break` statement. In the worst case, if `threshold` were, for example, `150`, we would loop over *all* elements in the `iterable` argument, which must be *finite* for the code to work. So, [all() ](https://docs.python.org/3/library/functions.html#all) is a *linear search* in disguise." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "threshold = 100\n", + "\n", + "for number in numbers:\n", + " if number ** 2 >= threshold: # the opposite of what we are checking for\n", + " all_below_threshold = False\n", + " break\n", + "else:\n", + " all_below_threshold = True\n", + "\n", + "all_below_threshold" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The documentation of [all() ](https://docs.python.org/3/library/functions.html#all) shows in another way what it does with code: By placing a `return` statement in a `for`-loop's body inside a function, iteration is stopped prematurely once an element does *not* meet the condition. That is the familiar *early exit* pattern at work." + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "def all_alt(iterable):\n", + " \"\"\"Alternative implementation of the built-in all() function.\"\"\"\n", + " for element in iterable:\n", + " if not element: # the opposite of what we are checking for\n", + " return False\n", + " return True" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_alt(x ** 2 < 100 for x in numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_alt(x ** 2 < 150 for x in numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Similarly, [any() ](https://docs.python.org/3/library/functions.html#any) checks if *at least* one element in the `iterable` argument is *truthy*.\n", + "\n", + "To continue the example, let's check if the square of *any* element in `numbers` is above `100` or `150`, respectively." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "any(x ** 2 > 100 for x in numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "any(x ** 2 > 150 for x in numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The code cell below shows how [any() ](https://docs.python.org/3/library/functions.html#any) works internally: It also follows the *short-circuiting* strategy. Here, we do *not* need to check for the opposite condition." + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "threshold = 100\n", + "\n", + "for number in numbers:\n", + " if number ** 2 > threshold:\n", + " any_above_threshold = True\n", + " break\n", + "else:\n", + " any_above_threshold = False\n", + "\n", + "any_above_threshold" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The alternative formulation in the documentation of [any() ](https://docs.python.org/3/library/functions.html#any) is straightforward and also uses the early exit pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "def any_alt(iterable):\n", + " \"\"\"Alternative implementation of the built-in any() function.\"\"\"\n", + " for element in iterable:\n", + " if element:\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "any_alt(x ** 2 > 100 for x in numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "any_alt(x ** 2 > 150 for x in numbers)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "livereveal": { + "auto_select": "code", + "auto_select_fragment": true, + "scroll": true, + "theme": "serif" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "384px" + }, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index 476808d..ac3e1db 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -195,4 +195,10 @@ If this is not possible, (Mapping; Filtering; Reducing; - `lambda` Expression) \ No newline at end of file + `lambda` Expression) + - [content ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/01_content.ipynb) + [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/01_content.ipynb) + (`list` Comprehension; + `generator` Expression; + Streams of Data; + Boolean Reducers) \ No newline at end of file From f40c74f89e869826ea8131980a179a1bb89cd5e6 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 18:46:56 +0200 Subject: [PATCH 04/10] Explain what files are used and where they are --- CONTENTS.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CONTENTS.md b/CONTENTS.md index ac3e1db..6291fb2 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -2,6 +2,21 @@ The materials are designed to resemble an *interactive* book. +The files come + primarily in the [Jupyter Notebook](https://jupyter-notebook.readthedocs.io/en/stable/) + format (i.e., \*.ipynb) + but also as [modules and packages ](https://docs.python.org/3/tutorial/modules.html) + (i.e., \*.py). +Together with some other static files (e.g., images), + they are stored in one folder per chapter in this repository. +They are to be opened + from within the [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) application, + even though other ways are certainly possible as well. +Both the files and the folders + are appropriately named with prefixes + indicating the order in which they should be read + and starting with "00_". + It is recommended to follow the [installation instructions](https://github.com/webartifex/intro-to-python#installation) in the [README.md](README.md) file From f4f2cf476e74334e04dcaef54b4079f03952a4e1 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 18:48:59 +0200 Subject: [PATCH 05/10] Add initial version of chapter 08's exercises, part 1 --- 08_mfr/02_exercises.ipynb | 760 ++++++++++++++++++++++++++++++++++++++ 08_mfr/stream.py | 51 +++ CONTENTS.md | 5 +- 3 files changed, 815 insertions(+), 1 deletion(-) create mode 100644 08_mfr/02_exercises.ipynb create mode 100644 08_mfr/stream.py diff --git a/08_mfr/02_exercises.ipynb b/08_mfr/02_exercises.ipynb new file mode 100644 index 0000000..0911aaf --- /dev/null +++ b/08_mfr/02_exercises.ipynb @@ -0,0 +1,760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: Click on \"*Kernel*\" > \"*Restart Kernel and Run All*\" in [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) *after* finishing the exercises to ensure that your solution runs top to bottom *without* any errors. If you cannot run this file on your machine, you may want to open it [in the cloud ](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/02_exercises.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 8: Map, Filter, & Reduce (Coding Exercises)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The exercises below assume that you have read the [first ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb) and [second ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/01_content.ipynb) part of Chapter 8.\n", + "\n", + "The `...`'s in the code cells indicate where you need to fill in code snippets. The number of `...`'s within a code cell give you a rough idea of how many lines of code are needed to solve the task. You should not need to create any additional code cells for your final solution. However, you may want to use temporary code cells to try out some ideas." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Removing Outliers in Streaming Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's say we are given a `list` object with random integers like `sample` below, and we want to calculate some basic statistics on them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample = [\n", + " 45, 46, 40, 49, 36, 53, 49, 42, 25, 40, 39, 36, 38, 40, 40, 52, 36, 52, 40, 41,\n", + " 35, 29, 48, 43, 42, 30, 29, 33, 55, 33, 38, 50, 39, 56, 52, 28, 37, 56, 45, 37,\n", + " 41, 41, 37, 30, 51, 32, 23, 40, 53, 40, 45, 39, 99, 42, 34, 42, 34, 39, 39, 53,\n", + " 43, 37, 46, 36, 45, 42, 32, 38, 57, 34, 36, 44, 47, 51, 46, 39, 28, 40, 35, 46,\n", + " 41, 51, 41, 23, 46, 40, 40, 51, 50, 32, 47, 36, 38, 29, 32, 53, 34, 43, 39, 41,\n", + " 40, 34, 44, 40, 41, 43, 47, 57, 50, 42, 38, 25, 45, 41, 58, 37, 45, 55, 44, 53,\n", + " 82, 31, 45, 33, 32, 39, 46, 48, 42, 47, 40, 45, 51, 35, 31, 46, 40, 44, 61, 57,\n", + " 40, 36, 35, 55, 40, 56, 36, 35, 86, 36, 51, 40, 54, 50, 49, 36, 41, 37, 48, 41,\n", + " 42, 44, 40, 43, 51, 47, 46, 50, 40, 23, 40, 39, 28, 38, 42, 46, 46, 42, 46, 31,\n", + " 32, 40, 48, 27, 40, 40, 30, 32, 25, 31, 30, 43, 44, 29, 45, 41, 63, 32, 33, 58,\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(sample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q1**: `list` objects are **sequences**. What *four* behaviors do they always come with?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q2**: Write a function `mean()` that calculates the simple arithmetic mean of a given `sequence` with numbers!\n", + "\n", + "Hints: You can solve this task with [built-in functions ](https://docs.python.org/3/library/functions.html) only. A `for`-loop is *not* needed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def mean(sequence):\n", + " ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_mean = mean(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_mean" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q3**: Write a function `std()` that calculates the [standard deviation ](https://en.wikipedia.org/wiki/Standard_deviation) of a `sequence` of numbers! Integrate your `mean()` version from before and the [sqrt() ](https://docs.python.org/3/library/math.html#math.sqrt) function from the [math ](https://docs.python.org/3/library/math.html) module in the [standard library ](https://docs.python.org/3/library/index.html) provided to you below. Make sure `std()` calls `mean()` only *once* internally! Repeated calls to `mean()` would be a waste of computational resources.\n", + "\n", + "Hints: Parts of the code are probably too long to fit within the suggested 79 characters per line. So, use *temporary variables* inside your function. Instead of a `for`-loop, you may want to use a `list` comprehension or, even better, a memoryless `generator` expression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from math import sqrt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def std(sequence):\n", + " ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_std = std(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_std" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q4**: Complete `standardize()` below that takes a `sequence` of numbers and returns a `list` object with the **[z-scores ](https://en.wikipedia.org/wiki/Standard_score)** of these numbers! A z-score is calculated by subtracting the mean and dividing by the standard deviation. Re-use `mean()` and `std()` from before. Again, ensure that `standardize()` calls `mean()` and `std()` only *once*! Further, round all z-scores with the built-in [round() ](https://docs.python.org/3/library/functions.html#round) function and pass on the keyword-only argument `digits` to it.\n", + "\n", + "Hint: You may want to use a `list` comprehension instead of a `for`-loop." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def standardize(sequence, *, digits=3):\n", + " ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "z_scores = standardize(sample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The [pprint() ](https://docs.python.org/3/library/pprint.html#pprint.pprint) function from the [pprint ](https://docs.python.org/3/library/pprint.html) module in the [standard library ](https://docs.python.org/3/library/index.html) allows us to \"pretty print\" long `list` objects compactly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pprint import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pprint(z_scores, compact=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We know that `standardize()` works correctly if the resulting z-scores' mean and standard deviation approach `0` and `1` for a long enough `sequence`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean(z_scores), std(z_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even though `standardize()` calls `mean()` and `std()` only once each, `mean()` is called *twice*! That is so because `std()` internally also re-uses `mean()`!\n", + "\n", + "**Q5.1**: Rewrite `std()` to take an optional keyword-only argument `seq_mean`, defaulting to `None`. If provided, `seq_mean` is used instead of the result of calling `mean()`. Otherwise, the latter is called.\n", + "\n", + "Hint: You must check if `seq_mean` is still the default value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def std(sequence, *, seq_mean=None):\n", + " ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`std()` continues to work as before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_std = std(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_std" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q5.2**: Now, rewrite `standardize()` to pass on the return value of `mean()` to `std()`! In summary, `standardize()` calculates the z-scores for the numbers in the `sequence` with as few computational steps as possible." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def standardize(sequence, *, digits=3):\n", + " ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "z_scores = standardize(sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean(z_scores), std(z_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q6**: With both `sample` and `z_scores` being materialized `list` objects, we can loop over pairs consisting of a number from `sample` and its corresponding z-score. Write a `for`-loop that prints out all the \"outliers,\" as which we define numbers with an absolute z-score above `1.96`. There are *four* of them in the `sample`.\n", + "\n", + "Hint: Use the [abs() ](https://docs.python.org/3/library/functions.html#abs) and [zip() ](https://docs.python.org/3/library/functions.html#zip) built-ins." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We provide a `stream` module with a `data` object that models an *infinite* **stream** of data (cf., the [*stream.py* ](https://github.com/webartifex/intro-to-python/blob/develop/08_mfr/stream.py) file in the repository)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from stream import data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`data` is of type `generator` and has *no* length." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So, the only thing we can do with it is to pass it to the built-in [next() ](https://docs.python.org/3/library/functions.html#next) function and go over the numbers it streams one by one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q7**: What happens if you call `mean()` with `data` as the argument? What is the problem?\n", + "\n", + "Hints: If you try it out, you may have to press the \"Stop\" button in the toolbar at the top. Your computer should *not* crash, but you will *have to* restart this Jupyter notebook with \"Kernel\" > \"Restart\" and import `data` again." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q8**: Write a function `take_sample()` that takes an `iterable` as its argument, like `data`, and creates a *materialized* `list` object out of its first `n` elements, defaulting to `1_000`!\n", + "\n", + "Hints: [next() ](https://docs.python.org/3/library/functions.html#next) and the [range() ](https://docs.python.org/3/library/functions.html#func-range) built-in may be helpful. You may want to use a `list` comprehension instead of a `for`-loop and write a one-liner. Audacious students may want to look at [isclice() ](https://docs.python.org/3/library/itertools.html#itertools.islice) in the [itertools ](https://docs.python.org/3/library/itertools.html) module in the [standard library ](https://docs.python.org/3/library/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def take_sample(iterable, *, n=1_000):\n", + " ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We take a `new_sample` from the stream of `data`, and its statistics are similar to the initial `sample`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "new_sample = take_sample(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(new_sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mean(new_sample)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "std(new_sample)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q9**: Convert `standardize()` into a *new* function `standardized()` that implements the *same* logic but works on a possibly *infinite* stream of data, provided as an `iterable`, instead of a *finite* `sequence`.\n", + "\n", + "To calculate a z-score, we need the stream's overall mean and standard deviation, and that is *impossible* to calculate if we do not know how long the stream is, and, in particular, if it is *infinite*. So, `standardized()` first takes a sample from the `iterable` internally, and uses the sample's mean and standard deviation to calculate the z-scores.\n", + "\n", + "Hint: `standardized()` *must* return a `generator` object. So, use a `generator` expression as the return value; unless you know about the `yield` statement already (cf., [reference ](https://docs.python.org/3/reference/simple_stmts.html#the-yield-statement))." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def standardized(iterable, *, digits=3):\n", + " ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`standardized()` works almost like `standardize()` except that we use it with [next() ](https://docs.python.org/3/library/functions.html#next) to obtain the z-scores one by one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "z_scores = standardized(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "z_scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(z_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(z_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q10.1**: `standardized()` allows us to go over an *infinite* stream of z-scores. What we want to do instead is to loop over the stream's raw numbers and skip the outliers. In the remainder of this exercise, you look at the parts that make up the `skip_outliers()` function below to achieve precisely that.\n", + "\n", + "The first steps in `skip_outliers()` are the same as in `standardized()`: We take a `sample` from the stream of `data` and calculate its statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample = ...\n", + "seq_mean = ...\n", + "seq_std = ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q10.2**: Just as in `standardized()`, write a `generator` expression that produces z-scores one by one! However, instead of just generating a z-score, the resulting `generator` object should produce `tuple` objects consisting of a \"raw\" number from `data` and its z-score.\n", + "\n", + "Hint: Look at the revisited \"*Averaging Even Numbers*\" example in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/01_content.ipynb#Example:-Averaging-all-even-Numbers-in-a-List-%28revisited%29) for some inspiration, which also contains a `generator` expression producing `tuple` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "standardizer = (... for ... in data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`standardizer` should produce `tuple` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(standardizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q10.3**: Write another `generator` expression that loops over `standardizer`. It contains an `if`-clause that keeps only numbers with an absolute z-score below the `threshold_z`. If you fancy, use `tuple` unpacking." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold_z = 1.96" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "no_outliers = (... for ... in standardizer if ...)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`no_outliers` should produce `int` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(no_outliers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q10.4**: Lastly, put everything together in the `skip_outliers()` function! Make sure you refer to `iterable` inside the function and not the global `data`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def skip_outliers(iterable, *, threshold_z=1.96):\n", + " sample = ...\n", + " seq_mean = ...\n", + " seq_std = ...\n", + " standardizer = ...\n", + " no_outliers = ...\n", + " return no_outliers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can create a `generator` object and loop over the `data` in the stream with outliers skipped. Instead of the default `1.96`, we use a `threshold_z` of only `0.05`: That filters out all numbers except `42`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "skipper = skip_outliers(data, threshold_z=0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "skipper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(skipper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(skipper)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q11**: You implemented the functions `mean()`, `std()`, `standardize()`, `standardized()`, and `skip_outliers()`. Which of them are **eager**, and which are **lazy**? How do these two concepts relate to **finite** and **infinite** data?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/08_mfr/stream.py b/08_mfr/stream.py new file mode 100644 index 0000000..6aa6cc7 --- /dev/null +++ b/08_mfr/stream.py @@ -0,0 +1,51 @@ +"""Simulation of random streams of data. + +This module defines: +- a generator object `data` modeling an infinite stream of integers +- a function `make_finite_stream()` that creates finite streams of data + +The probability distribution underlying the integers is Gaussian-like with a +mean of 42 and a standard deviation of 8. The left tail of the distribution is +cut off meaning that the streams only produce non-negative numbers. Further, +one in a hundred random numbers has an increased chance to be an outlier. +""" + +import itertools as _itertools +import random as _random + + +_random.seed(87) + + +def _infinite_stream(): + """Internal generator function to simulate an infinite stream of data.""" + while True: + number = max(0, int(_random.gauss(42, 8))) + if _random.randint(1, 100) == 1: + number *= 2 + yield number + + +def make_finite_stream(min_=5, max_=15): + """Simulate a finite stream of data. + + The returned stream is finite, but the number of elements to be produced + by it is still random. This default behavior may be turned off by passing + in `min_` and `max_` arguments with `min_ == max_`. + + Args: + min_ (optional, int): minimum numbers in the stream; defaults to 5 + max_ (optional, int): maximum numbers in the stream; defaults to 15 + + Returns: + finite_stream (generator) + + Raises: + ValueError: if max_ < min_ + """ + stream = _infinite_stream() + n = _random.randint(min_, max_) + yield from _itertools.islice(stream, n) + + +data = _infinite_stream() diff --git a/CONTENTS.md b/CONTENTS.md index 6291fb2..c377dbd 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -216,4 +216,7 @@ If this is not possible, (`list` Comprehension; `generator` Expression; Streams of Data; - Boolean Reducers) \ No newline at end of file + Boolean Reducers) + - [exercises ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/02_exercises.ipynb) + [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/02_exercises.ipynb) + (Removing Outliers in Streaming Data) \ No newline at end of file From 7ca4754140741034c538728eb36e72caa8dbf91e Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 19:03:56 +0200 Subject: [PATCH 06/10] Add initial version of chapter 08's exercises, part 2 --- 08_mfr/03_exercises.ipynb | 455 ++++++++++++++++++++++++++++++++++++++ CONTENTS.md | 5 +- 2 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 08_mfr/03_exercises.ipynb diff --git a/08_mfr/03_exercises.ipynb b/08_mfr/03_exercises.ipynb new file mode 100644 index 0000000..b93761a --- /dev/null +++ b/08_mfr/03_exercises.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: Click on \"*Kernel*\" > \"*Restart Kernel and Run All*\" in [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) *after* finishing the exercises to ensure that your solution runs top to bottom *without* any errors. If you cannot run this file on your machine, you may want to open it [in the cloud ](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/03_exercises.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 8: Map, Filter, & Reduce (Coding Exercises)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The exercises below assume that you have read the [first ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb) and [second ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/01_content.ipynb) part of Chapter 8.\n", + "\n", + "The `...`'s in the code cells indicate where you need to fill in code snippets. The number of `...`'s within a code cell give you a rough idea of how many lines of code are needed to solve the task. You should not need to create any additional code cells for your final solution. However, you may want to use temporary code cells to try out some ideas." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Packing & Unpacking with Functions (continued)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q1**: Copy your solution to **Q10** from the \"*Packing & Unpacking with Functions*\" exercise in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/04_exercises.ipynb) into the code cell below!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import collections.abc as abc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def product(*args, ...):\n", + " \"\"\"Multiply all arguments.\"\"\"\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + "\n", + " ...\n", + " ...\n", + "\n", + " return ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q2**: Verify that all test cases below work (i.e., the `assert` statements must *not* raise an `AssertionError`)!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert product(42) == 42" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert product(2, 5, 10) == 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert product(2, 5, 10, start=2) == 200" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "one_hundred = [2, 5, 10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert product(one_hundred) == 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert product(*one_hundred) == 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q3**: Verify that `product()` raises a `TypeError` when called without any arguments!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This implementation of `product()` is convenient to use, in particular, because we can pass it any *collection* object with or without *unpacking* it.\n", + "\n", + "However, `product()` suffers from one last flaw: We cannot pass it a **stream** of data, as modeled, for example, with a `generator` object that produces elements on a one-by-one basis.\n", + "\n", + "**Q4**: Click through the following code cells and observe what they do!\n", + "\n", + "The [*stream.py* ](https://github.com/webartifex/intro-to-python/blob/develop/08_mfr/stream.py) module in the book's repository provides a `make_finite_stream()` function. It is a *factory* function creating objects of type `generator` that we use to model *streaming* data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from stream import make_finite_stream" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = make_finite_stream()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "type(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`generator` objects are good for only *one* thing: Giving us the \"next\" element in a series of possibly *infinitely* many objects. While the `data` object is finite (i.e., execute the next code cell until you see a `StopIteration` exception), ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "next(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... it has *no* concept of a \"length:\" The built-in [len() ](https://docs.python.org/3/library/functions.html#len) function raises a `TypeError`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "len(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the built-in [list() ](https://docs.python.org/3/library/functions.html#func-list) constructor to *materialize* all elements. However, in a real-world scenario, these may *not* fit into our machine's memory! If you get an empty `list` object below, you have to create a *new* `data` object above again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To be more realistic, `make_finite_stream()` creates `generator` objects producing a varying number of elements." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(make_finite_stream())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(make_finite_stream())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(make_finite_stream())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what happens if we pass a `generator` object, as created by `make_finite_stream()`, instead of a materialized *collection*, like `one_hundred`, to `product()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product(make_finite_stream())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q5**: What line causes the `TypeError`? What line is really the problem in `product()`? Hint: These may be different lines. Describe what happens on each line in the function's body until the exception is raised!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q6**: Adapt `product()` one last time to make it work with `generator` objects, or more generallz *iterators*, as well!\n", + "\n", + "Hints: This task is as easy as replacing `Collection` with something else. Which of the three behaviors of *collections* do `generator` objects also exhibit? You may want to look at the documentations on the built-in [max() ](https://docs.python.org/3/library/functions.html#max), [min() ](https://docs.python.org/3/library/functions.html#min), and [sum() ](https://docs.python.org/3/library/functions.html#sum) functions: What kind of argument do they take?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def product(*args, ...):\n", + " \"\"\"Multiply all arguments.\"\"\"\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + "\n", + " ...\n", + " ...\n", + "\n", + " return ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final version of `product()` behaves like built-ins in edge cases (i.e., `sum()` also raises a `TypeError` when called without arguments), ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... works with the arguments passed either separately as *positional* arguments, *packed* together into a single *collection* argument, or *unpacked*, ..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product(42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product(2, 5, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product([2, 5, 10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product(*[2, 5, 10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... and can handle *streaming* data with *indefinite* \"length.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "product(make_finite_stream())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In real-world projects, the data science practitioner must decide if it is worthwhile to make a function usable in various different forms as we do in this exercise. This may be over-engineered.\n", + "\n", + "Yet, two lessons are important to take away:\n", + "- It is a good idea to *mimic* the behavior of *built-ins* when accepting arguments, and\n", + "- make functions capable of working with *streaming* data." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index c377dbd..971c890 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -219,4 +219,7 @@ If this is not possible, Boolean Reducers) - [exercises ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/02_exercises.ipynb) [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/02_exercises.ipynb) - (Removing Outliers in Streaming Data) \ No newline at end of file + (Removing Outliers in Streaming Data) + - [exercises ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/03_exercises.ipynb) + [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/03_exercises.ipynb) + (Packing & Unpacking with Functions, continued) \ No newline at end of file From dc03b7ad64e8437bef1006b4979f9073cebfed2c Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 19:17:43 +0200 Subject: [PATCH 07/10] Add initial version of chapter 08, part 3 --- 08_mfr/04_content.ipynb | 1672 +++++++++++++++++++++++++++++++++++++++ CONTENTS.md | 6 +- 2 files changed, 1677 insertions(+), 1 deletion(-) create mode 100644 08_mfr/04_content.ipynb diff --git a/08_mfr/04_content.ipynb b/08_mfr/04_content.ipynb new file mode 100644 index 0000000..ff55376 --- /dev/null +++ b/08_mfr/04_content.ipynb @@ -0,0 +1,1672 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "**Note**: Click on \"*Kernel*\" > \"*Restart Kernel and Clear All Outputs*\" in [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) *before* reading this notebook to reset its output. If you cannot run this file on your machine, you may want to open it [in the cloud ](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/04_content.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Chapter 8: Map, Filter, & Reduce" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Similarly to how we classify different *concrete* data types like `list` or `str` by how they behave *abstractly* in a given context in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/00_content.ipynb#Collections-vs.-Sequences), we also do so for the data types we have introduced in this chapter." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## Iterators vs. Iterables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Here, the `map`, `filter`, and `generator` types all behave like \"rules\" in memory that govern how objects are produced \"on the fly.\" Their main commonality is their support for the built-in [next() ](https://docs.python.org/3/library/functions.html#next) function. In computer science terminology, such data types are called **[iterators ](https://en.wikipedia.org/wiki/Iterator)**, and the [collections.abc ](https://docs.python.org/3/library/collections.abc.html) module formalizes them with the `Iterator` ABC in Python.\n", + "\n", + "So, one example of an iterator is `evens_transformed` below, an object of type `generator`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "evens_transformed = ((x ** 2) + 1 for x in numbers if x % 2 == 0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Let's first confirm that `evens_transformed` is indeed an `Iterator`, \"abstractly speaking.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "import collections.abc as abc" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isinstance(evens_transformed, abc.Iterator)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In Python, iterators are *always* also iterables. The reverse is *not* true! To be precise, iterators are *specializations* of iterables. That is what the \"Inherits from\" column means in the [collections.abc ](https://docs.python.org/3/library/collections.abc.html) module's documentation." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isinstance(evens_transformed, abc.Iterable)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Furthermore, we sharpen our definition of an *iterable* from [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/00_content.ipynb#Collections-vs.-Sequences): Just as we define an *iterator* to be any object that supports the [next() ](https://docs.python.org/3/library/functions.html#next) function, we define an *iterable* to be any object that supports the built-in [iter() ](https://docs.python.org/3/library/functions.html#iter) function.\n", + "\n", + "The confused reader may now be wondering how the two concepts relate to each other.\n", + "\n", + "In short, the [iter() ](https://docs.python.org/3/library/functions.html#iter) function is the general way to create an *iterator* object out of a given *iterable* object. Then, the *iterator* object manages the iteration over the *iterable* object. In real-world code, we hardly ever see [iter() ](https://docs.python.org/3/library/functions.html#iter) as Python calls it for us in the background. \n", + "\n", + "For illustration, let's do that ourselves and create *two* iterators out of the iterable `numbers` and see what we can do with them." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "iterator1 = iter(numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "iterator2 = iter(numbers)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`iterator1` and `iterator2` are of type `list_iterator`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "list_iterator" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(iterator1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "*Iterators* are useful for only *one* thing: Get the next object from the associated *iterable*.\n", + "\n", + "By calling [next() ](https://docs.python.org/3/library/functions.html#next) three times with `iterator1` as the argument, we obtain the first three elements of `numbers`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(7, 11, 8)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(iterator1), next(iterator1), next(iterator1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`iterator1` and `iterator2` keep their *states* separate. So, we could loop over the same *iterable* several times in parallel." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(5, 7)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(iterator1), next(iterator2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We can also play a \"trick\" and exchange some elements in `numbers`. `iterator1` and `iterator2` do *not* see these changes and present us with the new elements. So, *iterators* not only have state on their own but also keep this separate from the underlying *iterable*." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "numbers[1], numbers[4] = 99, 99" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(99, 99)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(iterator1), next(iterator2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Let's re-assign the elements in `numbers` so that they are in order. After that, the numbers returned from [next() ](https://docs.python.org/3/library/functions.html#next) also tell us how often [next() ](https://docs.python.org/3/library/functions.html#next) was called with `iterator1` or `iterator2`. We conclude that `list_iterator` objects work by simply keeping track of the *last* index obtained from the underlying *iterable*." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "numbers[:] = list(range(1, 13))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(6, 3)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(iterator1), next(iterator2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "With the concepts introduced in this section, we can now understand the first sentence in the documentation on the [zip() ](https://docs.python.org/3/library/functions.html#zip) built-in better: \"Make an *iterator* that aggregates elements from each of the *iterables*.\"\n", + "\n", + "Because *iterators* are always also *iterables*, we may pass `iterator1` and `iterator2` as arguments to [zip() ](https://docs.python.org/3/library/functions.html#zip).\n", + "\n", + "The returned `zipper` object is of type `zip` and, \"abstractly speaking,\" an `Iterator` as well." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "zipper = zip(iterator1, iterator2)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zipper" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "zip" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(zipper)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "isinstance(zipper, abc.Iterator)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "So far, we have always used [zip() ](https://docs.python.org/3/library/functions.html#zip) in a `for`-loop. That was our earlier definition of an *iterable*. Our revised definition in this section states that an *iterable* is an object that supports the [iter() ](https://docs.python.org/3/library/functions.html#iter) function. So, let's see what happens if we pass `zipper` to [iter() ](https://docs.python.org/3/library/functions.html#iter)." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "zipper_iterator = iter(zipper)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zipper_iterator" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`zipper_iterator` references the *same* object as `zipper`! That is true for *iterators* in general: Any *iterator* created from an existing *iterator* with [iter() ](https://docs.python.org/3/library/functions.html#iter) is the *iterator* itself." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zipper is zipper_iterator" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The Python core developers made that design decision so that *iterators* may also be looped over.\n", + "\n", + "The `for`-loop below prints out only *six* more `tuple` objects derived from the now ordered `numbers` because the `iterator1` object hidden inside `zipper` already returned its first *six* elements. So, the respective first elements of the `tuple` objects printed range from `7` to `12`. Similarly, as `iterator2` already returned its first *three* elements from `numbers`, we see the respective second elements in the range from `4` to `9`." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7 and 4 8 and 5 9 and 6 10 and 7 11 and 8 12 and 9 " + ] + } + ], + "source": [ + "for x, y in zipper:\n", + " print(x, \"and\", y, end=\" \")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "`zipper` is now *exhausted*. So, the `for`-loop below does *not* make any iteration at all." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "for x, y in zipper:\n", + " raise RuntimeError(\"We won't see this error\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "We verify that `iterator1` is exhausted by passing it to [next() ](https://docs.python.org/3/library/functions.html#next) again, which raises a `StopIteration` exception." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "ename": "StopIteration", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mStopIteration\u001b[0m: " + ] + } + ], + "source": [ + "next(iterator1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "On the contrary, `iterator2` is *not* yet exhausted." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "next(iterator2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Understanding *iterators* and *iterables* is helpful for any data science practitioner that deals with large amounts of data. Even without that, these two terms occur everywhere in Python-related texts and documentation. So, a beginner should regularly review this section until it becomes second nature." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## The `for` Statement (revisited)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "In [Chapter 4 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/04_iteration/02_content.ipynb#The-for-Statement), we argue that the `for` statement is syntactic sugar, replacing the `while` statement in many common scenarios. In particular, a `for`-loop saves us two tasks: Managing an index variable *and* obtaining the individual elements by indexing. In this sub-section, we look at a more realistic picture, using the new terminology as well.\n", + "\n", + "Let's print out the elements of a `list` object as the *iterable* being looped over." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "iterable = [0, 1, 2, 3, 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1 2 3 4 " + ] + } + ], + "source": [ + "for element in iterable:\n", + " print(element, end=\" \")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Our previous and equivalent formulation with a `while` statement is like so." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1 2 3 4 " + ] + } + ], + "source": [ + "index = 0\n", + "\n", + "while index < len(iterable):\n", + " element = iterable[index]\n", + " print(element, end=\" \")\n", + " index += 1\n", + "\n", + "del index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "What actually happens behind the scenes in the Python interpreter is shown below.\n", + "\n", + "First, Python calls [iter() ](https://docs.python.org/3/library/functions.html#iter) with the `iterable` to be looped over as the argument. The returned `iterator` contains the entire logic of how the `iterable` is looped over. In particular, the `iterator` may or may not pick the `iterable`'s elements in a predictable order. That is up to the \"rule\" it models.\n", + "\n", + "Second, Python enters an *indefinite* `while`-loop. It tries to obtain the next element with [next() ](https://docs.python.org/3/library/functions.html#next). If that succeeds, the `for`-loop's code block is executed. Below, that code is placed within the `else`-clause that runs only if *no* exception is raised in the `try`-clause. Then, Python jumps into the next iteration and tries to obtain the next element from the `iterator`, and so on. Once the `iterator` is exhausted, it raises a `StopIteration` exception, and Python stops the `while`-loop with the `break` statement." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1 2 3 4 " + ] + } + ], + "source": [ + "iterator = iter(iterable)\n", + "\n", + "while True:\n", + " try:\n", + " element = next(iterator)\n", + " except StopIteration:\n", + " break\n", + " else:\n", + " print(element, end=\" \")\n", + "\n", + "del iterator" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "## sorted() vs. reversed()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Now that we know the concept of an *iterator*, let's compare some of the built-ins introduced in [Chapter 7 ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/07_sequences/00_content.ipynb) in detail and make sure we understand what is going on in memory. This section also serves as a summary of all the concepts in this chapter.\n", + "\n", + "We use two simple examples, `numbers` and `memoryless`: `numbers` creates *thirteen* objects in memory and `memoryless` only *one* (cf., [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20%5B7,%2011,%208,%205,%203,%2012,%202,%206,%209,%2010,%201,%204%5D%0Amemoryless%20%3D%20range%281,%2013%29&cumulative=false&curInstr=2&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false))." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [], + "source": [ + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [], + "source": [ + "memoryless = range(1, 13)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The [sorted() ](https://docs.python.org/3/library/functions.html#sorted) function takes a *finite* `iterable` argument and *materializes* its elements into a *new* `list` object that is returned.\n", + "\n", + "The argument may already be materialized, as is the case with `numbers`, but may also be an *iterable* without any objects in it, such as `memoryless`. In both cases, we end up with materialized `list` objects with the elements sorted in *forward* order (cf., [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20%5B7,%2011,%208,%205,%203,%2012,%202,%206,%209,%2010,%201,%204%5D%0Amemoryless%20%3D%20range%281,%2013%29%0Aresult1%20%3D%20sorted%28numbers%29%0Aresult2%20%3D%20sorted%28memoryless%29&cumulative=false&curInstr=4&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false))." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(memoryless)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "By adding a keyword-only argument `reverse=True`, the materialized `list` objects are sorted in *reverse* order (cf., [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20%5B7,%2011,%208,%205,%203,%2012,%202,%206,%209,%2010,%201,%204%5D%0Amemoryless%20%3D%20range%281,%2013%29%0Aresult1%20%3D%20sorted%28numbers,%20reverse%3DTrue%29%0Aresult2%20%3D%20sorted%28memoryless,%20reverse%3DTrue%29&cumulative=false&curInstr=4&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false))." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(numbers, reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(memoryless, reverse=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The order in `numbers` remains *unchanged*, and `memoryless` is still *not* materialized." + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "range(1, 13)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memoryless" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The [reversed() ](https://docs.python.org/3/library/functions.html#reversed) built-in takes a `sequence` argument and returns an *iterator*. The argument must be *finite* and *reversible* (i.e., *iterable* in *reverse* order) as otherwise [reversed() ](https://docs.python.org/3/library/functions.html#reversed) could neither determine the last element that becomes the first nor loop in a *predictable* backward fashion. [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20%5B7,%2011,%208,%205,%203,%2012,%202,%206,%209,%2010,%201,%204%5D%0Amemoryless%20%3D%20range%281,%2013%29%0Aiterator1%20%3D%20reversed%28numbers%29%0Aiterator2%20%3D%20reversed%28memoryless%29&cumulative=false&curInstr=4&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false) confirms that [reversed() ](https://docs.python.org/3/library/functions.html#reversed) does *not* materialize any elements but only returns an *iterator*.\n", + "\n", + "**Side Note**: Even though `range` objects, like `memoryless` here, do *not* \"contain\" references to other objects, they count as *sequence* types, and as such, they are also *container* types. The `in` operator works with `range` objects because we can always cast the object to be checked as an `int` and check if that lies within the `range` object's `start` and `stop` values, taking a potential `step` value into account (cf., this [blog post](https://treyhunner.com/2018/02/python-range-is-not-an-iterator/) for more details on the [range() ](https://docs.python.org/3/library/functions.html#func-range) built-in)." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reversed(numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reversed(memoryless)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To materialize the elements, we can pass the returned *iterators* to, for example, the [list() ](https://docs.python.org/3/library/functions.html#func-list) or [tuple() ](https://docs.python.org/3/library/functions.html#func-tuple) constructors. That creates *new* `list` and `tuple` objects (cf., [PythonTutor ](http://pythontutor.com/visualize.html#code=numbers%20%3D%20%5B7,%2011,%208,%205,%203,%2012,%202,%206,%209,%2010,%201,%204%5D%0Amemoryless%20%3D%20range%281,%2013%29%0Aresult1%20%3D%20list%28reversed%28numbers%29%29%0Aresult2%20%3D%20tuple%28reversed%28memoryless%29%29&cumulative=false&curInstr=4&heapPrimitives=nevernest&mode=display&origin=opt-frontend.js&py=3&rawInputLstJSON=%5B%5D&textReferences=false)).\n", + "\n", + "To reiterate some more new terminology from this chapter, we describe [reversed() ](https://docs.python.org/3/library/functions.html#reversed) as *lazy* whereas [list() ](https://docs.python.org/3/library/functions.html#func-list) and [tuple() ](https://docs.python.org/3/library/functions.html#func-tuple) are *eager*. The former has no significant side effect in memory, while the latter may require a lot of memory." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[4, 1, 10, 9, 6, 2, 12, 3, 5, 8, 11, 7]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(reversed(numbers))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuple(reversed(memoryless))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Of course, we can also loop over the returned *iterators* instead.\n", + "\n", + "That works because *iterators* are always *iterable*; in particular, as the previous \"*The for Statement (revisited)*\" sub-section explains, the `for`-loops below call `iter(reversed(numbers))` and `iter(reversed(memoryless))` behind the scenes. However, the *iterators* returned by [iter() ](https://docs.python.org/3/library/functions.html#iter) are the *same* as the `reversed(numbers)` and `reversed(memoryless)` iterators passed in! In summary, the `for`-loops below involve many subtleties that together make Python the expressive language it is." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 1 10 9 6 2 12 3 5 8 11 7 " + ] + } + ], + "source": [ + "for number in reversed(numbers):\n", + " print(number, end=\" \")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12 11 10 9 8 7 6 5 4 3 2 1 " + ] + } + ], + "source": [ + "for element in reversed(memoryless):\n", + " print(element, end=\" \")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "As with [sorted() ](https://docs.python.org/3/library/functions.html#sorted), the [reversed() ](https://docs.python.org/3/library/functions.html#reversed) built-in does *not* mutate its argument." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "range(1, 13)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memoryless" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "To point out the potentially obvious, we compare the results of *sorting* `numbers` in *reverse* order with *reversing* it: These are *different* concepts!" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sorted(numbers, reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[4, 1, 10, 9, 6, 2, 12, 3, 5, 8, 11, 7]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(reversed(numbers))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "Whereas both [sorted() ](https://docs.python.org/3/library/functions.html#sorted) and [reversed() ](https://docs.python.org/3/library/functions.html#reversed) do *not* mutate their arguments, the *mutable* `list` type comes with two methods, [sort() ](https://docs.python.org/3/library/stdtypes.html#list.sort) and `reverse()`, that implement the same logic but mutate an object, like `numbers` below, *in place*. To indicate that all changes occur *in place*, the [sort() ](https://docs.python.org/3/library/stdtypes.html#list.sort) and `reverse()` methods always return `None`, which is not shown in JupyterLab." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The `reverse()` method on the `list` type is *eager*, as opposed to the *lazy* [reversed() ](https://docs.python.org/3/library/functions.html#reversed) built-in. That means the mutations caused by the `reverse()` method are written into memory right away." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "numbers.reverse()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[4, 1, 10, 9, 6, 2, 12, 3, 5, 8, 11, 7]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "*Sorting* `numbers` in place occurs eagerly." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "numbers.sort(reverse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [], + "source": [ + "numbers.sort()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "livereveal": { + "auto_select": "code", + "auto_select_fragment": true, + "scroll": true, + "theme": "serif" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "384px" + }, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index 971c890..1309419 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -222,4 +222,8 @@ If this is not possible, (Removing Outliers in Streaming Data) - [exercises ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/03_exercises.ipynb) [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/03_exercises.ipynb) - (Packing & Unpacking with Functions, continued) \ No newline at end of file + (Packing & Unpacking with Functions, continued) + - [content ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/04_content.ipynb) + [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/04_content.ipynb) + (Iterators vs. Iterables; + Example: `sorted()` vs. `reversed()`) \ No newline at end of file From 4d5b23ff03234d181688afe66c11b1dbb82c5871 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 19:19:20 +0200 Subject: [PATCH 08/10] Add initial version of chapter 08's summary --- 08_mfr/05_summary.ipynb | 75 +++++++++++++++++++++++++++++++++++++++++ CONTENTS.md | 3 +- 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 08_mfr/05_summary.ipynb diff --git a/08_mfr/05_summary.ipynb b/08_mfr/05_summary.ipynb new file mode 100644 index 0000000..361fbd0 --- /dev/null +++ b/08_mfr/05_summary.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "slide" + } + }, + "source": [ + "# Chapter 8: Map, Filter, & Reduce (TL;DR)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "slideshow": { + "slide_type": "skip" + } + }, + "source": [ + "The operations we do with sequential data commonly follow the **map-filter-reduce paradigm**: We apply the same transformation to all elements, filter some of them out, and calculate summary statistics from the remaining ones.\n", + "\n", + "An essential idea in this chapter is that, in many situations, we need *not* have all the data **materialized** in memory. Instead, **iterators** allow us to process sequential data on a one-by-one basis.\n", + "\n", + "Examples for iterators are the `map`, `filter`, and `generator` types." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "livereveal": { + "auto_select": "code", + "auto_select_fragment": true, + "scroll": true, + "theme": "serif" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "384px" + }, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index 1309419..332a42f 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -226,4 +226,5 @@ If this is not possible, - [content ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/04_content.ipynb) [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/04_content.ipynb) (Iterators vs. Iterables; - Example: `sorted()` vs. `reversed()`) \ No newline at end of file + Example: `sorted()` vs. `reversed()`) + - [summary ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/05_summary.ipynb) \ No newline at end of file From b1a55631147e8f4f6aa055265e4ba88d4bb40a51 Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 19:22:08 +0200 Subject: [PATCH 09/10] Add initial version of chapter 08's review --- 08_mfr/06_review.ipynb | 214 +++++++++++++++++++++++++++++++++++++++++ CONTENTS.md | 3 +- 2 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 08_mfr/06_review.ipynb diff --git a/08_mfr/06_review.ipynb b/08_mfr/06_review.ipynb new file mode 100644 index 0000000..2f3d29f --- /dev/null +++ b/08_mfr/06_review.ipynb @@ -0,0 +1,214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chapter 8: Map, Filter, & Reduce (Review Questions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The questions below assume that you have read the [first ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/00_content.ipynb), [second ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/01_content.ipynb), and [third ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/04_content.ipynb) part in Chapter 8.\n", + "\n", + "Be concise in your answers! Most questions can be answered in *one* sentence." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Essay Questions " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q1**: With the [map() ](https://docs.python.org/3/library/functions.html#map) and [filter() ](https://docs.python.org/3/library/functions.html#filter) built-ins and the [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) function from the [functools ](https://docs.python.org/3/library/functools.html) module in the [standard library ](https://docs.python.org/3/library/index.html), we can replace many tedious `for`-loops and `if` statements. What are some advantages of doing so?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q2**: Looking at the `lambda` expression inside [reduce() ](https://docs.python.org/3/library/functools.html#functools.reduce) below, what [built-in function ](https://docs.python.org/3/library/functions.html) is mimicked here?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "from functools import reduce\n", + "\n", + "numbers = [7, 11, 8, 5, 3, 12, 2, 6, 9, 10, 1, 4]\n", + "\n", + "reduce(lambda x, y: x if x > y else y, numbers)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q3**: What is the primary use case of **`list` comprehensions**? Why do we describe them as **eager**?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q4**: **`generator` expressions** may replace `list` objects and list comprehensions in many scenarios. When evaluated, they create a **lazy** `generator` object that does *not* **materialize** its elements right away. What do we mean by that? What does it mean for a `generator` object to be **exhausted**?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q5**: What does it mean for the **boolean reducers**, the built-in [all() ](https://docs.python.org/3/library/functions.html#all) and [any() ](https://docs.python.org/3/library/functions.html#any) functions, to follow the **short-circuiting** strategy?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q6**: What is an **iterator**? How does it relate to an **iterable**?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## True / False Questions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Motivate your answer with *one short* sentence!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q7**: `lambda` expressions are useful in the context of the **map-filter-reduce** paradigm, where we often do *not* re-use a `function` object more than once." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q8**: Using **`generator` expressions** in place of **`list` comprehensions** wherever possible is a good practice as it makes our programs use memory more efficiently." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Q9**: Just as **`list` comprehensions** create `list` objects, **`tuple` comprehensions** create `tuple` objects." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " < your answer >" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": false, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/CONTENTS.md b/CONTENTS.md index 332a42f..bfa7f76 100644 --- a/CONTENTS.md +++ b/CONTENTS.md @@ -227,4 +227,5 @@ If this is not possible, [](https://mybinder.org/v2/gh/webartifex/intro-to-python/develop?urlpath=lab/tree/08_mfr/04_content.ipynb) (Iterators vs. Iterables; Example: `sorted()` vs. `reversed()`) - - [summary ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/05_summary.ipynb) \ No newline at end of file + - [summary ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/05_summary.ipynb) + - [review questions ](https://nbviewer.jupyter.org/github/webartifex/intro-to-python/blob/develop/08_mfr/06_review.ipynb) From 814069e7fbfc31eaca0985f2e584a53e6642f3bb Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Wed, 21 Oct 2020 19:24:51 +0200 Subject: [PATCH 10/10] Streamline slides --- 08_mfr/00_content.ipynb | 6 +++++- 08_mfr/01_content.ipynb | 8 ++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/08_mfr/00_content.ipynb b/08_mfr/00_content.ipynb index 4cf1aa7..1884515 100644 --- a/08_mfr/00_content.ipynb +++ b/08_mfr/00_content.ipynb @@ -329,7 +329,11 @@ { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [ { "data": { diff --git a/08_mfr/01_content.ipynb b/08_mfr/01_content.ipynb index 2cf8720..688d953 100644 --- a/08_mfr/01_content.ipynb +++ b/08_mfr/01_content.ipynb @@ -700,7 +700,11 @@ { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "slideshow": { + "slide_type": "fragment" + } + }, "outputs": [], "source": [ "from functools import reduce" @@ -711,7 +715,7 @@ "execution_count": 20, "metadata": { "slideshow": { - "slide_type": "fragment" + "slide_type": "-" } }, "outputs": [