This is an automated email from the ASF dual-hosted git repository. fmcquillan pushed a commit to branch asf-site in repository https://gitbox.apache.org/repos/asf/madlib-site.git
The following commit(s) were added to refs/heads/asf-site by this push: new 3a7f9ed add ipython notebook for window functions 3a7f9ed is described below commit 3a7f9ed2e8dbaa6d0b0a406593f00b0598e1bbf0 Author: Frank McQuillan <fmcquil...@pivotal.io> AuthorDate: Fri Apr 24 12:45:54 2020 -0700 add ipython notebook for window functions --- .../Time-series/Window-functions-v1.ipynb | 1910 ++++++++++++++++++++ 1 file changed, 1910 insertions(+) diff --git a/community-artifacts/Time-series/Window-functions-v1.ipynb b/community-artifacts/Time-series/Window-functions-v1.ipynb new file mode 100644 index 0000000..9c30a40 --- /dev/null +++ b/community-artifacts/Time-series/Window-functions-v1.ipynb @@ -0,0 +1,1910 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Time series example - window functions\n", + "\n", + "Some example queries on time series data using aggregates and window functions. Thanks to Divya Bhargov from VMware for this example notebook.\n", + "\n", + "Data from https://data.cityofchicago.org/Transportation/Potholes-Patched/wqdh-9gek/data which is loaded from CSV format.\n", + "\n", + "## Table of contents \n", + "\n", + "<a href=\"#setup\">1. Connect to database</a>\n", + "\n", + "<a href=\"#load_dataset\">2. Load data</a>\n", + "\n", + "<a href=\"#window\">3. Window functions</a>\n", + "\n", + "<a href=\"#mapping\">4. Mapping for gap filling</a>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"setup\"></a>\n", + "## 1. Connect to database" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n", + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>PostgreSQL 8.3.23 (Greenplum Database 5.18.0 build commit:6aec9959d367d46c6b4391eb9ffc82c735d20102) on x86_64-pc-linux-gnu, compiled by GCC gcc (GCC) 6.2.0, 64-bit compiled on Apr 3 2019 14:45:51</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'PostgreSQL 8.3.23 (Greenplum Database 5.18.0 build commit:6aec9959d367d46c6b4391eb9ffc82c735d20102) on x86_64-pc-linux-gnu, compiled by GCC gcc (GCC) 6.2.0, 64-bit compiled on Apr 3 2019 14:45:51',)]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%load_ext sql\n", + "\n", + "# Greenplum Database 5.x on GCP (PM demo machine) - via tunnel\n", + "%sql postgresql://gpadmin@localhost:8000/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + " \n", + "%sql SELECT version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"load_dataset\"></a>\n", + "## 2. Load data\n", + "Load from CSV. You will need to change the path to the location of the CSV file." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "65544 rows affected.\n" + ] + }, + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS chicago_potholes_patched;\n", + "CREATE TABLE chicago_potholes_patched (\n", + " id serial NOT NULL,\n", + " address TEXT,\n", + " request_date TIMESTAMP,\n", + " completion_date TIMESTAMP,\n", + " number_of_potholes_filled_on_block INTEGER,\n", + " latitude FLOAT,\n", + " longitude FLOAT,\n", + " location TEXT\n", + ");\n", + "\n", + "/* COPY chicago_potholes_patched(address, request_date, completion_date, number_of_potholes_filled_on_block, latitude, longitude, location) \n", + "FROM '/Users/fmcquillan/Documents/Product/MADlib/Demos/Data/potholes_patched.csv' DELIMITER ',' CSV HEADER;*/\n", + "\n", + "COPY chicago_potholes_patched(address, request_date, completion_date, number_of_potholes_filled_on_block, latitude, longitude, location) \n", + "FROM '/home/gpadmin/data/potholes_patched.csv' DELIMITER ',' CSV HEADER;" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>address</th>\n", + " <th>request_date</th>\n", + " <th>completion_date</th>\n", + " <th>number_of_potholes_filled_on_block</th>\n", + " <th>latitude</th>\n", + " <th>longitude</th>\n", + " <th>location</th>\n", + " </tr>\n", + " <tr>\n", + " <td>64544</td>\n", + " <td>4649 W FULLERTON AVE</td>\n", + " <td>2020-04-14 11:14:50</td>\n", + " <td>2020-04-21 15:25:34</td>\n", + " <td>4</td>\n", + " <td>41.924278082</td>\n", + " <td>-87.743872241</td>\n", + " <td>POINT (-87.743872241019 41.924278081659)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64878</td>\n", + " <td>2310 N KNOX AVE</td>\n", + " <td>2020-04-18 12:14:13</td>\n", + " <td>2020-04-21 15:24:44</td>\n", + " <td>12</td>\n", + " <td>41.922840944</td>\n", + " <td>-87.742654834</td>\n", + " <td>POINT (-87.742654834295 41.922840943632)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64459</td>\n", + " <td>2258 N KNOX AVE</td>\n", + " <td>2020-04-18 12:06:17</td>\n", + " <td>2020-04-21 15:23:55</td>\n", + " <td>15</td>\n", + " <td>41.922467136</td>\n", + " <td>-87.742610186</td>\n", + " <td>POINT (-87.742610186145 41.922467135904)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64807</td>\n", + " <td>3958 W FULLERTON AVE</td>\n", + " <td>2020-04-16 12:37:36</td>\n", + " <td>2020-04-21 15:22:08</td>\n", + " <td>2</td>\n", + " <td>41.924817886</td>\n", + " <td>-87.726093348</td>\n", + " <td>POINT (-87.726093348147 41.924817886186)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64812</td>\n", + " <td>2711 N PULASKI RD</td>\n", + " <td>2020-04-16 13:03:30</td>\n", + " <td>2020-04-21 15:20:16</td>\n", + " <td>2</td>\n", + " <td>41.930408431</td>\n", + " <td>-87.726595137</td>\n", + " <td>POINT (-87.726595137074 41.930408431011)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64844</td>\n", + " <td>4444 W DEMING PL</td>\n", + " <td>2020-04-14 11:07:56</td>\n", + " <td>2020-04-21 15:18:38</td>\n", + " <td>1</td>\n", + " <td>41.92754944</td>\n", + " <td>-87.738454216</td>\n", + " <td>POINT (-87.738454216115 41.927549439867)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64848</td>\n", + " <td>2536 N CENTRAL PARK AVE</td>\n", + " <td>2020-04-20 13:19:30</td>\n", + " <td>2020-04-21 15:16:36</td>\n", + " <td>19</td>\n", + " <td>41.927537543</td>\n", + " <td>-87.717339518</td>\n", + " <td>POINT (-87.717339517786 41.92753754297)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64545</td>\n", + " <td>5607 S KENWOOD AVE</td>\n", + " <td>2020-04-21 15:08:30</td>\n", + " <td>2020-04-21 15:12:51</td>\n", + " <td>4</td>\n", + " <td>41.793048858</td>\n", + " <td>-87.592978731</td>\n", + " <td>POINT (-87.592978730873 41.793048857693)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64765</td>\n", + " <td>8701 W FOSTER AVE</td>\n", + " <td>2020-04-16 18:46:19</td>\n", + " <td>2020-04-21 15:11:44</td>\n", + " <td>1</td>\n", + " <td>41.973190749</td>\n", + " <td>-87.844147025</td>\n", + " <td>POINT (-87.844147024715 41.97319074886)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>64892</td>\n", + " <td>11521 S LA SALLE ST</td>\n", + " <td>2020-04-19 05:14:37</td>\n", + " <td>2020-04-21 14:57:33</td>\n", + " <td>15</td>\n", + " <td>41.684608279</td>\n", + " <td>-87.626219142</td>\n", + " <td>POINT (-87.626219142292 41.684608279405)</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(64544, u'4649 W FULLERTON AVE', datetime.datetime(2020, 4, 14, 11, 14, 50), datetime.datetime(2020, 4, 21, 15, 25, 34), 4, 41.924278082, -87.743872241, u'POINT (-87.743872241019 41.924278081659)'),\n", + " (64878, u'2310 N KNOX AVE', datetime.datetime(2020, 4, 18, 12, 14, 13), datetime.datetime(2020, 4, 21, 15, 24, 44), 12, 41.922840944, -87.742654834, u'POINT (-87.742654834295 41.922840943632)'),\n", + " (64459, u'2258 N KNOX AVE', datetime.datetime(2020, 4, 18, 12, 6, 17), datetime.datetime(2020, 4, 21, 15, 23, 55), 15, 41.922467136, -87.742610186, u'POINT (-87.742610186145 41.922467135904)'),\n", + " (64807, u'3958 W FULLERTON AVE', datetime.datetime(2020, 4, 16, 12, 37, 36), datetime.datetime(2020, 4, 21, 15, 22, 8), 2, 41.924817886, -87.726093348, u'POINT (-87.726093348147 41.924817886186)'),\n", + " (64812, u'2711 N PULASKI RD', datetime.datetime(2020, 4, 16, 13, 3, 30), datetime.datetime(2020, 4, 21, 15, 20, 16), 2, 41.930408431, -87.726595137, u'POINT (-87.726595137074 41.930408431011)'),\n", + " (64844, u'4444 W DEMING PL', datetime.datetime(2020, 4, 14, 11, 7, 56), datetime.datetime(2020, 4, 21, 15, 18, 38), 1, 41.92754944, -87.738454216, u'POINT (-87.738454216115 41.927549439867)'),\n", + " (64848, u'2536 N CENTRAL PARK AVE', datetime.datetime(2020, 4, 20, 13, 19, 30), datetime.datetime(2020, 4, 21, 15, 16, 36), 19, 41.927537543, -87.717339518, u'POINT (-87.717339517786 41.92753754297)'),\n", + " (64545, u'5607 S KENWOOD AVE', datetime.datetime(2020, 4, 21, 15, 8, 30), datetime.datetime(2020, 4, 21, 15, 12, 51), 4, 41.793048858, -87.592978731, u'POINT (-87.592978730873 41.793048857693)'),\n", + " (64765, u'8701 W FOSTER AVE', datetime.datetime(2020, 4, 16, 18, 46, 19), datetime.datetime(2020, 4, 21, 15, 11, 44), 1, 41.973190749, -87.844147025, u'POINT (-87.844147024715 41.97319074886)'),\n", + " (64892, u'11521 S LA SALLE ST', datetime.datetime(2020, 4, 19, 5, 14, 37), datetime.datetime(2020, 4, 21, 14, 57, 33), 15, 41.684608279, -87.626219142, u'POINT (-87.626219142292 41.684608279405)')]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM chicago_potholes_patched ORDER BY completion_date DESC LIMIT 10;" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>address</th>\n", + " <th>request_date</th>\n", + " <th>completion_date</th>\n", + " <th>number_of_potholes_filled_on_block</th>\n", + " <th>latitude</th>\n", + " <th>longitude</th>\n", + " <th>location</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>5505 W WINDSOR AVE</td>\n", + " <td>2019-04-24 14:06:04</td>\n", + " <td>2019-04-26 14:05:16</td>\n", + " <td>5</td>\n", + " <td>41.96306727</td>\n", + " <td>-87.765233061</td>\n", + " <td>POINT (-87.765233060697 41.963067269624)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>400 E WALDRON DR</td>\n", + " <td>2019-07-23 10:00:36</td>\n", + " <td>2019-07-23 16:16:48</td>\n", + " <td>10</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>2101 S CANAL ST</td>\n", + " <td>2019-07-23 09:57:55</td>\n", + " <td>2019-07-23 09:58:51</td>\n", + " <td>12</td>\n", + " <td>41.854048592</td>\n", + " <td>-87.638463921</td>\n", + " <td>POINT (-87.638463920776 41.854048592448)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>3651 W AINSLIE ST</td>\n", + " <td>2019-03-15 11:43:36</td>\n", + " <td>2019-04-29 14:37:02</td>\n", + " <td>10</td>\n", + " <td>41.970029</td>\n", + " <td>-87.7203</td>\n", + " <td>POINT (-87.7203 41.970029)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>10346 S PULASKI RD</td>\n", + " <td>2019-02-28 16:42:32</td>\n", + " <td>2019-03-05 11:14:02</td>\n", + " <td>1</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>521 E 91ST PL</td>\n", + " <td>2019-06-13 10:18:24</td>\n", + " <td>2019-07-23 13:49:35</td>\n", + " <td>12</td>\n", + " <td>41.728137874</td>\n", + " <td>-87.610980602</td>\n", + " <td>POINT (-87.610980602161 41.7281378741)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>801 E SOLIDARITY DR</td>\n", + " <td>2019-07-23 10:09:20</td>\n", + " <td>2019-07-23 10:10:10</td>\n", + " <td>20</td>\n", + " <td>41.866144383</td>\n", + " <td>-87.609999814</td>\n", + " <td>POINT (-87.609999814415 41.866144383387)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>2500 W 36TH ST</td>\n", + " <td>2019-06-09 18:46:30</td>\n", + " <td>2019-07-23 10:46:44</td>\n", + " <td>48</td>\n", + " <td>41.828545615</td>\n", + " <td>-87.687820304</td>\n", + " <td>POINT (-87.687820304125 41.828545615092)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>300 W CERMAK RD</td>\n", + " <td>2019-07-21 18:26:09</td>\n", + " <td>2019-07-23 15:06:49</td>\n", + " <td>18</td>\n", + " <td>41.85290813</td>\n", + " <td>-87.634936965</td>\n", + " <td>POINT (-87.634936964669 41.852908130256)</td>\n", + " </tr>\n", + " <tr>\n", + " <td>20</td>\n", + " <td>328 N DESPLAINES ST</td>\n", + " <td>2019-07-22 10:05:41</td>\n", + " <td>2019-07-23 09:54:46</td>\n", + " <td>10</td>\n", + " <td>41.887721193</td>\n", + " <td>-87.644343544</td>\n", + " <td>POINT (-87.644343543685 41.88772119348)</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2, u'5505 W WINDSOR AVE', datetime.datetime(2019, 4, 24, 14, 6, 4), datetime.datetime(2019, 4, 26, 14, 5, 16), 5, 41.96306727, -87.765233061, u'POINT (-87.765233060697 41.963067269624)'),\n", + " (4, u'400 E WALDRON DR', datetime.datetime(2019, 7, 23, 10, 0, 36), datetime.datetime(2019, 7, 23, 16, 16, 48), 10, None, None, None),\n", + " (6, u'2101 S CANAL ST', datetime.datetime(2019, 7, 23, 9, 57, 55), datetime.datetime(2019, 7, 23, 9, 58, 51), 12, 41.854048592, -87.638463921, u'POINT (-87.638463920776 41.854048592448)'),\n", + " (8, u'3651 W AINSLIE ST', datetime.datetime(2019, 3, 15, 11, 43, 36), datetime.datetime(2019, 4, 29, 14, 37, 2), 10, 41.970029, -87.7203, u'POINT (-87.7203 41.970029)'),\n", + " (10, u'10346 S PULASKI RD', datetime.datetime(2019, 2, 28, 16, 42, 32), datetime.datetime(2019, 3, 5, 11, 14, 2), 1, None, None, None),\n", + " (12, u'521 E 91ST PL', datetime.datetime(2019, 6, 13, 10, 18, 24), datetime.datetime(2019, 7, 23, 13, 49, 35), 12, 41.728137874, -87.610980602, u'POINT (-87.610980602161 41.7281378741)'),\n", + " (14, u'801 E SOLIDARITY DR', datetime.datetime(2019, 7, 23, 10, 9, 20), datetime.datetime(2019, 7, 23, 10, 10, 10), 20, 41.866144383, -87.609999814, u'POINT (-87.609999814415 41.866144383387)'),\n", + " (16, u'2500 W 36TH ST', datetime.datetime(2019, 6, 9, 18, 46, 30), datetime.datetime(2019, 7, 23, 10, 46, 44), 48, 41.828545615, -87.687820304, u'POINT (-87.687820304125 41.828545615092)'),\n", + " (18, u'300 W CERMAK RD', datetime.datetime(2019, 7, 21, 18, 26, 9), datetime.datetime(2019, 7, 23, 15, 6, 49), 18, 41.85290813, -87.634936965, u'POINT (-87.634936964669 41.852908130256)'),\n", + " (20, u'328 N DESPLAINES ST', datetime.datetime(2019, 7, 22, 10, 5, 41), datetime.datetime(2019, 7, 23, 9, 54, 46), 10, 41.887721193, -87.644343544, u'POINT (-87.644343543685 41.88772119348)')]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "select * from chicago_potholes_patched limit 10;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transform to year - month format" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "22 rows affected.\n", + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_year</th>\n", + " <th>completion_month</th>\n", + " <th>total_potholes_patched</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>7</td>\n", + " <td>25221</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>8</td>\n", + " <td>21251</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>9</td>\n", + " <td>13865</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>10</td>\n", + " <td>15853</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>11</td>\n", + " <td>13078</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>12</td>\n", + " <td>12287</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>1</td>\n", + " <td>32819</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>2</td>\n", + " <td>78509</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>3</td>\n", + " <td>135763</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>4</td>\n", + " <td>106023</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>5</td>\n", + " <td>55697</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>6</td>\n", + " <td>25742</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>7</td>\n", + " <td>16474</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>8</td>\n", + " <td>12880</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>9</td>\n", + " <td>19201</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>10</td>\n", + " <td>18079</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>11</td>\n", + " <td>19133</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>12</td>\n", + " <td>21691</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>1</td>\n", + " <td>58844</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>2</td>\n", + " <td>49768</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>3</td>\n", + " <td>67589</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>4</td>\n", + " <td>39150</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2018, 7, 25221),\n", + " (2018, 8, 21251),\n", + " (2018, 9, 13865),\n", + " (2018, 10, 15853),\n", + " (2018, 11, 13078),\n", + " (2018, 12, 12287),\n", + " (2019, 1, 32819),\n", + " (2019, 2, 78509),\n", + " (2019, 3, 135763),\n", + " (2019, 4, 106023),\n", + " (2019, 5, 55697),\n", + " (2019, 6, 25742),\n", + " (2019, 7, 16474),\n", + " (2019, 8, 12880),\n", + " (2019, 9, 19201),\n", + " (2019, 10, 18079),\n", + " (2019, 11, 19133),\n", + " (2019, 12, 21691),\n", + " (2020, 1, 58844),\n", + " (2020, 2, 49768),\n", + " (2020, 3, 67589),\n", + " (2020, 4, 39150)]" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "DROP TABLE IF EXISTS temp_total_potholes_patched;\n", + "\n", + "CREATE TABLE temp_total_potholes_patched(\n", + " completion_year INTEGER,\n", + " completion_month INTEGER,\n", + " total_potholes_patched INTEGER\n", + ");\n", + "\n", + "INSERT INTO temp_total_potholes_patched \n", + " SELECT date_part('year', completion_date) AS completion_year,\n", + " date_part('month', completion_date) AS completion_month,\n", + " sum(number_of_potholes_filled_on_block) AS total_potholes_patched\n", + " FROM chicago_potholes_patched\n", + " GROUP BY completion_year, completion_month;\n", + "\n", + "SELECT * FROM temp_total_potholes_patched order by completion_year, completion_month;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"window\"></a>\n", + "## 3. Window functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Potholes patched every month, compared to average potholes patched for the year" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_year</th>\n", + " <th>completion_month</th>\n", + " <th>total_potholes_patched</th>\n", + " <th>average_for_year</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>7</td>\n", + " <td>25221</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>8</td>\n", + " <td>21251</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>9</td>\n", + " <td>13865</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>10</td>\n", + " <td>15853</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>11</td>\n", + " <td>13078</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>12</td>\n", + " <td>12287</td>\n", + " <td>16925.8333333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>1</td>\n", + " <td>32819</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>2</td>\n", + " <td>78509</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>3</td>\n", + " <td>135763</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>4</td>\n", + " <td>106023</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>5</td>\n", + " <td>55697</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>6</td>\n", + " <td>25742</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>7</td>\n", + " <td>16474</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>8</td>\n", + " <td>12880</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>9</td>\n", + " <td>19201</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>10</td>\n", + " <td>18079</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>11</td>\n", + " <td>19133</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>12</td>\n", + " <td>21691</td>\n", + " <td>45167.5833333333</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>1</td>\n", + " <td>58844</td>\n", + " <td>53837.75</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>2</td>\n", + " <td>49768</td>\n", + " <td>53837.75</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>3</td>\n", + " <td>67589</td>\n", + " <td>53837.75</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>4</td>\n", + " <td>39150</td>\n", + " <td>53837.75</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2018, 7, 25221, Decimal('16925.8333333333')),\n", + " (2018, 8, 21251, Decimal('16925.8333333333')),\n", + " (2018, 9, 13865, Decimal('16925.8333333333')),\n", + " (2018, 10, 15853, Decimal('16925.8333333333')),\n", + " (2018, 11, 13078, Decimal('16925.8333333333')),\n", + " (2018, 12, 12287, Decimal('16925.8333333333')),\n", + " (2019, 1, 32819, Decimal('45167.5833333333')),\n", + " (2019, 2, 78509, Decimal('45167.5833333333')),\n", + " (2019, 3, 135763, Decimal('45167.5833333333')),\n", + " (2019, 4, 106023, Decimal('45167.5833333333')),\n", + " (2019, 5, 55697, Decimal('45167.5833333333')),\n", + " (2019, 6, 25742, Decimal('45167.5833333333')),\n", + " (2019, 7, 16474, Decimal('45167.5833333333')),\n", + " (2019, 8, 12880, Decimal('45167.5833333333')),\n", + " (2019, 9, 19201, Decimal('45167.5833333333')),\n", + " (2019, 10, 18079, Decimal('45167.5833333333')),\n", + " (2019, 11, 19133, Decimal('45167.5833333333')),\n", + " (2019, 12, 21691, Decimal('45167.5833333333')),\n", + " (2020, 1, 58844, Decimal('53837.75')),\n", + " (2020, 2, 49768, Decimal('53837.75')),\n", + " (2020, 3, 67589, Decimal('53837.75')),\n", + " (2020, 4, 39150, Decimal('53837.75'))]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT completion_year, completion_month, total_potholes_patched,\n", + "avg(total_potholes_patched) OVER (PARTITION BY completion_year) AS average_for_year \n", + "FROM temp_total_potholes_patched\n", + "ORDER BY completion_year, completion_month;" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABJoAAAEyCAYAAACyHrqiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi41LCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvSM8oowAAIABJREFUeJzs3Xd81dX9x/HXySIkEEaAMAIk7BlGwhZZMlQURRwoswoOwFqtSmtbbWtba63WAuJkqiiiKFoVAUEQCBCmbAIkEGQEQhKSkH1+f+TKL8yErG/G+/l45MG955zv+b7vhccjl8893/M11lpEREREREREREQKy83pACIiIiIiIiIiUj6o0CQiIiIiIiIiIkVChSYRERERERERESkSKjSJiIiIiIiIiEiRUKFJRERERERERESKhApNIiIiIiIiIiJSJFRoEhERERERERGRIqFC [...] + "text/plain": [ + "<Figure size 1440x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = _.DataFrame()\n", + "\n", + "df['date'] = df.completion_year.map(\"{:04}\".format) + df.completion_month.map(\"{:02}\".format)\n", + "\n", + "plt.figure(figsize=(20,5))\n", + "\n", + "plt.plot(df.date,df.total_potholes_patched, linestyle='solid');\n", + "plt.plot(df.date,df.average_for_year, linestyle='dashed');\n", + "plt.legend();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Potholes patched every month, compared to previous month" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_year</th>\n", + " <th>completion_month</th>\n", + " <th>total_potholes_patched</th>\n", + " <th>potholes_patched_last_month</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>7</td>\n", + " <td>25221</td>\n", + " <td>None</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>8</td>\n", + " <td>21251</td>\n", + " <td>25221</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>9</td>\n", + " <td>13865</td>\n", + " <td>21251</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>10</td>\n", + " <td>15853</td>\n", + " <td>13865</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>11</td>\n", + " <td>13078</td>\n", + " <td>15853</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>12</td>\n", + " <td>12287</td>\n", + " <td>13078</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>1</td>\n", + " <td>32819</td>\n", + " <td>12287</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>2</td>\n", + " <td>78509</td>\n", + " <td>32819</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>3</td>\n", + " <td>135763</td>\n", + " <td>78509</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>4</td>\n", + " <td>106023</td>\n", + " <td>135763</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>5</td>\n", + " <td>55697</td>\n", + " <td>106023</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>6</td>\n", + " <td>25742</td>\n", + " <td>55697</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>7</td>\n", + " <td>16474</td>\n", + " <td>25742</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>8</td>\n", + " <td>12880</td>\n", + " <td>16474</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>9</td>\n", + " <td>19201</td>\n", + " <td>12880</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>10</td>\n", + " <td>18079</td>\n", + " <td>19201</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>11</td>\n", + " <td>19133</td>\n", + " <td>18079</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>12</td>\n", + " <td>21691</td>\n", + " <td>19133</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>1</td>\n", + " <td>58844</td>\n", + " <td>21691</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>2</td>\n", + " <td>49768</td>\n", + " <td>58844</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>3</td>\n", + " <td>67589</td>\n", + " <td>49768</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>4</td>\n", + " <td>39150</td>\n", + " <td>67589</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2018, 7, 25221, None),\n", + " (2018, 8, 21251, 25221),\n", + " (2018, 9, 13865, 21251),\n", + " (2018, 10, 15853, 13865),\n", + " (2018, 11, 13078, 15853),\n", + " (2018, 12, 12287, 13078),\n", + " (2019, 1, 32819, 12287),\n", + " (2019, 2, 78509, 32819),\n", + " (2019, 3, 135763, 78509),\n", + " (2019, 4, 106023, 135763),\n", + " (2019, 5, 55697, 106023),\n", + " (2019, 6, 25742, 55697),\n", + " (2019, 7, 16474, 25742),\n", + " (2019, 8, 12880, 16474),\n", + " (2019, 9, 19201, 12880),\n", + " (2019, 10, 18079, 19201),\n", + " (2019, 11, 19133, 18079),\n", + " (2019, 12, 21691, 19133),\n", + " (2020, 1, 58844, 21691),\n", + " (2020, 2, 49768, 58844),\n", + " (2020, 3, 67589, 49768),\n", + " (2020, 4, 39150, 67589)]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT completion_year, completion_month, total_potholes_patched,\n", + "lag(total_potholes_patched, 1) OVER (ORDER BY completion_year, completion_month) AS potholes_patched_last_month\n", + "FROM temp_total_potholes_patched\n", + "ORDER BY completion_year, completion_month;" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABJoAAAEyCAYAAACyHrqiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi41LCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvSM8oowAAIABJREFUeJzs3Xd0VVX+/vH3TiOEXkIvCb2EBEgITboIKooiqEhVBEXUmXF0ZEZnmLHMT0dGLGBXmooKDogVEUVaEEKVXhMIUkIJJKQn+/dHrvnSk1ySnNzwvNbKyr377LPPc29YK5dP9tnbWGsRERERERERERG5Wl5OBxARERERERERkdJBhSYRERERERERESkUKjSJiIiIiIiIiEihUKFJREREREREREQKhQpNIiIiIiIiIiJSKFRoEhERERERERGRQqFCk4iIiIiIiIiIFAoVmkRE [...] + "text/plain": [ + "<Figure size 1440x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = _.DataFrame()\n", + "\n", + "df['date'] = df.completion_year.map(\"{:04}\".format) + df.completion_month.map(\"{:02}\".format)\n", + "\n", + "plt.figure(figsize=(20,5))\n", + "\n", + "plt.plot(df.date,df.total_potholes_patched, linestyle='solid');\n", + "plt.plot(df.date,df.potholes_patched_last_month, linestyle='dashed');\n", + "plt.legend();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Accumulation - Total potholes patched to date" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_year</th>\n", + " <th>completion_month</th>\n", + " <th>total_potholes_patched</th>\n", + " <th>total_potholes_patched_to_date</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>7</td>\n", + " <td>25221</td>\n", + " <td>25221</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>8</td>\n", + " <td>21251</td>\n", + " <td>46472</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>9</td>\n", + " <td>13865</td>\n", + " <td>60337</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>10</td>\n", + " <td>15853</td>\n", + " <td>76190</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>11</td>\n", + " <td>13078</td>\n", + " <td>89268</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2018</td>\n", + " <td>12</td>\n", + " <td>12287</td>\n", + " <td>101555</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>1</td>\n", + " <td>32819</td>\n", + " <td>134374</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>2</td>\n", + " <td>78509</td>\n", + " <td>212883</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>3</td>\n", + " <td>135763</td>\n", + " <td>348646</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>4</td>\n", + " <td>106023</td>\n", + " <td>454669</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>5</td>\n", + " <td>55697</td>\n", + " <td>510366</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>6</td>\n", + " <td>25742</td>\n", + " <td>536108</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>7</td>\n", + " <td>16474</td>\n", + " <td>552582</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>8</td>\n", + " <td>12880</td>\n", + " <td>565462</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>9</td>\n", + " <td>19201</td>\n", + " <td>584663</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>10</td>\n", + " <td>18079</td>\n", + " <td>602742</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>11</td>\n", + " <td>19133</td>\n", + " <td>621875</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>12</td>\n", + " <td>21691</td>\n", + " <td>643566</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>1</td>\n", + " <td>58844</td>\n", + " <td>702410</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>2</td>\n", + " <td>49768</td>\n", + " <td>752178</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>3</td>\n", + " <td>67589</td>\n", + " <td>819767</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2020</td>\n", + " <td>4</td>\n", + " <td>39150</td>\n", + " <td>858917</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2018, 7, 25221, 25221L),\n", + " (2018, 8, 21251, 46472L),\n", + " (2018, 9, 13865, 60337L),\n", + " (2018, 10, 15853, 76190L),\n", + " (2018, 11, 13078, 89268L),\n", + " (2018, 12, 12287, 101555L),\n", + " (2019, 1, 32819, 134374L),\n", + " (2019, 2, 78509, 212883L),\n", + " (2019, 3, 135763, 348646L),\n", + " (2019, 4, 106023, 454669L),\n", + " (2019, 5, 55697, 510366L),\n", + " (2019, 6, 25742, 536108L),\n", + " (2019, 7, 16474, 552582L),\n", + " (2019, 8, 12880, 565462L),\n", + " (2019, 9, 19201, 584663L),\n", + " (2019, 10, 18079, 602742L),\n", + " (2019, 11, 19133, 621875L),\n", + " (2019, 12, 21691, 643566L),\n", + " (2020, 1, 58844, 702410L),\n", + " (2020, 2, 49768, 752178L),\n", + " (2020, 3, 67589, 819767L),\n", + " (2020, 4, 39150, 858917L)]" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "select completion_year, completion_month, total_potholes_patched,\n", + "sum(total_potholes_patched) over (order by completion_year, completion_month) as total_potholes_patched_to_date\n", + "From temp_total_potholes_patched\n", + "order by completion_year, completion_month;" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABKgAAAE/CAYAAABihvt5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi41LCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvSM8oowAAIABJREFUeJzt3Xu4XFV9//H3lwQSIJBACCgJmIBQiIoBI1BFq2DlpgQtKKiAoCI/RFTqBa1KqMVCbcVSUYuAAlUBhVZUELkYFVouAQJyN0CUIEJMSAAFScL398deB4bDucw5ZLJnzrxfzzPPmVn79pl9JpmZ71lr7chMJEmSJEmSpLqsUXcASZIkSZIkdTcLVJIkSZIkSaqVBSpJkiRJkiTVygKVJEmSJEmSamWBSpIkSZIkSbWyQCVJkiRJkqRaWaCSJEn9ioixEZERMWU1HGubiFgx [...] + "text/plain": [ + "<Figure size 1440x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = _.DataFrame()\n", + "\n", + "df['date'] = df.completion_year.map(\"{:04}\".format) + df.completion_month.map(\"{:02}\".format)\n", + "\n", + "\n", + "plt.figure(figsize=(20,5))\n", + "\n", + "p1 = plt.bar(df.date, df.total_potholes_patched)\n", + "p2 = plt.bar(df.date, df.total_potholes_patched_to_date,\n", + " bottom=df.total_potholes_patched)\n", + "\n", + "\n", + "plt.ylabel('Total potholes patched');\n", + "plt.title('Total potholes patched by month and year');\n", + "plt.legend((p1[0], p2[0]), ('Monthly', 'YTD'));\n", + "\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Rank - Top 3 months of 2019" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_year</th>\n", + " <th>completion_month</th>\n", + " <th>total_potholes_patched</th>\n", + " <th>rank</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>3</td>\n", + " <td>135763</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>4</td>\n", + " <td>106023</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019</td>\n", + " <td>2</td>\n", + " <td>78509</td>\n", + " <td>3</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2019, 3, 135763, 1L), (2019, 4, 106023, 2L), (2019, 2, 78509, 3L)]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM (\n", + " SELECT completion_year,\n", + " completion_month,\n", + " total_potholes_patched,\n", + " rank() OVER (PARTITION BY completion_year \n", + " ORDER BY total_potholes_patched desc) AS rank\n", + " FROM temp_total_potholes_patched\n", + " ORDER BY completion_year, completion_month\n", + ") AS ranked_records\n", + "WHERE rank <= 3\n", + "AND completion_year = 2019;" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnoAAAE/CAYAAADR125OAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi41LCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvSM8oowAAIABJREFUeJzt3XuUJlV57/HvTxBREbkNKAw4GDEEb0cYkXiLCoHxshxMvOBRGQ0ROWDUo4miiZKoJJpkBcMJmoNABOMRETVwIgQRvHsGGUBFQGXkIoNcRoab4m3gOX/Ubnlpu3tepvvtnqn+ftZ6V1ftvav2U2/XzDxTu3ZVqgpJkiT1zwPmOgBJkiSNhomeJElST5noSZIk9ZSJniRJUk+Z6EmSJPWUiZ4kSVJPmehJPZFk8ySVZOEs9LV7krXrue2SJCtnOqbZkuTGJM+YoX2dmuSv [...] + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = _.DataFrame()\n", + "\n", + "df['date'] = df.completion_year.map(\"{:04}\".format) + df.completion_month.map(\"{:02}\".format)\n", + "\n", + "\n", + "plt.figure(figsize=(10,5))\n", + "\n", + "p1 = plt.bar(df.date, df.total_potholes_patched)\n", + "\n", + "\n", + "plt.ylabel('Total potholes patched');\n", + "plt.title('Total potholes patched by month and year');\n", + "\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<a id=\"mapping\"></a>\n", + "## 4. Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>completion_date</th>\n", + " <th>interval_bound</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-04-26 14:05:16</td>\n", + " <td>2019-04-26 14:05:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 16:16:48</td>\n", + " <td>2019-07-23 16:15:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 09:58:51</td>\n", + " <td>2019-07-23 09:55:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-04-29 14:37:02</td>\n", + " <td>2019-04-29 14:35:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-03-05 11:14:02</td>\n", + " <td>2019-03-05 11:10:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 13:49:35</td>\n", + " <td>2019-07-23 13:45:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 10:10:10</td>\n", + " <td>2019-07-23 10:10:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 10:46:44</td>\n", + " <td>2019-07-23 10:45:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 15:06:49</td>\n", + " <td>2019-07-23 15:05:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-07-23 09:54:46</td>\n", + " <td>2019-07-23 09:50:00</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(datetime.datetime(2019, 4, 26, 14, 5, 16), datetime.datetime(2019, 4, 26, 14, 5)),\n", + " (datetime.datetime(2019, 7, 23, 16, 16, 48), datetime.datetime(2019, 7, 23, 16, 15)),\n", + " (datetime.datetime(2019, 7, 23, 9, 58, 51), datetime.datetime(2019, 7, 23, 9, 55)),\n", + " (datetime.datetime(2019, 4, 29, 14, 37, 2), datetime.datetime(2019, 4, 29, 14, 35)),\n", + " (datetime.datetime(2019, 3, 5, 11, 14, 2), datetime.datetime(2019, 3, 5, 11, 10)),\n", + " (datetime.datetime(2019, 7, 23, 13, 49, 35), datetime.datetime(2019, 7, 23, 13, 45)),\n", + " (datetime.datetime(2019, 7, 23, 10, 10, 10), datetime.datetime(2019, 7, 23, 10, 10)),\n", + " (datetime.datetime(2019, 7, 23, 10, 46, 44), datetime.datetime(2019, 7, 23, 10, 45)),\n", + " (datetime.datetime(2019, 7, 23, 15, 6, 49), datetime.datetime(2019, 7, 23, 15, 5)),\n", + " (datetime.datetime(2019, 7, 23, 9, 54, 46), datetime.datetime(2019, 7, 23, 9, 50))]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT completion_date, interval_bound(completion_date, '5 minutes')\n", + "FROM chicago_potholes_patched LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gap filling\n", + "Notice gap in days" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>date</th>\n", + " <th>sum</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-03 00:00:00</td>\n", + " <td>1605</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-04 00:00:00</td>\n", + " <td>2206</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-05 00:00:00</td>\n", + " <td>1814</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-06 00:00:00</td>\n", + " <td>1365</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-07 00:00:00</td>\n", + " <td>1973</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-10 00:00:00</td>\n", + " <td>514</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-11 00:00:00</td>\n", + " <td>1408</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-12 00:00:00</td>\n", + " <td>1714</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-13 00:00:00</td>\n", + " <td>1550</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-14 00:00:00</td>\n", + " <td>1415</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(datetime.datetime(2019, 6, 3, 0, 0), 1605L),\n", + " (datetime.datetime(2019, 6, 4, 0, 0), 2206L),\n", + " (datetime.datetime(2019, 6, 5, 0, 0), 1814L),\n", + " (datetime.datetime(2019, 6, 6, 0, 0), 1365L),\n", + " (datetime.datetime(2019, 6, 7, 0, 0), 1973L),\n", + " (datetime.datetime(2019, 6, 10, 0, 0), 514L),\n", + " (datetime.datetime(2019, 6, 11, 0, 0), 1408L),\n", + " (datetime.datetime(2019, 6, 12, 0, 0), 1714L),\n", + " (datetime.datetime(2019, 6, 13, 0, 0), 1550L),\n", + " (datetime.datetime(2019, 6, 14, 0, 0), 1415L)]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SELECT interval_bound(completion_date, '1 day') as date, sum(number_of_potholes_filled_on_block)\n", + "FROM chicago_potholes_patched \n", + "WHERE completion_date >= '2019-06-01'\n", + "AND completion_date <= '2019-06-15'\n", + "GROUP BY date ORDER BY date;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate dates you want (with no gaps)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>date</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-01 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-02 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-03 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-04 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-05 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-06 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-07 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-08 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-09 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-10 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-11 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-12 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-13 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-14 00:00:00+00:00</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-15 00:00:00+00:00</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(datetime.datetime(2019, 6, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 2, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 3, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 4, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 5, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 6, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 7, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 8, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 9, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 10, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 11, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 12, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 13, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 14, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),),\n", + " (datetime.datetime(2019, 6, 15, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)),)]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "\n", + "SELECT date FROM generate_series('2019-06-01'::date,'2019-06-15','1 day') date\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gap filling query" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "15 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>date</th>\n", + " <th>sum</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-01 00:00:00+00:00</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-02 00:00:00+00:00</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-03 00:00:00+00:00</td>\n", + " <td>1605</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-04 00:00:00+00:00</td>\n", + " <td>2206</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-05 00:00:00+00:00</td>\n", + " <td>1814</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-06 00:00:00+00:00</td>\n", + " <td>1365</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-07 00:00:00+00:00</td>\n", + " <td>1973</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-08 00:00:00+00:00</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-09 00:00:00+00:00</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-10 00:00:00+00:00</td>\n", + " <td>514</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-11 00:00:00+00:00</td>\n", + " <td>1408</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-12 00:00:00+00:00</td>\n", + " <td>1714</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-13 00:00:00+00:00</td>\n", + " <td>1550</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-14 00:00:00+00:00</td>\n", + " <td>1415</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2019-06-15 00:00:00+00:00</td>\n", + " <td>0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(datetime.datetime(2019, 6, 1, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 0L),\n", + " (datetime.datetime(2019, 6, 2, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 0L),\n", + " (datetime.datetime(2019, 6, 3, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1605L),\n", + " (datetime.datetime(2019, 6, 4, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 2206L),\n", + " (datetime.datetime(2019, 6, 5, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1814L),\n", + " (datetime.datetime(2019, 6, 6, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1365L),\n", + " (datetime.datetime(2019, 6, 7, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1973L),\n", + " (datetime.datetime(2019, 6, 8, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 0L),\n", + " (datetime.datetime(2019, 6, 9, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 0L),\n", + " (datetime.datetime(2019, 6, 10, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 514L),\n", + " (datetime.datetime(2019, 6, 11, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1408L),\n", + " (datetime.datetime(2019, 6, 12, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1714L),\n", + " (datetime.datetime(2019, 6, 13, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1550L),\n", + " (datetime.datetime(2019, 6, 14, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 1415L),\n", + " (datetime.datetime(2019, 6, 15, 0, 0, tzinfo=psycopg2.tz.FixedOffsetTimezone(offset=0, name=None)), 0L)]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "WITH\n", + " bounded AS (\n", + " SELECT interval_bound(completion_date, '1 day') AS date,number_of_potholes_filled_on_block\n", + " FROM chicago_potholes_patched\n", + " WHERE completion_date >= '2019-06-01' AND completion_date <= '2019-06-15'\n", + " ),\n", + " dense AS (\n", + " SELECT slice AS date\n", + " FROM generate_series('2019-06-01'::date,'2019-06-15','1 day') s(slice)\n", + " )\n", + "SELECT date, sum(coalesce(number_of_potholes_filled_on_block,0))\n", + "FROM bounded NATURAL RIGHT JOIN dense\n", + "GROUP BY date ORDER BY date;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}