Repository: incubator-madlib-site Updated Branches: refs/heads/asf-site 12422c78a -> de140c36b
update weakly connected cpts workbook with helper functions Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/commit/de140c36 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/tree/de140c36 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/diff/de140c36 Branch: refs/heads/asf-site Commit: de140c36b2cd6ad091637d3f5444edecbc0f739d Parents: 12422c7 Author: Frank McQuillan <fmcquil...@pivotal.io> Authored: Fri Jul 28 11:54:29 2017 -0700 Committer: Frank McQuillan <fmcquil...@pivotal.io> Committed: Fri Jul 28 11:54:29 2017 -0700 ---------------------------------------------------------------------- .../Weakly-connected-cpts-v1.ipynb | 584 ------------ .../Weakly-connected-cpts-v2.ipynb | 908 +++++++++++++++++++ 2 files changed, 908 insertions(+), 584 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/de140c36/community-artifacts/Weakly-connected-cpts-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/Weakly-connected-cpts-v1.ipynb b/community-artifacts/Weakly-connected-cpts-v1.ipynb deleted file mode 100644 index 813692b..0000000 --- a/community-artifacts/Weakly-connected-cpts-v1.ipynb +++ /dev/null @@ -1,584 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Weakly connected components\n", - "\n", - "Weakly connected components was added in MADlib 1.12." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] - } - ], - "source": [ - "%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Greenplum 4.3.10.0\n", - "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", - " \n", - "# PostgreSQL local\n", - "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", - "\n", - "# Greenplum 4.2.3.0\n", - "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>version</th>\n", - " </tr>\n", - " <tr>\n", - " <td>MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-188-ge47dc8a, cmake configuration time: Wed Jul 5 18:23:50 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(u'MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-188-ge47dc8a, cmake configuration time: Wed Jul 5 18:23:50 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select madlib.version();\n", - "#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Create vertex and edge tables" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "Done.\n", - "14 rows affected.\n", - "18 rows affected.\n", - "18 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>src</th>\n", - " <th>dest</th>\n", - " <th>user_id</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>5</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>10</td>\n", - " <td>11</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>10</td>\n", - " <td>12</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>11</td>\n", - " <td>12</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>11</td>\n", - " <td>13</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>12</td>\n", - " <td>13</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>13</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>15</td>\n", - " <td>14</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>15</td>\n", - " <td>16</td>\n", - " <td>2</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 1, 1),\n", - " (0, 2, 1),\n", - " (1, 2, 1),\n", - " (1, 3, 1),\n", - " (2, 3, 1),\n", - " (2, 5, 1),\n", - " (2, 6, 1),\n", - " (3, 0, 1),\n", - " (5, 6, 1),\n", - " (6, 3, 1),\n", - " (10, 11, 2),\n", - " (10, 12, 2),\n", - " (11, 12, 2),\n", - " (11, 13, 2),\n", - " (12, 13, 2),\n", - " (13, 10, 2),\n", - " (15, 14, 2),\n", - " (15, 16, 2)]" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql \n", - "DROP TABLE IF EXISTS vertex, edge;\n", - "\n", - "CREATE TABLE vertex(\n", - " id INTEGER\n", - ");\n", - "\n", - "CREATE TABLE edge(\n", - " src INTEGER,\n", - " dest INTEGER,\n", - " user_id INTEGER\n", - ");\n", - "\n", - "INSERT INTO vertex VALUES\n", - "(0),\n", - "(1),\n", - "(2),\n", - "(3),\n", - "(4),\n", - "(5),\n", - "(6),\n", - "(10),\n", - "(11),\n", - "(12),\n", - "(13),\n", - "(14),\n", - "(15),\n", - "(16);\n", - "\n", - "INSERT INTO edge VALUES\n", - "(0, 1, 1),\n", - "(0, 2, 1),\n", - "(1, 2, 1),\n", - "(1, 3, 1),\n", - "(2, 3, 1),\n", - "(2, 5, 1),\n", - "(2, 6, 1),\n", - "(3, 0, 1),\n", - "(5, 6, 1),\n", - "(6, 3, 1),\n", - "(10, 11, 2),\n", - "(10, 12, 2),\n", - "(11, 12, 2),\n", - "(11, 13, 2),\n", - "(12, 13, 2),\n", - "(13, 10, 2),\n", - "(15, 16, 2),\n", - "(15, 14, 2);\n", - "\n", - "SELECT * FROM edge ORDER BY src, dest;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Find all weakly connected components" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "14 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>id</th>\n", - " <th>component_id</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>0</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>4</td>\n", - " </tr>\n", - " <tr>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " </tr>\n", - " <tr>\n", - " <td>11</td>\n", - " <td>10</td>\n", - " </tr>\n", - " <tr>\n", - " <td>12</td>\n", - " <td>10</td>\n", - " </tr>\n", - " <tr>\n", - " <td>13</td>\n", - " <td>10</td>\n", - " </tr>\n", - " <tr>\n", - " <td>14</td>\n", - " <td>14</td>\n", - " </tr>\n", - " <tr>\n", - " <td>15</td>\n", - " <td>14</td>\n", - " </tr>\n", - " <tr>\n", - " <td>16</td>\n", - " <td>14</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 0),\n", - " (1, 0),\n", - " (2, 0),\n", - " (3, 0),\n", - " (5, 0),\n", - " (6, 0),\n", - " (4, 4),\n", - " (10, 10),\n", - " (11, 10),\n", - " (12, 10),\n", - " (13, 10),\n", - " (14, 14),\n", - " (15, 14),\n", - " (16, 14)]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS wcc_out;\n", - "\n", - "SELECT madlib.weakly_connected_components(\n", - " 'vertex', -- Vertex table\n", - " 'id', -- Vertix id column\n", - " 'edge', -- Edge table\n", - " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", - " 'wcc_out'); -- Output table of weakly connected components\n", - "\n", - "SELECT * FROM wcc_out ORDER BY component_id, id;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Weakly connected components with grouping" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "13 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>id</th>\n", - " <th>component_id</th>\n", - " <th>user_id</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>10</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>11</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>12</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>13</td>\n", - " <td>10</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>14</td>\n", - " <td>14</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>15</td>\n", - " <td>14</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>16</td>\n", - " <td>14</td>\n", - " <td>2</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 0, 1),\n", - " (1, 0, 1),\n", - " (2, 0, 1),\n", - " (3, 0, 1),\n", - " (5, 0, 1),\n", - " (6, 0, 1),\n", - " (10, 10, 2),\n", - " (11, 10, 2),\n", - " (12, 10, 2),\n", - " (13, 10, 2),\n", - " (14, 14, 2),\n", - " (15, 14, 2),\n", - " (16, 14, 2)]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n", - "\n", - "SELECT madlib.weakly_connected_components(\n", - " 'vertex', -- Vertex table\n", - " 'id', -- Vertix id column\n", - " 'edge', -- Edge table\n", - " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", - " 'wcc_out', -- Output table of weakly connected components\n", - " 'user_id'); -- Grouping column name\n", - "\n", - "SELECT * FROM wcc_out ORDER BY component_id, id, user_id;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that vertex '4' is not identified as a separate component in the above result. This is because disconnected nodes cannot be assigned to a particular group with the current graph representation in MADlib." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} http://git-wip-us.apache.org/repos/asf/incubator-madlib-site/blob/de140c36/community-artifacts/Weakly-connected-cpts-v2.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/Weakly-connected-cpts-v2.ipynb b/community-artifacts/Weakly-connected-cpts-v2.ipynb new file mode 100644 index 0000000..3258e44 --- /dev/null +++ b/community-artifacts/Weakly-connected-cpts-v2.ipynb @@ -0,0 +1,908 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Weakly connected components\n", + "\n", + "Weakly connected components was added in MADlib 1.12. This notebook also includes helper functions based on the WCC output." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpdbchina@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum 4.3.10.0\n", + "%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + "\n", + "# Greenplum 4.2.3.0\n", + "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.12-dev, git revision: rc/v1.9alpha-rc1-195-g85e89ef, cmake configuration time: Wed Jul 26 23:21:17 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Create vertex and edge tables" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "18 rows affected.\n", + "18 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>src</th>\n", + " <th>dest</th>\n", + " <th>user_id</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>11</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>12</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>12</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>13</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>13</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>14</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>16</td>\n", + " <td>2</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 1, 1),\n", + " (0, 2, 1),\n", + " (1, 2, 1),\n", + " (1, 3, 1),\n", + " (2, 3, 1),\n", + " (2, 5, 1),\n", + " (2, 6, 1),\n", + " (3, 0, 1),\n", + " (5, 6, 1),\n", + " (6, 3, 1),\n", + " (10, 11, 2),\n", + " (10, 12, 2),\n", + " (11, 12, 2),\n", + " (11, 13, 2),\n", + " (12, 13, 2),\n", + " (13, 10, 2),\n", + " (15, 14, 2),\n", + " (15, 16, 2)]" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql \n", + "DROP TABLE IF EXISTS vertex, edge;\n", + "\n", + "CREATE TABLE vertex(\n", + " id INTEGER\n", + ");\n", + "\n", + "CREATE TABLE edge(\n", + " src INTEGER,\n", + " dest INTEGER,\n", + " user_id INTEGER\n", + ");\n", + "\n", + "INSERT INTO vertex VALUES\n", + "(0),\n", + "(1),\n", + "(2),\n", + "(3),\n", + "(4),\n", + "(5),\n", + "(6),\n", + "(10),\n", + "(11),\n", + "(12),\n", + "(13),\n", + "(14),\n", + "(15),\n", + "(16);\n", + "\n", + "INSERT INTO edge VALUES\n", + "(0, 1, 1),\n", + "(0, 2, 1),\n", + "(1, 2, 1),\n", + "(1, 3, 1),\n", + "(2, 3, 1),\n", + "(2, 5, 1),\n", + "(2, 6, 1),\n", + "(3, 0, 1),\n", + "(5, 6, 1),\n", + "(6, 3, 1),\n", + "(10, 11, 2),\n", + "(10, 12, 2),\n", + "(11, 12, 2),\n", + "(11, 13, 2),\n", + "(12, 13, 2),\n", + "(13, 10, 2),\n", + "(15, 16, 2),\n", + "(15, 14, 2);\n", + "\n", + "SELECT * FROM edge ORDER BY src, dest;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Find all weakly connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>component_id</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>10</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>14</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 0),\n", + " (1, 0),\n", + " (2, 0),\n", + " (3, 0),\n", + " (5, 0),\n", + " (6, 0),\n", + " (4, 4),\n", + " (10, 10),\n", + " (11, 10),\n", + " (12, 10),\n", + " (13, 10),\n", + " (14, 14),\n", + " (15, 14),\n", + " (16, 14)]" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n", + "\n", + "SELECT madlib.weakly_connected_components(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'wcc_out'); -- Output table of weakly connected components\n", + "\n", + "SELECT * FROM wcc_out ORDER BY component_id, id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Weakly connected components with grouping" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "13 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>component_id</th>\n", + " <th>user_id</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>10</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>14</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>14</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>14</td>\n", + " <td>2</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 0, 1),\n", + " (1, 0, 1),\n", + " (2, 0, 1),\n", + " (3, 0, 1),\n", + " (5, 0, 1),\n", + " (6, 0, 1),\n", + " (10, 10, 2),\n", + " (11, 10, 2),\n", + " (12, 10, 2),\n", + " (13, 10, 2),\n", + " (14, 14, 2),\n", + " (15, 14, 2),\n", + " (16, 14, 2)]" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS wcc_out, wcc_out_summary;\n", + "\n", + "SELECT madlib.weakly_connected_components(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'wcc_out', -- Output table of weakly connected components\n", + " 'user_id'); -- Grouping column name\n", + "\n", + "SELECT * FROM wcc_out ORDER BY component_id, id, user_id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that vertex '4' is not identified as a separate component in the above result. This is because disconnected nodes cannot be assigned to a particular group with the current graph representation in MADlib." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Largest connected component" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "2 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>component_id</th>\n", + " <th>num_vertices</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 6L), (2, 10, 4L)]" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql \n", + "DROP TABLE IF EXISTS largest_cpt_table;\n", + "\n", + "SELECT madlib.graph_wcc_largest_cpt(\n", + " 'wcc_out', -- WCC output table\n", + " 'largest_cpt_table'); -- output table containing largest component ID\n", + "\n", + "SELECT * FROM largest_cpt_table ORDER BY component_id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Histogram of number of vertices per connected component" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "3 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>component_id</th>\n", + " <th>num_vertices</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>10</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>14</td>\n", + " <td>3</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 6L), (2, 10, 4L), (2, 14, 3L)]" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS histogram_table; \n", + "\n", + "SELECT madlib.graph_wcc_histogram(\n", + " 'wcc_out', -- WCC's output table\n", + " 'histogram_table'); -- output table containing the histogram of vertices\n", + "\n", + "SELECT * FROM histogram_table ORDER BY component_id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Check if two vertices belong to the same component" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>component_id</th>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>14</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(2, 14)]" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS vc_table;\n", + "\n", + "SELECT madlib.graph_wcc_vertex_check(\n", + " 'wcc_out', -- WCC output table\n", + " '14,15', -- Pair of vertex IDs\n", + " 'vc_table'); -- output table containing components that contain the two vertices\n", + "\n", + "SELECT * FROM vc_table ORDER BY component_id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 7. Find all reachable vertices from a source vertex" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "5 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>component_id</th>\n", + " <th>dest</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 1), (1, 0, 2), (1, 0, 3), (1, 0, 5), (1, 0, 6)]" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS reach_table;\n", + "\n", + "SELECT madlib.graph_wcc_reachable_vertices(\n", + " 'wcc_out', -- WCC output table\n", + " '0', -- source vertex\n", + " 'reach_table'); -- output table containing all vertices reachable from source vertex\n", + "\n", + "SELECT * FROM reach_table ORDER BY component_id, dest;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 8. Count of connected components" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "2 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>num_components</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 1L), (2, 2L)]" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS count_table;\n", + "\n", + "SELECT madlib.graph_wcc_num_cpts(\n", + " 'wcc_out', -- WCC output table\n", + " 'count_table'); -- output table containing number of components per group\n", + "\n", + "SELECT * FROM count_table;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}