Repository: incubator-systemml Updated Branches: refs/heads/master 3e048be8c -> 48ca2010a
[SYSTEMML-618] SystemML-NN: Simplying the execution of the examples. Simplying the execution of the examples by introducing a soft link inside the `examples` folder pointing to the actual `nn` folder, and moving the notebooks inside the `examples` folder. Now, all example scripts can be run independently of the enclosing `examples` folder, and users should invoke these scripts from the directory in which they are located. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/48ca2010 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/48ca2010 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/48ca2010 Branch: refs/heads/master Commit: 48ca2010a6dc671ba060af283dc405f7b3588563 Parents: 3e048be Author: Mike Dusenberry <[email protected]> Authored: Mon Jun 27 16:47:09 2016 -0700 Committer: Mike Dusenberry <[email protected]> Committed: Mon Jun 27 16:47:09 2016 -0700 ---------------------------------------------------------------------- .../SystemML-NN/Example - MNIST LeNet.ipynb | 231 ------------------- .../Example - MNIST Softmax Classifier.ipynb | 201 ---------------- scripts/staging/SystemML-NN/README.md | 1 + .../examples/Example - MNIST LeNet.ipynb | 231 +++++++++++++++++++ .../Example - MNIST Softmax Classifier.ipynb | 201 ++++++++++++++++ scripts/staging/SystemML-NN/examples/README.md | 67 ++++++ .../SystemML-NN/examples/mnist_lenet.dml | 6 +- .../SystemML-NN/examples/mnist_softmax.dml | 6 +- scripts/staging/SystemML-NN/examples/nn | 1 + 9 files changed, 507 insertions(+), 438 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/Example - MNIST LeNet.ipynb ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/Example - MNIST LeNet.ipynb deleted file mode 100644 index 1926f3b..0000000 --- a/scripts/staging/SystemML-NN/Example - MNIST LeNet.ipynb +++ /dev/null @@ -1,231 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quick Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Add SystemML PySpark API file.\n", - "sc.addPyFile(\"https://raw.githubusercontent.com/apache/incubator-systemml/3d5f9b11741f6d6ecc6af7cbaa1069cde32be838/src/main/java/org/apache/sysml/api/python/SystemML.py\")\n", - "\n", - "# Create a SystemML MLContext object\n", - "from SystemML import MLContext\n", - "ml = MLContext(sc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download Data - MNIST" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "%%sh\n", - "mkdir -p examples/data/mnist/\n", - "cd examples/data/mnist/\n", - "curl -O http://pjreddie.com/media/files/mnist_train.csv\n", - "curl -O http://pjreddie.com/media/files/mnist_test.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SystemML \"LeNet\" Neural Network" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "script = \"\"\"\n", - "source(\"examples/mnist_lenet.dml\") as mnist_lenet\n", - "\n", - "# Read training data\n", - "data = read($data, format=\"csv\")\n", - "n = nrow(data)\n", - "C = $C\n", - "Hin = $Hin\n", - "Win = $Win\n", - "\n", - "# Extract images and labels\n", - "images = data[,2:ncol(data)]\n", - "labels = data[,1]\n", - "\n", - "# Scale images to [-1,1], and one-hot encode the labels\n", - "images = (images / 255.0) * 2 - 1\n", - "labels = table(seq(1, n), labels+1, n, 10)\n", - "\n", - "# Split into training (55,000 examples) and validation (5,000 examples)\n", - "X = images[5001:nrow(images),]\n", - "X_val = images[1:5000,]\n", - "y = labels[5001:nrow(images),]\n", - "y_val = labels[1:5000,]\n", - "\n", - "# Train\n", - "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win)\n", - "\n", - "# Write model out\n", - "write(W1, $W1out)\n", - "write(b1, $b1out)\n", - "write(W2, $W2out)\n", - "write(b2, $b2out)\n", - "write(W3, $W3out)\n", - "write(b3, $b3out)\n", - "write(W4, $W4out)\n", - "write(b4, $b4out)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "\"\"\"\n", - "ml.reset()\n", - "out = ml.executeScript(script, {\"data\": \"examples/data/mnist/mnist_train.csv\",\n", - " \"C\": 1, \"Hin\": 28, \"Win\": 28},\n", - " outputs=[\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Extract model from SystemML back into PySpark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Extract variables\n", - "W1 = out.getDF(sqlContext, \"W1\").sort(\"ID\").drop(\"ID\")\n", - "b1 = out.getDF(sqlContext, \"b1\").sort(\"ID\").drop(\"ID\")\n", - "W2 = out.getDF(sqlContext, \"W2\").sort(\"ID\").drop(\"ID\")\n", - "b2 = out.getDF(sqlContext, \"b2\").sort(\"ID\").drop(\"ID\")\n", - "W3 = out.getDF(sqlContext, \"W3\").sort(\"ID\").drop(\"ID\")\n", - "b3 = out.getDF(sqlContext, \"b3\").sort(\"ID\").drop(\"ID\")\n", - "W4 = out.getDF(sqlContext, \"W4\").sort(\"ID\").drop(\"ID\")\n", - "b4 = out.getDF(sqlContext, \"b4\").sort(\"ID\").drop(\"ID\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Compute Test Accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "script = \"\"\"\n", - "source(\"examples/mnist_lenet.dml\") as mnist_lenet\n", - "\n", - "# Read test data\n", - "data = read($data, format=\"csv\")\n", - "n = nrow(data)\n", - "C = $C\n", - "Hin = $Hin\n", - "Win = $Win\n", - "\n", - "# Extract images and labels\n", - "X_test = data[,2:ncol(data)]\n", - "y_test = data[,1]\n", - "\n", - "# Scale images to [-1,1], and one-hot encode the labels\n", - "X_test = (X_test / 255.0) * 2 - 1\n", - "y_test = table(seq(1, n), y_test+1, n, 10)\n", - "\n", - "# Read model coefficients\n", - "W1 = read($W1)\n", - "b1 = read($b1)\n", - "W2 = read($W2)\n", - "b2 = read($b2)\n", - "W3 = read($W3)\n", - "b3 = read($b3)\n", - "W4 = read($W4)\n", - "b4 = read($b4)\n", - "\n", - "# Eval on test set\n", - "[loss, accuracy] = mnist_lenet::eval(X_test, y_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n", - "\n", - "print(\"Test ;Accuracy: \" + accuracy)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "\"\"\"\n", - "ml.reset()\n", - "ml.executeScript(script, {\"data\": \"examples/data/mnist/mnist_train.csv\",\n", - " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n", - " \"W1\": W1, \"b1\": b1,\n", - " \"W2\": W2, \"b2\": b2,\n", - " \"W3\": W3, \"b3\": b3,\n", - " \"W4\": W4, \"b4\": b4})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/Example - MNIST Softmax Classifier.ipynb ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/Example - MNIST Softmax Classifier.ipynb deleted file mode 100644 index d7da0d2..0000000 --- a/scripts/staging/SystemML-NN/Example - MNIST Softmax Classifier.ipynb +++ /dev/null @@ -1,201 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quick Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Add SystemML PySpark API file.\n", - "sc.addPyFile(\"https://raw.githubusercontent.com/apache/incubator-systemml/3d5f9b11741f6d6ecc6af7cbaa1069cde32be838/src/main/java/org/apache/sysml/api/python/SystemML.py\")\n", - "\n", - "# Create a SystemML MLContext object\n", - "from SystemML import MLContext\n", - "ml = MLContext(sc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Download Data - MNIST" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%%sh\n", - "mkdir -p examples/data/mnist/\n", - "cd examples/data/mnist/\n", - "curl -O http://pjreddie.com/media/files/mnist_train.csv\n", - "curl -O http://pjreddie.com/media/files/mnist_test.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## SystemML Softmax Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "script = \"\"\"\n", - "source(\"examples/mnist_softmax.dml\") as mnist_softmax\n", - "\n", - "# Read training data\n", - "data = read($data, format=\"csv\")\n", - "n = nrow(data)\n", - "\n", - "# Extract images and labels\n", - "images = data[,2:ncol(data)]\n", - "labels = data[,1]\n", - "\n", - "# Scale images to [0,1], and one-hot encode the labels\n", - "images = images / 255.0\n", - "labels = table(seq(1, n), labels+1, n, 10)\n", - "\n", - "# Split into training (55,000 examples) and validation (5,000 examples)\n", - "X = images[5001:nrow(images),]\n", - "X_val = images[1:5000,]\n", - "y = labels[5001:nrow(images),]\n", - "y_val = labels[1:5000,]\n", - "\n", - "# Train\n", - "[W, b] = mnist_softmax::train(X, y, X_val, y_val)\n", - "\n", - "# Write model out (we will extract these back into PySpark)\n", - "write(W, $Wout)\n", - "write(b, $bout)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "\"\"\"\n", - "ml.reset()\n", - "out = ml.executeScript(script, {\"data\": \"examples/data/mnist/mnist_train.csv\"},\n", - " outputs=[\"W\", \"b\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 2. Extract model from SystemML back into PySpark" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "W = out.getDF(sqlContext, \"W\").sort(\"ID\").drop(\"ID\")\n", - "b = out.getDF(sqlContext, \"b\").sort(\"ID\").drop(\"ID\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 3. Compute Test Accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "script = \"\"\"\n", - "source(\"examples/mnist_softmax.dml\") as mnist_softmax\n", - "\n", - "# Read test data\n", - "data = read($data, format=\"csv\")\n", - "n = nrow(data)\n", - "\n", - "# Extract images and labels\n", - "X_test = data[,2:ncol(data)]\n", - "y_test = data[,1]\n", - "\n", - "# Scale images to [0,1], and one-hot encode the labels\n", - "X_test = X_test / 255.0\n", - "y_test = table(seq(1, n), y_test+1, n, 10)\n", - "\n", - "# Read model coefficients\n", - "W = read($W)\n", - "b = read($b)\n", - "\n", - "# Eval on test set\n", - "[loss, accuracy] = mnist_softmax::eval(X_test, y_test, W, b)\n", - "\n", - "print(\"Test Accuracy: \" + accuracy)\n", - "\n", - "print(\"\")\n", - "print(\"\")\n", - "\"\"\"\n", - "ml.reset()\n", - "out = ml.executeScript(script, {\"data\": \"examples/data/mnist/mnist_test.csv\",\n", - " \"W\": W, \"b\": b})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/README.md ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/README.md b/scripts/staging/SystemML-NN/README.md index 965da67..ca5067e 100644 --- a/scripts/staging/SystemML-NN/README.md +++ b/scripts/staging/SystemML-NN/README.md @@ -22,6 +22,7 @@ limitations under the License. ### A deep learning library for [Apache SystemML](https://github.com/apache/incubator-systemml). ## Examples: +#### Please see the `examples` folder for more detailed examples, or view the following two quick examples. ### Neural net for regression with vanilla SGD: ```python # Imports http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb new file mode 100644 index 0000000..fb1f2b3 --- /dev/null +++ b/scripts/staging/SystemML-NN/examples/Example - MNIST LeNet.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quick Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Add SystemML PySpark API file.\n", + "sc.addPyFile(\"https://raw.githubusercontent.com/apache/incubator-systemml/3d5f9b11741f6d6ecc6af7cbaa1069cde32be838/src/main/java/org/apache/sysml/api/python/SystemML.py\")\n", + "\n", + "# Create a SystemML MLContext object\n", + "from SystemML import MLContext\n", + "ml = MLContext(sc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Data - MNIST" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%%sh\n", + "mkdir -p data/mnist/\n", + "cd data/mnist/\n", + "curl -O http://pjreddie.com/media/files/mnist_train.csv\n", + "curl -O http://pjreddie.com/media/files/mnist_test.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SystemML \"LeNet\" Neural Network" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"mnist_lenet.dml\") as mnist_lenet\n", + "\n", + "# Read training data\n", + "data = read($data, format=\"csv\")\n", + "n = nrow(data)\n", + "C = $C\n", + "Hin = $Hin\n", + "Win = $Win\n", + "\n", + "# Extract images and labels\n", + "images = data[,2:ncol(data)]\n", + "labels = data[,1]\n", + "\n", + "# Scale images to [-1,1], and one-hot encode the labels\n", + "images = (images / 255.0) * 2 - 1\n", + "labels = table(seq(1, n), labels+1, n, 10)\n", + "\n", + "# Split into training (55,000 examples) and validation (5,000 examples)\n", + "X = images[5001:nrow(images),]\n", + "X_val = images[1:5000,]\n", + "y = labels[5001:nrow(images),]\n", + "y_val = labels[1:5000,]\n", + "\n", + "# Train\n", + "[W1, b1, W2, b2, W3, b3, W4, b4] = mnist_lenet::train(X, y, X_val, y_val, C, Hin, Win)\n", + "\n", + "# Write model out\n", + "write(W1, $W1out)\n", + "write(b1, $b1out)\n", + "write(W2, $W2out)\n", + "write(b2, $b2out)\n", + "write(W3, $W3out)\n", + "write(b3, $b3out)\n", + "write(W4, $W4out)\n", + "write(b4, $b4out)\n", + "\n", + "print(\"\")\n", + "print(\"\")\n", + "\"\"\"\n", + "ml.reset()\n", + "out = ml.executeScript(script, {\"data\": \"data/mnist/mnist_train.csv\",\n", + " \"C\": 1, \"Hin\": 28, \"Win\": 28},\n", + " outputs=[\"W1\", \"b1\", \"W2\", \"b2\", \"W3\", \"b3\", \"W4\", \"b4\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Extract model from SystemML back into PySpark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Extract variables\n", + "W1 = out.getDF(sqlContext, \"W1\").sort(\"ID\").drop(\"ID\")\n", + "b1 = out.getDF(sqlContext, \"b1\").sort(\"ID\").drop(\"ID\")\n", + "W2 = out.getDF(sqlContext, \"W2\").sort(\"ID\").drop(\"ID\")\n", + "b2 = out.getDF(sqlContext, \"b2\").sort(\"ID\").drop(\"ID\")\n", + "W3 = out.getDF(sqlContext, \"W3\").sort(\"ID\").drop(\"ID\")\n", + "b3 = out.getDF(sqlContext, \"b3\").sort(\"ID\").drop(\"ID\")\n", + "W4 = out.getDF(sqlContext, \"W4\").sort(\"ID\").drop(\"ID\")\n", + "b4 = out.getDF(sqlContext, \"b4\").sort(\"ID\").drop(\"ID\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Compute Test Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"mnist_lenet.dml\") as mnist_lenet\n", + "\n", + "# Read test data\n", + "data = read($data, format=\"csv\")\n", + "n = nrow(data)\n", + "C = $C\n", + "Hin = $Hin\n", + "Win = $Win\n", + "\n", + "# Extract images and labels\n", + "X_test = data[,2:ncol(data)]\n", + "y_test = data[,1]\n", + "\n", + "# Scale images to [-1,1], and one-hot encode the labels\n", + "X_test = (X_test / 255.0) * 2 - 1\n", + "y_test = table(seq(1, n), y_test+1, n, 10)\n", + "\n", + "# Read model coefficients\n", + "W1 = read($W1)\n", + "b1 = read($b1)\n", + "W2 = read($W2)\n", + "b2 = read($b2)\n", + "W3 = read($W3)\n", + "b3 = read($b3)\n", + "W4 = read($W4)\n", + "b4 = read($b4)\n", + "\n", + "# Eval on test set\n", + "[loss, accuracy] = mnist_lenet::eval(X_test, y_test, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)\n", + "\n", + "print(\"Test ;Accuracy: \" + accuracy)\n", + "\n", + "print(\"\")\n", + "print(\"\")\n", + "\"\"\"\n", + "ml.reset()\n", + "ml.executeScript(script, {\"data\": \"data/mnist/mnist_train.csv\",\n", + " \"C\": 1, \"Hin\": 28, \"Win\": 28,\n", + " \"W1\": W1, \"b1\": b1,\n", + " \"W2\": W2, \"b2\": b2,\n", + " \"W3\": W3, \"b3\": b3,\n", + " \"W4\": W4, \"b4\": b4})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb b/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb new file mode 100644 index 0000000..454e31b --- /dev/null +++ b/scripts/staging/SystemML-NN/examples/Example - MNIST Softmax Classifier.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quick Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Add SystemML PySpark API file.\n", + "sc.addPyFile(\"https://raw.githubusercontent.com/apache/incubator-systemml/3d5f9b11741f6d6ecc6af7cbaa1069cde32be838/src/main/java/org/apache/sysml/api/python/SystemML.py\")\n", + "\n", + "# Create a SystemML MLContext object\n", + "from SystemML import MLContext\n", + "ml = MLContext(sc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download Data - MNIST" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The MNIST dataset contains labeled images of handwritten digits, where each example is a 28x28 pixel image of grayscale values in the range [0,255] stretched out as 784 pixels, and each label is one of 10 possible digits in [0,9]. Here, we download 60,000 training examples, and 10,000 test examples, where the format is \"label, pixel_1, pixel_2, ..., pixel_n\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%%sh\n", + "mkdir -p data/mnist/\n", + "cd data/mnist/\n", + "curl -O http://pjreddie.com/media/files/mnist_train.csv\n", + "curl -O http://pjreddie.com/media/files/mnist_test.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SystemML Softmax Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"mnist_softmax.dml\") as mnist_softmax\n", + "\n", + "# Read training data\n", + "data = read($data, format=\"csv\")\n", + "n = nrow(data)\n", + "\n", + "# Extract images and labels\n", + "images = data[,2:ncol(data)]\n", + "labels = data[,1]\n", + "\n", + "# Scale images to [0,1], and one-hot encode the labels\n", + "images = images / 255.0\n", + "labels = table(seq(1, n), labels+1, n, 10)\n", + "\n", + "# Split into training (55,000 examples) and validation (5,000 examples)\n", + "X = images[5001:nrow(images),]\n", + "X_val = images[1:5000,]\n", + "y = labels[5001:nrow(images),]\n", + "y_val = labels[1:5000,]\n", + "\n", + "# Train\n", + "[W, b] = mnist_softmax::train(X, y, X_val, y_val)\n", + "\n", + "# Write model out (we will extract these back into PySpark)\n", + "write(W, $Wout)\n", + "write(b, $bout)\n", + "\n", + "print(\"\")\n", + "print(\"\")\n", + "\"\"\"\n", + "ml.reset()\n", + "out = ml.executeScript(script, {\"data\": \"data/mnist/mnist_train.csv\"},\n", + " outputs=[\"W\", \"b\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Extract model from SystemML back into PySpark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "W = out.getDF(sqlContext, \"W\").sort(\"ID\").drop(\"ID\")\n", + "b = out.getDF(sqlContext, \"b\").sort(\"ID\").drop(\"ID\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Compute Test Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"mnist_softmax.dml\") as mnist_softmax\n", + "\n", + "# Read test data\n", + "data = read($data, format=\"csv\")\n", + "n = nrow(data)\n", + "\n", + "# Extract images and labels\n", + "X_test = data[,2:ncol(data)]\n", + "y_test = data[,1]\n", + "\n", + "# Scale images to [0,1], and one-hot encode the labels\n", + "X_test = X_test / 255.0\n", + "y_test = table(seq(1, n), y_test+1, n, 10)\n", + "\n", + "# Read model coefficients\n", + "W = read($W)\n", + "b = read($b)\n", + "\n", + "# Eval on test set\n", + "[loss, accuracy] = mnist_softmax::eval(X_test, y_test, W, b)\n", + "\n", + "print(\"Test Accuracy: \" + accuracy)\n", + "\n", + "print(\"\")\n", + "print(\"\")\n", + "\"\"\"\n", + "ml.reset()\n", + "out = ml.executeScript(script, {\"data\": \"data/mnist/mnist_test.csv\",\n", + " \"W\": W, \"b\": b})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/README.md ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/README.md b/scripts/staging/SystemML-NN/examples/README.md new file mode 100644 index 0000000..eee5e9b --- /dev/null +++ b/scripts/staging/SystemML-NN/examples/README.md @@ -0,0 +1,67 @@ +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +# SystemML-NN Examples + +#### This folder contains scripts and PySpark Jupyter notebooks serving as examples of using the *SystemML-NN* (`nn`) deep learning library. + +--- + +## Code +* To run the examples, please first download and unzip the project via GitHub using the "Clone or download" button on the [homepage of the project](https://github.com/dusenberrymw/systemml-nn), *or* via the following commands: + + ``` + curl -LO https://github.com/dusenberrymw/systemml-nn/archive/master.zip + unzip master.zip + ``` + +* Then, move into the `examples` folder via: + ``` + cd systemml-nn-master/examples/ + ``` + +## Data +* The following examples use the classic [MNIST](http://yann.lecun.com/exdb/mnist/) dataset, which contains labeled 28x28 pixel images of handwritten digits in the range of 0-9. There are 60,000 training images, and 10,000 testing images. Of the 60,000 training images, 5,000 will be used as validation images. +* The data will be automatically downloaded as a step in either of the example notebooks. (*If* you wish to download it separately, please run `get_mnist_data.sh`). + +## Execution +* These examples contain scripts written in SystemML's R-like language (`*.dml`), as well as PySpark Jupyter notebooks (`*.ipynb`). The scripts contain the math for the algorithms, enclosed in functions, and the notebooks serve as full, end-to-end examples of reading in data, training models using the functions within the scripts, and evaluating final performance. +* To run the notebook examples, please startup Jupyter in the following manner from this directory (or for more information, please see [this great blog post](http://spark.tc/0-to-life-changing-application-with-apache-systemml/)): + + ``` + PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" $SPARK_HOME/bin/pyspark --master local[*] --driver-memory 3G --driver-class-path $SYSTEMML_HOME/SystemML.jar + ``` + + Note that all printed output, such as training statistics, from the SystemML scripts will be sent to the terminal in which Jupyter was started (for now...). + +* To run the scripts directly using `spark-submit`, please see the comments located at the bottom of the scripts. + +## Examples +### MNIST Softmax Classifier + +* This example trains a softmax classifier, which is essentially a multi-class logistic regression model, on the MNIST data. The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images. +* Notebook: `Example - MNIST Softmax Classifier.ipynb`. +* Script: `mnist_softmax.dml` + +### MNIST "LeNet" Neural Net + +* This example trains a neural network on the MNIST data using a ["LeNet" architecture](http://yann.lecun.com/exdb/publis/pdf/lecun-98.pdf). The model will be trained on the *training* images, validated on the *validation* images, and tested for final performance metrics on the *test* images. +* Notebook: `Example - MNIST LeNet.ipynb`. +* Script: `mnist_lenet.dml` + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/mnist_lenet.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml index bd6361f..4474358 100644 --- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml +++ b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml @@ -302,9 +302,9 @@ generate_dummy_data = function() # ``` # $SPARK_HOME/bin/spark-submit --master local[*] --driver-memory 10G # --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128 -# $SYSTEMML_HOME/target/SystemML.jar -f examples/mnist_lenet.dml -# -nvargs train=examples/data/mnist/mnist_train.csv -# test=examples/data/mnist/mnist_test.csv C=1 Hin=28 Win=28 out_dir=examples/model/mnist_lenet +# $SYSTEMML_HOME/target/SystemML.jar -f mnist_lenet.dml +# -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv +# C=1 Hin=28 Win=28 out_dir=model/mnist_lenet # ``` # http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/mnist_softmax.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml index 8ba24cb..f3c00e8 100644 --- a/scripts/staging/SystemML-NN/examples/mnist_softmax.dml +++ b/scripts/staging/SystemML-NN/examples/mnist_softmax.dml @@ -176,9 +176,9 @@ generate_dummy_data = function() # ``` # $SPARK_HOME/bin/spark-submit --master local[*] --driver-memory 5G # --conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128 -# $SYSTEMML_HOME/target/SystemML.jar -f examples/mnist_softmax.dml -# -nvargs train=examples/data/mnist/mnist_train.csv -# test=examples/data/mnist/mnist_test.csv out_dir=examples/model/mnist_softmax +# $SYSTEMML_HOME/target/SystemML.jar -f mnist_softmax.dml +# -nvargs train=data/mnist/mnist_train.csv test=data/mnist/mnist_test.csv +# out_dir=model/mnist_softmax # ``` # http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/48ca2010/scripts/staging/SystemML-NN/examples/nn ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/examples/nn b/scripts/staging/SystemML-NN/examples/nn new file mode 120000 index 0000000..cfe2905 --- /dev/null +++ b/scripts/staging/SystemML-NN/examples/nn @@ -0,0 +1 @@ +../nn \ No newline at end of file
