http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/MachineLearning.ipynb ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/MachineLearning.ipynb b/projects/breast_cancer/MachineLearning.ipynb new file mode 100644 index 0000000..9a11450 --- /dev/null +++ b/projects/breast_cancer/MachineLearning.ipynb @@ -0,0 +1,561 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predicting Breast Cancer Proliferation Scores with Apache Spark and Apache SystemML\n", + "\n", + "## Machine Learning\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "import os\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from pyspark.sql.functions import col, max\n", + "import systemml # pip3 install systemml\n", + "from systemml import MLContext, dml\n", + "\n", + "plt.rcParams['figure.figsize'] = (10, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ml = MLContext(sc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read in train & val data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Settings\n", + "size=64\n", + "grayscale = True\n", + "c = 1 if grayscale else 3\n", + "p = 0.01" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tr_sample_filename = os.path.join(\"data\", \"train_{}_sample_{}{}.parquet\".format(p, size, \"_grayscale\" if grayscale else \"\"))\n", + "val_sample_filename = os.path.join(\"data\", \"val_{}_sample_{}{}.parquet\".format(p, size, \"_grayscale\" if grayscale else \"\"))\n", + "train_df = sqlContext.read.load(tr_sample_filename)\n", + "val_df = sqlContext.read.load(val_sample_filename)\n", + "train_df, val_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tc = train_df.count()\n", + "vc = val_df.count()\n", + "tc, vc, tc + vc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train_df.select(max(col(\"__INDEX\"))).show()\n", + "train_df.groupBy(\"tumor_score\").count().show()\n", + "val_df.groupBy(\"tumor_score\").count().show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extract X and Y matrices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Note: Must use the row index column, or X may not\n", + "# necessarily correspond correctly to Y\n", + "X_df = train_df.select(\"__INDEX\", \"sample\")\n", + "X_val_df = val_df.select(\"__INDEX\", \"sample\")\n", + "y_df = train_df.select(\"__INDEX\", \"tumor_score\")\n", + "y_val_df = val_df.select(\"__INDEX\", \"tumor_score\")\n", + "X_df, X_val_df, y_df, y_val_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert to SystemML Matrices\n", + "Note: This allows for reuse of the matrices on multiple\n", + "subsequent script invocations with only a single\n", + "conversion. Additionally, since the underlying RDDs\n", + "backing the SystemML matrices are maintained, any\n", + "caching will also be maintained." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "# Scale images to [-1,1]\n", + "X = X / 255\n", + "X_val = X_val / 255\n", + "X = X * 2 - 1\n", + "X_val = X_val * 2 - 1\n", + "\n", + "# One-hot encode the labels\n", + "num_tumor_classes = 3\n", + "n = nrow(y)\n", + "n_val = nrow(y_val)\n", + "Y = table(seq(1, n), y, n, num_tumor_classes)\n", + "Y_val = table(seq(1, n_val), y_val, n_val, num_tumor_classes)\n", + "\"\"\"\n", + "outputs = (\"X\", \"X_val\", \"Y\", \"Y_val\")\n", + "script = dml(script).input(X=X_df, X_val=X_val_df, y=y_df, y_val=y_val_df).output(*outputs)\n", + "X, X_val, Y, Y_val = ml.execute(script).get(*outputs)\n", + "X, X_val, Y, Y_val" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trigger Caching (Optional)\n", + "Note: This will take a while and is not necessary, but doing it\n", + "once will speed up the training below. Otherwise, the cost of\n", + "caching will be spread across the first full loop through the\n", + "data during training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# script = \"\"\"\n", + "# # Trigger conversions and caching\n", + "# # Note: This may take a while, but will enable faster iteration later\n", + "# print(sum(X))\n", + "# print(sum(Y))\n", + "# print(sum(X_val))\n", + "# print(sum(Y_val))\n", + "# \"\"\"\n", + "# script = dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val)\n", + "# ml.execute(script)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Save Matrices (Optional)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# script = \"\"\"\n", + "# write(X, \"data/X_\"+p+\"_sample_binary\", format=\"binary\")\n", + "# write(Y, \"data/Y_\"+p+\"_sample_binary\", format=\"binary\")\n", + "# write(X_val, \"data/X_val_\"+p+\"_sample_binary\", format=\"binary\")\n", + "# write(Y_val, \"data/Y_val_\"+p+\"_sample_binary\", format=\"binary\")\n", + "# \"\"\"\n", + "# script = dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val, p=p)\n", + "# ml.execute(script)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Softmax Classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sanity Check: Overfit Small Portion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"softmax_clf.dml\") as clf\n", + "\n", + "# Hyperparameters & Settings\n", + "lr = 1e-2 # learning rate\n", + "mu = 0.9 # momentum\n", + "decay = 0.999 # learning rate decay constant\n", + "batch_size = 50\n", + "epochs = 500\n", + "log_interval = 1\n", + "n = 200 # sample size for overfitting sanity check\n", + "\n", + "# Train\n", + "[W, b] = clf::train(X[1:n,], Y[1:n,], X[1:n,], Y[1:n,], lr, mu, decay, batch_size, epochs, log_interval)\n", + "\"\"\"\n", + "outputs = (\"W\", \"b\")\n", + "script = dml(script).input(X=X, Y=Y, X_val=X_val, Y_val=Y_val).output(*outputs)\n", + "W, b = ml.execute(script).get(*outputs)\n", + "W, b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"softmax_clf.dml\") as clf\n", + "\n", + "# Hyperparameters & Settings\n", + "lr = 5e-7 # learning rate\n", + "mu = 0.5 # momentum\n", + "decay = 0.999 # learning rate decay constant\n", + "batch_size = 50\n", + "epochs = 1\n", + "log_interval = 10\n", + "\n", + "# Train\n", + "[W, b] = clf::train(X, Y, X_val, Y_val, lr, mu, decay, batch_size, epochs, log_interval)\n", + "\"\"\"\n", + "outputs = (\"W\", \"b\")\n", + "script = dml(script).input(X=X, Y=Y, X_val=X_val, Y_val=Y_val).output(*outputs)\n", + "W, b = ml.execute(script).get(*outputs)\n", + "W, b" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"softmax_clf.dml\") as clf\n", + "\n", + "# Eval\n", + "probs = clf::predict(X, W, b)\n", + "[loss, accuracy] = clf::eval(probs, Y)\n", + "probs_val = clf::predict(X_val, W, b)\n", + "[loss_val, accuracy_val] = clf::eval(probs_val, Y_val)\n", + "\"\"\"\n", + "outputs = (\"loss\", \"accuracy\", \"loss_val\", \"accuracy_val\")\n", + "script = dml(script).input(X=X, Y=Y, X_val=X_val, Y_val=Y_val, W=W, b=b).output(*outputs)\n", + "loss, acc, loss_val, acc_val = ml.execute(script).get(*outputs)\n", + "loss, acc, loss_val, acc_val" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "# LeNet-like ConvNet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sanity Check: Overfit Small Portion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"convnet.dml\") as clf\n", + "\n", + "# Hyperparameters & Settings\n", + "lr = 1e-2 # learning rate\n", + "mu = 0.9 # momentum\n", + "decay = 0.999 # learning rate decay constant\n", + "lambda = 0 #5e-04\n", + "batch_size = 50\n", + "epochs = 300\n", + "log_interval = 1\n", + "dir = \"models/lenet-cnn/sanity/\"\n", + "n = 200 # sample size for overfitting sanity check\n", + "\n", + "# Train\n", + "[Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X[1:n,], Y[1:n,], X[1:n,], Y[1:n,], C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir)\n", + "\"\"\"\n", + "outputs = (\"Wc1\", \"bc1\", \"Wc2\", \"bc2\", \"Wc3\", \"bc3\", \"Wa1\", \"ba1\", \"Wa2\", \"ba2\")\n", + "script = (dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val,\n", + " C=c, Hin=size, Win=size)\n", + " .output(*outputs))\n", + "Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2 = ml.execute(script).get(*outputs)\n", + "Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hyperparameter Search" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"convnet.dml\") as clf\n", + "\n", + "dir = \"models/lenet-cnn/hyperparam-search/\"\n", + "\n", + "# TODO: Fix `parfor` so that it can be efficiently used for hyperparameter tuning\n", + "j = 1\n", + "while(j < 2) {\n", + "#parfor(j in 1:10000, par=6) {\n", + " # Hyperparameter Sampling & Settings\n", + " lr = 10 ^ as.scalar(rand(rows=1, cols=1, min=-7, max=-1)) # learning rate\n", + " mu = as.scalar(rand(rows=1, cols=1, min=0.5, max=0.9)) # momentum\n", + " decay = as.scalar(rand(rows=1, cols=1, min=0.9, max=1)) # learning rate decay constant\n", + " lambda = 10 ^ as.scalar(rand(rows=1, cols=1, min=-7, max=-1)) # regularization constant\n", + " batch_size = 50\n", + " epochs = 1\n", + " log_interval = 10\n", + " trial_dir = dir + \"j/\"\n", + "\n", + " # Train\n", + " [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, trial_dir)\n", + "\n", + " # Eval\n", + " #probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)\n", + " #[loss, accuracy] = clf::eval(probs, Y)\n", + " probs_val = clf::predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)\n", + " [loss_val, accuracy_val] = clf::eval(probs_val, Y_val)\n", + "\n", + " # Save hyperparams\n", + " str = \"lr: \" + lr + \", mu: \" + mu + \", decay: \" + decay + \", lambda: \" + lambda + \", batch_size: \" + batch_size\n", + " name = dir + accuracy_val + \",\" + j #+\",\"+accuracy+\",\"+j\n", + " write(str, name)\n", + " j = j + 1\n", + "}\n", + "\"\"\"\n", + "script = (dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val, C=c, Hin=size, Win=size))\n", + "ml.execute(script)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"convnet.dml\") as clf\n", + "\n", + "# Hyperparameters & Settings\n", + "lr = 0.00205 # learning rate\n", + "mu = 0.632 # momentum\n", + "decay = 0.99 # learning rate decay constant\n", + "lambda = 0.00385\n", + "batch_size = 50\n", + "epochs = 1\n", + "log_interval = 10\n", + "dir = \"models/lenet-cnn/train/\"\n", + "\n", + "# Train\n", + "[Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir)\n", + "\"\"\"\n", + "outputs = (\"Wc1\", \"bc1\", \"Wc2\", \"bc2\", \"Wc3\", \"bc3\", \"Wa1\", \"ba1\", \"Wa2\", \"ba2\")\n", + "script = (dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val,\n", + " C=c, Hin=size, Win=size)\n", + " .output(*outputs))\n", + "Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2 = ml.execute(script).get(*outputs)\n", + "Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "script = \"\"\"\n", + "source(\"convnet.dml\") as clf\n", + "\n", + "# Eval\n", + "probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)\n", + "[loss, accuracy] = clf::eval(probs, Y)\n", + "probs_val = clf::predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2)\n", + "[loss_val, accuracy_val] = clf::eval(probs_val, Y_val)\n", + "\"\"\"\n", + "outputs = (\"loss\", \"accuracy\", \"loss_val\", \"accuracy_val\")\n", + "script = (dml(script).input(X=X, X_val=X_val, Y=Y, Y_val=Y_val,\n", + " C=c, Hin=size, Win=size,\n", + " Wc1=Wc1, bc1=bc1,\n", + " Wc2=Wc2, bc2=bc2,\n", + " Wc3=Wc3, bc3=bc3,\n", + " Wa1=Wa1, ba1=ba1,\n", + " Wa2=Wa2, ba2=ba2)\n", + " .output(*outputs))\n", + "loss, acc, loss_val, acc_val = ml.execute(script).get(*outputs)\n", + "loss, acc, loss_val, acc_val" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/Preprocessing.ipynb ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/Preprocessing.ipynb b/projects/breast_cancer/Preprocessing.ipynb new file mode 100644 index 0000000..a54e862 --- /dev/null +++ b/projects/breast_cancer/Preprocessing.ipynb @@ -0,0 +1,904 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Predicting Breast Cancer Proliferation Scores with Apache Spark and Apache SystemML\n", + "## Preprocessing\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline\n", + "\n", + "import math\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import openslide\n", + "from openslide.deepzoom import DeepZoomGenerator\n", + "import pandas as pd\n", + "from pyspark.mllib.linalg import Vectors\n", + "from scipy.ndimage.morphology import binary_fill_holes\n", + "from skimage.color import rgb2gray\n", + "from skimage.feature import canny\n", + "from skimage.morphology import binary_closing, binary_dilation, disk\n", + "\n", + "plt.rcParams['figure.figsize'] = (10, 6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Open Whole-Slide Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def open_slide(slide_num, folder, training):\n", + " \"\"\"\n", + " Open a whole-slide image, given an image number.\n", + " \n", + " Args:\n", + " slide_num: Slide image number as an integer.\n", + " folder: Directory in which the slides folder is stored, as a string.\n", + " This should contain either a `training_image_data` folder with\n", + " images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`\n", + " folder with images in the format `TUPAC-TE-###.svs`.\n", + " training: Boolean for training or testing datasets.\n", + " \n", + " Returns:\n", + " An OpenSlide object representing a whole-slide image.\n", + " \"\"\"\n", + " if training:\n", + " filename = os.path.join(folder, \"training_image_data\", \"TUPAC-TR-{}.svs\".format(str(slide_num).zfill(3)))\n", + " else:\n", + " # Testing images\n", + " filename = os.path.join(folder, \"testing_image_data\", \"TUPAC-TE-{}.svs\".format(str(slide_num).zfill(3)))\n", + " slide = openslide.open_slide(filename)\n", + " return slide" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Tile Generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def create_tile_generator(slide, tile_size=1024, overlap=0):\n", + " \"\"\"\n", + " Create a tile generator for the given slide.\n", + " \n", + " This generator is able to extract tiles from the overall\n", + " whole-slide image.\n", + " \n", + " Args:\n", + " slide: An OpenSlide object representing a whole-slide image.\n", + " tile_size: The width and height of a square tile to be generated.\n", + " overlap: Number of pixels by which to overlap the tiles.\n", + " \n", + " Returns:\n", + " A DeepZoomGenerator object representing the tile generator. Each\n", + " extracted tile is an Image with shape (tile_size, tile_size, channels).\n", + " Note: This generator is not a true \"Python generator function\", but\n", + " rather is an object that is capable of extracting individual tiles.\n", + " \"\"\"\n", + " generator = DeepZoomGenerator(slide, tile_size=tile_size, overlap=overlap, limit_bounds=True)\n", + " return generator" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determine 20x Magnification Zoom Level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_20x_zoom_level(slide, generator):\n", + " \"\"\"\n", + " Return the zoom level that corresponds to a 20x magnification.\n", + " \n", + " The generator can extract tiles from multiple zoom levels, downsampling\n", + " by a factor of 2 per level from highest to lowest resolution.\n", + " \n", + " Args:\n", + " slide: An OpenSlide object representing a whole-slide image.\n", + " generator: A DeepZoomGenerator object representing a tile generator.\n", + " Note: This generator is not a true \"Python generator function\", but\n", + " rather is an object that is capable of extracting individual tiles.\n", + " \n", + " Returns:\n", + " Zoom level corresponding to a 20x magnification, or as close as possible.\n", + " \"\"\"\n", + " highest_zoom_level = generator.level_count - 1 # 0-based indexing\n", + " try:\n", + " mag = int(slide.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])\n", + " # `mag / 20` gives the downsampling factor between the slide's\n", + " # magnification and the desired 20x magnification.\n", + " # `(mag / 20) / 2` gives the zoom level offset from the highest\n", + " # resolution level, based on a 2x downsampling factor in the\n", + " # generator.\n", + " offset = math.floor((mag / 20) / 2)\n", + " level = highest_zoom_level - offset\n", + " except ValueError:\n", + " # In case the slide magnification level is unknown, just\n", + " # use the highest resolution.\n", + " level = highest_zoom_level\n", + " return level" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Tile Indices For Whole-Slide Image." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def process_slide(slide_num, folder, training, tile_size=1024, overlap=0):\n", + " \"\"\"\n", + " Generate all possible tile indices for a whole-slide image.\n", + " \n", + " Given a slide number, tile size, and overlap, generate\n", + " all possible (slide_num, tile_size, overlap, zoom_level, col, row)\n", + " indices.\n", + " \n", + " Args:\n", + " slide_num: Slide image number as an integer.\n", + " folder: Directory in which the slides folder is stored, as a string.\n", + " This should contain either a `training_image_data` folder with\n", + " images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`\n", + " folder with images in the format `TUPAC-TE-###.svs`.\n", + " training: Boolean for training or testing datasets.\n", + " tile_size: The width and height of a square tile to be generated.\n", + " overlap: Number of pixels by which to overlap the tiles.\n", + " \n", + " Returns:\n", + " A list of (slide_num, tile_size, overlap, zoom_level, col, row)\n", + " integer index tuples representing possible tiles to extract.\n", + " \"\"\"\n", + " # Open slide.\n", + " slide = open_slide(slide_num, folder, training)\n", + " # Create tile generator.\n", + " generator = create_tile_generator(slide, tile_size, overlap)\n", + " # Get 20x zoom level.\n", + " zoom_level = get_20x_zoom_level(slide, generator)\n", + " # Generate all possible (zoom_level, col, row) tile index tuples.\n", + " cols, rows = generator.level_tiles[zoom_level]\n", + " tile_indices = [(slide_num, tile_size, overlap, zoom_level, col, row)\n", + " for col in range(cols) for row in range(rows)]\n", + " return tile_indices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Tile From Tile Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def process_tile_index(tile_index, folder, training):\n", + " \"\"\"\n", + " Generate a tile from a tile index.\n", + " \n", + " Given a (slide_num, tile_size, overlap, zoom_level, col, row) tile\n", + " index, generate a (slide_num, tile) tuple.\n", + " \n", + " Args:\n", + " tile_index: A (slide_num, tile_size, overlap, zoom_level, col, row)\n", + " integer index tuple representing a tile to extract.\n", + " folder: Directory in which the slides folder is stored, as a string.\n", + " This should contain either a `training_image_data` folder with\n", + " images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`\n", + " folder with images in the format `TUPAC-TE-###.svs`.\n", + " training: Boolean for training or testing datasets.\n", + " \n", + " Returns:\n", + " A (slide_num, tile) tuple, where slide_num is an integer, and tile\n", + " is a 3D NumPy array of shape (tile_size, tile_size, channels) in\n", + " RGB format.\n", + " \"\"\"\n", + " slide_num, tile_size, overlap, zoom_level, col, row = tile_index\n", + " # Open slide.\n", + " slide = open_slide(slide_num, folder, training)\n", + " # Create tile generator.\n", + " generator = create_tile_generator(slide, tile_size, overlap)\n", + " # Generate tile\n", + " tile = np.array(generator.get_tile(zoom_level, (col, row)))\n", + " return (slide_num, tile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Filter Tile For Dimensions & Tissue Threshold" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def keep_tile(tile_tuple, tile_size=1024, tissue_threshold=0.9):\n", + " \"\"\"\n", + " Determine if a tile should be kept.\n", + " \n", + " This filters out tiles based on size and a tissue percentage\n", + " threshold, using a custom algorithm. If a tile has height &\n", + " width equal to (tile_size, tile_size), and contains greater\n", + " than or equal to the given percentage, then it will be kept;\n", + " otherwise it will be filtered out.\n", + " \n", + " Args:\n", + " tile_tuple: A (slide_num, tile) tuple, where slide_num is an\n", + " integer, and tile is a 3D NumPy array of shape \n", + " (tile_size, tile_size, channels) in RGB format.\n", + " tile_size: The width and height of a square tile to be generated.\n", + " tissue_threshold: Tissue percentage threshold.\n", + " \n", + " Returns:\n", + " A Boolean indicating whether or not a tile should be kept\n", + " for future usage.\n", + " \"\"\"\n", + " slide_num, tile = tile_tuple\n", + " if tile.shape[0:2] == (tile_size, tile_size):\n", + " # Convert 3D RGB image to 2D grayscale image, from\n", + " # 0 (dense tissue) to 1 (plain background).\n", + " tile = rgb2gray(tile)\n", + " # 8-bit depth complement, from 1 (dense tissue)\n", + " # to 0 (plain background).\n", + " tile = 1 - tile\n", + " # Canny edge detection with hysteresis thresholding.\n", + " # This returns a binary map of edges, with 1 equal to\n", + " # an edge. The idea is that tissue would be full of\n", + " # edges, while background would not.\n", + " tile = canny(tile)\n", + " # Binary closing, which is a dilation followed by\n", + " # an erosion. This removes small dark spots, which\n", + " # helps remove noise in the background.\n", + " tile = binary_closing(tile, disk(10))\n", + " # Binary dilation, which enlarges bright areas,\n", + " # and shrinks dark areas. This helps fill in holes\n", + " # within regions of tissue.\n", + " tile = binary_dilation(tile, disk(10))\n", + " # Fill remaining holes within regions of tissue.\n", + " tile = binary_fill_holes(tile)\n", + " # Calculate percentage of tissue coverage.\n", + " percentage = tile.mean()\n", + " return percentage >= tissue_threshold\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Flattened Samples From Tile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def process_tile(tile_tuple, sample_size=256, grayscale=False):\n", + " \"\"\"\n", + " Process a tile into a group of smaller samples.\n", + " \n", + " Cut up a tile into smaller blocks of sample_size x sample_size pixels,\n", + " change the shape of each sample from (H, W, channels) to \n", + " (channels, H, W), then flatten each into a vector of length\n", + " channels*H*W.\n", + " \n", + " Args:\n", + " tile_tuple: A (slide_num, tile) tuple, where slide_num is an\n", + " integer, and tile is a 3D NumPy array of shape \n", + " (tile_size, tile_size, channels).\n", + " sample_size: The new width and height of the square samples to be\n", + " generated.\n", + " grayscale: Whether or not to generate grayscale samples, rather\n", + " than RGB.\n", + " \n", + " Returns:\n", + " A list of (slide_num, sample) tuples representing cut up tiles,\n", + " where each sample has been transposed from\n", + " (sample_size_x, sample_size_y, channels) to (channels, sample_size_x, sample_size_y),\n", + " and flattened to a vector of length (channels*sample_size_x*sample_size_y).\n", + " \"\"\"\n", + " slide_num, tile = tile_tuple\n", + " if grayscale:\n", + " tile = rgb2gray(tile)[:, :, np.newaxis] # Grayscale\n", + " # Save disk space and future IO time by converting from [0,1] to [0,255],\n", + " # at the expense of some minor loss of information.\n", + " tile = np.round(tile * 255).astype(\"uint8\")\n", + " x, y, ch = tile.shape\n", + " # 1. Reshape into a 5D array of (num_x, sample_size_x, num_y, sample_size_y, ch), where\n", + " # num_x and num_y are the number of chopped tiles on the x and y axes, respectively.\n", + " # 2. Swap sample_size_x and num_y axes to create (num_x, num_y, sample_size_x, sample_size_y, ch).\n", + " # 3. Combine num_x and num_y into single axis, returning\n", + " # (num_samples, sample_size_x, sample_size_y, ch).\n", + " # 4. Swap axes from (num_samples, sample_size_x, sample_size_y, ch) to\n", + " # (num_samples, ch, sample_size_x, sample_size_y).\n", + " # 5. Flatten samples into (num_samples, ch*sample_size_x*sample_size_y).\n", + " samples = (tile.reshape((x // sample_size, sample_size, y // sample_size, sample_size, ch))\n", + " .swapaxes(1,2)\n", + " .reshape((-1, sample_size, sample_size, ch))\n", + " .transpose(0,3,1,2))\n", + " samples = samples.reshape(samples.shape[0], -1)\n", + " samples = [(slide_num, sample) for sample in list(samples)]\n", + " return samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualize Tile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def visualize_tile(tile):\n", + " \"\"\"\n", + " Plot a tissue tile.\n", + " \n", + " Args:\n", + " tile: A 3D NumPy array of shape (tile_size, tile_size, channels).\n", + " \n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " plt.imshow(tile)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualize Sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def visualize_sample(sample, size=256):\n", + " \"\"\"\n", + " Plot a tissue sample.\n", + " \n", + " Args:\n", + " sample: A square sample flattened to a vector of size\n", + " (channels*size_x*size_y).\n", + " size: The width and height of the square samples.\n", + " \n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " # Change type, reshape, transpose to (size_x, size_y, channels).\n", + " length = sample.shape[0]\n", + " channels = int(length / (size * size))\n", + " if channels > 1:\n", + " sample = sample.astype('uint8').reshape((channels, size, size)).transpose(1,2,0)\n", + " plt.imshow(sample)\n", + " else:\n", + " vmax = 255 if sample.max() > 1 else 1\n", + " sample = sample.reshape((size, size))\n", + " plt.imshow(sample, cmap=\"gray\", vmin=0, vmax=vmax)\n", + " plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get Ground Truth Labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def create_ground_truth_maps(folder):\n", + " \"\"\"\n", + " Create lookup maps for ground truth labels.\n", + " \n", + " Args:\n", + " folder: Directory in which the slides folder is stored, as a string.\n", + " This should contain a `training_ground_truth.csv` file.\n", + " \"\"\"\n", + " filename = os.path.join(folder, \"training_ground_truth.csv\")\n", + " labels = pd.read_csv(filename, names=[\"tumor_score\",\"molecular_score\"], header=None)\n", + " labels[\"slide_num\"] = range(1, 501)\n", + "\n", + " # Create slide_num -> tumor_score, and slide_num -> molecular_score dictionaries\n", + " tumor_score_dict = {int(s): int(l) for s,l in zip(labels.slide_num, labels.tumor_score)}\n", + " molecular_score_dict = {int(s): float(l) for s,l in zip(labels.slide_num, labels.molecular_score)}\n", + " return tumor_score_dict, molecular_score_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Process All Slides Into A Saved Spark DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def preprocess(slide_nums, folder=\"data\", training=True, tile_size=1024, overlap=0,\n", + " tissue_threshold=0.9, sample_size=256, grayscale=False, num_partitions=20000):\n", + " \"\"\"\n", + " Preprocess a set of whole-slide images.\n", + " \n", + " Preprocess a set of whole-slide images as follows:\n", + " 1. Tile the slides into tiles of size (tile_size, tile_size, 3).\n", + " 2. Filter the tiles to remove unnecessary tissue.\n", + " 3. Cut the remaining tiles into samples of size (sample_size, sample_size, ch),\n", + " where `ch` is 1 if `grayscale` is true, or 3 otherwise.\n", + " \n", + " Args:\n", + " slide_nums: List of whole-slide numbers to process.\n", + " folder: Directory in which the slides folder is stored, as a string.\n", + " This should contain either a `training_image_data` folder with\n", + " images in the format `TUPAC-TR-###.svs`, or a `testing_image_data`\n", + " folder with images in the format `TUPAC-TE-###.svs`.\n", + " training: Boolean for training or testing datasets.\n", + " tile_size: The width and height of a square tile to be generated.\n", + " overlap: Number of pixels by which to overlap the tiles.\n", + " tissue_threshold: Tissue percentage threshold for filtering.\n", + " sample_size: The new width and height of the square samples to be\n", + " generated.\n", + " grayscale: Whether or not to generate grayscale samples, rather\n", + " than RGB.\n", + " num_partitions: Number of partitions to use during processing.\n", + " \n", + " Returns:\n", + " A DataFrame in which each row contains the slide number, tumor score,\n", + " molecular score, and the sample stretched out into a Vector.\n", + " \"\"\"\n", + " slides = sc.parallelize(slide_nums)\n", + " # Force even partitioning by collecting and parallelizing -- for memory issues\n", + " ## HACK Note: This was a PySpark bug with a fix in the master branch now.\n", + " tile_indices = slides.flatMap(lambda slide: process_slide(slide, folder, training, tile_size, overlap)).collect()\n", + " tile_indices = sc.parallelize(tile_indices, num_partitions)\n", + " ## END HACK -- update later\n", + " tiles = tile_indices.map(lambda tile_index: process_tile_index(tile_index, folder, training))\n", + " filtered_tiles = tiles.filter(lambda tile: keep_tile(tile, tile_size, tissue_threshold))\n", + " samples = filtered_tiles.flatMap(lambda tile: process_tile(tile, sample_size, grayscale))\n", + " if training:\n", + " tumor_score_dict, molecular_score_dict = create_ground_truth_maps(folder)\n", + " samples_with_labels = (samples.map(lambda tup: \n", + " (tup[0], tumor_score_dict[tup[0]], molecular_score_dict[tup[0]],\n", + " Vectors.dense(tup[1]))))\n", + " df = samples_with_labels.toDF([\"slide_num\", \"tumor_score\", \"molecular_score\", \"sample\"])\n", + " df = df.select(df.slide_num.astype(\"int\"), df.tumor_score.astype(\"int\"), df.molecular_score, df[\"sample\"])\n", + " else: # testing data -- no labels\n", + " df = samples.toDF([\"slide_num\", \"sample\"])\n", + " df = df.select(df.slide_num.astype(\"int\"), df[\"sample\"])\n", + " df = df.repartition(num_partitions) # Even out the partitions\n", + " return df\n", + "\n", + "def save(df, training=True, sample_size=256, grayscale=False, mode=\"error\"):\n", + " \"\"\"\n", + " Save a preprocessed DataFrame of samples in Parquet format.\n", + " \n", + " The filename will be formatted as follows:\n", + " `samples_{labels|testing}_SAMPLE-SIZE[_grayscale].parquet`\n", + " \n", + " Args:\n", + " df: A DataFrame in which each row contains the slide number, tumor score,\n", + " molecular score, and the sample stretched out into a Vector.\n", + " training: Boolean for training or testing datasets.\n", + " sample_size: The width and height of the square samples.\n", + " grayscale: Whether or not to the samples are in grayscale format, rather\n", + " than RGB.\n", + " mode: Specifies the behavior of `df.write.mode` when the data already exists.\n", + " Options include:\n", + " * `append`: Append contents of this :class:`DataFrame` to existing data.\n", + " * `overwrite`: Overwrite existing data.\n", + " * `error`: Throw an exception if data already exists.\n", + " * `ignore`: Silently ignore this operation if data already exists.\n", + " \"\"\"\n", + " filename = \"samples_{}_{}{}.parquet\".format(\"labels\" if training else \"testing\",\n", + " sample_size,\n", + " \"_grayscale\" if grayscale else \"\")\n", + " filepath = os.path.join(\"data\", filename)\n", + " df.write.mode(mode).save(filepath, format=\"parquet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Execute Preprocessing & Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# TODO: Filtering tiles and then cutting into samples could result\n", + "# in samples with less tissue than desired, despite that being the\n", + "# procedure of the paper. Look into simply selecting tiles of the\n", + "# desired size to begin with." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Get list of image numbers, minus the broken ones\n", + "broken = {2, 45, 91, 112, 242, 256, 280, 313, 329, 467}\n", + "slide_nums = sorted(set(range(1,501)) - broken)\n", + "\n", + "# Settings\n", + "training = True\n", + "sample_size=64\n", + "grayscale = True\n", + "num_partitions = 20000\n", + "folder = \"/home/MDM/breast_cancer/data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Process all slides\n", + "df = preprocess(slide_nums, sample_size=sample_size, grayscale=grayscale,\n", + " training=training, num_partitions=num_partitions, folder=folder)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Save DataFrame of samples\n", + "save(df, sample_size=sample_size, grayscale=grayscale, training=training)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Split Into Separate Train & Validation DataFrames Based On Slide Number" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TODO: Wrap this in a function with appropriate default arguments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "filename = \"samples_{}_{}{}.parquet\".format(\"labels\" if training else \"testing\",\n", + " sample_size,\n", + " \"_grayscale\" if grayscale else \"\")\n", + "filepath = os.path.join(\"data\", filename)\n", + "df = sqlContext.read.load(filepath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "labels = pd.read_csv(\"data/training_ground_truth.csv\", names=[\"tumor_score\",\"molecular_score\"], header=None)\n", + "labels[\"slide_num\"] = range(1, 501)\n", + "\n", + "# Create slide_num -> tumor_score and slide_num -> molecular_score dictionaries\n", + "tumor_score_dict = {int(s): int(l) for s,l in zip(labels.slide_num, labels.tumor_score)}\n", + "molecular_score_dict = {int(s): float(l) for s,l in zip(labels.slide_num, labels.molecular_score)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(labels[\"tumor_score\"].value_counts() / labels.tumor_score.count())\n", + "print(labels[labels.slide_num > 400][\"tumor_score\"].value_counts() / labels[labels.slide_num > 400].tumor_score.count())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train = (df.where(df.slide_num <= 400)\n", + " .rdd\n", + " .zipWithIndex()\n", + " .map(lambda r: (r[1] + 1, *r[0]))\n", + " .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n", + "train = train.select(train[\"__INDEX\"].astype(\"int\"), train.slide_num.astype(\"int\"), train.tumor_score.astype(\"int\"),\n", + " train.molecular_score, train[\"sample\"])\n", + "\n", + "val = (df.where(df.slide_num > 400)\n", + " .rdd\n", + " .zipWithIndex()\n", + " .map(lambda r: (r[1] + 1, *r[0]))\n", + " .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n", + "val = val.select(val[\"__INDEX\"].astype(\"int\"), val.slide_num.astype(\"int\"), val.tumor_score.astype(\"int\"),\n", + " val.molecular_score, val[\"sample\"])\n", + "\n", + "train, val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Write\n", + "# TODO: Wrap this in a function with appropriate default arguments\n", + "mode = \"error\"\n", + "tr_filename = os.path.join(\"data\", \"train_{}{}.parquet\".format(sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "val_filename = os.path.join(\"data\", \"val_{}{}.parquet\".format(sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "train.write.mode(mode).save(tr_filename, format=\"parquet\")\n", + "val.write.mode(mode).save(val_filename, format=\"parquet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sample Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TODO: Wrap this in a function with appropriate default arguments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tr_filename = os.path.join(\"data\", \"train_{}{}.parquet\".format(sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "val_filename = os.path.join(\"data\", \"val_{}{}.parquet\".format(sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "train = sqlContext.read.load(tr_filename)\n", + "val = sqlContext.read.load(val_filename)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Take a stratified sample\n", + "p=0.01\n", + "train_sample = train.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n", + "val_sample = val.drop(\"__INDEX\").sampleBy(\"tumor_score\", fractions={1: p, 2: p, 3: p}, seed=42)\n", + "train_sample, val_sample" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# TODO: turn this into a function\n", + "# Repartition to get ~128MB partitions\n", + "\n", + "# TODO: Update logic to use the following to automatically\n", + "# select the number of partitions:\n", + "# ex_mb = SIZE*SIZE*CHANNELS * 8 / 1024 / 1024 # size of one example in MB\n", + "# ideal_part_size_mb = 128 # 128 MB partitions sizes are empirically ideal\n", + "# ideal_exs_per_part = round(ideal_part_size_mb / ex_mb)\n", + "# tr_parts = round(tc / ideal_exs_per_part)\n", + "# val_parts = round(vc / ideal_exs_per_part)\n", + "\n", + "if grayscale:\n", + " train_sample = train_sample.repartition(150) #300) #3000)\n", + " val_sample = val_sample.repartition(40) #80) #800)\n", + "else: # 3x\n", + " train_sample = train_sample.repartition(450) #900) #9000)\n", + " val_sample = val_sample.repartition(120) #240) #2400)\n", + "\n", + "# Reassign row indices\n", + "train_sample = (\n", + " train_sample.rdd\n", + " .zipWithIndex()\n", + " .map(lambda r: (r[1] + 1, *r[0]))\n", + " .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n", + "train_sample = train_sample.select(train_sample[\"__INDEX\"].astype(\"int\"), train_sample.slide_num.astype(\"int\"), \n", + " train_sample.tumor_score.astype(\"int\"), train_sample.molecular_score, train_sample[\"sample\"])\n", + "\n", + "val_sample = (\n", + " val_sample.rdd\n", + " .zipWithIndex()\n", + " .map(lambda r: (r[1] + 1, *r[0]))\n", + " .toDF(['__INDEX', 'slide_num', 'tumor_score', 'molecular_score', 'sample']))\n", + "val_sample = val_sample.select(val_sample[\"__INDEX\"].astype(\"int\"), val_sample.slide_num.astype(\"int\"), \n", + " val_sample.tumor_score.astype(\"int\"), val_sample.molecular_score, val_sample[\"sample\"])\n", + "\n", + "train_sample, val_sample\n", + "\n", + "# Write\n", + "# TODO: Wrap this in a function with appropriate default arguments\n", + "mode = \"error\"\n", + "tr_sample_filename = os.path.join(\"data\", \"train_{}_sample_{}{}.parquet\".format(p, sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "val_sample_filename = os.path.join(\"data\", \"val_{}_sample_{}{}.parquet\".format(p, sample_size, \"_grayscale\" if grayscale else \"\"))\n", + "train_sample.write.mode(mode).save(tr_sample_filename, format=\"parquet\")\n", + "val_sample.write.mode(mode).save(val_sample_filename, format=\"parquet\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/README.md ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/README.md b/projects/breast_cancer/README.md new file mode 100644 index 0000000..fc1ec0d --- /dev/null +++ b/projects/breast_cancer/README.md @@ -0,0 +1,137 @@ +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +# Predicting Breast Cancer Proliferation Scores with Apache Spark and Apache SystemML + +Note: This project is still a **work in progress**. + +## Overview +The [Tumor Proliferation Assessment Challenge 2016 (TUPAC16)](http://tupac.tue-image.nl/) is a "Grand Challenge" that was created for the [2016 Medical Image Computing and Computer Assisted Intervention (MICCAI 2016)](http://miccai2016.org/en/) conference. In this challenge, the goal is to develop state-of-the-art algorithms for automatic prediction of tumor proliferation scores from whole-slide histopathology images of breast tumors. + +## Background +Breast cancer is the leading cause of cancerous death in women in less-developed countries, and is the second leading cause of cancerous deaths in developed countries, accounting for 29% of all cancers in women within the U.S. [1]. Survival rates increase as early detection increases, giving incentive for pathologists and the medical world at large to develop improved methods for even earlier detection [2]. There are many forms of breast cancer including Ductal Carcinoma in Situ (DCIS), Invasive Ductal Carcinoma (IDC), Tubular Carcinoma of the Breast, Medullary Carcinoma of the Breast, Invasive Lobular Carcinoma, Inflammatory Breast Cancer and several others [3]. Within all of these forms of breast cancer, the rate in which breast cancer cells grow (proliferation), is a strong indicator of a patientâs prognosis. Although there are many means of determining the presence of breast cancer, tumor proliferation speed has been proven to help pathologists determine the treatment for the patient. The most common technique for determining the proliferation speed is through mitotic count (mitotic index) estimates, in which a pathologist counts the dividing cell nuclei in hematoxylin and eosin (H&E) stained slide preparations to determine the number of mitotic bodies. Given this, the pathologist produces a proliferation score of either 1, 2, or 3, ranging from better to worse prognosis [4]. Unfortunately, this approach is known to have reproducibility problems due to the variability in counting, as well as the difficulty in distinguishing between different grades. + +References: +[1] http://emedicine.medscape.com/article/1947145-overview#a3 +[2] http://emedicine.medscape.com/article/1947145-overview#a7 +[3] http://emedicine.medscape.com/article/1954658-overview +[4] http://emedicine.medscape.com/article/1947145-workup#c12 + +## Goal & Approach +In an effort to automate the process of classification, this project aims to develop a large-scale deep learning approach for predicting tumor scores directly from the pixels of whole-slide histopathology images. Our proposed approach is based on a recent research paper from Stanford [1]. Starting with 500 extremely high-resolution tumor slide images with accompanying score labels, we aim to make use of Apache Spark in a preprocessing step to cut and filter the images into smaller square samples, generating 4.7 million samples for a total of ~7TB of data [2]. We then utilize Apache SystemML on top of Spark to develop and train a custom, large-scale, deep convolutional neural network on these samples, making use of the familiar linear algebra syntax and automatically-distributed execution of SystemML [3]. Our model takes as input the pixel values of the individual samples, and is trained to predict the correct tumor score classification for each one. In addition to distributed l inear algebra, we aim to exploit task-parallelism via parallel for-loops for hyperparameter optimization, as well as hardware acceleration for faster training via a GPU-backed runtime. Ultimately, we aim to develop a model that is sufficiently stronger than existing approaches for the task of breast cancer tumor proliferation score classification. + +References: +[1] https://web.stanford.edu/group/rubinlab/pubs/2243353.pdf +[2] See [`Preprocessing.ipynb`](Preprocessing.ipynb). +[3] See [`MachineLearning.ipynb`](MachineLearning.ipynb), [`softmax_clf.dml`](softmax_clf.dml), and [`convnet.dml`](convnet.dml). + + + +--- + +## Setup (*All nodes* unless other specified): +* System Packages: + * `sudo yum update` + * `sudo yum install gcc ruby` +* Python 3: + * `sudo yum install epel-release` + * `sudo yum install -y https://centos7.iuscommunity.org/ius-release.rpm` + * `sudo yum install -y python35u python35u-libs python35u-devel python35u-pip` + * `ln -s /usr/bin/python3.5 ~/.local/bin/python3` + * `ln -s /usr/bin/pip3.5 ~/.local/bin/pip3` + * Prepend `~/.local/bin` to the `PATH`. +* OpenSlide: + * `sudo yum install openslide` +* Python packages: + * `pip3 install -U matplotlib numpy pandas scipy jupyter ipython scikit-learn scikit-image flask openslide-python` +* SystemML (only driver): + * `git clone https://github.com/apache/incubator-systemml.git` + * `cd incubator-systemml` + * `mvn clean package` + * `pip3 install -e src/main/python` +* Create a `data` folder with the following contents (same location on *all* nodes): + * `training_image_data` folder with the training slides. + * `testing_image_data` folder with the testing slides. + * `training_ground_truth.csv` file containing the tumor & molecular scores for each slide. +* Create a project folder (i.e. `breast_cancer`) with the following contents (only driver): + * All notebooks (`*.ipynb`). + * All DML scripts (`*.dml`). + * SystemML-NN installed as an `nn` folder containing the contents of `$SYSTEMML_HOME/scripts/staging/SystemML-NN/nn` (either copy & paste, or use a softlink). + * The `data` folder (or a softlink pointing to it). + * Layout: + + ``` + - MachineLearning.ipynb + - Preprocessing.ipynb + - ... + - data/ + - training_ground_truth.csv + - training_image_data + - TUPAC-TR-001.svs + - TUPAC-TR-002.svs + - ... + - testing_image_data + - TUPAC-TE-001.svs + - TUPAC-TE-002.svs + - ... + ``` + +* Adjust the Spark settings in `$SPARK_HOME/conf/spark-defaults.conf` using the following examples, depending on the job being executed: + * All jobs: + ``` + # Use most of the driver memory. + spark.driver.memory 70g + # Remove the max result size constraint. + spark.driver.maxResultSize 0 + # Increase the message size. + spark.akka.frameSize 128 + # Extend the network timeout threshold. + spark.network.timeout 1000s + # Setup some extra Java options for performance. + spark.driver.extraJavaOptions -server -Xmn12G + spark.executor.extraJavaOptions -server -Xmn12G + # Setup local directories on separate disks for intermediate read/write performance, if running + # on Spark Standalone clusters. + spark.local.dirs /disk2/local,/disk3/local,/disk4/local,/disk5/local,/disk6/local,/disk7/local,/disk8/local,/disk9/local,/disk10/local,/disk11/local,/disk12/local + ``` + + * Preprocessing: + ``` + # Save 1/2 executor memory for Python processes + spark.executor.memory 50g + ``` + + * Machine Learning: + ``` + # Use all executor memory for JVM + spark.executor.memory 100g + ``` + +* Start Jupyter + PySpark with the following command (could also use Yarn in client mode with `--master yarn --deploy-mode`): + ``` + PYSPARK_PYTHON=python3 PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS="notebook" pyspark --master spark://MASTER_URL:7077 --driver-class-path $SYSTEMML_HOME/target/SystemML.jar --jars $SYSTEMML_HOME/target/SystemML.jar + ``` + +## Create a Histopath slide âlabâ to view the slides (just driver): + - `git clone https://github.com/openslide/openslide-python.git` + - Host locally: + - `python3 path/to/openslide-python/examples/deepzoom/deepzoom_multiserver.py path/to/data/` + - Host on server: + - `python3 path/to/openslide-python/examples/deepzoom/deepzoom_multiserver.py -l HOSTING_URL_HERE path/to/data/` + - Open local browser to `HOSTING_URL_HERE:5000`. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/convnet.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/convnet.dml b/projects/breast_cancer/convnet.dml new file mode 100644 index 0000000..5f115a2 --- /dev/null +++ b/projects/breast_cancer/convnet.dml @@ -0,0 +1,470 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Breast Cancer LeNet-like ConvNet Model + */ +# Imports +source("nn/layers/affine.dml") as affine +source("nn/layers/conv_builtin.dml") as conv +source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("nn/layers/dropout.dml") as dropout +source("nn/layers/l2_reg.dml") as l2_reg +source("nn/layers/max_pool_builtin.dml") as max_pool +source("nn/layers/relu.dml") as relu +source("nn/layers/softmax.dml") as softmax +#source("nn/optim/adam.dml") as adam +source("nn/optim/sgd_nesterov.dml") as sgd_nesterov + +train = function(matrix[double] X, matrix[double] Y, + matrix[double] X_val, matrix[double] Y_val, + int C, int Hin, int Win, + double lr, double mu, double decay, double lambda, + int batch_size, int epochs, int log_interval, + string checkpoint_dir) + return (matrix[double] Wc1, matrix[double] bc1, + matrix[double] Wc2, matrix[double] bc2, + matrix[double] Wc3, matrix[double] bc3, + matrix[double] Wa1, matrix[double] ba1, + matrix[double] Wa2, matrix[double] ba2) { + /* + * Trains a convolutional net using a "LeNet"-like architecture. + * + * The input matrix, X, has N examples, each represented as a 3D + * volume unrolled into a single vector. The targets, Y, have K + * classes, and are one-hot encoded. + * + * Inputs: + * - X: Input data matrix, of shape (N, C*Hin*Win). + * - Y: Target matrix, of shape (N, K). + * - X_val: Input validation data matrix, of shape (N, C*Hin*Win). + * - Y_val: Target validation matrix, of shape (N, K). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - lr: Learning rate. + * - mu: Momentum value. + * Typical values are in the range of [0.5, 0.99], usually + * started at the lower end and annealed towards the higher end. + * - decay: Learning rate decay rate. + * - lambda: Regularization strength. + * - batch_size: Size of mini-batches to train on. + * - epochs: Total number of full training loops over the full data set. + * - log_interval: Interval, in iterations, between log outputs. + * - checkpoint_dir: Directory to store model checkpoints. + * + * Outputs: + * - Wc1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). + * - bc1: 1st layer biases vector, of shape (F1, 1). + * - Wc2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). + * - bc2: 2nd layer biases vector, of shape (F2, 1). + * - Wc3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3). + * - bc3: 3rd layer biases vector, of shape (1, N3). + * - Wa2: 4th layer weights (parameters) matrix, of shape (N3, K). + * - ba2: 4th layer biases vector, of shape (1, K). + */ + N = nrow(X) + K = ncol(Y) + + # Create network: + # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 + # -> affine1 -> relu1 -> dropout1 -> affine2 -> softmax + Hf = 3 # filter height + Wf = 3 # filter width + stride = 1 + pad = 1 # For same dimensions, (Hf - stride) / 2 + F1 = 32 # num conv filters in conv1 + F2 = 32 # num conv filters in conv2 + F3 = 32 # num conv filters in conv3 + N1 = 512 # num nodes in affine1 + # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes) + [Wc1, bc1] = conv::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win) + [Wc2, bc2] = conv::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2)) + [Wc3, bc3] = conv::init(F3, F2, Hf, Wf) # inputs: (N, F2*(Hin/2^2)*(Win/2^2)) + [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1) # inputs: (N, F3*(Hin/2^3)*(Win/2^3)) + [Wa2, ba2] = affine::init(N1, K) # inputs: (N, N1) + Wa2 = Wa2 / sqrt(2) # different initialization, since being fed into softmax, instead of relu + + # TODO: Compare optimizers once training is faster. + # Initialize SGD w/ Nesterov momentum optimizer + vWc1 = sgd_nesterov::init(Wc1); vbc1 = sgd_nesterov::init(bc1) + vWc2 = sgd_nesterov::init(Wc2); vbc2 = sgd_nesterov::init(bc2) + vWc3 = sgd_nesterov::init(Wc3); vbc3 = sgd_nesterov::init(bc3) + vWa1 = sgd_nesterov::init(Wa1); vba1 = sgd_nesterov::init(ba1) + vWa2 = sgd_nesterov::init(Wa2); vba2 = sgd_nesterov::init(ba2) + #[mWc1, vWc1] = adam::init(Wc1) # optimizer 1st & 2nd moment state for Wc1 + #[mbc1, vbc1] = adam::init(bc1) # optimizer 1st & 2nd moment state for bc1 + #[mWc2, vWc2] = adam::init(Wc2) # optimizer 1st & 2nd moment state for Wc2 + #[mbc2, vbc2] = adam::init(bc2) # optimizer 1st & 2nd moment state for bc2 + #[mWc3, vWc3] = adam::init(Wc3) # optimizer 1st & 2nd moment state for Wc3 + #[mbc3, vbc3] = adam::init(bc3) # optimizer 1st & 2nd moment state for bc3 + #[mWa1, vWa1] = adam::init(Wa1) # optimizer 1st & 2nd moment state for Wa1 + #[mba1, vba1] = adam::init(ba1) # optimizer 1st & 2nd moment state for ba1 + #[mWa2, vWa2] = adam::init(Wa2) # optimizer 1st & 2nd moment state for Wa2 + #[mba2, vba2] = adam::init(ba2) # optimizer 1st & 2nd moment state for ba2 + #beta1 = 0.9 + #beta2 = 0.999 + #eps = 1e-8 + + # TODO: Enable starting val metrics once fast, distributed predictions are available. + # Starting validation loss & accuracy + #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + #loss_val = cross_entropy_loss::forward(probs_val, Y_val) + #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) + ## Output results + #print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) + + # Optimize + print("Starting optimization") + iters = ceil(N / batch_size) + for (e in 1:epochs) { + for(i in 1:iters) { + # Get next batch + beg = ((i-1) * batch_size) %% N + 1 + end = min(N, beg + batch_size - 1) + X_batch = X[beg:end,] + y_batch = Y[beg:end,] + + # Compute forward pass + ## conv layer 1: conv1 -> relu1 -> pool1 + [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + outc1r = relu::forward(outc1) + [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) + ## conv layer 2: conv2 -> relu2 -> pool2 + [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad) + outc2r = relu::forward(outc2) + [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) + ## conv layer 3: conv3 -> relu3 -> pool3 + [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad) + outc3r = relu::forward(outc3) + [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) + ## affine layer 1: affine1 -> relu1 -> dropout1 + outa1 = affine::forward(outc3p, Wa1, ba1) + outa1r = relu::forward(outa1) + [outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) + ## affine layer 2: affine2 -> softmax + outa2 = affine::forward(outa1d, Wa2, ba2) + probs = softmax::forward(outa2) + + # Compute data backward pass + ## loss: + dprobs = cross_entropy_loss::backward(probs, y_batch) + ## affine layer 2: affine2 -> softmax + douta2 = softmax::backward(dprobs, outa2) + [douta1d, dWa2, dba2] = affine::backward(douta2, outa1d, Wa2, ba2) + ## layer 3: affine3 -> relu3 -> dropout + ## affine layer 1: affine1 -> relu1 -> dropout + douta1r = dropout::backward(douta1d, outa1r, 0.5, maskad1) + douta1 = relu::backward(douta1r, outa1) + [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1) + ## conv layer 3: conv3 -> relu3 -> pool3 + doutc3r = max_pool::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) + doutc3 = relu::backward(doutc3r, outc3) + [doutc2p, dWc3, dbc3] = conv::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad) + ## conv layer 2: conv2 -> relu2 -> pool2 + doutc2r = max_pool::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) + doutc2 = relu::backward(doutc2r, outc2) + [doutc1p, dWc2, dbc2] = conv::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad) + ## conv layer 1: conv1 -> relu1 -> pool1 + doutc1r = max_pool::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) + doutc1 = relu::backward(doutc1r, outc1) + [dX_batch, dWc1, dbc1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + + # Compute regularization backward pass + dWc1_reg = l2_reg::backward(Wc1, lambda) + dWc2_reg = l2_reg::backward(Wc2, lambda) + dWc3_reg = l2_reg::backward(Wc3, lambda) + dWa1_reg = l2_reg::backward(Wa1, lambda) + dWa2_reg = l2_reg::backward(Wa2, lambda) + dWc1 = dWc1 + dWc1_reg + dWc2 = dWc2 + dWc2_reg + dWc3 = dWc3 + dWc3_reg + dWa1 = dWa1 + dWa1_reg + dWa2 = dWa2 + dWa2_reg + + # Optimize with SGD w/ Nesterov momentum + [Wc1, vWc1] = sgd_nesterov::update(Wc1, dWc1, lr, mu, vWc1) + [bc1, vbc1] = sgd_nesterov::update(bc1, dbc1, lr, mu, vbc1) + [Wc2, vWc2] = sgd_nesterov::update(Wc2, dWc2, lr, mu, vWc2) + [bc2, vbc2] = sgd_nesterov::update(bc2, dbc2, lr, mu, vbc2) + [Wc3, vWc3] = sgd_nesterov::update(Wc3, dWc3, lr, mu, vWc3) + [bc3, vbc3] = sgd_nesterov::update(bc3, dbc3, lr, mu, vbc3) + [Wa1, vWa1] = sgd_nesterov::update(Wa1, dWa1, lr, mu, vWa1) + [ba1, vba1] = sgd_nesterov::update(ba1, dba1, lr, mu, vba1) + [Wa2, vWa2] = sgd_nesterov::update(Wa2, dWa2, lr, mu, vWa2) + [ba2, vba2] = sgd_nesterov::update(ba2, dba2, lr, mu, vba2) + #t = e*i - 1 + #[Wc1, mWc1, vWc1] = adam::update(Wc1, dWc1, lr, beta1, beta2, eps, t, mWc1, vWc1) + #[bc1, mbc1, vbc1] = adam::update(bc1, dbc1, lr, beta1, beta2, eps, t, mbc1, vbc1) + #[Wc2, mWc2, vWc2] = adam::update(Wc2, dWc2, lr, beta1, beta2, eps, t, mWc2, vWc2) + #[bc2, mbc2, vbc2] = adam::update(bc2, dbc2, lr, beta1, beta2, eps, t, mbc2, vbc2) + #[Wc3, mWc3, vWc3] = adam::update(Wc3, dWc3, lr, beta1, beta2, eps, t, mWc3, vWc3) + #[bc3, mbc3, vbc3] = adam::update(bc3, dbc3, lr, beta1, beta2, eps, t, mbc3, vbc3) + #[Wa1, mWa1, vWa1] = adam::update(Wa1, dWa1, lr, beta1, beta2, eps, t, mWa1, vWa1) + #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1) + #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2) + #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2) + + # Compute loss & accuracy for training & validation data every `log_interval` iterations. + if (i %% log_interval == 0) { + # Compute training loss & accuracy + loss_data = cross_entropy_loss::forward(probs, y_batch) + loss_reg_Wc1 = l2_reg::forward(Wc1, lambda) + loss_reg_Wc2 = l2_reg::forward(Wc2, lambda) + loss_reg_Wc3 = l2_reg::forward(Wc3, lambda) + loss_reg_Wa1 = l2_reg::forward(Wa1, lambda) + loss_reg_Wa2 = l2_reg::forward(Wa2, lambda) + loss = loss_data + loss_reg_Wc1 + loss_reg_Wc2 + loss_reg_Wc3 + loss_reg_Wa1 + loss_reg_Wa2 + accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch)) + + # TODO: Consider enabling val metrics here once fast, distributed predictions are available. + ## Compute validation loss & accuracy + #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + #loss_val = cross_entropy_loss::forward(probs_val, Y_val) + #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) + + ## Output results + #print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " + # + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val + # + ", lr: " + lr + ", mu " + mu) + # Output results + print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters + + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy) + } + } + + # Compute validation loss & accuracy for validation data every epoch + probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + loss_val = cross_entropy_loss::forward(probs_val, Y_val) + accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) + + # Output results + print("Epoch: " + e + "/" + epochs + ", Val Loss: " + loss_val + + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu) + + # Checkpoint model + dir = checkpoint_dir + e + "/" + dummy = checkpoint(dir, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda + + ", batch_size: " + batch_size + name = dir + accuracy_val + write(str, name) + + # Anneal momentum towards 0.999 + mu = mu + (0.999 - mu)/(1+epochs-e) + # Decay learning rate + lr = lr * decay + } +} + +checkpoint = function(string dir, + matrix[double] Wc1, matrix[double] bc1, + matrix[double] Wc2, matrix[double] bc2, + matrix[double] Wc3, matrix[double] bc3, + matrix[double] Wa1, matrix[double] ba1, + matrix[double] Wa2, matrix[double] ba2) { + /* + * Save the model parameters. + * + * Inputs: + * - dir: Directory in which to save model parameters. + * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). + * - bc1: 1st conv layer biases vector, of shape (F1, 1). + * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). + * - bc2: 2nd conv layer biases vector, of shape (F2, 1). + * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf). + * - bc3: 3rd conv layer biases vector, of shape (F3, 1). + * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1). + * - ba1: 1st affine layer biases vector, of shape (1, N1). + * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K). + * - ba2: 2nd affine layer biases vector, of shape (1, K). + * + * Outputs: + * - probs: Class probabilities, of shape (N, K). + */ + write(Wc1, dir + "Wc1", format="binary") + write(bc1, dir + "bc1", format="binary") + write(Wc2, dir + "Wc2", format="binary") + write(bc2, dir + "bc2", format="binary") + write(Wc3, dir + "Wc3", format="binary") + write(bc3, dir + "bc3", format="binary") + write(Wa1, dir + "Wa1", format="binary") + write(ba1, dir + "ba1", format="binary") + write(Wa2, dir + "Wa2", format="binary") + write(ba2, dir + "ba2", format="binary") +} + +predict = function(matrix[double] X, int C, int Hin, int Win, + matrix[double] Wc1, matrix[double] bc1, + matrix[double] Wc2, matrix[double] bc2, + matrix[double] Wc3, matrix[double] bc3, + matrix[double] Wa1, matrix[double] ba1, + matrix[double] Wa2, matrix[double] ba2) + return (matrix[double] probs) { + /* + * Computes the class probability predictions of a convolutional + * net using the "LeNet" architecture. + * + * The input matrix, X, has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * Inputs: + * - X: Input data matrix, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). + * - bc1: 1st conv layer biases vector, of shape (F1, 1). + * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). + * - bc2: 2nd conv layer biases vector, of shape (F2, 1). + * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf). + * - bc3: 3rd conv layer biases vector, of shape (F3, 1). + * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1). + * - ba1: 1st affine layer biases vector, of shape (1, N1). + * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K). + * - ba2: 2nd affine layer biases vector, of shape (1, K). + * + * Outputs: + * - probs: Class probabilities, of shape (N, K). + */ + N = nrow(X) + + # Network: + # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 -> affine1 -> relu1 -> affine2 -> softmax + Hf = 3 # filter height + Wf = 3 # filter width + stride = 1 + pad = 1 # For same dimensions, (Hf - stride) / 2 + + F1 = nrow(Wc1) # num conv filters in conv1 + F2 = nrow(Wc2) # num conv filters in conv2 + F3 = nrow(Wc3) # num conv filters in conv3 + N1 = ncol(Wa1) # num nodes in affine1 + K = ncol(Wa2) # num nodes in affine2, equal to number of target dimensions (num classes) + + # TODO: Implement fast, distributed conv & max pooling operators so that predictions + # can be computed in a full-batch, distributed manner. Alternatively, improve `parfor` + # so that it can be efficiently used for parallel predictions. + ## Compute forward pass + ### conv layer 1: conv1 -> relu1 -> pool1 + #[outc1, Houtc1, Woutc1] = conv::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + #outc1r = relu::forward(outc1) + #[outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) + ### conv layer 2: conv2 -> relu2 -> pool2 + #[outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad) + #outc2r = relu::forward(outc2) + #[outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) + ### conv layer 3: conv3 -> relu3 -> pool3 + #[outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad) + #outc3r = relu::forward(outc3) + #[outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) + ### affine layer 1: affine1 -> relu1 -> dropout + #outa1 = affine::forward(outc3p, Wa1, ba1) + #outa1r = relu::forward(outa1) + ##[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) + ### affine layer 2: affine2 -> softmax + #outa2 = affine::forward(outa1r, Wa2, ba2) + #probs = softmax::forward(outa2) + + # Compute predictions over mini-batches + probs = matrix(0, rows=N, cols=K) + batch_size = 50 + iters = ceil(N / batch_size) + for(i in 1:iters) { + # TODO: `parfor` should work here, possibly as an alternative to distributed predictions. + #parfor(i in 1:iters, check=0, mode=REMOTE_SPARK, resultmerge=REMOTE_SPARK) { + # Get next batch + beg = ((i-1) * batch_size) %% N + 1 + end = min(N, beg + batch_size - 1) + X_batch = X[beg:end,] + + # Compute forward pass + ## conv layer 1: conv1 -> relu1 -> pool1 + [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + outc1r = relu::forward(outc1) + [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) + ## conv layer 2: conv2 -> relu2 -> pool2 + [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad) + outc2r = relu::forward(outc2) + [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) + ## conv layer 3: conv3 -> relu3 -> pool3 + [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad) + outc3r = relu::forward(outc3) + [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) + ## affine layer 1: affine1 -> relu1 -> dropout + outa1 = affine::forward(outc3p, Wa1, ba1) + outa1r = relu::forward(outa1) + #[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) + ## affine layer 2: affine2 -> softmax + outa2 = affine::forward(outa1r, Wa2, ba2) + probs_batch = softmax::forward(outa2) + + # Store predictions + probs[beg:end,] = probs_batch + } +} + +eval = function(matrix[double] probs, matrix[double] Y) + return (double loss, double accuracy) { + /* + * Evaluates a convolutional net using the "LeNet" architecture. + * + * The probs matrix contains the class probability predictions + * of K classes over N examples. The targets, Y, have K classes, + * and are one-hot encoded. + * + * Inputs: + * - probs: Class probabilities, of shape (N, K). + * - Y: Target matrix, of shape (N, + * + * Outputs: + * - loss: Scalar loss, of shape (1). + * - accuracy: Scalar accuracy, of shape (1). + */ + # Compute loss & accuracy + loss = cross_entropy_loss::forward(probs, Y) + correct_pred = rowIndexMax(probs) == rowIndexMax(Y) + accuracy = mean(correct_pred) +} + +generate_dummy_data = function() + return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) { + /* + * Generate a dummy dataset similar to the MNIST dataset. + * + * Outputs: + * - X: Input data matrix, of shape (N, D). + * - Y: Target matrix, of shape (N, K). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + */ + # Generate dummy input data + N = 1024 # num examples + C = 1 # num input channels + Hin = 64 # input height + Win = 64 # input width + K = 3 # num target classes + X = rand(rows=N, cols=C*Hin*Win, pdf="normal") + classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform")) + Y = table(seq(1, N), classes) # one-hot encoding +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/data/testing_image_data/.keep ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/data/testing_image_data/.keep b/projects/breast_cancer/data/testing_image_data/.keep new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/data/training_image_data/.keep ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/data/training_image_data/.keep b/projects/breast_cancer/data/training_image_data/.keep new file mode 100644 index 0000000..e69de29 http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/hyperparam_tuning.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml new file mode 100644 index 0000000..464c659 --- /dev/null +++ b/projects/breast_cancer/hyperparam_tuning.dml @@ -0,0 +1,84 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Hyperparameter Tuning Script For LeNet-like CNN Model + */ +# Imports +source("cnn.dml") as clf + +# Read data +# X = read("data/X_0.01_sample_binary") +# Y = read("data/Y_0.01_sample_binary") +# X_val = read("data/X_val_0.01_sample_binary") +# Y_val = read("data/Y_val_0.01_sample_binary") +X = read("data/X_really_small_sample_binary") +Y = read("data/Y_really_small_sample_binary") +X_val = read("data/X_val_really_small_sample_binary") +Y_val = read("data/Y_val_really_small_sample_binary") + +# Smaller sample for now +# X = X[1:300000,] +# Y = Y[1:300000,] +# X = X[1:18818,] +# Y = Y[1:18818,] +# X_val = X_val[1:18818,] +# Y_val = Y_val[1:18818,] + +# data shape +C = 1 +Hin = 64 +Win = 64 + +# Output directory +# dir = "models/systemml/lenet-1-sample/" +dir = "models/systemml/lenet-really-small-sample-hyperparam-2/" + +# TODO: Get this working with `parfor` +#j = 1 +#while(TRUE) { # Ideally this would be a parfor loop with a bunch of tries +parfor(j in 1:10000) { + # Hyperparameter Sampling & Settings + lr = 10 ^ as.scalar(rand(rows=1, cols=1, min=-7, max=-1)) # learning rate + mu = as.scalar(rand(rows=1, cols=1, min=0.5, max=0.9)) # momentum + decay = as.scalar(rand(rows=1, cols=1, min=0.9, max=1)) # learning rate decay constant + lambda = 10 ^ as.scalar(rand(rows=1, cols=1, min=-7, max=-1)) # regularization constant + batch_size = 50 + epochs = 1 + log_interval = 10 + + # Train + [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir) + + # Eval + #probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + #[loss, accuracy] = clf::eval(probs, Y) + probs_val = clf::predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) + [loss_val, accuracy_val] = clf::eval(probs_val, Y_val) + + # Save hyperparams with accuracy + str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda + + ", batch_size: " + batch_size + name = dir + accuracy_val + "," + j #+","+accuracy+","+j + write(str, name) + #j = j + 1 +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3dc8f21c/projects/breast_cancer/nn ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/nn b/projects/breast_cancer/nn new file mode 120000 index 0000000..9c0c967 --- /dev/null +++ b/projects/breast_cancer/nn @@ -0,0 +1 @@ +../../scripts/staging/SystemML-NN/nn \ No newline at end of file
