Repository: systemml Updated Branches: refs/heads/master 754548190 -> cddd2a4f6
[Minor] added cross validation example Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cddd2a4f Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cddd2a4f Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cddd2a4f Branch: refs/heads/master Commit: cddd2a4f60e22e8b621712135e5ed263b25343c0 Parents: 7545481 Author: Berthold Reinwald <[email protected]> Authored: Mon Sep 11 15:30:29 2017 -0700 Committer: Berthold Reinwald <[email protected]> Committed: Mon Sep 11 15:46:52 2017 -0700 ---------------------------------------------------------------------- ...DML Tips and Tricks (aka Fun With DML).ipynb | 189 ++++++++++++++++++- 1 file changed, 186 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/cddd2a4f/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb ---------------------------------------------------------------------- diff --git a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb index 23d975a..c0391ce 100644 --- a/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb +++ b/samples/jupyter-notebooks/DML Tips and Tricks (aka Fun With DML).ipynb @@ -4,7 +4,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "1. [Value-based join of two Matrices](#JoinMatrices)\n", + "1. [Cross Validation](#CrossValidation)\n", + "* [Value-based join of two Matrices](#JoinMatrices)\n", "* [Filter Matrix to include only Frequent Column Values](#FilterMatrix)\n", "* [Construct (sparse) Matrix from (rowIndex, colIndex, values) triplets](#Construct_sparse_Matrix)\n", "* [Find and remove duplicates in columns or rows](#Find_and_remove_duplicates)\n", @@ -16,12 +17,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": false, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2017-08-18 21:33:18 UTC\n" + ] + } + ], "source": [ "from systemml import MLContext, dml, jvm_stdout\n", "ml = MLContext(sc)\n", @@ -33,6 +42,180 @@ "cell_type": "markdown", "metadata": {}, "source": [ + "## Cross Validation<a id=\"CrossValidation\" />" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform kFold cross validation by running in parallel fold creation, training algorithm, test algorithm, and evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test data Xyi2\n", + "10.000 11.000 12.000 4.000\n", + "16.000 17.000 18.000 6.000\n", + "\n", + "Train data Xyni2\n", + "1.000 2.000 3.000 1.000\n", + "4.000 5.000 6.000 2.000\n", + "7.000 8.000 9.000 3.000\n", + "13.000 14.000 15.000 5.000\n", + "\n", + "w2\n", + "95.000\n", + "106.000\n", + "117.000\n", + "\n", + "stats2\n", + "8938.000\n", + "\n", + "\n", + "Test data Xyi3\n", + "1.000 2.000 3.000 1.000\n", + "7.000 8.000 9.000 3.000\n", + "\n", + "Train data Xyni3\n", + "4.000 5.000 6.000 2.000\n", + "10.000 11.000 12.000 4.000\n", + "13.000 14.000 15.000 5.000\n", + "16.000 17.000 18.000 6.000\n", + "\n", + "w3\n", + "209.000\n", + "226.000\n", + "243.000\n", + "\n", + "stats3\n", + "6844.000\n", + "\n", + "\n", + "Test data Xyi1\n", + "4.000 5.000 6.000 2.000\n", + "13.000 14.000 15.000 5.000\n", + "\n", + "Train data Xyni1\n", + "1.000 2.000 3.000 1.000\n", + "7.000 8.000 9.000 3.000\n", + "10.000 11.000 12.000 4.000\n", + "16.000 17.000 18.000 6.000\n", + "\n", + "w1\n", + "158.000\n", + "172.000\n", + "186.000\n", + "\n", + "stats1\n", + "9853.000\n", + "\n", + "\n", + "SV selection vector:\n", + "3.000\n", + "1.000\n", + "3.000\n", + "2.000\n", + "1.000\n", + "2.000\n", + "\n", + "SystemML Statistics:\n", + "Total execution time:\t\t0.024 sec.\n", + "Number of executed Spark inst:\t0.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "prog = \"\"\"\n", + "holdOut = 1/3\n", + "kFolds = 1/holdOut\n", + "\n", + "nRows = 6; nCols = 3; \n", + "\n", + "X = matrix(seq(1, nRows * nCols), rows = nRows, cols = nCols) # X data\n", + "y = matrix(seq(1, nRows), rows = nRows, cols = 1) # y label data\n", + "Xy = cbind (X,y) # Xy Data for CV\n", + "\n", + "sv = rand (rows = nRows, cols = 1, min = 0.0, max = 1.0, pdf = \"uniform\") # sv selection vector for fold creation \n", + "sv = (order(target=sv, by=1, index.return=TRUE)) %% kFolds + 1 # with numbers between 1 .. kFolds \n", + "\n", + "stats = matrix(0, rows=kFolds, cols=1) # stats per kFolds model on test data\n", + "\n", + "parfor (i in 1:kFolds)\n", + "{\n", + " # Skip empty training data or test data. \n", + " if ( sum (sv == i) > 0 & sum (sv == i) < nrow(X) ) \n", + " {\n", + " Xyi = removeEmpty(target = Xy, margin = \"rows\", select = (sv == i)) # Xyi fold, i.e. 1/k of rows (test data)\n", + " Xyni = removeEmpty(target = Xy, margin = \"rows\", select = (sv != i)) # Xyni data, i.e. (k-1)/k of rows (train data)\n", + "\n", + " # Skip extreme label inbalance\n", + " distinctLabels = aggregate( target = Xyni[,1], groups = Xyni[,1], fn = \"count\")\n", + " if ( nrow(distinctLabels) > 1)\n", + " {\n", + " wi = trainAlg (Xyni[ ,1:ncol(Xy)-1], Xyni[ ,ncol(Xy)]) # wi Model for i-th training data\n", + " pi = testAlg (Xyi [ ,1:ncol(Xy)-1], wi) # pi Prediction for i-th test data\n", + " ei = evalPrediction (pi, Xyi[ ,ncol(Xy)]) # stats[i,] evaluation of prediction of i-th fold\n", + " stats[i,] = ei\n", + " \n", + " print ( \"Test data Xyi\" + i + \"\\n\" + toString(Xyi) \n", + " + \"\\nTrain data Xyni\" + i + \"\\n\" + toString(Xyni) \n", + " + \"\\nw\" + i + \"\\n\" + toString(wi) \n", + " + \"\\nstats\" + i + \"\\n\" + toString(stats[i,]) \n", + " + \"\\n\")\n", + " }\n", + " else\n", + " {\n", + " print (\"Training data for fold \" + i + \" has only \" + nrow(distinctLabels) + \" distinct labels. Needs to be > 1.\")\n", + " } \n", + " } \n", + " else \n", + " {\n", + " print (\"Training data or test data for fold \" + i + \" is empty. Fold not validated.\")\n", + " }\n", + "\n", + "}\n", + "\n", + "print (\"SV selection vector:\\n\" + toString(sv))\n", + "\n", + "trainAlg = function (matrix[double] X, matrix[double] y)\n", + " return (matrix[double] w)\n", + "{\n", + " w = t(X) %*% y\n", + "}\n", + "\n", + "testAlg = function (matrix[double] X, matrix[double] w)\n", + " return (matrix[double] p)\n", + "{\n", + " p = X %*% w\n", + "}\n", + "\n", + "evalPrediction = function (matrix[double] p, matrix[double] y)\n", + " return (matrix[double] e)\n", + "{\n", + " e = as.matrix(sum (p - y))\n", + "}\n", + "\"\"\"\n", + "\n", + "with jvm_stdout(True):\n", + " ml.execute(dml(prog))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "## Value-based join of two Matrices<a id=\"JoinMatrices\"/>" ] },
