Repository: incubator-systemml Updated Branches: refs/heads/master 1be911cc5 -> 3877e3563
[SYSTEMML-209] Added documentation for MLPipeline scala wrappers for MultiLogReg, SVM and Naive Bayes. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3877e356 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3877e356 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3877e356 Branch: refs/heads/master Commit: 3877e3563007ebf383d8c18681ab3d379a93f698 Parents: 1be911c Author: Niketan Pansare <[email protected]> Authored: Mon Aug 29 13:47:16 2016 -0700 Committer: Niketan Pansare <[email protected]> Committed: Mon Aug 29 13:48:42 2016 -0700 ---------------------------------------------------------------------- docs/algorithms-classification.md | 108 +++++++++++++++++++++++++++++++++ docs/beginners-guide-python.md | 7 +-- 2 files changed, 111 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3877e356/docs/algorithms-classification.md ---------------------------------------------------------------------- diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index 03c78d6..340267c 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -138,6 +138,14 @@ y_test = logistic.fit(X_train, y_train).predict(X_test) y_test = logistic.fit(df_train).transform(df_test) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.sysml.api.ml.LogisticRegression +val lr = new LogisticRegression("logReg", sc).setIcpt(0).setMaxOuterIter(100).setMaxInnerIter(0).setRegParam(0.000001).setTol(0.000001) +val model = lr.fit(X_train_df) +val prediction = model.transform(X_test_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs X=<file> @@ -277,6 +285,38 @@ prediction = model.transform(test) prediction.show() {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.spark.ml.feature.{HashingTF, Tokenizer} +import org.apache.sysml.api.ml.LogisticRegression +import org.apache.spark.ml.Pipeline +val training = sqlContext.createDataFrame(Seq( + ("a b c d e spark", 1.0), + ("b d", 2.0), + ("spark f g h", 1.0), + ("hadoop mapreduce", 2.0), + ("b spark who", 1.0), + ("g d a y", 2.0), + ("spark fly", 1.0), + ("was mapreduce", 2.0), + ("e spark program", 1.0), + ("a e c l", 2.0), + ("spark compile", 1.0), + ("hadoop software", 2.0))).toDF("text", "label") +val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") +val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutputCol).setOutputCol("features") +val lr = new LogisticRegression("logReg", sc) +val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr)) +val model = pipeline.fit(training) +val test = sqlContext.createDataFrame(Seq( + ("spark i j k", 1.0), + ("l m n", 2.0), + ("mapreduce spark", 1.0), + ("apache hadoop", 2.0))).toDF("text", "trueLabel") +val prediction = model.transform(test) +prediction.show() +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs X=/user/ml/X.mtx @@ -467,6 +507,13 @@ y_test = svm.fit(X_train, y_train) y_test = svm.fit(df_train) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.sysml.api.ml.SVM +val svm = new SVM("svm", sc, isMultiClass=false).setIcpt(0).setMaxIter(100).setRegParam(0.000001).setTol(0.000001) +val model = svm.fit(X_train_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f l2-svm.dml -nvargs X=<file> @@ -510,6 +557,11 @@ y_test = svm.predict(X_test) y_test = svm.transform(df_test) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +val prediction = model.transform(X_test_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f l2-svm-predict.dml -nvargs X=<file> @@ -723,6 +775,13 @@ y_test = svm.fit(X_train, y_train) y_test = svm.fit(df_train) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.sysml.api.ml.SVM +val svm = new SVM("svm", sc, isMultiClass=true).setIcpt(0).setMaxIter(100).setRegParam(0.000001).setTol(0.000001) +val model = svm.fit(X_train_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f m-svm.dml -nvargs X=<file> @@ -766,6 +825,11 @@ y_test = svm.predict(X_test) y_test = svm.transform(df_test) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +val prediction = model.transform(X_test_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=<file> @@ -900,6 +964,38 @@ prediction = model.transform(test) prediction.show() {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.spark.ml.feature.{HashingTF, Tokenizer} +import org.apache.sysml.api.ml.SVM +import org.apache.spark.ml.Pipeline +val training = sqlContext.createDataFrame(Seq( + ("a b c d e spark", 1.0), + ("b d", 2.0), + ("spark f g h", 1.0), + ("hadoop mapreduce", 2.0), + ("b spark who", 1.0), + ("g d a y", 2.0), + ("spark fly", 1.0), + ("was mapreduce", 2.0), + ("e spark program", 1.0), + ("a e c l", 2.0), + ("spark compile", 1.0), + ("hadoop software", 2.0))).toDF("text", "label") +val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") +val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutputCol).setOutputCol("features") +val svm = new SVM("svm", sc, isMultiClass=true) +val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, svm)) +val model = pipeline.fit(training) +val test = sqlContext.createDataFrame(Seq( + ("spark i j k", 1.0), + ("l m n", 2.0), + ("mapreduce spark", 1.0), + ("apache hadoop", 2.0))).toDF("text", "trueLabel") +val prediction = model.transform(test) +prediction.show() +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f m-svm.dml -nvargs X=/user/ml/X.mtx @@ -1034,6 +1130,13 @@ y_test = nb.fit(X_train, y_train) y_test = nb.fit(df_train) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +import org.apache.sysml.api.ml.NaiveBayes +val nb = new NaiveBayes("naiveBayes", sc, isMultiClass=true).setLaplace(1.0) +val model = nb.fit(X_train_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<file> @@ -1073,6 +1176,11 @@ y_test = nb.predict(X_test) y_test = nb.transform(df_test) {% endhighlight %} </div> +<div data-lang="Scala" markdown="1"> +{% highlight scala %} +val prediction = model.transform(X_test_df) +{% endhighlight %} +</div> <div data-lang="Hadoop" markdown="1"> hadoop jar SystemML.jar -f naive-bayes-predict.dml -nvargs X=<file> http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3877e356/docs/beginners-guide-python.md ---------------------------------------------------------------------- diff --git a/docs/beginners-guide-python.md b/docs/beginners-guide-python.md index 790ed43..b565656 100644 --- a/docs/beginners-guide-python.md +++ b/docs/beginners-guide-python.md @@ -309,11 +309,11 @@ prediction.show() ## Invoking DML/PyDML scripts using MLContext -TODO: This is work in progress. +The below example demonstrates how to invoke the algorithm [scripts/algorithms/MultiLogReg.dml](https://github.com/apache/incubator-systemml/blob/master/scripts/algorithms/MultiLogReg.dml) +using Python [MLContext API](https://apache.github.io/incubator-systemml/spark-mlcontext-programming-guide). ```python from sklearn import datasets, neighbors -from SystemML.mllearn import LogisticRegression from pyspark.sql import DataFrame, SQLContext import SystemML as sml import pandas as pd @@ -328,7 +328,6 @@ X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:.9 * n_samples])) y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:.9 * n_samples])) ml = sml.MLContext(sc) script = os.path.join(os.environ['SYSTEMML_HOME'], 'scripts', 'algorithms', 'MultiLogReg.dml') -script = sml.dml(script).input(X=X_df, Y_vec=y_df).out("B_out") -# .input($X=' ', $Y=' ', $B=' ') +script = sml.dml(script).input(X=X_df, Y_vec=y_df).input(**{"$X": ' ', "$Y": ' ', "$B": ' '}).out("B_out") beta = ml.execute(script).getNumPyArray('B_out') ```
