[2/2] incubator-systemml git commit: [SYSTEMML-1632] Support loading and saving models via mllearn
[SYSTEMML-1632] Support loading and saving models via mllearn - Also, updated documentation and fixed bugs. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c067a585 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c067a585 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c067a585 Branch: refs/heads/gh-pages Commit: c067a5858f0e9360513c4c6be9e3a1940eb3b87a Parents: b31ebbf Author: Niketan PansareAuthored: Thu May 25 22:32:02 2017 -0700 Committer: Niketan Pansare Committed: Thu May 25 22:32:02 2017 -0700 -- algorithms-classification.md | 52 +-- algorithms-regression.md | 8 +- beginners-guide-caffe2dml.md | 264 +++ beginners-guide-python.md| 33 +- native-backend.md| 4 +- python-reference.md | 929 +- 6 files changed, 455 insertions(+), 835 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c067a585/algorithms-classification.md -- diff --git a/algorithms-classification.md b/algorithms-classification.md index ed56c34..04c5eb8 100644 --- a/algorithms-classification.md +++ b/algorithms-classification.md @@ -131,7 +131,7 @@ Eqs. (1) and (2). {% highlight python %} from systemml.mllearn import LogisticRegression # C = 1/reg -logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.01, C=1.0) +logistic = LogisticRegression(spark, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.01, C=1.0) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = logistic.fit(X_train, y_train).predict(X_test) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" @@ -229,6 +229,8 @@ if no maximum limit provided `mm`, or `csv`; see read/write functions in SystemML Language Reference for details. +Please see [mllearn documentation](https://apache.github.io/incubator-systemml/python-reference#mllearn-api) for +more details on the Python API. ### Examples @@ -255,9 +257,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te from pyspark.ml import Pipeline from systemml.mllearn import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import SQLContext -sqlCtx = SQLContext(sc) -training = sqlCtx.createDataFrame([ +training = spark.createDataFrame([ (0L, "a b c d e spark", 1.0), (1L, "b d", 2.0), (2L, "spark f g h", 1.0), @@ -273,10 +273,10 @@ training = sqlCtx.createDataFrame([ ], ["id", "text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) -lr = LogisticRegression(sqlCtx) +lr = LogisticRegression(spark) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) -test = sqlCtx.createDataFrame([ +test = spark.createDataFrame([ (12L, "spark i j k"), (13L, "l m n"), (14L, "mapreduce spark"), @@ -290,7 +290,7 @@ prediction.show() import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.sysml.api.ml.LogisticRegression import org.apache.spark.ml.Pipeline -val training = sqlContext.createDataFrame(Seq( +val training = spark.createDataFrame(Seq( ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), @@ -308,7 +308,7 @@ val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutp val lr = new LogisticRegression("logReg", sc) val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr)) val model = pipeline.fit(training) -val test = sqlContext.createDataFrame(Seq( +val test = spark.createDataFrame(Seq( ("spark i j k", 1.0), ("l m n", 2.0), ("mapreduce spark", 1.0), @@ -500,7 +500,7 @@ support vector machine (`y` with domain size `2`). {% highlight python %} from systemml.mllearn import SVM # C = 1/reg -svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.01, C=1.0, is_multi_class=False) +svm = SVM(spark, fit_intercept=True, max_iter=100, tol=0.01, C=1.0, is_multi_class=False) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = svm.fit(X_train, y_train) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" @@ -637,6 +637,8 @@ held-out test set. Note that this is an optional argument. **confusion**: Location (on HDFS) to store the confusion
[2/2] incubator-systemml git commit: [SYSTEMML-1632] Support loading and saving models via mllearn
[SYSTEMML-1632] Support loading and saving models via mllearn - Also, updated documentation and fixed bugs. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/d69f3441 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/d69f3441 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/d69f3441 Branch: refs/heads/master Commit: d69f3441c8243ddd13dd3da6aab9c2d5701c6e50 Parents: d36a0c1 Author: Niketan PansareAuthored: Thu May 25 22:32:02 2017 -0700 Committer: Niketan Pansare Committed: Thu May 25 22:32:02 2017 -0700 -- docs/algorithms-classification.md | 52 +- docs/algorithms-regression.md | 8 +- docs/beginners-guide-caffe2dml.md | 264 -- docs/beginners-guide-python.md | 33 +- docs/native-backend.md | 4 +- docs/python-reference.md| 929 +-- pom.xml | 3 + .../caffe2dml/models/mnist_lenet/lenet.proto| 195 .../models/mnist_lenet/lenet_solver.proto | 19 + src/main/python/systemml/converters.py | 96 +- src/main/python/systemml/mllearn/estimators.py | 179 ++-- .../org/apache/sysml/api/dl/Caffe2DML.scala | 64 +- .../org/apache/sysml/api/dl/CaffeLayer.scala| 108 ++- .../org/apache/sysml/api/dl/CaffeNetwork.scala | 55 +- .../scala/org/apache/sysml/api/dl/Utils.scala | 145 +++ .../sysml/api/ml/BaseSystemMLClassifier.scala | 43 +- .../sysml/api/ml/BaseSystemMLRegressor.scala| 8 +- .../apache/sysml/api/ml/LinearRegression.scala | 33 +- .../sysml/api/ml/LogisticRegression.scala | 28 +- .../org/apache/sysml/api/ml/NaiveBayes.scala| 28 +- .../scala/org/apache/sysml/api/ml/SVM.scala | 27 +- .../scala/org/apache/sysml/api/ml/Utils.scala | 25 + 22 files changed, 1365 insertions(+), 981 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/d69f3441/docs/algorithms-classification.md -- diff --git a/docs/algorithms-classification.md b/docs/algorithms-classification.md index ed56c34..04c5eb8 100644 --- a/docs/algorithms-classification.md +++ b/docs/algorithms-classification.md @@ -131,7 +131,7 @@ Eqs. (1) and (2). {% highlight python %} from systemml.mllearn import LogisticRegression # C = 1/reg -logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.01, C=1.0) +logistic = LogisticRegression(spark, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.01, C=1.0) # X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix y_test = logistic.fit(X_train, y_train).predict(X_test) # df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features" @@ -229,6 +229,8 @@ if no maximum limit provided `mm`, or `csv`; see read/write functions in SystemML Language Reference for details. +Please see [mllearn documentation](https://apache.github.io/incubator-systemml/python-reference#mllearn-api) for +more details on the Python API. ### Examples @@ -255,9 +257,7 @@ print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_te from pyspark.ml import Pipeline from systemml.mllearn import LogisticRegression from pyspark.ml.feature import HashingTF, Tokenizer -from pyspark.sql import SQLContext -sqlCtx = SQLContext(sc) -training = sqlCtx.createDataFrame([ +training = spark.createDataFrame([ (0L, "a b c d e spark", 1.0), (1L, "b d", 2.0), (2L, "spark f g h", 1.0), @@ -273,10 +273,10 @@ training = sqlCtx.createDataFrame([ ], ["id", "text", "label"]) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) -lr = LogisticRegression(sqlCtx) +lr = LogisticRegression(spark) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) model = pipeline.fit(training) -test = sqlCtx.createDataFrame([ +test = spark.createDataFrame([ (12L, "spark i j k"), (13L, "l m n"), (14L, "mapreduce spark"), @@ -290,7 +290,7 @@ prediction.show() import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.sysml.api.ml.LogisticRegression import org.apache.spark.ml.Pipeline -val training = sqlContext.createDataFrame(Seq( +val training = spark.createDataFrame(Seq( ("a b c d e spark", 1.0), ("b d", 2.0), ("spark f g h", 1.0), @@ -308,7 +308,7 @@ val hashingTF = new HashingTF().setNumFeatures(20).setInputCol(tokenizer.getOutp val lr = new