Repository: incubator-systemml
Updated Branches:
  refs/heads/master 4049ce407 -> 578e595fd


[SYSTEMML-1224] Migrate Vector and LabeledPoint classes from mllib to ml

Migrate:
mllib.linalg.DenseVector to ml.linalg.DenseVector.
mllib.linalg.Vector to ml.linalg.Vector.
mllib.linalg.Vectors to ml.linalg.Vectors.
mllib.linalg.VectorUDT to ml.linalg.VectorUDT.
mllib.regression.LabeledPoint to ml.feature.LabeledPoint.

Closes #369.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/578e595f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/578e595f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/578e595f

Branch: refs/heads/master
Commit: 578e595fdc506fb8a0c0b18c312fe420a406276d
Parents: 4049ce4
Author: Deron Eriksson <[email protected]>
Authored: Fri Feb 3 19:31:44 2017 -0800
Committer: Deron Eriksson <[email protected]>
Committed: Fri Feb 3 19:31:44 2017 -0800

----------------------------------------------------------------------
 .../java/org/apache/sysml/api/MLOutput.java     |  2 +-
 .../api/mlcontext/MLContextConversionUtil.java  |  2 +-
 .../sysml/api/mlcontext/MLContextUtil.java      |  2 +-
 .../spark/utils/FrameRDDConverterUtils.java     |  4 +-
 .../spark/utils/RDDConverterUtils.java          | 10 +--
 .../spark/utils/RDDConverterUtilsExt.java       | 69 +-------------------
 .../sysml/api/ml/LogisticRegression.scala       |  4 +-
 .../DataFrameVectorFrameConversionTest.java     |  4 +-
 .../mlcontext/DataFrameVectorScriptTest.java    |  4 +-
 .../integration/mlcontext/MLContextTest.java    |  6 +-
 .../sysml/api/ml/LogisticRegressionSuite.scala  |  6 +-
 11 files changed, 25 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/MLOutput.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java 
b/src/main/java/org/apache/sysml/api/MLOutput.java
index 6acca68..08a9a00 100644
--- a/src/main/java/org/apache/sysml/api/MLOutput.java
+++ b/src/main/java/org/apache/sysml/api/MLOutput.java
@@ -108,7 +108,7 @@ public class MLOutput {
         * Obtain the DataFrame
         * @param sqlContext the SQLContext
         * @param varName the variable name
-        * @param outputVector if true, returns DataFrame with two column: ID 
and org.apache.spark.mllib.linalg.Vector
+        * @param outputVector if true, returns DataFrame with two column: ID 
and org.apache.spark.ml.linalg.Vector
         * @return the DataFrame
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java 
b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
index ca853ef..cca9d2c 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java
@@ -33,7 +33,7 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java 
b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
index 75e9c1e..9553acb 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java
@@ -39,7 +39,7 @@ import org.apache.spark.SparkContext;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
index 3196f09..ae3b686 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java
@@ -37,8 +37,8 @@ import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
index 356d16f..b5a4b58 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java
@@ -35,11 +35,11 @@ import org.apache.spark.api.java.function.FlatMapFunction;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
-import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.ml.feature.LabeledPoint;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
index f18c0a9..e0d347f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java
@@ -35,9 +35,9 @@ import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
 import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
 import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
 import org.apache.spark.sql.Dataset;
@@ -128,69 +128,6 @@ public class RDDConverterUtilsExt
                return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), 
input, mcIn, true);
        }
 
-       public static Dataset<Row> stringDataFrameToVectorDataFrame(SQLContext 
sqlContext, Dataset<Row> inputDF)
-                       throws DMLRuntimeException {
-
-               StructField[] oldSchema = inputDF.schema().fields();
-               //create the new schema
-               StructField[] newSchema = new StructField[oldSchema.length];
-               for(int i = 0; i < oldSchema.length; i++) {
-                       String colName = oldSchema[i].name();
-                       newSchema[i] = DataTypes.createStructField(colName, new 
VectorUDT(), true);
-               }
-
-               //converter
-               class StringToVector implements Function<Tuple2<Row, Long>, 
Row> {
-                       private static final long serialVersionUID = 
-4733816995375745659L;
-                       @Override
-                       public Row call(Tuple2<Row, Long> arg0) throws 
Exception {
-                               Row oldRow = arg0._1;
-                               int oldNumCols = oldRow.length();
-                               if (oldNumCols > 1) {
-                                       throw new DMLRuntimeException("The row 
must have at most one column");
-                               }
-
-                               // parse the various strings. i.e
-                               // ((1.2,4.3, 3.4))  or (1.2, 3.4, 2.2) or (1.2 
3.4)
-                               // [[1.2,34.3, 1.2, 1.2]] or [1.2, 3.4] or [1.3 
1.2]
-                               Object [] fields = new Object[oldNumCols];
-                               ArrayList<Object> fieldsArr = new 
ArrayList<Object>();
-                               for (int i = 0; i < oldRow.length(); i++) {
-                                       Object ci=oldRow.get(i);
-                                       if (ci instanceof String) {
-                                               String cis = (String)ci;
-                                               StringBuffer sb = new 
StringBuffer(cis.trim());
-                                               for (int nid=0; i < 2; i++) { 
//remove two level nesting
-                                                       if ((sb.charAt(0) == 
'(' && sb.charAt(sb.length() - 1) == ')') ||
-                                                                       
(sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')
-                                                                       ) {
-                                                               
sb.deleteCharAt(0);
-                                                               
sb.setLength(sb.length() - 1);
-                                                       }
-                                               }
-                                               //have the replace code
-                                               String ncis = "[" + 
sb.toString().replaceAll(" *, *", ",") + "]";
-                                               Vector v = Vectors.parse(ncis);
-                                               fieldsArr.add(v);
-                                       } else {
-                                               throw new 
DMLRuntimeException("Only String is supported");
-                                       }
-                               }
-                               Row row = 
RowFactory.create(fieldsArr.toArray());
-                               return row;
-                       }
-               }
-
-               //output DF
-               JavaRDD<Row> newRows = 
inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
-               // DataFrame outDF = sqlContext.createDataFrame(newRows, new 
StructType(newSchema)); //TODO investigate why it doesn't work
-               Dataset<Row> outDF = sqlContext.createDataFrame(newRows.rdd(),
-                               DataTypes.createStructType(newSchema));
-
-               return outDF;
-       }
-
-       
        public static Dataset<Row> projectColumns(Dataset<Row> df, 
ArrayList<String> columns) throws DMLRuntimeException {
                ArrayList<String> columnToSelect = new ArrayList<String>();
                for(int i = 1; i < columns.size(); i++) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala 
b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
index 18eadec..c0e3f35 100644
--- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
+++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala
@@ -112,8 +112,8 @@ class LogisticRegressionModel(override val uid: String)(
 object LogisticRegressionExample {
   import org.apache.spark.{ SparkConf, SparkContext }
   import org.apache.spark.sql.types._
-  import org.apache.spark.mllib.linalg.Vectors
-  import org.apache.spark.mllib.regression.LabeledPoint
+  import org.apache.spark.ml.linalg.Vectors
+  import org.apache.spark.ml.feature.LabeledPoint
 
   def main(args: Array[String]) = {
     val sparkConf: SparkConf = new SparkConf();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
index b152b58..c6d2251 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java
@@ -25,8 +25,8 @@ import java.util.List;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
index 6ab0fd0..14ed4b7 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java
@@ -27,8 +27,8 @@ import java.util.List;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.mllib.linalg.DenseVector;
-import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.DenseVector;
+import org.apache.spark.ml.linalg.VectorUDT;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java 
b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
index 89241c5..2241ad1 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java
@@ -46,9 +46,9 @@ import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.Function;
-import org.apache.spark.mllib.linalg.Vector;
-import org.apache.spark.mllib.linalg.VectorUDT;
-import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.ml.linalg.Vector;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
 import org.apache.spark.rdd.RDD;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala 
b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
index 068db91..555d0a2 100644
--- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
+++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
@@ -21,13 +21,13 @@ package org.apache.sysml.api.ml
 
 import org.scalatest.FunSuite
 import org.scalatest.Matchers
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.feature.LabeledPoint
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.ml.linalg.Vector
 import scala.reflect.runtime.universe._
 
 case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)

Reply via email to