Repository: incubator-systemml Updated Branches: refs/heads/master 4049ce407 -> 578e595fd
[SYSTEMML-1224] Migrate Vector and LabeledPoint classes from mllib to ml Migrate: mllib.linalg.DenseVector to ml.linalg.DenseVector. mllib.linalg.Vector to ml.linalg.Vector. mllib.linalg.Vectors to ml.linalg.Vectors. mllib.linalg.VectorUDT to ml.linalg.VectorUDT. mllib.regression.LabeledPoint to ml.feature.LabeledPoint. Closes #369. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/578e595f Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/578e595f Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/578e595f Branch: refs/heads/master Commit: 578e595fdc506fb8a0c0b18c312fe420a406276d Parents: 4049ce4 Author: Deron Eriksson <[email protected]> Authored: Fri Feb 3 19:31:44 2017 -0800 Committer: Deron Eriksson <[email protected]> Committed: Fri Feb 3 19:31:44 2017 -0800 ---------------------------------------------------------------------- .../java/org/apache/sysml/api/MLOutput.java | 2 +- .../api/mlcontext/MLContextConversionUtil.java | 2 +- .../sysml/api/mlcontext/MLContextUtil.java | 2 +- .../spark/utils/FrameRDDConverterUtils.java | 4 +- .../spark/utils/RDDConverterUtils.java | 10 +-- .../spark/utils/RDDConverterUtilsExt.java | 69 +------------------- .../sysml/api/ml/LogisticRegression.scala | 4 +- .../DataFrameVectorFrameConversionTest.java | 4 +- .../mlcontext/DataFrameVectorScriptTest.java | 4 +- .../integration/mlcontext/MLContextTest.java | 6 +- .../sysml/api/ml/LogisticRegressionSuite.scala | 6 +- 11 files changed, 25 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/MLOutput.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/MLOutput.java b/src/main/java/org/apache/sysml/api/MLOutput.java index 6acca68..08a9a00 100644 --- a/src/main/java/org/apache/sysml/api/MLOutput.java +++ b/src/main/java/org/apache/sysml/api/MLOutput.java @@ -108,7 +108,7 @@ public class MLOutput { * Obtain the DataFrame * @param sqlContext the SQLContext * @param varName the variable name - * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.mllib.linalg.Vector + * @param outputVector if true, returns DataFrame with two column: ID and org.apache.spark.ml.linalg.Vector * @return the DataFrame * @throws DMLRuntimeException if DMLRuntimeException occurs */ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java index ca853ef..cca9d2c 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextConversionUtil.java @@ -33,7 +33,7 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java index 75e9c1e..9553acb 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/MLContextUtil.java @@ -39,7 +39,7 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java index 3196f09..ae3b686 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/FrameRDDConverterUtils.java @@ -37,8 +37,8 @@ import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java index 356d16f..b5a4b58 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtils.java @@ -35,11 +35,11 @@ import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.mllib.linalg.DenseVector; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; -import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.ml.feature.LabeledPoint; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java index f18c0a9..e0d347f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/utils/RDDConverterUtilsExt.java @@ -35,9 +35,9 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFlatMapFunction; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix; import org.apache.spark.mllib.linalg.distributed.MatrixEntry; import org.apache.spark.sql.Dataset; @@ -128,69 +128,6 @@ public class RDDConverterUtilsExt return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true); } - public static Dataset<Row> stringDataFrameToVectorDataFrame(SQLContext sqlContext, Dataset<Row> inputDF) - throws DMLRuntimeException { - - StructField[] oldSchema = inputDF.schema().fields(); - //create the new schema - StructField[] newSchema = new StructField[oldSchema.length]; - for(int i = 0; i < oldSchema.length; i++) { - String colName = oldSchema[i].name(); - newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true); - } - - //converter - class StringToVector implements Function<Tuple2<Row, Long>, Row> { - private static final long serialVersionUID = -4733816995375745659L; - @Override - public Row call(Tuple2<Row, Long> arg0) throws Exception { - Row oldRow = arg0._1; - int oldNumCols = oldRow.length(); - if (oldNumCols > 1) { - throw new DMLRuntimeException("The row must have at most one column"); - } - - // parse the various strings. i.e - // ((1.2,4.3, 3.4)) or (1.2, 3.4, 2.2) or (1.2 3.4) - // [[1.2,34.3, 1.2, 1.2]] or [1.2, 3.4] or [1.3 1.2] - Object [] fields = new Object[oldNumCols]; - ArrayList<Object> fieldsArr = new ArrayList<Object>(); - for (int i = 0; i < oldRow.length(); i++) { - Object ci=oldRow.get(i); - if (ci instanceof String) { - String cis = (String)ci; - StringBuffer sb = new StringBuffer(cis.trim()); - for (int nid=0; i < 2; i++) { //remove two level nesting - if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') || - (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']') - ) { - sb.deleteCharAt(0); - sb.setLength(sb.length() - 1); - } - } - //have the replace code - String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]"; - Vector v = Vectors.parse(ncis); - fieldsArr.add(v); - } else { - throw new DMLRuntimeException("Only String is supported"); - } - } - Row row = RowFactory.create(fieldsArr.toArray()); - return row; - } - } - - //output DF - JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector()); - // DataFrame outDF = sqlContext.createDataFrame(newRows, new StructType(newSchema)); //TODO investigate why it doesn't work - Dataset<Row> outDF = sqlContext.createDataFrame(newRows.rdd(), - DataTypes.createStructType(newSchema)); - - return outDF; - } - - public static Dataset<Row> projectColumns(Dataset<Row> df, ArrayList<String> columns) throws DMLRuntimeException { ArrayList<String> columnToSelect = new ArrayList<String>(); for(int i = 1; i < columns.size(); i++) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala index 18eadec..c0e3f35 100644 --- a/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala +++ b/src/main/scala/org/apache/sysml/api/ml/LogisticRegression.scala @@ -112,8 +112,8 @@ class LogisticRegressionModel(override val uid: String)( object LogisticRegressionExample { import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.sql.types._ - import org.apache.spark.mllib.linalg.Vectors - import org.apache.spark.mllib.regression.LabeledPoint + import org.apache.spark.ml.linalg.Vectors + import org.apache.spark.ml.feature.LabeledPoint def main(args: Array[String]) = { val sparkConf: SparkConf = new SparkConf(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java index b152b58..c6d2251 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorFrameConversionTest.java @@ -25,8 +25,8 @@ import java.util.List; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.DenseVector; -import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java index 6ab0fd0..14ed4b7 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/mlcontext/DataFrameVectorScriptTest.java @@ -27,8 +27,8 @@ import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.mllib.linalg.DenseVector; -import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.ml.linalg.DenseVector; +import org.apache.spark.ml.linalg.VectorUDT; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java index 89241c5..2241ad1 100644 --- a/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java +++ b/src/test/java/org/apache/sysml/test/integration/mlcontext/MLContextTest.java @@ -46,9 +46,9 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; -import org.apache.spark.mllib.linalg.Vector; -import org.apache.spark.mllib.linalg.VectorUDT; -import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/578e595f/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala ---------------------------------------------------------------------- diff --git a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala index 068db91..555d0a2 100644 --- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala +++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala @@ -21,13 +21,13 @@ package org.apache.sysml.api.ml import org.scalatest.FunSuite import org.scalatest.Matchers -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.ml.linalg.Vector import scala.reflect.runtime.universe._ case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)
