spark git commit: [SPARK-16240][ML] ML persistence backward compatibility for LDA - 2.0 backport

2016-09-22 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 22216d6bd -> 54d4eee51


[SPARK-16240][ML] ML persistence backward compatibility for LDA - 2.0 backport

## What changes were proposed in this pull request?

Allow Spark 2.x to load instances of LDA, LocalLDAModel, and 
DistributedLDAModel saved from Spark 1.6.
Backport of https://github.com/apache/spark/pull/15034 for branch-2.0

## How was this patch tested?

I tested this manually, saving the 3 types from 1.6 and loading them into 
master (2.x).  In the future, we can add generic tests for testing backwards 
compatibility across all ML models in SPARK-15573.

Author: Gayathri Murali 
Author: Joseph K. Bradley 

Closes #15205 from jkbradley/lda-backward-2.0.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54d4eee5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54d4eee5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54d4eee5

Branch: refs/heads/branch-2.0
Commit: 54d4eee51eca364d9334141f62e0478343345d06
Parents: 22216d6
Author: Gayathri Murali 
Authored: Thu Sep 22 22:44:20 2016 -0700
Committer: Joseph K. Bradley 
Committed: Thu Sep 22 22:44:20 2016 -0700

--
 .../org/apache/spark/ml/clustering/LDA.scala| 86 
 project/MimaExcludes.scala  |  3 +
 2 files changed, 72 insertions(+), 17 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54d4eee5/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index 034f2c3..8e23325 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -18,6 +18,9 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.hadoop.fs.Path
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.JObject
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.internal.Logging
@@ -26,19 +29,21 @@ import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors, 
VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasCheckpointInterval, 
HasFeaturesCol, HasMaxIter, HasSeed}
 import org.apache.spark.ml.util._
+import org.apache.spark.ml.util.DefaultParamsReader.Metadata
 import org.apache.spark.mllib.clustering.{DistributedLDAModel => 
OldDistributedLDAModel,
   EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => OldLDAModel,
   LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel,
   OnlineLDAOptimizer => OldOnlineLDAOptimizer}
 import org.apache.spark.mllib.impl.PeriodicCheckpointer
-import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Vector => 
OldVector,
-  Vectors => OldVectors}
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
 import org.apache.spark.mllib.linalg.MatrixImplicits._
 import org.apache.spark.mllib.linalg.VectorImplicits._
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.{col, monotonically_increasing_id, udf}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.VersionUtils
 
 
 private[clustering] trait LDAParams extends Params with HasFeaturesCol with 
HasMaxIter
@@ -80,6 +85,7 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
* - Values should be >= 0
* - default = uniformly (1.0 / k), following the implementation from
*   [[https://github.com/Blei-Lab/onlineldavb]].
+   *
* @group param
*/
   @Since("1.6.0")
@@ -121,6 +127,7 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
* - Value should be >= 0
* - default = (1.0 / k), following the implementation from
*   [[https://github.com/Blei-Lab/onlineldavb]].
+   *
* @group param
*/
   @Since("1.6.0")
@@ -354,6 +361,39 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
   }
 }
 
+private object LDAParams {
+
+  /**
+   * Equivalent to [[DefaultParamsReader.getAndSetParams()]], but handles 
[[LDA]] and [[LDAModel]]
+   * formats saved with Spark 1.6, which differ from the formats in Spark 2.0+.
+   *
+   * @param model[[LDA]] or [[LDAModel]] instance.  This instance will be 
modified with
+   * [[Param]] values extracted from metadata.
+   * 

spark git commit: [SPARK-16240][ML] ML persistence backward compatibility for LDA

2016-09-22 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 0d6348750 -> f4f6bd8c9


[SPARK-16240][ML] ML persistence backward compatibility for LDA

## What changes were proposed in this pull request?

Allow Spark 2.x to load instances of LDA, LocalLDAModel, and 
DistributedLDAModel saved from Spark 1.6.

## How was this patch tested?

I tested this manually, saving the 3 types from 1.6 and loading them into 
master (2.x).  In the future, we can add generic tests for testing backwards 
compatibility across all ML models in SPARK-15573.

Author: Joseph K. Bradley 

Closes #15034 from jkbradley/lda-backwards.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4f6bd8c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4f6bd8c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4f6bd8c

Branch: refs/heads/master
Commit: f4f6bd8c9884e3919509907307fda774f56b5ecc
Parents: 0d63487
Author: Gayathri Murali 
Authored: Thu Sep 22 16:34:42 2016 -0700
Committer: Joseph K. Bradley 
Committed: Thu Sep 22 16:34:42 2016 -0700

--
 .../org/apache/spark/ml/clustering/LDA.scala| 86 
 project/MimaExcludes.scala  |  4 +-
 2 files changed, 72 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f4f6bd8c/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
index b5a764b..7773802 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala
@@ -18,6 +18,9 @@
 package org.apache.spark.ml.clustering
 
 import org.apache.hadoop.fs.Path
+import org.json4s.DefaultFormats
+import org.json4s.JsonAST.JObject
+import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.internal.Logging
@@ -26,19 +29,21 @@ import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors, 
VectorUDT}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasCheckpointInterval, 
HasFeaturesCol, HasMaxIter, HasSeed}
 import org.apache.spark.ml.util._
+import org.apache.spark.ml.util.DefaultParamsReader.Metadata
 import org.apache.spark.mllib.clustering.{DistributedLDAModel => 
OldDistributedLDAModel,
   EMLDAOptimizer => OldEMLDAOptimizer, LDA => OldLDA, LDAModel => OldLDAModel,
   LDAOptimizer => OldLDAOptimizer, LocalLDAModel => OldLocalLDAModel,
   OnlineLDAOptimizer => OldOnlineLDAOptimizer}
 import org.apache.spark.mllib.impl.PeriodicCheckpointer
-import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Vector => 
OldVector,
-  Vectors => OldVectors}
+import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => 
OldVectors}
 import org.apache.spark.mllib.linalg.MatrixImplicits._
 import org.apache.spark.mllib.linalg.VectorImplicits._
+import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.{col, monotonically_increasing_id, udf}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.VersionUtils
 
 
 private[clustering] trait LDAParams extends Params with HasFeaturesCol with 
HasMaxIter
@@ -80,6 +85,7 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
* - Values should be >= 0
* - default = uniformly (1.0 / k), following the implementation from
*   [[https://github.com/Blei-Lab/onlineldavb]].
+   *
* @group param
*/
   @Since("1.6.0")
@@ -121,6 +127,7 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
* - Value should be >= 0
* - default = (1.0 / k), following the implementation from
*   [[https://github.com/Blei-Lab/onlineldavb]].
+   *
* @group param
*/
   @Since("1.6.0")
@@ -354,6 +361,39 @@ private[clustering] trait LDAParams extends Params with 
HasFeaturesCol with HasM
   }
 }
 
+private object LDAParams {
+
+  /**
+   * Equivalent to [[DefaultParamsReader.getAndSetParams()]], but handles 
[[LDA]] and [[LDAModel]]
+   * formats saved with Spark 1.6, which differ from the formats in Spark 2.0+.
+   *
+   * @param model[[LDA]] or [[LDAModel]] instance.  This instance will be 
modified with
+   * [[Param]] values extracted from metadata.
+   * @param metadata Loaded model metadata
+   */
+  def getAndSetParams(model: LDAParams, metadata: Metadata): Unit = {
+