spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)

2015-08-25 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 21a10a86d -> 5220db9e3


[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / 
select)

Add support for
```
   df[df$name == "Smith", c(1,2)]
   df[df$age %in% c(19, 30), 1:2]
```

shivaram

Author: felixcheung 

Closes #8394 from felixcheung/rsubset.

(cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5220db9e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5220db9e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5220db9e

Branch: refs/heads/branch-1.5
Commit: 5220db9e352b5d5eae59cead9478ca0a9f73f16b
Parents: 21a10a8
Author: felixcheung 
Authored: Tue Aug 25 23:48:16 2015 -0700
Committer: Shivaram Venkataraman 
Committed: Tue Aug 25 23:48:27 2015 -0700

--
 R/pkg/R/DataFrame.R  | 22 +-
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++
 2 files changed, 48 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4e..1d870ec 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"),
 x
   })
 
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
 #' @rdname select
 #' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
   function(x, i) {
 if (is.numeric(i)) {
   cols <- columns(x)
@@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 select(x, j)
   })
 
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+  function(x, i, j, ...) {
+# It could handle i as "character" but it seems confusing and not 
required
+# 
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+filtered <- filter(x, i)
+if (!missing(j)) {
+  filtered[, j]
+} else {
+  filtered
+}
+  })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 #'   # Columns can also be selected using `[[` and `[`
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
+#'   # It can also be subset on rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
   function(x, col, ...) {

http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/inst/tests/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5..ee48a3d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
 })
 
+test_that("subsetting", {
+  # jsonFile returns columns in random order
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  filtered <- df[df$age > 20,]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1]
+  expect_is(df2, "DataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+  
+  df5 <- df[df$age %in% c(19), c(1,2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+})
+
 test_that("selectExpr() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)

2015-08-25 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 321d77596 -> 75d4773aa


[SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / 
select)

Add support for
```
   df[df$name == "Smith", c(1,2)]
   df[df$age %in% c(19, 30), 1:2]
```

shivaram

Author: felixcheung 

Closes #8394 from felixcheung/rsubset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/75d4773a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/75d4773a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/75d4773a

Branch: refs/heads/master
Commit: 75d4773aa50e24972c533e8b48697fde586429eb
Parents: 321d775
Author: felixcheung 
Authored: Tue Aug 25 23:48:16 2015 -0700
Committer: Shivaram Venkataraman 
Committed: Tue Aug 25 23:48:16 2015 -0700

--
 R/pkg/R/DataFrame.R  | 22 +-
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++
 2 files changed, 48 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/75d4773a/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index ae1d912..a5162de 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -985,9 +985,11 @@ setMethod("$<-", signature(x = "DataFrame"),
 x
   })
 
+setClassUnion("numericOrcharacter", c("numeric", "character"))
+
 #' @rdname select
 #' @name [[
-setMethod("[[", signature(x = "DataFrame"),
+setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
   function(x, i) {
 if (is.numeric(i)) {
   cols <- columns(x)
@@ -1010,6 +1012,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 select(x, j)
   })
 
+#' @rdname select
+#' @name [
+setMethod("[", signature(x = "DataFrame", i = "Column"),
+  function(x, i, j, ...) {
+# It could handle i as "character" but it seems confusing and not 
required
+# 
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
+filtered <- filter(x, i)
+if (!missing(j)) {
+  filtered[, j]
+} else {
+  filtered
+}
+  })
+
 #' Select
 #'
 #' Selects a set of columns with names or Column expressions.
@@ -1028,8 +1044,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"),
 #'   # Columns can also be selected using `[[` and `[`
 #'   df[[2]] == df[["age"]]
 #'   df[,2] == df[,"age"]
+#'   df[,c("name", "age")]
 #'   # Similar to R data frames columns can also be selected using `$`
 #'   df$age
+#'   # It can also be subset on rows and Columns
+#'   df[df$name == "Smith", c(1,2)]
+#'   df[df$age %in% c(19, 30), 1:2]
 #' }
 setMethod("select", signature(x = "DataFrame", col = "character"),
   function(x, col, ...) {

http://git-wip-us.apache.org/repos/asf/spark/blob/75d4773a/R/pkg/inst/tests/test_sparkSQL.R
--
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 556b8c5..ee48a3d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -587,6 +587,33 @@ test_that("select with column", {
   expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
 })
 
+test_that("subsetting", {
+  # jsonFile returns columns in random order
+  df <- select(jsonFile(sqlContext, jsonPath), "name", "age")
+  filtered <- df[df$age > 20,]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1]
+  expect_is(df2, "DataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+  
+  df5 <- df[df$age %in% c(19), c(1,2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+})
+
 test_that("selectExpr() on a DataFrame", {
   df <- jsonFile(sqlContext, jsonPath)
   selected <- selectExpr(df, "age * 2")


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10236] [MLLIB] update since versions in mllib.feature

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 4657fa1f3 -> 321d77596


[SPARK-10236] [MLLIB] update since versions in mllib.feature

Same as #8421 but for `mllib.feature`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8449 from mengxr/SPARK-10236.feature and squashes the following commits:

0e8d658 [Xiangrui Meng] remove unnecessary comment
ad70b03 [Xiangrui Meng] update since versions in mllib.feature


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/321d7759
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/321d7759
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/321d7759

Branch: refs/heads/master
Commit: 321d7759691bed9867b1f0470f12eab2faa50aff
Parents: 4657fa1
Author: Xiangrui Meng 
Authored: Tue Aug 25 23:45:41 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 23:45:41 2015 -0700

--
 .../mllib/clustering/PowerIterationClustering.scala |  2 --
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala  |  4 ++--
 .../apache/spark/mllib/feature/ElementwiseProduct.scala |  3 ++-
 .../main/scala/org/apache/spark/mllib/feature/IDF.scala |  6 --
 .../org/apache/spark/mllib/feature/Normalizer.scala |  2 +-
 .../main/scala/org/apache/spark/mllib/feature/PCA.scala |  7 +--
 .../org/apache/spark/mllib/feature/StandardScaler.scala | 12 ++--
 .../scala/org/apache/spark/mllib/feature/Word2Vec.scala |  1 +
 8 files changed, 21 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index da234bd..6c76e26 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -71,8 +71,6 @@ object PowerIterationClusteringModel extends 
Loader[PowerIterationClusteringMode
 private[clustering]
 val thisClassName = 
"org.apache.spark.mllib.clustering.PowerIterationClusteringModel"
 
-/**
- */
 @Since("1.4.0")
 def save(sc: SparkContext, model: PowerIterationClusteringModel, path: 
String): Unit = {
   val sqlContext = new SQLContext(sc)

http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index fdd974d..4743cfd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelectorModel (
+class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
 
   require(isSorted(selectedFeatures), "Array has to be sorted asc")
@@ -112,7 +112,7 @@ class ChiSqSelectorModel (
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelector (
+class ChiSqSelector @Since("1.3.0") (
   @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index 33e2d17..d0a6cf6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -29,7 +29,8 @@ import org.apache.spark.mllib.linalg._
  */
 @Since("1.4.0")
 @Experimental
-class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
+class ElementwiseProduct @Since("1.4.0") (
+@Since("1.4.0") val scalingVec: Vector) extends VectorTransformer {
 
   /**
* Does the hadamard product transformation.

http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
index d5353

spark git commit: [SPARK-10236] [MLLIB] update since versions in mllib.feature

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 08d390f45 -> 21a10a86d


[SPARK-10236] [MLLIB] update since versions in mllib.feature

Same as #8421 but for `mllib.feature`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8449 from mengxr/SPARK-10236.feature and squashes the following commits:

0e8d658 [Xiangrui Meng] remove unnecessary comment
ad70b03 [Xiangrui Meng] update since versions in mllib.feature

(cherry picked from commit 321d7759691bed9867b1f0470f12eab2faa50aff)
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21a10a86
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21a10a86
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21a10a86

Branch: refs/heads/branch-1.5
Commit: 21a10a86d20ec1a6fea42286b4d2aae9ce7e848d
Parents: 08d390f
Author: Xiangrui Meng 
Authored: Tue Aug 25 23:45:41 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 23:45:53 2015 -0700

--
 .../mllib/clustering/PowerIterationClustering.scala |  2 --
 .../org/apache/spark/mllib/feature/ChiSqSelector.scala  |  4 ++--
 .../apache/spark/mllib/feature/ElementwiseProduct.scala |  3 ++-
 .../main/scala/org/apache/spark/mllib/feature/IDF.scala |  6 --
 .../org/apache/spark/mllib/feature/Normalizer.scala |  2 +-
 .../main/scala/org/apache/spark/mllib/feature/PCA.scala |  7 +--
 .../org/apache/spark/mllib/feature/StandardScaler.scala | 12 ++--
 .../scala/org/apache/spark/mllib/feature/Word2Vec.scala |  1 +
 8 files changed, 21 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
index da234bd..6c76e26 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala
@@ -71,8 +71,6 @@ object PowerIterationClusteringModel extends 
Loader[PowerIterationClusteringMode
 private[clustering]
 val thisClassName = 
"org.apache.spark.mllib.clustering.PowerIterationClusteringModel"
 
-/**
- */
 @Since("1.4.0")
 def save(sc: SparkContext, model: PowerIterationClusteringModel, path: 
String): Unit = {
   val sqlContext = new SQLContext(sc)

http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
index fdd974d..4743cfd 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala
@@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelectorModel (
+class ChiSqSelectorModel @Since("1.3.0") (
   @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer {
 
   require(isSorted(selectedFeatures), "Array has to be sorted asc")
@@ -112,7 +112,7 @@ class ChiSqSelectorModel (
  */
 @Since("1.3.0")
 @Experimental
-class ChiSqSelector (
+class ChiSqSelector @Since("1.3.0") (
   @Since("1.3.0") val numTopFeatures: Int) extends Serializable {
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
index 33e2d17..d0a6cf6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala
@@ -29,7 +29,8 @@ import org.apache.spark.mllib.linalg._
  */
 @Since("1.4.0")
 @Experimental
-class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer {
+class ElementwiseProduct @Since("1.4.0") (
+@Since("1.4.0") val scalingVec: Vector) extends VectorTransformer {
 
   /**
* Does the hadamard product transformation.

http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spar

spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master fb7e12fe2 -> 4657fa1f3


[SPARK-10235] [MLLIB] update since versions in mllib.regression

Same as #8421 but for `mllib.regression`.

cc freeman-lab dbtsai

Author: Xiangrui Meng 

Closes #8426 from mengxr/SPARK-10235 and squashes the following commits:

6cd28e4 [Xiangrui Meng] update since versions in mllib.regression


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4657fa1f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4657fa1f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4657fa1f

Branch: refs/heads/master
Commit: 4657fa1f37d41dd4c7240a960342b68c7c591f48
Parents: fb7e12f
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:49:33 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 22:49:33 2015 -0700

--
 .../regression/GeneralizedLinearAlgorithm.scala |  6 --
 .../spark/mllib/regression/IsotonicRegression.scala | 16 +---
 .../spark/mllib/regression/LabeledPoint.scala   |  5 +++--
 .../org/apache/spark/mllib/regression/Lasso.scala   |  9 ++---
 .../spark/mllib/regression/LinearRegression.scala   |  9 ++---
 .../spark/mllib/regression/RidgeRegression.scala| 12 +++-
 .../mllib/regression/StreamingLinearAlgorithm.scala |  8 +++-
 .../StreamingLinearRegressionWithSGD.scala  | 11 +--
 8 files changed, 47 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 509f6a2..7e3b4d5 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("0.8.0")
 @DeveloperApi
-abstract class GeneralizedLinearModel(val weights: Vector, val intercept: 
Double)
+abstract class GeneralizedLinearModel @Since("1.0.0") (
+@Since("1.0.0") val weights: Vector,
+@Since("0.8.0") val intercept: Double)
   extends Serializable {
 
   /**
@@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M <: 
GeneralizedLinearModel]
* The optimizer to solve the problem.
*
*/
-  @Since("1.0.0")
+  @Since("0.8.0")
   def optimizer: Optimizer
 
   /** Whether to add intercept (default: false). */

http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 31ca7c2..877d31b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext
  */
 @Since("1.3.0")
 @Experimental
-class IsotonicRegressionModel (
-val boundaries: Array[Double],
-val predictions: Array[Double],
-val isotonic: Boolean) extends Serializable with Saveable {
+class IsotonicRegressionModel @Since("1.3.0") (
+@Since("1.3.0") val boundaries: Array[Double],
+@Since("1.3.0") val predictions: Array[Double],
+@Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable {
 
   private val predictionOrd = if (isotonic) Ordering[Double] else 
Ordering[Double].reverse
 
@@ -63,7 +63,6 @@ class IsotonicRegressionModel (
 
   /**
* A Java-friendly constructor that takes two Iterable parameters and one 
Boolean parameter.
-   *
*/
   @Since("1.4.0")
   def this(boundaries: java.lang.Iterable[Double],
@@ -214,8 +213,6 @@ object IsotonicRegressionModel extends 
Loader[IsotonicRegressionModel] {
 }
   }
 
-  /**
-   */
   @Since("1.4.0")
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = 
{
 implicit val formats = DefaultFormats
@@ -256,6 +253,7 @@ object IsotonicRegressionModel extends 
Loader[IsotonicRegressionModel] {
  * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression 
(Wikipedia)]]
  */
 @Experimental
+@Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends 
Serializable {
 
   /**
@@ -263,6 +261,7 @@ class IsotonicRegression private (private var isotonic: 
Boolean) extends Seriali
*
*

spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 6d8ebc801 -> 08d390f45


[SPARK-10235] [MLLIB] update since versions in mllib.regression

Same as #8421 but for `mllib.regression`.

cc freeman-lab dbtsai

Author: Xiangrui Meng 

Closes #8426 from mengxr/SPARK-10235 and squashes the following commits:

6cd28e4 [Xiangrui Meng] update since versions in mllib.regression

(cherry picked from commit 4657fa1f37d41dd4c7240a960342b68c7c591f48)
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08d390f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08d390f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08d390f4

Branch: refs/heads/branch-1.5
Commit: 08d390f457f80ffdc2dfce61ea579d9026047f12
Parents: 6d8ebc8
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:49:33 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 22:49:46 2015 -0700

--
 .../regression/GeneralizedLinearAlgorithm.scala |  6 --
 .../spark/mllib/regression/IsotonicRegression.scala | 16 +---
 .../spark/mllib/regression/LabeledPoint.scala   |  5 +++--
 .../org/apache/spark/mllib/regression/Lasso.scala   |  9 ++---
 .../spark/mllib/regression/LinearRegression.scala   |  9 ++---
 .../spark/mllib/regression/RidgeRegression.scala| 12 +++-
 .../mllib/regression/StreamingLinearAlgorithm.scala |  8 +++-
 .../StreamingLinearRegressionWithSGD.scala  | 11 +--
 8 files changed, 47 insertions(+), 29 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
index 509f6a2..7e3b4d5 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala
@@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("0.8.0")
 @DeveloperApi
-abstract class GeneralizedLinearModel(val weights: Vector, val intercept: 
Double)
+abstract class GeneralizedLinearModel @Since("1.0.0") (
+@Since("1.0.0") val weights: Vector,
+@Since("0.8.0") val intercept: Double)
   extends Serializable {
 
   /**
@@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M <: 
GeneralizedLinearModel]
* The optimizer to solve the problem.
*
*/
-  @Since("1.0.0")
+  @Since("0.8.0")
   def optimizer: Optimizer
 
   /** Whether to add intercept (default: false). */

http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
index 31ca7c2..877d31b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala
@@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext
  */
 @Since("1.3.0")
 @Experimental
-class IsotonicRegressionModel (
-val boundaries: Array[Double],
-val predictions: Array[Double],
-val isotonic: Boolean) extends Serializable with Saveable {
+class IsotonicRegressionModel @Since("1.3.0") (
+@Since("1.3.0") val boundaries: Array[Double],
+@Since("1.3.0") val predictions: Array[Double],
+@Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable {
 
   private val predictionOrd = if (isotonic) Ordering[Double] else 
Ordering[Double].reverse
 
@@ -63,7 +63,6 @@ class IsotonicRegressionModel (
 
   /**
* A Java-friendly constructor that takes two Iterable parameters and one 
Boolean parameter.
-   *
*/
   @Since("1.4.0")
   def this(boundaries: java.lang.Iterable[Double],
@@ -214,8 +213,6 @@ object IsotonicRegressionModel extends 
Loader[IsotonicRegressionModel] {
 }
   }
 
-  /**
-   */
   @Since("1.4.0")
   override def load(sc: SparkContext, path: String): IsotonicRegressionModel = 
{
 implicit val formats = DefaultFormats
@@ -256,6 +253,7 @@ object IsotonicRegressionModel extends 
Loader[IsotonicRegressionModel] {
  * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression 
(Wikipedia)]]
  */
 @Experimental
+@Since("1.3.0")
 class IsotonicRegression private (private var isotonic: Boolean) extends 
Serializable {
 
   /**
@@ -263,6 +

spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 be0c9915c -> 6d8ebc801


[SPARK-10243] [MLLIB] update since versions in mllib.tree

Same as #8421 but for `mllib.tree`.

cc jkbradley

Author: Xiangrui Meng 

Closes #8442 from mengxr/SPARK-10236.

(cherry picked from commit fb7e12fe2e14af8de4c206ca8096b2e8113bfddc)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d8ebc80
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d8ebc80
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d8ebc80

Branch: refs/heads/branch-1.5
Commit: 6d8ebc801799714d297c83be6935b37e26dc2df7
Parents: be0c991
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:35:49 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:35:56 2015 -0700

--
 .../apache/spark/mllib/tree/DecisionTree.scala  |  3 +-
 .../spark/mllib/tree/GradientBoostedTrees.scala |  2 +-
 .../spark/mllib/tree/configuration/Algo.scala   |  2 ++
 .../tree/configuration/BoostingStrategy.scala   | 12 
 .../mllib/tree/configuration/FeatureType.scala  |  2 ++
 .../tree/configuration/QuantileStrategy.scala   |  2 ++
 .../mllib/tree/configuration/Strategy.scala | 29 ++--
 .../mllib/tree/model/DecisionTreeModel.scala|  5 +++-
 .../apache/spark/mllib/tree/model/Node.scala| 18 ++--
 .../apache/spark/mllib/tree/model/Predict.scala |  6 ++--
 .../apache/spark/mllib/tree/model/Split.scala   |  8 +++---
 .../mllib/tree/model/treeEnsembleModels.scala   | 12 
 12 files changed, 57 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 9728410..4a77d4a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom
  */
 @Since("1.0.0")
 @Experimental
-class DecisionTree (private val strategy: Strategy) extends Serializable with 
Logging {
+class DecisionTree @Since("1.0.0") (private val strategy: Strategy)
+  extends Serializable with Logging {
 
   strategy.assertValid()
 

http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index e750408..95ed48c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.2.0")
 @Experimental
-class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
+class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: 
BoostingStrategy)
   extends Serializable with Logging {
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
index 8301ad1..853c731 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since}
 @Since("1.0.0")
 @Experimental
 object Algo extends Enumeration {
+  @Since("1.0.0")
   type Algo = Value
+  @Since("1.0.0")
   val Classification, Regression = Value
 
   private[mllib] def fromString(name: String): Algo = name match {

http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 7c56998..b5c72fb 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configurati

spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master d703372f8 -> fb7e12fe2


[SPARK-10243] [MLLIB] update since versions in mllib.tree

Same as #8421 but for `mllib.tree`.

cc jkbradley

Author: Xiangrui Meng 

Closes #8442 from mengxr/SPARK-10236.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb7e12fe
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb7e12fe
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb7e12fe

Branch: refs/heads/master
Commit: fb7e12fe2e14af8de4c206ca8096b2e8113bfddc
Parents: d703372
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:35:49 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:35:49 2015 -0700

--
 .../apache/spark/mllib/tree/DecisionTree.scala  |  3 +-
 .../spark/mllib/tree/GradientBoostedTrees.scala |  2 +-
 .../spark/mllib/tree/configuration/Algo.scala   |  2 ++
 .../tree/configuration/BoostingStrategy.scala   | 12 
 .../mllib/tree/configuration/FeatureType.scala  |  2 ++
 .../tree/configuration/QuantileStrategy.scala   |  2 ++
 .../mllib/tree/configuration/Strategy.scala | 29 ++--
 .../mllib/tree/model/DecisionTreeModel.scala|  5 +++-
 .../apache/spark/mllib/tree/model/Node.scala| 18 ++--
 .../apache/spark/mllib/tree/model/Predict.scala |  6 ++--
 .../apache/spark/mllib/tree/model/Split.scala   |  8 +++---
 .../mllib/tree/model/treeEnsembleModels.scala   | 12 
 12 files changed, 57 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
index 9728410..4a77d4a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala
@@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom
  */
 @Since("1.0.0")
 @Experimental
-class DecisionTree (private val strategy: Strategy) extends Serializable with 
Logging {
+class DecisionTree @Since("1.0.0") (private val strategy: Strategy)
+  extends Serializable with Logging {
 
   strategy.assertValid()
 

http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index e750408..95ed48c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.2.0")
 @Experimental
-class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy)
+class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: 
BoostingStrategy)
   extends Serializable with Logging {
 
   /**

http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
index 8301ad1..853c731 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala
@@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since}
 @Since("1.0.0")
 @Experimental
 object Algo extends Enumeration {
+  @Since("1.0.0")
   type Algo = Value
+  @Since("1.0.0")
   val Classification, Regression = Value
 
   private[mllib] def fromString(name: String): Algo = name match {

http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 7c56998..b5c72fb 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -41,14 +41,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, 
SquaredErro

spark git commit: [SPARK-10234] [MLLIB] update since version in mllib.clustering

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 b7766699a -> be0c9915c


[SPARK-10234] [MLLIB] update since version in mllib.clustering

Same as #8421 but for `mllib.clustering`.

cc feynmanliang yu-iskw

Author: Xiangrui Meng 

Closes #8435 from mengxr/SPARK-10234.

(cherry picked from commit d703372f86d6a59383ba8569fcd9d379849cffbf)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be0c9915
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be0c9915
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be0c9915

Branch: refs/heads/branch-1.5
Commit: be0c9915c0084a187933f338e51e606dc68e93af
Parents: b776669
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:33:48 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:33:55 2015 -0700

--
 .../mllib/clustering/GaussianMixture.scala  |  1 +
 .../mllib/clustering/GaussianMixtureModel.scala |  8 +++---
 .../apache/spark/mllib/clustering/KMeans.scala  |  1 +
 .../spark/mllib/clustering/KMeansModel.scala|  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 28 +++-
 .../clustering/PowerIterationClustering.scala   | 10 ---
 .../mllib/clustering/StreamingKMeans.scala  | 15 ++-
 7 files changed, 44 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index daa947e..f82bd82 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -53,6 +53,7 @@ import org.apache.spark.util.Utils
  * @param maxIterations The maximum number of iterations to perform
  */
 @Experimental
+@Since("1.3.0")
 class GaussianMixture private (
 private var k: Int,
 private var convergenceTol: Double,

http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a10a8b..7f6163e 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row}
  */
 @Since("1.3.0")
 @Experimental
-class GaussianMixtureModel(
-  val weights: Array[Double],
-  val gaussians: Array[MultivariateGaussian]) extends Serializable with 
Saveable {
+class GaussianMixtureModel @Since("1.3.0") (
+  @Since("1.3.0") val weights: Array[Double],
+  @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends 
Serializable with Saveable {
 
   require(weights.length == gaussians.length, "Length of weight and Gaussian 
arrays must match")
 
@@ -178,7 +178,7 @@ object GaussianMixtureModel extends 
Loader[GaussianMixtureModel] {
   (weight, new MultivariateGaussian(mu, sigma))
   }.unzip
 
-  return new GaussianMixtureModel(weights.toArray, gaussians.toArray)
+  new GaussianMixtureModel(weights.toArray, gaussians.toArray)
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 3e9545a..46920ff 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * This is an iterative algorithm that will make multiple passes over the 
data, so any RDDs given
  * to it should be cached by the user.
  */
+@Since("0.8.0")
 class KMeans private (
 private var k: Int,
 private var maxIterations: Int,

http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMean

spark git commit: [SPARK-10234] [MLLIB] update since version in mllib.clustering

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master c3a54843c -> d703372f8


[SPARK-10234] [MLLIB] update since version in mllib.clustering

Same as #8421 but for `mllib.clustering`.

cc feynmanliang yu-iskw

Author: Xiangrui Meng 

Closes #8435 from mengxr/SPARK-10234.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d703372f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d703372f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d703372f

Branch: refs/heads/master
Commit: d703372f86d6a59383ba8569fcd9d379849cffbf
Parents: c3a5484
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:33:48 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:33:48 2015 -0700

--
 .../mllib/clustering/GaussianMixture.scala  |  1 +
 .../mllib/clustering/GaussianMixtureModel.scala |  8 +++---
 .../apache/spark/mllib/clustering/KMeans.scala  |  1 +
 .../spark/mllib/clustering/KMeansModel.scala|  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 28 +++-
 .../clustering/PowerIterationClustering.scala   | 10 ---
 .../mllib/clustering/StreamingKMeans.scala  | 15 ++-
 7 files changed, 44 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index daa947e..f82bd82 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -53,6 +53,7 @@ import org.apache.spark.util.Utils
  * @param maxIterations The maximum number of iterations to perform
  */
 @Experimental
+@Since("1.3.0")
 class GaussianMixture private (
 private var k: Int,
 private var convergenceTol: Double,

http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 1a10a8b..7f6163e 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row}
  */
 @Since("1.3.0")
 @Experimental
-class GaussianMixtureModel(
-  val weights: Array[Double],
-  val gaussians: Array[MultivariateGaussian]) extends Serializable with 
Saveable {
+class GaussianMixtureModel @Since("1.3.0") (
+  @Since("1.3.0") val weights: Array[Double],
+  @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends 
Serializable with Saveable {
 
   require(weights.length == gaussians.length, "Length of weight and Gaussian 
arrays must match")
 
@@ -178,7 +178,7 @@ object GaussianMixtureModel extends 
Loader[GaussianMixtureModel] {
   (weight, new MultivariateGaussian(mu, sigma))
   }.unzip
 
-  return new GaussianMixtureModel(weights.toArray, gaussians.toArray)
+  new GaussianMixtureModel(weights.toArray, gaussians.toArray)
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index 3e9545a..46920ff 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom
  * This is an iterative algorithm that will make multiple passes over the 
data, so any RDDs given
  * to it should be cached by the user.
  */
+@Since("0.8.0")
 class KMeans private (
 private var k: Int,
 private var maxIterations: Int,

http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index e425ecd..a741584 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMea

spark git commit: [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 46750b912 -> b7766699a


[SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and 
mllib.stat

The same as #8241 but for `mllib.stat` and `mllib.random`.

cc feynmanliang

Author: Xiangrui Meng 

Closes #8439 from mengxr/SPARK-10242.

(cherry picked from commit c3a54843c0c8a14059da4e6716c1ad45c69bbe6c)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7766699
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7766699
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7766699

Branch: refs/heads/branch-1.5
Commit: b7766699aef65586b0c3af96fb625efaa218d2b2
Parents: 46750b9
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:31:23 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:31:33 2015 -0700

--
 .../mllib/random/RandomDataGenerator.scala  | 43 ++--
 .../apache/spark/mllib/random/RandomRDDs.scala  | 69 +---
 .../distribution/MultivariateGaussian.scala |  6 +-
 .../spark/mllib/stat/test/TestResult.scala  | 24 ---
 4 files changed, 117 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b7766699/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index 9349eca..a2d85a6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.random
 import org.apache.commons.math3.distribution.{ExponentialDistribution,
   GammaDistribution, LogNormalDistribution, PoissonDistribution}
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
 
 /**
@@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, 
Pseudorandom}
  * Trait for random data generators that generate i.i.d. data.
  */
 @DeveloperApi
+@Since("1.1.0")
 trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
 
   /**
* Returns an i.i.d. sample as a generic type from an underlying 
distribution.
*/
+  @Since("1.1.0")
   def nextValue(): T
 
   /**
* Returns a copy of the RandomDataGenerator with a new instance of the rng 
object used in the
* class when applicable for non-locking concurrent usage.
*/
+  @Since("1.1.0")
   def copy(): RandomDataGenerator[T]
 }
 
@@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with 
Serializable {
  * Generates i.i.d. samples from U[0.0, 1.0]
  */
 @DeveloperApi
+@Since("1.1.0")
 class UniformGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
 random.nextDouble()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): UniformGenerator = new UniformGenerator()
 }
 
@@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] {
  * Generates i.i.d. samples from the standard normal distribution.
  */
 @DeveloperApi
+@Since("1.1.0")
 class StandardNormalGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
   random.nextGaussian()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
 }
 
@@ -87,16 +98,21 @@ class StandardNormalGenerator extends 
RandomDataGenerator[Double] {
  * @param mean mean for the Poisson distribution.
  */
 @DeveloperApi
-class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+@Since("1.1.0")
+class PoissonGenerator @Since("1.1.0") (
+@Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new PoissonDistribution(mean)
 
+  @Since("1.1.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.1.0")
   override def setSeed(seed: Long) {
 rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.1.0")
   override def copy(): PoissonGenerator = new PoissonGenerator(mean)
 }
 
@@ -107,16 +123,21 @@ class PoissonGenerat

spark git commit: [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master ab431f8a9 -> c3a54843c


[SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and 
mllib.stat

The same as #8241 but for `mllib.stat` and `mllib.random`.

cc feynmanliang

Author: Xiangrui Meng 

Closes #8439 from mengxr/SPARK-10242.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3a54843
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3a54843
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3a54843

Branch: refs/heads/master
Commit: c3a54843c0c8a14059da4e6716c1ad45c69bbe6c
Parents: ab431f8
Author: Xiangrui Meng 
Authored: Tue Aug 25 22:31:23 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 22:31:23 2015 -0700

--
 .../mllib/random/RandomDataGenerator.scala  | 43 ++--
 .../apache/spark/mllib/random/RandomRDDs.scala  | 69 +---
 .../distribution/MultivariateGaussian.scala |  6 +-
 .../spark/mllib/stat/test/TestResult.scala  | 24 ---
 4 files changed, 117 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c3a54843/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
index 9349eca..a2d85a6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.mllib.random
 import org.apache.commons.math3.distribution.{ExponentialDistribution,
   GammaDistribution, LogNormalDistribution, PoissonDistribution}
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{Since, DeveloperApi}
 import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom}
 
 /**
@@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, 
Pseudorandom}
  * Trait for random data generators that generate i.i.d. data.
  */
 @DeveloperApi
+@Since("1.1.0")
 trait RandomDataGenerator[T] extends Pseudorandom with Serializable {
 
   /**
* Returns an i.i.d. sample as a generic type from an underlying 
distribution.
*/
+  @Since("1.1.0")
   def nextValue(): T
 
   /**
* Returns a copy of the RandomDataGenerator with a new instance of the rng 
object used in the
* class when applicable for non-locking concurrent usage.
*/
+  @Since("1.1.0")
   def copy(): RandomDataGenerator[T]
 }
 
@@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with 
Serializable {
  * Generates i.i.d. samples from U[0.0, 1.0]
  */
 @DeveloperApi
+@Since("1.1.0")
 class UniformGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
 random.nextDouble()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): UniformGenerator = new UniformGenerator()
 }
 
@@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] {
  * Generates i.i.d. samples from the standard normal distribution.
  */
 @DeveloperApi
+@Since("1.1.0")
 class StandardNormalGenerator extends RandomDataGenerator[Double] {
 
   // XORShiftRandom for better performance. Thread safety isn't necessary here.
   private val random = new XORShiftRandom()
 
+  @Since("1.1.0")
   override def nextValue(): Double = {
   random.nextGaussian()
   }
 
+  @Since("1.1.0")
   override def setSeed(seed: Long): Unit = random.setSeed(seed)
 
+  @Since("1.1.0")
   override def copy(): StandardNormalGenerator = new StandardNormalGenerator()
 }
 
@@ -87,16 +98,21 @@ class StandardNormalGenerator extends 
RandomDataGenerator[Double] {
  * @param mean mean for the Poisson distribution.
  */
 @DeveloperApi
-class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] {
+@Since("1.1.0")
+class PoissonGenerator @Since("1.1.0") (
+@Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] {
 
   private val rng = new PoissonDistribution(mean)
 
+  @Since("1.1.0")
   override def nextValue(): Double = rng.sample()
 
+  @Since("1.1.0")
   override def setSeed(seed: Long) {
 rng.reseedRandomGenerator(seed)
   }
 
+  @Since("1.1.0")
   override def copy(): PoissonGenerator = new PoissonGenerator(mean)
 }
 
@@ -107,16 +123,21 @@ class PoissonGenerator(val mean: Double) extends 
RandomDataGenerator[Double] {
  * @param mean mean for the exponential distrib

spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 af98e51f2 -> 46750b912


[SPARK-10238] [MLLIB] update since versions in mllib.linalg

Same as #8421 but for `mllib.linalg`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8440 from mengxr/SPARK-10238 and squashes the following commits:

b38437e [Xiangrui Meng] update since versions in mllib.linalg

(cherry picked from commit ab431f8a970b85fba34ccb506c0f8815e55c63bf)
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/46750b91
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/46750b91
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/46750b91

Branch: refs/heads/branch-1.5
Commit: 46750b912781433b6ce0845ac22805cde975361e
Parents: af98e51
Author: Xiangrui Meng 
Authored: Tue Aug 25 20:07:56 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 20:08:09 2015 -0700

--
 .../apache/spark/mllib/linalg/Matrices.scala| 44 +---
 .../linalg/SingularValueDecomposition.scala |  1 +
 .../org/apache/spark/mllib/linalg/Vectors.scala | 25 ---
 .../mllib/linalg/distributed/BlockMatrix.scala  | 10 +++--
 .../linalg/distributed/CoordinateMatrix.scala   |  4 +-
 .../linalg/distributed/DistributedMatrix.scala  |  2 +
 .../linalg/distributed/IndexedRowMatrix.scala   |  4 +-
 .../mllib/linalg/distributed/RowMatrix.scala|  5 ++-
 8 files changed, 64 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/46750b91/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 28b5b46..c02ba42 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -32,18 +32,23 @@ import org.apache.spark.sql.types._
  * Trait for a local matrix.
  */
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
+@Since("1.0.0")
 sealed trait Matrix extends Serializable {
 
   /** Number of rows. */
+  @Since("1.0.0")
   def numRows: Int
 
   /** Number of columns. */
+  @Since("1.0.0")
   def numCols: Int
 
   /** Flag that keeps track whether the matrix is transposed or not. False by 
default. */
+  @Since("1.3.0")
   val isTransposed: Boolean = false
 
   /** Converts to a dense array in column major. */
+  @Since("1.0.0")
   def toArray: Array[Double] = {
 val newArray = new Array[Double](numRows * numCols)
 foreachActive { (i, j, v) =>
@@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable {
   private[mllib] def toBreeze: BM[Double]
 
   /** Gets the (i, j)-th element. */
+  @Since("1.3.0")
   def apply(i: Int, j: Int): Double
 
   /** Return the index for the (i, j)-th element in the backing array. */
@@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable {
   private[mllib] def update(i: Int, j: Int, v: Double): Unit
 
   /** Get a deep copy of the matrix. */
+  @Since("1.2.0")
   def copy: Matrix
 
   /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same 
underlying data. */
+  @Since("1.3.0")
   def transpose: Matrix
 
   /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  @Since("1.2.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
 val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
 BLAS.gemm(1.0, this, y, 0.0, C)
@@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable {
   }
 
   /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary 
compatibility. */
+  @Since("1.2.0")
   def multiply(y: DenseVector): DenseVector = {
 multiply(y.asInstanceOf[Vector])
   }
 
   /** Convenience method for `Matrix`-`Vector` multiplication. */
+  @Since("1.4.0")
   def multiply(y: Vector): DenseVector = {
 val output = new DenseVector(new Array[Double](numRows))
 BLAS.gemv(1.0, this, y, 0.0, output)
@@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable {
   override def toString: String = toBreeze.toString()
 
   /** A human readable representation of the matrix with maximum lines and 
width */
+  @Since("1.4.0")
   def toString(maxLines: Int, maxLineWidth: Int): String = 
toBreeze.toString(maxLines, maxLineWidth)
 
   /** Map the values of this matrix using a function. Generates a new matrix. 
Performs the
@@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable {
   /**
* Find the number of non-zero active values.
*/
+  @Since("1.5.0")
   def numNonzeros: Int
 
   /**
* Find the number of values stored explicitly. These values can be zero as 
well.
*/
+  @Since("1.5.0")
   def numActives: Int
 }
 
@@ -230,11 +244,11 @@ private[spark] class MatrixUD

spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 8668ead2e -> ab431f8a9


[SPARK-10238] [MLLIB] update since versions in mllib.linalg

Same as #8421 but for `mllib.linalg`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8440 from mengxr/SPARK-10238 and squashes the following commits:

b38437e [Xiangrui Meng] update since versions in mllib.linalg


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab431f8a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab431f8a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab431f8a

Branch: refs/heads/master
Commit: ab431f8a970b85fba34ccb506c0f8815e55c63bf
Parents: 8668ead
Author: Xiangrui Meng 
Authored: Tue Aug 25 20:07:56 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 20:07:56 2015 -0700

--
 .../apache/spark/mllib/linalg/Matrices.scala| 44 +---
 .../linalg/SingularValueDecomposition.scala |  1 +
 .../org/apache/spark/mllib/linalg/Vectors.scala | 25 ---
 .../mllib/linalg/distributed/BlockMatrix.scala  | 10 +++--
 .../linalg/distributed/CoordinateMatrix.scala   |  4 +-
 .../linalg/distributed/DistributedMatrix.scala  |  2 +
 .../linalg/distributed/IndexedRowMatrix.scala   |  4 +-
 .../mllib/linalg/distributed/RowMatrix.scala|  5 ++-
 8 files changed, 64 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ab431f8a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 28b5b46..c02ba42 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -32,18 +32,23 @@ import org.apache.spark.sql.types._
  * Trait for a local matrix.
  */
 @SQLUserDefinedType(udt = classOf[MatrixUDT])
+@Since("1.0.0")
 sealed trait Matrix extends Serializable {
 
   /** Number of rows. */
+  @Since("1.0.0")
   def numRows: Int
 
   /** Number of columns. */
+  @Since("1.0.0")
   def numCols: Int
 
   /** Flag that keeps track whether the matrix is transposed or not. False by 
default. */
+  @Since("1.3.0")
   val isTransposed: Boolean = false
 
   /** Converts to a dense array in column major. */
+  @Since("1.0.0")
   def toArray: Array[Double] = {
 val newArray = new Array[Double](numRows * numCols)
 foreachActive { (i, j, v) =>
@@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable {
   private[mllib] def toBreeze: BM[Double]
 
   /** Gets the (i, j)-th element. */
+  @Since("1.3.0")
   def apply(i: Int, j: Int): Double
 
   /** Return the index for the (i, j)-th element in the backing array. */
@@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable {
   private[mllib] def update(i: Int, j: Int, v: Double): Unit
 
   /** Get a deep copy of the matrix. */
+  @Since("1.2.0")
   def copy: Matrix
 
   /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same 
underlying data. */
+  @Since("1.3.0")
   def transpose: Matrix
 
   /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */
+  @Since("1.2.0")
   def multiply(y: DenseMatrix): DenseMatrix = {
 val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols)
 BLAS.gemm(1.0, this, y, 0.0, C)
@@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable {
   }
 
   /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary 
compatibility. */
+  @Since("1.2.0")
   def multiply(y: DenseVector): DenseVector = {
 multiply(y.asInstanceOf[Vector])
   }
 
   /** Convenience method for `Matrix`-`Vector` multiplication. */
+  @Since("1.4.0")
   def multiply(y: Vector): DenseVector = {
 val output = new DenseVector(new Array[Double](numRows))
 BLAS.gemv(1.0, this, y, 0.0, output)
@@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable {
   override def toString: String = toBreeze.toString()
 
   /** A human readable representation of the matrix with maximum lines and 
width */
+  @Since("1.4.0")
   def toString(maxLines: Int, maxLineWidth: Int): String = 
toBreeze.toString(maxLines, maxLineWidth)
 
   /** Map the values of this matrix using a function. Generates a new matrix. 
Performs the
@@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable {
   /**
* Find the number of non-zero active values.
*/
+  @Since("1.5.0")
   def numNonzeros: Int
 
   /**
* Find the number of values stored explicitly. These values can be zero as 
well.
*/
+  @Since("1.5.0")
   def numActives: Int
 }
 
@@ -230,11 +244,11 @@ private[spark] class MatrixUDT extends 
UserDefinedType[Matrix] {
  */
 @Since("1.0.0")
 @SQLUserDefinedType(udt = classOf[MatrixUD

spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 5cf266fde -> af98e51f2


[SPARK-10233] [MLLIB] update since version in mllib.evaluation

Same as #8421 but for `mllib.evaluation`.

cc avulanov

Author: Xiangrui Meng 

Closes #8423 from mengxr/SPARK-10233.

(cherry picked from commit 8668ead2e7097b9591069599fbfccf67c53db659)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af98e51f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af98e51f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af98e51f

Branch: refs/heads/branch-1.5
Commit: af98e51f273d95e0fc19da1eca32a5f87a8c5576
Parents: 5cf266f
Author: Xiangrui Meng 
Authored: Tue Aug 25 18:17:54 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 18:18:27 2015 -0700

--
 .../mllib/evaluation/BinaryClassificationMetrics.scala  |  8 
 .../spark/mllib/evaluation/MulticlassMetrics.scala  | 11 ++-
 .../spark/mllib/evaluation/MultilabelMetrics.scala  | 12 +++-
 .../spark/mllib/evaluation/RegressionMetrics.scala  |  3 ++-
 4 files changed, 27 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 76ae847..508fe53 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame
  *be smaller as a result, meaning there may be an extra sample 
at
  *partition boundaries.
  */
-@Since("1.3.0")
+@Since("1.0.0")
 @Experimental
-class BinaryClassificationMetrics(
-val scoreAndLabels: RDD[(Double, Double)],
-val numBins: Int) extends Logging {
+class BinaryClassificationMetrics @Since("1.3.0") (
+@Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)],
+@Since("1.3.0") val numBins: Int) extends Logging {
 
   require(numBins >= 0, "numBins must be nonnegative")
 

http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 02e89d9..00e8376 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame
  */
 @Since("1.1.0")
 @Experimental
-class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, 
Double)]) {
 
   /**
* An auxiliary constructor taking a DataFrame.
@@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
   /**
* Returns precision
*/
+  @Since("1.1.0")
   lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
 
   /**
@@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
* because sum of all false positives is equal to sum
* of all false negatives)
*/
+  @Since("1.1.0")
   lazy val recall: Double = precision
 
   /**
* Returns f-measure
* (equals to precision and recall because precision equals recall)
*/
+  @Since("1.1.0")
   lazy val fMeasure: Double = precision
 
   /**
* Returns weighted true positive rate
* (equals to precision, recall and f-measure)
*/
+  @Since("1.1.0")
   lazy val weightedTruePositiveRate: Double = weightedRecall
 
   /**
* Returns weighted false positive rate
*/
+  @Since("1.1.0")
   lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case 
(category, count) =>
 falsePositiveRate(category) * count.toDouble / labelCount
   }.sum
@@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
* Returns weighted averaged recall
* (equals to precision, recall and f-measure)
*/
+  @Since("1.1.0")
   lazy val weightedRecall: Double = labelCountByClass.map { case (category, 
count) =>
 recall(category) * count.toDouble / labelCount
   }.sum
@@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)

spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master 125205cdb -> 8668ead2e


[SPARK-10233] [MLLIB] update since version in mllib.evaluation

Same as #8421 but for `mllib.evaluation`.

cc avulanov

Author: Xiangrui Meng 

Closes #8423 from mengxr/SPARK-10233.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8668ead2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8668ead2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8668ead2

Branch: refs/heads/master
Commit: 8668ead2e7097b9591069599fbfccf67c53db659
Parents: 125205c
Author: Xiangrui Meng 
Authored: Tue Aug 25 18:17:54 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 18:17:54 2015 -0700

--
 .../mllib/evaluation/BinaryClassificationMetrics.scala  |  8 
 .../spark/mllib/evaluation/MulticlassMetrics.scala  | 11 ++-
 .../spark/mllib/evaluation/MultilabelMetrics.scala  | 12 +++-
 .../spark/mllib/evaluation/RegressionMetrics.scala  |  3 ++-
 4 files changed, 27 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
index 76ae847..508fe53 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame
  *be smaller as a result, meaning there may be an extra sample 
at
  *partition boundaries.
  */
-@Since("1.3.0")
+@Since("1.0.0")
 @Experimental
-class BinaryClassificationMetrics(
-val scoreAndLabels: RDD[(Double, Double)],
-val numBins: Int) extends Logging {
+class BinaryClassificationMetrics @Since("1.3.0") (
+@Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)],
+@Since("1.3.0") val numBins: Int) extends Logging {
 
   require(numBins >= 0, "numBins must be nonnegative")
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
index 02e89d9..00e8376 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame
  */
 @Since("1.1.0")
 @Experimental
-class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) {
+class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, 
Double)]) {
 
   /**
* An auxiliary constructor taking a DataFrame.
@@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
   /**
* Returns precision
*/
+  @Since("1.1.0")
   lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount
 
   /**
@@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
* because sum of all false positives is equal to sum
* of all false negatives)
*/
+  @Since("1.1.0")
   lazy val recall: Double = precision
 
   /**
* Returns f-measure
* (equals to precision and recall because precision equals recall)
*/
+  @Since("1.1.0")
   lazy val fMeasure: Double = precision
 
   /**
* Returns weighted true positive rate
* (equals to precision, recall and f-measure)
*/
+  @Since("1.1.0")
   lazy val weightedTruePositiveRate: Double = weightedRecall
 
   /**
* Returns weighted false positive rate
*/
+  @Since("1.1.0")
   lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case 
(category, count) =>
 falsePositiveRate(category) * count.toDouble / labelCount
   }.sum
@@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
* Returns weighted averaged recall
* (equals to precision, recall and f-measure)
*/
+  @Since("1.1.0")
   lazy val weightedRecall: Double = labelCountByClass.map { case (category, 
count) =>
 recall(category) * count.toDouble / labelCount
   }.sum
@@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, 
Double)]) {
   /**
* Returns weighted averaged precision
*/
+  @Since("1.1.0")
   lazy val weightedPrecisio

Git Push Summary

2015-08-25 Thread rxin
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc4 [deleted] dbaa5c294

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



Git Push Summary

2015-08-25 Thread rxin
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc2 [deleted] 07b95c7ad

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



Git Push Summary

2015-08-25 Thread rxin
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc3 [deleted] 3e8ae3894

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



Git Push Summary

2015-08-25 Thread rxin
Repository: spark
Updated Tags:  refs/tags/v1.4.1-rc1 [deleted] 60e08e507

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 4c03cb4da -> 5cf266fde


[SPARK-9888] [MLLIB] User guide for new LDA features

 * Adds two new sections to LDA's user guide; one for each optimizer/model
 * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, 
hyperpam optimization)
 * Cleans up a TODO and sets a default parameter in LDA code

jkbradley hhbyyh

Author: Feynman Liang 

Closes #8254 from feynmanliang/SPARK-9888.

(cherry picked from commit 125205cdb35530cdb4a8fff3e1ee49cf4a299583)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5cf266fd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5cf266fd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5cf266fd

Branch: refs/heads/branch-1.5
Commit: 5cf266fdeb6632622642e5d9bc056a76680b1970
Parents: 4c03cb4
Author: Feynman Liang 
Authored: Tue Aug 25 17:39:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 17:39:40 2015 -0700

--
 docs/mllib-clustering.md| 135 ---
 .../spark/mllib/clustering/LDAModel.scala   |   1 -
 .../spark/mllib/clustering/LDASuite.scala   |   1 +
 3 files changed, 117 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5cf266fd/docs/mllib-clustering.md
--
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index fd9ab25..3fb35d3 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, 
"myModelPath")
 is a topic model which infers topics from a collection of text documents.
 LDA can be thought of as a clustering algorithm as follows:
 
-* Topics correspond to cluster centers, and documents correspond to examples 
(rows) in a dataset.
-* Topics and documents both exist in a feature space, where feature vectors 
are vectors of word counts.
-* Rather than estimating a clustering using a traditional distance, LDA uses a 
function based
- on a statistical model of how text documents are generated.
-
-LDA takes in a collection of documents as vectors of word counts.
-It supports different inference algorithms via `setOptimizer` function. 
EMLDAOptimizer learns clustering using 
[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
-on the likelihood function and yields comprehensive results, while 
OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational 
inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) 
and is generally memory friendly. After fitting on the documents, LDA provides:
-
-* Topics: Inferred topics, each of which is a probability distribution over 
terms (words).
-* Topic distributions for documents: For each non empty document in the 
training set, LDA gives a probability distribution over topics. (EM only). Note 
that for empty documents, we don't create the topic distributions. (EM only)
+* Topics correspond to cluster centers, and documents correspond to
+examples (rows) in a dataset.
+* Topics and documents both exist in a feature space, where feature
+vectors are vectors of word counts (bag of words).
+* Rather than estimating a clustering using a traditional distance, LDA
+uses a function based on a statistical model of how text documents are
+generated.
+
+LDA supports different inference algorithms via `setOptimizer` function.
+`EMLDAOptimizer` learns clustering using
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+on the likelihood function and yields comprehensive results, while
+`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online
+variational
+inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf)
+and is generally memory friendly.
 
-LDA takes the following parameters:
+LDA takes in a collection of documents as vectors of word counts and the
+following parameters (set using the builder pattern):
 
 * `k`: Number of topics (i.e., cluster centers)
-* `maxIterations`: Limit on the number of iterations of EM used for learning
-* `docConcentration`: Hyperparameter for prior over documents' distributions 
over topics. Currently must be > 1, where larger values encourage smoother 
inferred distributions.
-* `topicConcentration`: Hyperparameter for prior over topics' distributions 
over terms (words). Currently must be > 1, where larger values encourage 
smoother inferred distributions.
-* `checkpointInterval`: If using checkpointing (set in the Spark 
configuration), this parameter specifies the frequency with which checkpoints 
will be created.  If `maxIterations` is large, using checkpoi

spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 7467b52ed -> 125205cdb


[SPARK-9888] [MLLIB] User guide for new LDA features

 * Adds two new sections to LDA's user guide; one for each optimizer/model
 * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, 
hyperpam optimization)
 * Cleans up a TODO and sets a default parameter in LDA code

jkbradley hhbyyh

Author: Feynman Liang 

Closes #8254 from feynmanliang/SPARK-9888.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/125205cd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/125205cd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/125205cd

Branch: refs/heads/master
Commit: 125205cdb35530cdb4a8fff3e1ee49cf4a299583
Parents: 7467b52
Author: Feynman Liang 
Authored: Tue Aug 25 17:39:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 17:39:20 2015 -0700

--
 docs/mllib-clustering.md| 135 ---
 .../spark/mllib/clustering/LDAModel.scala   |   1 -
 .../spark/mllib/clustering/LDASuite.scala   |   1 +
 3 files changed, 117 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/125205cd/docs/mllib-clustering.md
--
diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md
index fd9ab25..3fb35d3 100644
--- a/docs/mllib-clustering.md
+++ b/docs/mllib-clustering.md
@@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, 
"myModelPath")
 is a topic model which infers topics from a collection of text documents.
 LDA can be thought of as a clustering algorithm as follows:
 
-* Topics correspond to cluster centers, and documents correspond to examples 
(rows) in a dataset.
-* Topics and documents both exist in a feature space, where feature vectors 
are vectors of word counts.
-* Rather than estimating a clustering using a traditional distance, LDA uses a 
function based
- on a statistical model of how text documents are generated.
-
-LDA takes in a collection of documents as vectors of word counts.
-It supports different inference algorithms via `setOptimizer` function. 
EMLDAOptimizer learns clustering using 
[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
-on the likelihood function and yields comprehensive results, while 
OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational 
inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) 
and is generally memory friendly. After fitting on the documents, LDA provides:
-
-* Topics: Inferred topics, each of which is a probability distribution over 
terms (words).
-* Topic distributions for documents: For each non empty document in the 
training set, LDA gives a probability distribution over topics. (EM only). Note 
that for empty documents, we don't create the topic distributions. (EM only)
+* Topics correspond to cluster centers, and documents correspond to
+examples (rows) in a dataset.
+* Topics and documents both exist in a feature space, where feature
+vectors are vectors of word counts (bag of words).
+* Rather than estimating a clustering using a traditional distance, LDA
+uses a function based on a statistical model of how text documents are
+generated.
+
+LDA supports different inference algorithms via `setOptimizer` function.
+`EMLDAOptimizer` learns clustering using
+[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm)
+on the likelihood function and yields comprehensive results, while
+`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online
+variational
+inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf)
+and is generally memory friendly.
 
-LDA takes the following parameters:
+LDA takes in a collection of documents as vectors of word counts and the
+following parameters (set using the builder pattern):
 
 * `k`: Number of topics (i.e., cluster centers)
-* `maxIterations`: Limit on the number of iterations of EM used for learning
-* `docConcentration`: Hyperparameter for prior over documents' distributions 
over topics. Currently must be > 1, where larger values encourage smoother 
inferred distributions.
-* `topicConcentration`: Hyperparameter for prior over topics' distributions 
over terms (words). Currently must be > 1, where larger values encourage 
smoother inferred distributions.
-* `checkpointInterval`: If using checkpointing (set in the Spark 
configuration), this parameter specifies the frequency with which checkpoints 
will be created.  If `maxIterations` is large, using checkpointing can help 
reduce shuffle file sizes on disk and help with failure recovery.
-
-*Note*: LDA is a new featur

[2/2] spark git commit: Preparing development version 1.5.1-SNAPSHOT

2015-08-25 Thread pwendell
Preparing development version 1.5.1-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c03cb4d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c03cb4d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c03cb4d

Branch: refs/heads/branch-1.5
Commit: 4c03cb4da846bf3ea4cd99f593d74c4a817a7d2d
Parents: 7277713
Author: Patrick Wendell 
Authored: Tue Aug 25 15:56:44 2015 -0700
Committer: Patrick Wendell 
Committed: Tue Aug 25 15:56:44 2015 -0700

--
 assembly/pom.xml| 2 +-
 bagel/pom.xml   | 2 +-
 core/pom.xml| 2 +-
 examples/pom.xml| 2 +-
 external/flume-assembly/pom.xml | 2 +-
 external/flume-sink/pom.xml | 2 +-
 external/flume/pom.xml  | 2 +-
 external/kafka-assembly/pom.xml | 2 +-
 external/kafka/pom.xml  | 2 +-
 external/mqtt-assembly/pom.xml  | 2 +-
 external/mqtt/pom.xml   | 2 +-
 external/twitter/pom.xml| 2 +-
 external/zeromq/pom.xml | 2 +-
 extras/java8-tests/pom.xml  | 2 +-
 extras/kinesis-asl-assembly/pom.xml | 2 +-
 extras/kinesis-asl/pom.xml  | 2 +-
 extras/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml  | 2 +-
 launcher/pom.xml| 2 +-
 mllib/pom.xml   | 2 +-
 network/common/pom.xml  | 2 +-
 network/shuffle/pom.xml | 2 +-
 network/yarn/pom.xml| 2 +-
 pom.xml | 2 +-
 repl/pom.xml| 2 +-
 sql/catalyst/pom.xml| 2 +-
 sql/core/pom.xml| 2 +-
 sql/hive-thriftserver/pom.xml   | 2 +-
 sql/hive/pom.xml| 2 +-
 streaming/pom.xml   | 2 +-
 tools/pom.xml   | 2 +-
 unsafe/pom.xml  | 2 +-
 yarn/pom.xml| 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 3ef7d6f..7b41ebb 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index 684e07b..16bf17c 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 6b082ad..beb547f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index 9ef1eda..3926b79 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-assembly/pom.xml
--
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index aa7021d..5eda12d 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 7d72f78..33f2cd7 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0
+1.5.1-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume/pom.xml
--
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 38683e3..670c783 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 s

[1/2] spark git commit: Preparing Spark release v1.5.0-rc2

2015-08-25 Thread pwendell
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 ab7d46d1d -> 4c03cb4da


Preparing Spark release v1.5.0-rc2


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/72777135
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/72777135
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/72777135

Branch: refs/heads/branch-1.5
Commit: 727771352855dbb780008c449a877f5aaa5fc27a
Parents: ab7d46d
Author: Patrick Wendell 
Authored: Tue Aug 25 15:56:37 2015 -0700
Committer: Patrick Wendell 
Committed: Tue Aug 25 15:56:37 2015 -0700

--
 assembly/pom.xml| 2 +-
 bagel/pom.xml   | 2 +-
 core/pom.xml| 2 +-
 examples/pom.xml| 2 +-
 external/flume-assembly/pom.xml | 2 +-
 external/flume-sink/pom.xml | 2 +-
 external/flume/pom.xml  | 2 +-
 external/kafka-assembly/pom.xml | 2 +-
 external/kafka/pom.xml  | 2 +-
 external/mqtt-assembly/pom.xml  | 2 +-
 external/mqtt/pom.xml   | 2 +-
 external/twitter/pom.xml| 2 +-
 external/zeromq/pom.xml | 2 +-
 extras/java8-tests/pom.xml  | 2 +-
 extras/kinesis-asl-assembly/pom.xml | 2 +-
 extras/kinesis-asl/pom.xml  | 2 +-
 extras/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml  | 2 +-
 launcher/pom.xml| 2 +-
 mllib/pom.xml   | 2 +-
 network/common/pom.xml  | 2 +-
 network/shuffle/pom.xml | 2 +-
 network/yarn/pom.xml| 2 +-
 pom.xml | 2 +-
 repl/pom.xml| 2 +-
 sql/catalyst/pom.xml| 2 +-
 sql/core/pom.xml| 2 +-
 sql/hive-thriftserver/pom.xml   | 2 +-
 sql/hive/pom.xml| 2 +-
 streaming/pom.xml   | 2 +-
 tools/pom.xml   | 2 +-
 unsafe/pom.xml  | 2 +-
 yarn/pom.xml| 2 +-
 33 files changed, 33 insertions(+), 33 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index e9c6d26..3ef7d6f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/bagel/pom.xml
--
diff --git a/bagel/pom.xml b/bagel/pom.xml
index ed5c37e..684e07b 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index 4f79d71..6b082ad 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/examples/pom.xml
--
diff --git a/examples/pom.xml b/examples/pom.xml
index e6884b0..9ef1eda 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-assembly/pom.xml
--
diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml
index 561ed4b..aa7021d 100644
--- a/external/flume-assembly/pom.xml
+++ b/external/flume-assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-sink/pom.xml
--
diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml
index 0664cfb..7d72f78 100644
--- a/external/flume-sink/pom.xml
+++ b/external/flume-sink/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.10
-1.5.0-SNAPSHOT
+1.5.0
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume/pom.xml
--
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 14f7daa..38683e3 100644
--- a/external/flume/pom.xml
+++ 

Git Push Summary

2015-08-25 Thread pwendell
Repository: spark
Updated Tags:  refs/tags/v1.5.0-rc2 [created] 727771352

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)

2015-08-25 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 8925896b1 -> ab7d46d1d


[SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)

Follow the rule in Hive for decimal division. see 
https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113

cc chenghao-intel

Author: Davies Liu 

Closes #8415 from davies/decimal_div2.

(cherry picked from commit 7467b52ed07f174d93dfc4cb544dc4b69a2c2826)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab7d46d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab7d46d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab7d46d1

Branch: refs/heads/branch-1.5
Commit: ab7d46d1d6e7e6705a3348a0cab2d05fe62951cf
Parents: 8925896
Author: Davies Liu 
Authored: Tue Aug 25 15:19:41 2015 -0700
Committer: Yin Huai 
Committed: Tue Aug 25 15:20:42 2015 -0700

--
 .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++--
 .../sql/catalyst/analysis/AnalysisSuite.scala   |  9 ---
 .../analysis/DecimalPrecisionSuite.scala|  8 +++
 .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++--
 4 files changed, 39 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index a1aa2a2..87c11ab 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -396,8 +396,14 @@ object HiveTypeCoercion {
 resultType)
 
 case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ 
DecimalType.Expression(p2, s2)) =>
-  val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 
1),
-max(6, s1 + p2 + 1))
+  var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+  var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
+  val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+  if (diff > 0) {
+decDig -= diff / 2 + 1
+intDig = DecimalType.MAX_SCALE - decDig
+  }
+  val resultType = DecimalType.bounded(intDig + decDig, decDig)
   val widerType = widerDecimalType(p1, s1, p2, s2)
   CheckOverflow(Divide(promotePrecision(e1, widerType), 
promotePrecision(e2, widerType)),
 resultType)

http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 1e0cc81..820b336 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.dsl.plans._
 
 class AnalysisSuite extends AnalysisTest {
-  import TestRelations._
+  import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
 val plan = (1 to 100)
@@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest {
 assert(pl(1).dataType == DoubleType)
 assert(pl(2).dataType == DoubleType)
 // StringType will be promoted into Decimal(38, 18)
-assert(pl(3).dataType == DecimalType(38, 29))
+assert(pl(3).dataType == DecimalType(38, 22))
 assert(pl(4).dataType == DoubleType)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
 

spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)

2015-08-25 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master ec89bd840 -> 7467b52ed


[SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)

Follow the rule in Hive for decimal division. see 
https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113

cc chenghao-intel

Author: Davies Liu 

Closes #8415 from davies/decimal_div2.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7467b52e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7467b52e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7467b52e

Branch: refs/heads/master
Commit: 7467b52ed07f174d93dfc4cb544dc4b69a2c2826
Parents: ec89bd8
Author: Davies Liu 
Authored: Tue Aug 25 15:19:41 2015 -0700
Committer: Yin Huai 
Committed: Tue Aug 25 15:20:24 2015 -0700

--
 .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++--
 .../sql/catalyst/analysis/AnalysisSuite.scala   |  9 ---
 .../analysis/DecimalPrecisionSuite.scala|  8 +++
 .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++--
 4 files changed, 39 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index a1aa2a2..87c11ab 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -396,8 +396,14 @@ object HiveTypeCoercion {
 resultType)
 
 case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ 
DecimalType.Expression(p2, s2)) =>
-  val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 
1),
-max(6, s1 + p2 + 1))
+  var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2)
+  var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1))
+  val diff = (intDig + decDig) - DecimalType.MAX_SCALE
+  if (diff > 0) {
+decDig -= diff / 2 + 1
+intDig = DecimalType.MAX_SCALE - decDig
+  }
+  val resultType = DecimalType.bounded(intDig + decDig, decDig)
   val widerType = widerDecimalType(p1, s1, p2, s2)
   CheckOverflow(Divide(promotePrecision(e1, widerType), 
promotePrecision(e2, widerType)),
 resultType)

http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 1e0cc81..820b336 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,15 +17,14 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
-import org.apache.spark.sql.catalyst.dsl.expressions._
-import org.apache.spark.sql.catalyst.dsl.plans._
 
 class AnalysisSuite extends AnalysisTest {
-  import TestRelations._
+  import org.apache.spark.sql.catalyst.analysis.TestRelations._
 
   test("union project *") {
 val plan = (1 to 100)
@@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest {
 assert(pl(1).dataType == DoubleType)
 assert(pl(2).dataType == DoubleType)
 // StringType will be promoted into Decimal(38, 18)
-assert(pl(3).dataType == DecimalType(38, 29))
+assert(pl(3).dataType == DecimalType(38, 22))
 assert(pl(4).dataType == DoubleType)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index 

spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision < scale

2015-08-25 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 6f05b7aeb -> 8925896b1


[SPARK-10245] [SQL] Fix decimal literals with precision < scale

In BigDecimal or java.math.BigDecimal, the precision could be smaller than 
scale, for example, BigDecimal("0.001") has precision = 1 and scale = 3. But 
DecimalType require that the precision should be larger than scale, so we 
should use the maximum of precision and scale when inferring the schema from 
decimal literal.

Author: Davies Liu 

Closes #8428 from davies/smaller_decimal.

(cherry picked from commit ec89bd840a6862751999d612f586a962cae63f6d)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8925896b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8925896b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8925896b

Branch: refs/heads/branch-1.5
Commit: 8925896b1eb0a13d723d38fb263d3bec0a01ec10
Parents: 6f05b7a
Author: Davies Liu 
Authored: Tue Aug 25 14:55:34 2015 -0700
Committer: Yin Huai 
Committed: Tue Aug 25 14:55:45 2015 -0700

--
 .../apache/spark/sql/catalyst/expressions/literals.scala  |  7 ---
 .../sql/catalyst/expressions/LiteralExpressionSuite.scala |  8 +---
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 10 ++
 3 files changed, 19 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 34bad23..8c0c5d5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -36,9 +36,10 @@ object Literal {
 case s: Short => Literal(s, ShortType)
 case s: String => Literal(UTF8String.fromString(s), StringType)
 case b: Boolean => Literal(b, BooleanType)
-case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, 
d.scale))
-case d: java.math.BigDecimal => Literal(Decimal(d), 
DecimalType(d.precision(), d.scale()))
-case d: Decimal => Literal(d, DecimalType(d.precision, d.scale))
+case d: BigDecimal => Literal(Decimal(d), 
DecimalType(Math.max(d.precision, d.scale), d.scale))
+case d: java.math.BigDecimal =>
+  Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), 
d.scale()))
+case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), 
d.scale))
 case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), 
TimestampType)
 case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
 case a: Array[Byte] => Literal(a, BinaryType)

http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index f6404d2..015eb18 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   }
 
   test("decimal") {
-List(0.0, 1.2, 1., 5).foreach { d =>
+List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d =>
   checkEvaluation(Literal(Decimal(d)), Decimal(d))
   checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt))
   checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong))
-  checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)),
-Decimal((d * 1000L).toLong, 10, 1))
+  checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)),
+Decimal((d * 1000L).toLong, 10, 3))
+  checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d))
+  checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), 
Decimal(d))
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index dcb4e83..aa07665 100644
--- 

spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision < scale

2015-08-25 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master 00ae4be97 -> ec89bd840


[SPARK-10245] [SQL] Fix decimal literals with precision < scale

In BigDecimal or java.math.BigDecimal, the precision could be smaller than 
scale, for example, BigDecimal("0.001") has precision = 1 and scale = 3. But 
DecimalType require that the precision should be larger than scale, so we 
should use the maximum of precision and scale when inferring the schema from 
decimal literal.

Author: Davies Liu 

Closes #8428 from davies/smaller_decimal.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec89bd84
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec89bd84
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec89bd84

Branch: refs/heads/master
Commit: ec89bd840a6862751999d612f586a962cae63f6d
Parents: 00ae4be
Author: Davies Liu 
Authored: Tue Aug 25 14:55:34 2015 -0700
Committer: Yin Huai 
Committed: Tue Aug 25 14:55:34 2015 -0700

--
 .../apache/spark/sql/catalyst/expressions/literals.scala  |  7 ---
 .../sql/catalyst/expressions/LiteralExpressionSuite.scala |  8 +---
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 10 ++
 3 files changed, 19 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 34bad23..8c0c5d5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -36,9 +36,10 @@ object Literal {
 case s: Short => Literal(s, ShortType)
 case s: String => Literal(UTF8String.fromString(s), StringType)
 case b: Boolean => Literal(b, BooleanType)
-case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, 
d.scale))
-case d: java.math.BigDecimal => Literal(Decimal(d), 
DecimalType(d.precision(), d.scale()))
-case d: Decimal => Literal(d, DecimalType(d.precision, d.scale))
+case d: BigDecimal => Literal(Decimal(d), 
DecimalType(Math.max(d.precision, d.scale), d.scale))
+case d: java.math.BigDecimal =>
+  Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), 
d.scale()))
+case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), 
d.scale))
 case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), 
TimestampType)
 case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType)
 case a: Array[Byte] => Literal(a, BinaryType)

http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index f6404d2..015eb18 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
   }
 
   test("decimal") {
-List(0.0, 1.2, 1., 5).foreach { d =>
+List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d =>
   checkEvaluation(Literal(Decimal(d)), Decimal(d))
   checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt))
   checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong))
-  checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)),
-Decimal((d * 1000L).toLong, 10, 1))
+  checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)),
+Decimal((d * 1000L).toLong, 10, 3))
+  checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d))
+  checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), 
Decimal(d))
 }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index dcb4e83..aa07665 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/ap

spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 055387c08 -> 6f05b7aeb


[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and 
mllib.util

Same as #8421 but for `mllib.pmml` and `mllib.util`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8430 from mengxr/SPARK-10239 and squashes the following commits:

a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util

(cherry picked from commit 00ae4be97f7b205432db2967ba6d506286ef2ca6)
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f05b7ae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f05b7ae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f05b7ae

Branch: refs/heads/branch-1.5
Commit: 6f05b7aebd66a00e2556a29b35084e81ac526406
Parents: 055387c
Author: Xiangrui Meng 
Authored: Tue Aug 25 14:11:38 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 14:11:50 2015 -0700

--
 .../org/apache/spark/mllib/pmml/PMMLExportable.scala  |  7 ++-
 .../org/apache/spark/mllib/util/DataValidators.scala  |  7 +--
 .../org/apache/spark/mllib/util/KMeansDataGenerator.scala |  5 -
 .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 --
 .../mllib/util/LogisticRegressionDataGenerator.scala  |  5 -
 .../org/apache/spark/mllib/util/MFDataGenerator.scala |  4 +++-
 .../main/scala/org/apache/spark/mllib/util/MLUtils.scala  |  2 ++
 .../org/apache/spark/mllib/util/SVMDataGenerator.scala|  6 --
 .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala |  6 +-
 9 files changed, 41 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
index 5e882d4..274ac7c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
 import org.jpmml.model.JAXBUtil
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
 
 /**
@@ -33,6 +33,7 @@ import 
org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
  * developed by the Data Mining Group (www.dmg.org).
  */
 @DeveloperApi
+@Since("1.4.0")
 trait PMMLExportable {
 
   /**
@@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(localPath: String): Unit = {
 toPMML(new StreamResult(new File(localPath)))
   }
@@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML 
format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(sc: SparkContext, path: String): Unit = {
 val pmml = toPMML()
 sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
@@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(outputStream: OutputStream): Unit = {
 toPMML(new StreamResult(outputStream))
   }
@@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(): String = {
 val writer = new StringWriter
 toPMML(new StreamResult(writer))

http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
index be335a1..dffe6e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
@@ -17,16 +17,17 @@
 
 package org.apache.spark.mllib.util
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
  * A collection of methods used to validate data before applying ML algorithms.
  */
 @DeveloperApi
+@Since("0.8.0")
 object DataValidators extends Logging {
 
   /**
@

spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 920590787 -> 00ae4be97


[SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and 
mllib.util

Same as #8421 but for `mllib.pmml` and `mllib.util`.

cc dbtsai

Author: Xiangrui Meng 

Closes #8430 from mengxr/SPARK-10239 and squashes the following commits:

a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00ae4be9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00ae4be9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00ae4be9

Branch: refs/heads/master
Commit: 00ae4be97f7b205432db2967ba6d506286ef2ca6
Parents: 9205907
Author: Xiangrui Meng 
Authored: Tue Aug 25 14:11:38 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 14:11:38 2015 -0700

--
 .../org/apache/spark/mllib/pmml/PMMLExportable.scala  |  7 ++-
 .../org/apache/spark/mllib/util/DataValidators.scala  |  7 +--
 .../org/apache/spark/mllib/util/KMeansDataGenerator.scala |  5 -
 .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 --
 .../mllib/util/LogisticRegressionDataGenerator.scala  |  5 -
 .../org/apache/spark/mllib/util/MFDataGenerator.scala |  4 +++-
 .../main/scala/org/apache/spark/mllib/util/MLUtils.scala  |  2 ++
 .../org/apache/spark/mllib/util/SVMDataGenerator.scala|  6 --
 .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala |  6 +-
 9 files changed, 41 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
index 5e882d4..274ac7c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult
 import org.jpmml.model.JAXBUtil
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
 
 /**
@@ -33,6 +33,7 @@ import 
org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
  * developed by the Data Mining Group (www.dmg.org).
  */
 @DeveloperApi
+@Since("1.4.0")
 trait PMMLExportable {
 
   /**
@@ -48,6 +49,7 @@ trait PMMLExportable {
* Export the model to a local file in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(localPath: String): Unit = {
 toPMML(new StreamResult(new File(localPath)))
   }
@@ -57,6 +59,7 @@ trait PMMLExportable {
* Export the model to a directory on a distributed file system in PMML 
format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(sc: SparkContext, path: String): Unit = {
 val pmml = toPMML()
 sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
@@ -67,6 +70,7 @@ trait PMMLExportable {
* Export the model to the OutputStream in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(outputStream: OutputStream): Unit = {
 toPMML(new StreamResult(outputStream))
   }
@@ -76,6 +80,7 @@ trait PMMLExportable {
* Export the model to a String in PMML format
*/
   @Experimental
+  @Since("1.4.0")
   def toPMML(): String = {
 val writer = new StringWriter
 toPMML(new StreamResult(writer))

http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
index be335a1..dffe6e7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala
@@ -17,16 +17,17 @@
 
 package org.apache.spark.mllib.util
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.Logging
-import org.apache.spark.rdd.RDD
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
  * A collection of methods used to validate data before applying ML algorithms.
  */
 @DeveloperApi
+@Since("0.8.0")
 object DataValidators extends Logging {
 
   /**
@@ -34,6 +35,7 @@ object DataValidators extends Logging {
*
* @return True if labels are all ze

spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 186326df2 -> 055387c08


[SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol 
default value

Adds default convergence tolerance (0.001, set in 
`GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc

Author: Feynman Liang 

Closes #8424 from feynmanliang/SPARK-9797.

(cherry picked from commit 9205907876cf65695e56c2a94bedd83df3675c03)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/055387c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/055387c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/055387c0

Branch: refs/heads/branch-1.5
Commit: 055387c087989c8790b6761429b68416ecee3a33
Parents: 186326d
Author: Feynman Liang 
Authored: Tue Aug 25 13:23:15 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 13:23:25 2015 -0700

--
 .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/055387c0/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index 537a052..26654e4 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   }
 
   /**
-   * Set the convergence tolerance.
+   * Set the convergence tolerance. Default: 0.001.
*/
   def setConvergenceTol(tolerance: Double): this.type = {
 this.algorithm.optimizer.setConvergenceTol(tolerance)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master c619c7552 -> 920590787


[SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol 
default value

Adds default convergence tolerance (0.001, set in 
`GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc

Author: Feynman Liang 

Closes #8424 from feynmanliang/SPARK-9797.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92059078
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92059078
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92059078

Branch: refs/heads/master
Commit: 9205907876cf65695e56c2a94bedd83df3675c03
Parents: c619c75
Author: Feynman Liang 
Authored: Tue Aug 25 13:23:15 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 13:23:15 2015 -0700

--
 .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/92059078/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
index 537a052..26654e4 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala
@@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] (
   }
 
   /**
-   * Set the convergence tolerance.
+   * Set the convergence tolerance. Default: 0.001.
*/
   def setConvergenceTol(tolerance: Double): this.type = {
 this.algorithm.optimizer.setConvergenceTol(tolerance)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master c0e9ff158 -> c619c7552


[SPARK-10237] [MLLIB] update since versions in mllib.fpm

Same as #8421 but for `mllib.fpm`.

cc feynmanliang

Author: Xiangrui Meng 

Closes #8429 from mengxr/SPARK-10237.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c619c755
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c619c755
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c619c755

Branch: refs/heads/master
Commit: c619c7552f22d28cfa321ce671fc9ca854dd655f
Parents: c0e9ff1
Author: Xiangrui Meng 
Authored: Tue Aug 25 13:22:38 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 13:22:38 2015 -0700

--
 .../spark/mllib/fpm/AssociationRules.scala  |  7 --
 .../org/apache/spark/mllib/fpm/FPGrowth.scala   |  9 ++--
 .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +---
 3 files changed, 32 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index ba3b447..95c688c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -82,12 +82,15 @@ class AssociationRules private[fpm] (
 }.filter(_.confidence >= minConfidence)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.5.0")
   def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] 
= {
 val tag = fakeClassTag[Item]
 run(freqItemsets.rdd)(tag)
   }
 }
 
+@Since("1.5.0")
 object AssociationRules {
 
   /**
@@ -104,8 +107,8 @@ object AssociationRules {
   @Since("1.5.0")
   @Experimental
   class Rule[Item] private[fpm] (
-  val antecedent: Array[Item],
-  val consequent: Array[Item],
+  @Since("1.5.0") val antecedent: Array[Item],
+  @Since("1.5.0") val consequent: Array[Item],
   freqUnion: Double,
   freqAntecedent: Double) extends Serializable {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index e37f806..aea5c4f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.3.0")
 @Experimental
-class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) 
extends Serializable {
+class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
+@Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends 
Serializable {
   /**
* Generates association rules for the [[Item]]s in [[freqItemsets]].
* @param confidence minimal confidence of the rules produced
@@ -126,6 +127,8 @@ class FPGrowth private (
 new FPGrowthModel(freqItemsets)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.3.0")
   def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): 
FPGrowthModel[Item] = {
 implicit val tag = fakeClassTag[Item]
 run(data.rdd.map(_.asScala.toArray))
@@ -226,7 +229,9 @@ object FPGrowth {
*
*/
   @Since("1.3.0")
-  class FreqItemset[Item](val items: Array[Item], val freq: Long) extends 
Serializable {
+  class FreqItemset[Item] @Since("1.3.0") (
+  @Since("1.3.0") val items: Array[Item],
+  @Since("1.3.0") val freq: Long) extends Serializable {
 
 /**
  * Returns items in a Java List.

http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index dc4ae1d..97916da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.api.java.JavaRDD
 import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
 import org.apac

spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm

2015-08-25 Thread meng
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 95e44b4df -> 186326df2


[SPARK-10237] [MLLIB] update since versions in mllib.fpm

Same as #8421 but for `mllib.fpm`.

cc feynmanliang

Author: Xiangrui Meng 

Closes #8429 from mengxr/SPARK-10237.

(cherry picked from commit c619c7552f22d28cfa321ce671fc9ca854dd655f)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/186326df
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/186326df
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/186326df

Branch: refs/heads/branch-1.5
Commit: 186326df21daf8d8271a522f2569eb5cd7be1442
Parents: 95e44b4
Author: Xiangrui Meng 
Authored: Tue Aug 25 13:22:38 2015 -0700
Committer: Xiangrui Meng 
Committed: Tue Aug 25 13:22:45 2015 -0700

--
 .../spark/mllib/fpm/AssociationRules.scala  |  7 --
 .../org/apache/spark/mllib/fpm/FPGrowth.scala   |  9 ++--
 .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +---
 3 files changed, 32 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index ba3b447..95c688c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -82,12 +82,15 @@ class AssociationRules private[fpm] (
 }.filter(_.confidence >= minConfidence)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.5.0")
   def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] 
= {
 val tag = fakeClassTag[Item]
 run(freqItemsets.rdd)(tag)
   }
 }
 
+@Since("1.5.0")
 object AssociationRules {
 
   /**
@@ -104,8 +107,8 @@ object AssociationRules {
   @Since("1.5.0")
   @Experimental
   class Rule[Item] private[fpm] (
-  val antecedent: Array[Item],
-  val consequent: Array[Item],
+  @Since("1.5.0") val antecedent: Array[Item],
+  @Since("1.5.0") val consequent: Array[Item],
   freqUnion: Double,
   freqAntecedent: Double) extends Serializable {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
index e37f806..aea5c4f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala
@@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel
  */
 @Since("1.3.0")
 @Experimental
-class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) 
extends Serializable {
+class FPGrowthModel[Item: ClassTag] @Since("1.3.0") (
+@Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends 
Serializable {
   /**
* Generates association rules for the [[Item]]s in [[freqItemsets]].
* @param confidence minimal confidence of the rules produced
@@ -126,6 +127,8 @@ class FPGrowth private (
 new FPGrowthModel(freqItemsets)
   }
 
+  /** Java-friendly version of [[run]]. */
+  @Since("1.3.0")
   def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): 
FPGrowthModel[Item] = {
 implicit val tag = fakeClassTag[Item]
 run(data.rdd.map(_.asScala.toArray))
@@ -226,7 +229,9 @@ object FPGrowth {
*
*/
   @Since("1.3.0")
-  class FreqItemset[Item](val items: Array[Item], val freq: Long) extends 
Serializable {
+  class FreqItemset[Item] @Since("1.3.0") (
+  @Since("1.3.0") val items: Array[Item],
+  @Since("1.3.0") val freq: Long) extends Serializable {
 
 /**
  * Returns items in a Java List.

http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
index dc4ae1d..97916da 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala
@@ -25,7 +25,7 @@ import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.ap

spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 71a138cd0 -> c0e9ff158


[SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias

* Adds doc for alias of runMIniBatchSGD documenting default value for 
convergeTol
* Cleans up a note in code

Author: Feynman Liang 

Closes #8425 from feynmanliang/SPARK-9800.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0e9ff15
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0e9ff15
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0e9ff15

Branch: refs/heads/master
Commit: c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910
Parents: 71a138c
Author: Feynman Liang 
Authored: Tue Aug 25 13:21:05 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 13:21:05 2015 -0700

--
 .../org/apache/spark/mllib/optimization/GradientDescent.scala   | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c0e9ff15/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 8f0d1e4..3b663b5 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -235,7 +235,7 @@ object GradientDescent extends Logging {
 
   if (miniBatchSize > 0) {
 /**
- * NOTE(Xinghao): lossSum is computed using the weights from the 
previous iteration
+ * lossSum is computed using the weights from the previous iteration
  * and regVal is the regularization value computed in the previous 
iteration as well.
  */
 stochasticLossHistory.append(lossSum / miniBatchSize + regVal)
@@ -264,6 +264,9 @@ object GradientDescent extends Logging {
 
   }
 
+  /**
+   * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 
0.001.
+   */
   def runMiniBatchSGD(
   data: RDD[(Double, Vector)],
   gradient: Gradient,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 5a32ed75c -> 95e44b4df


[SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias

* Adds doc for alias of runMIniBatchSGD documenting default value for 
convergeTol
* Cleans up a note in code

Author: Feynman Liang 

Closes #8425 from feynmanliang/SPARK-9800.

(cherry picked from commit c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95e44b4d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95e44b4d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95e44b4d

Branch: refs/heads/branch-1.5
Commit: 95e44b4df81b09803be2fde8c4e2566be0c8fdbc
Parents: 5a32ed7
Author: Feynman Liang 
Authored: Tue Aug 25 13:21:05 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 13:21:16 2015 -0700

--
 .../org/apache/spark/mllib/optimization/GradientDescent.scala   | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/95e44b4d/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
index 8f0d1e4..3b663b5 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala
@@ -235,7 +235,7 @@ object GradientDescent extends Logging {
 
   if (miniBatchSize > 0) {
 /**
- * NOTE(Xinghao): lossSum is computed using the weights from the 
previous iteration
+ * lossSum is computed using the weights from the previous iteration
  * and regVal is the regularization value computed in the previous 
iteration as well.
  */
 stochasticLossHistory.append(lossSum / miniBatchSize + regVal)
@@ -264,6 +264,9 @@ object GradientDescent extends Logging {
 
   }
 
+  /**
+   * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 
0.001.
+   */
   def runMiniBatchSGD(
   data: RDD[(Double, Vector)],
   gradient: Gradient,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde.

2015-08-25 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 16a2be1a8 -> 71a138cd0


[SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde.

This PR:
1. supports transferring arbitrary nested array from JVM to R side in SerDe;
2. based on 1, collect() implemenation is improved. Now it can support 
collecting data of complex types
   from a DataFrame.

Author: Sun Rui 

Closes #8276 from sun-rui/SPARK-10048.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71a138cd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71a138cd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71a138cd

Branch: refs/heads/master
Commit: 71a138cd0e0a14e8426f97877e3b52a562bbd02c
Parents: 16a2be1
Author: Sun Rui 
Authored: Tue Aug 25 13:14:10 2015 -0700
Committer: Shivaram Venkataraman 
Committed: Tue Aug 25 13:14:10 2015 -0700

--
 R/pkg/R/DataFrame.R | 55 ++---
 R/pkg/R/deserialize.R   | 72 +++-
 R/pkg/R/serialize.R | 10 +--
 R/pkg/inst/tests/test_Serde.R   | 77 ++
 R/pkg/inst/worker/worker.R  |  4 +-
 .../apache/spark/api/r/RBackendHandler.scala|  7 ++
 .../scala/org/apache/spark/api/r/SerDe.scala| 86 
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 32 +---
 8 files changed, 216 insertions(+), 127 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 10f3c4e..ae1d912 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -652,18 +652,49 @@ setMethod("dim",
 setMethod("collect",
   signature(x = "DataFrame"),
   function(x, stringsAsFactors = FALSE) {
-# listCols is a list of raw vectors, one per column
-listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"dfToCols", x@sdf)
-cols <- lapply(listCols, function(col) {
-  objRaw <- rawConnection(col)
-  numRows <- readInt(objRaw)
-  col <- readCol(objRaw, numRows)
-  close(objRaw)
-  col
-})
-names(cols) <- columns(x)
-do.call(cbind.data.frame, list(cols, stringsAsFactors = 
stringsAsFactors))
-  })
+names <- columns(x)
+ncol <- length(names)
+if (ncol <= 0) {
+  # empty data.frame with 0 columns and 0 rows
+  data.frame()
+} else {
+  # listCols is a list of columns
+  listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", 
"dfToCols", x@sdf)
+  stopifnot(length(listCols) == ncol)
+  
+  # An empty data.frame with 0 columns and number of rows as 
collected
+  nrow <- length(listCols[[1]])
+  if (nrow <= 0) {
+df <- data.frame()
+  } else {
+df <- data.frame(row.names = 1 : nrow)
+  }
+  
+  # Append columns one by one
+  for (colIndex in 1 : ncol) {
+# Note: appending a column of list type into a data.frame so 
that
+# data of complex type can be held. But getting a cell from a 
column
+# of list type returns a list instead of a vector. So for 
columns of
+# non-complex type, append them as vector.
+col <- listCols[[colIndex]]
+if (length(col) <= 0) {
+  df[[names[colIndex]]] <- col
+} else {
+  # TODO: more robust check on column of primitive types
+  vec <- do.call(c, col)
+  if (class(vec) != "list") {
+df[[names[colIndex]]] <- vec  
+  } else {
+# For columns of complex type, be careful to access them.
+# Get a column of complex type returns a list.
+# Get a cell from a column of complex type returns a list 
instead of a vector.
+df[[names[colIndex]]] <- col
+ }
+  }
+}
+df
+  }
+})
 
 #' Limit
 #'

http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/deserialize.R
--
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 33bf13e..6cf628e 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -48,6 +48,7 @@ readTypedObject <- function(con, type) {
 "r" = readRaw(con),
 "D" = readDat

spark git commit: [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 c740f5dd2 -> 5a32ed75c


[SPARK-10231] [MLLIB] update @Since annotation for mllib.classification

Update `Since` annotation in `mllib.classification`:

1. add version to classes, objects, constructors, and public variables declared 
in constructors
2. correct some versions
3. remove `Since` on `toString`

MechCoder dbtsai

Author: Xiangrui Meng 

Closes #8421 from mengxr/SPARK-10231 and squashes the following commits:

b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification

(cherry picked from commit 16a2be1a84c0a274a60c0a584faaf58b55d4942b)
Signed-off-by: DB Tsai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a32ed75
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a32ed75
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a32ed75

Branch: refs/heads/branch-1.5
Commit: 5a32ed75c939dc42886ea940aba2b14b89e9f40e
Parents: c740f5d
Author: Xiangrui Meng 
Authored: Tue Aug 25 12:16:23 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 12:16:41 2015 -0700

--
 .../classification/ClassificationModel.scala|  7 ++---
 .../classification/LogisticRegression.scala | 20 +-
 .../spark/mllib/classification/NaiveBayes.scala | 28 +++-
 .../apache/spark/mllib/classification/SVM.scala | 15 +++
 .../StreamingLogisticRegressionWithSGD.scala|  9 ++-
 5 files changed, 58 insertions(+), 21 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5a32ed75/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index a29b425..85a4132 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
  * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, 
etc.
  */
 @Experimental
+@Since("0.8.0")
 trait ClassificationModel extends Serializable {
   /**
* Predict values for the given data set using the model trained.
@@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable {
* @param testData RDD representing data points to be predicted
* @return an RDD[Double] where each entry contains the corresponding 
prediction
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: RDD[Vector]): RDD[Double]
 
   /**
@@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable {
* @param testData array representing a single data point
* @return predicted category from the trained model
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: Vector): Double
 
   /**
@@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable {
* @param testData JavaRDD representing data points to be predicted
* @return a JavaRDD[java.lang.Double] where each entry contains the 
corresponding prediction
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
 predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/5a32ed75/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index e03e662..5ceff5b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD
  *   Multinomial Logistic Regression. By default, it is binary 
logistic regression
  *   so numClasses will be set to 2.
  */
-class LogisticRegressionModel (
-override val weights: Vector,
-override val intercept: Double,
-val numFeatures: Int,
-val numClasses: Int)
+@Since("0.8.0")
+class LogisticRegressionModel @Since("1.3.0") (
+@Since("1.0.0") override val weights: Vector,
+@Since("1.0.0") override val intercept: Double,
+@Since("1.3.0") val numFeatures: Int,
+@Since("1.3.0") val numClasses: Int)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel 
with Serializable
   with Saveable with PMMLExportable {
 
@@ -75,6 

spark git commit: [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification

2015-08-25 Thread dbtsai
Repository: spark
Updated Branches:
  refs/heads/master 881208a8e -> 16a2be1a8


[SPARK-10231] [MLLIB] update @Since annotation for mllib.classification

Update `Since` annotation in `mllib.classification`:

1. add version to classes, objects, constructors, and public variables declared 
in constructors
2. correct some versions
3. remove `Since` on `toString`

MechCoder dbtsai

Author: Xiangrui Meng 

Closes #8421 from mengxr/SPARK-10231 and squashes the following commits:

b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16a2be1a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16a2be1a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16a2be1a

Branch: refs/heads/master
Commit: 16a2be1a84c0a274a60c0a584faaf58b55d4942b
Parents: 881208a
Author: Xiangrui Meng 
Authored: Tue Aug 25 12:16:23 2015 -0700
Committer: DB Tsai 
Committed: Tue Aug 25 12:16:23 2015 -0700

--
 .../classification/ClassificationModel.scala|  7 ++---
 .../classification/LogisticRegression.scala | 20 +-
 .../spark/mllib/classification/NaiveBayes.scala | 28 +++-
 .../apache/spark/mllib/classification/SVM.scala | 15 +++
 .../StreamingLogisticRegressionWithSGD.scala|  9 ++-
 5 files changed, 58 insertions(+), 21 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
index a29b425..85a4132 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala
@@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD
  * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, 
etc.
  */
 @Experimental
+@Since("0.8.0")
 trait ClassificationModel extends Serializable {
   /**
* Predict values for the given data set using the model trained.
@@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable {
* @param testData RDD representing data points to be predicted
* @return an RDD[Double] where each entry contains the corresponding 
prediction
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: RDD[Vector]): RDD[Double]
 
   /**
@@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable {
* @param testData array representing a single data point
* @return predicted category from the trained model
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: Vector): Double
 
   /**
@@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable {
* @param testData JavaRDD representing data points to be predicted
* @return a JavaRDD[java.lang.Double] where each entry contains the 
corresponding prediction
*/
-  @Since("0.8.0")
+  @Since("1.0.0")
   def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] =
 predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]]
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
index e03e662..5ceff5b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala
@@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD
  *   Multinomial Logistic Regression. By default, it is binary 
logistic regression
  *   so numClasses will be set to 2.
  */
-class LogisticRegressionModel (
-override val weights: Vector,
-override val intercept: Double,
-val numFeatures: Int,
-val numClasses: Int)
+@Since("0.8.0")
+class LogisticRegressionModel @Since("1.3.0") (
+@Since("1.0.0") override val weights: Vector,
+@Since("1.0.0") override val intercept: Double,
+@Since("1.3.0") val numFeatures: Int,
+@Since("1.3.0") val numClasses: Int)
   extends GeneralizedLinearModel(weights, intercept) with ClassificationModel 
with Serializable
   with Saveable with PMMLExportable {
 
@@ -75,6 +76,7 @@ class LogisticRegressionModel (
   /**
* Constructs a [[LogisticRegressionModel]] with we

spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 742c82ed9 -> c740f5dd2


[SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration

See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770)

CC jkbradley

Author: Feynman Liang 

Closes #8422 from feynmanliang/SPARK-10230.

(cherry picked from commit 881208a8e849facf54166bdd69d3634407f952e7)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c740f5dd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c740f5dd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c740f5dd

Branch: refs/heads/branch-1.5
Commit: c740f5dd20459b491a8c088383c19c11a76c225d
Parents: 742c82e
Author: Feynman Liang 
Authored: Tue Aug 25 11:58:47 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 11:58:55 2015 -0700

--
 .../spark/mllib/clustering/LDAOptimizer.scala   | 16 
 .../apache/spark/mllib/clustering/LDASuite.scala|  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 5c2aae6..38486e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
-  private var optimizeAlpha: Boolean = false
+  private var optimizeDocConcentration: Boolean = false
 
   // internal data structure
   private var docs: RDD[(Long, Vector)] = null
@@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * Optimize alpha, indicates whether alpha (Dirichlet parameter for 
document-topic distribution)
-   * will be optimized during training.
+   * Optimize docConcentration, indicates whether docConcentration (Dirichlet 
parameter for
+   * document-topic distribution) will be optimized during training.
*/
   @Since("1.5.0")
-  def getOptimzeAlpha: Boolean = this.optimizeAlpha
+  def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration
 
   /**
-   * Sets whether to optimize alpha parameter during training.
+   * Sets whether to optimize docConcentration parameter during training.
*
* Default: false
*/
   @Since("1.5.0")
-  def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
-this.optimizeAlpha = optimizeAlpha
+  def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): 
this.type = {
+this.optimizeDocConcentration = optimizeDocConcentration
 this
   }
 
@@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
 // Note that this is an optimization to avoid batch.count
 updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
-if (optimizeAlpha) updateAlpha(gammat)
+if (optimizeDocConcentration) updateAlpha(gammat)
 this
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 8a714f9..746a76a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val k = 2
 val docs = sc.parallelize(toyData)
 val op = new 
OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
-  .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false)
+  
.setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false)
 val lda = new LDA().setK(k)
   .setDocConcentration(1D / k)
   .setTopicConcentration(0.01)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master b37f0cc1b -> 881208a8e


[SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration

See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770)

CC jkbradley

Author: Feynman Liang 

Closes #8422 from feynmanliang/SPARK-10230.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/881208a8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/881208a8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/881208a8

Branch: refs/heads/master
Commit: 881208a8e849facf54166bdd69d3634407f952e7
Parents: b37f0cc
Author: Feynman Liang 
Authored: Tue Aug 25 11:58:47 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 11:58:47 2015 -0700

--
 .../spark/mllib/clustering/LDAOptimizer.scala   | 16 
 .../apache/spark/mllib/clustering/LDASuite.scala|  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 5c2aae6..38486e9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
-  private var optimizeAlpha: Boolean = false
+  private var optimizeDocConcentration: Boolean = false
 
   // internal data structure
   private var docs: RDD[(Long, Vector)] = null
@@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * Optimize alpha, indicates whether alpha (Dirichlet parameter for 
document-topic distribution)
-   * will be optimized during training.
+   * Optimize docConcentration, indicates whether docConcentration (Dirichlet 
parameter for
+   * document-topic distribution) will be optimized during training.
*/
   @Since("1.5.0")
-  def getOptimzeAlpha: Boolean = this.optimizeAlpha
+  def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration
 
   /**
-   * Sets whether to optimize alpha parameter during training.
+   * Sets whether to optimize docConcentration parameter during training.
*
* Default: false
*/
   @Since("1.5.0")
-  def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
-this.optimizeAlpha = optimizeAlpha
+  def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): 
this.type = {
+this.optimizeDocConcentration = optimizeDocConcentration
 this
   }
 
@@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
 
 // Note that this is an optimization to avoid batch.count
 updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt)
-if (optimizeAlpha) updateAlpha(gammat)
+if (optimizeDocConcentration) updateAlpha(gammat)
 this
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 8a714f9..746a76a 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
 val k = 2
 val docs = sc.parallelize(toyData)
 val op = new 
OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
-  .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false)
+  
.setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false)
 val lda = new LDA().setK(k)
   .setDocConcentration(1D / k)
   .setTopicConcentration(0.01)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-8531] [ML] Update ML user guide for MinMaxScaler

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 5c08c86bf -> b37f0cc1b


[SPARK-8531] [ML] Update ML user guide for MinMaxScaler

jira: https://issues.apache.org/jira/browse/SPARK-8531

Update ML user guide for MinMaxScaler

Author: Yuhao Yang 
Author: unknown 

Closes #7211 from hhbyyh/minmaxdoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b37f0cc1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b37f0cc1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b37f0cc1

Branch: refs/heads/master
Commit: b37f0cc1b4c064d6f09edb161250fa8b783de52a
Parents: 5c08c86
Author: Yuhao Yang 
Authored: Tue Aug 25 10:54:03 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 10:54:03 2015 -0700

--
 docs/ml-features.md | 71 
 1 file changed, 71 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b37f0cc1/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 642a4b4..62de483 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame)
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
@@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame)
 
 
 
+## MinMaxScaler
+
+`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature 
to a specific range (often [0, 1]).  It takes parameters:
+
+* `min`: 0.0 by default. Lower bound after transformation, shared by all 
features.
+* `max`: 1.0 by default. Upper bound after transformation, shared by all 
features.
+
+`MinMaxScaler` computes summary statistics on a data set and produces a 
`MinMaxScalerModel`. The model can then transform each feature individually 
such that it is in the given range.
+
+The rescaled value for a feature E is calculated as,
+`\begin{equation}
+  Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
+\end{equation}`
+For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`
+
+Note that since zero values will probably be transformed to non-zero values, 
output of the transformer will be DenseVector even for sparse input.
+
+The following example demonstrates how to load a dataset in libsvm format and 
then rescale each feature to [0, 1].
+
+
+
+More details can be found in the API docs for
+[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) 
and
+[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
+{% highlight scala %}
+import org.apache.spark.ml.feature.MinMaxScaler
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+val dataFrame = sqlContext.createDataFrame(data)
+val scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures")
+
+// Compute summary statistics and generate MinMaxScalerModel
+val scalerModel = scaler.fit(dataFrame)
+
+// rescale each feature to range [min, max].
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
+
+
+
+More details can be found in the API docs for
+[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and
+[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html).
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD data =
+  MLUtils.loadLibSVMFile(jsc.sc(), 
"data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+MinMaxScaler scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures");
+
+// Compute summary statistics and generate MinMaxScalerModel
+MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+// rescale each feature to range [min, max].
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
+
+
+
 ## Bucketizer
 
 `Bucketizer` transforms a column of continuous features to a column of feature 
buckets, where the buckets are specified by users. It takes a parameter:


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional command

spark git commit: [SPARK-8531] [ML] Update ML user guide for MinMaxScaler

2015-08-25 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 0402f1297 -> 742c82ed9


[SPARK-8531] [ML] Update ML user guide for MinMaxScaler

jira: https://issues.apache.org/jira/browse/SPARK-8531

Update ML user guide for MinMaxScaler

Author: Yuhao Yang 
Author: unknown 

Closes #7211 from hhbyyh/minmaxdoc.

(cherry picked from commit b37f0cc1b4c064d6f09edb161250fa8b783de52a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/742c82ed
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/742c82ed
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/742c82ed

Branch: refs/heads/branch-1.5
Commit: 742c82ed97ed3fc60d4f17c4363c52062829ea49
Parents: 0402f12
Author: Yuhao Yang 
Authored: Tue Aug 25 10:54:03 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 25 10:54:12 2015 -0700

--
 docs/ml-features.md | 71 
 1 file changed, 71 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/742c82ed/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 642a4b4..62de483 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame)
 {% highlight java %}
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.ml.feature.StandardScaler;
+import org.apache.spark.ml.feature.StandardScalerModel;
 import org.apache.spark.mllib.regression.LabeledPoint;
 import org.apache.spark.mllib.util.MLUtils;
 import org.apache.spark.sql.DataFrame;
@@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame)
 
 
 
+## MinMaxScaler
+
+`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature 
to a specific range (often [0, 1]).  It takes parameters:
+
+* `min`: 0.0 by default. Lower bound after transformation, shared by all 
features.
+* `max`: 1.0 by default. Upper bound after transformation, shared by all 
features.
+
+`MinMaxScaler` computes summary statistics on a data set and produces a 
`MinMaxScalerModel`. The model can then transform each feature individually 
such that it is in the given range.
+
+The rescaled value for a feature E is calculated as,
+`\begin{equation}
+  Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min
+\end{equation}`
+For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)`
+
+Note that since zero values will probably be transformed to non-zero values, 
output of the transformer will be DenseVector even for sparse input.
+
+The following example demonstrates how to load a dataset in libsvm format and 
then rescale each feature to [0, 1].
+
+
+
+More details can be found in the API docs for
+[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) 
and
+[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel).
+{% highlight scala %}
+import org.apache.spark.ml.feature.MinMaxScaler
+import org.apache.spark.mllib.util.MLUtils
+
+val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+val dataFrame = sqlContext.createDataFrame(data)
+val scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures")
+
+// Compute summary statistics and generate MinMaxScalerModel
+val scalerModel = scaler.fit(dataFrame)
+
+// rescale each feature to range [min, max].
+val scaledData = scalerModel.transform(dataFrame)
+{% endhighlight %}
+
+
+
+More details can be found in the API docs for
+[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and
+[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html).
+{% highlight java %}
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.ml.feature.MinMaxScaler;
+import org.apache.spark.ml.feature.MinMaxScalerModel;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.util.MLUtils;
+import org.apache.spark.sql.DataFrame;
+
+JavaRDD data =
+  MLUtils.loadLibSVMFile(jsc.sc(), 
"data/mllib/sample_libsvm_data.txt").toJavaRDD();
+DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class);
+MinMaxScaler scaler = new MinMaxScaler()
+  .setInputCol("features")
+  .setOutputCol("scaledFeatures");
+
+// Compute summary statistics and generate MinMaxScalerModel
+MinMaxScalerModel scalerModel = scaler.fit(dataFrame);
+
+// rescale each feature to range [min, max].
+DataFrame scaledData = scalerModel.transform(dataFrame);
+{% endhighlight %}
+
+
+
 ## Bucketizer
 
 `Bucketizer` transforms a column of continuous features to a column of feature 
buckets, where the buckets are specified by users. It takes a parameter:


-

spark git commit: [SPARK-10198] [SQL] Turn off partition verification by default

2015-08-25 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 bdcc8e608 -> 0402f1297


[SPARK-10198] [SQL] Turn off partition verification by default

Author: Michael Armbrust 

Closes #8404 from marmbrus/turnOffPartitionVerification.

(cherry picked from commit 5c08c86bfa43462fb2ca5f7c5980ddfb44dd57f8)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0402f129
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0402f129
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0402f129

Branch: refs/heads/branch-1.5
Commit: 0402f1297c697bfbe8b5c7bfc170fcdc6b2c9de5
Parents: bdcc8e6
Author: Michael Armbrust 
Authored: Tue Aug 25 10:22:54 2015 -0700
Committer: Michael Armbrust 
Committed: Tue Aug 25 10:23:08 2015 -0700

--
 .../scala/org/apache/spark/sql/SQLConf.scala|  2 +-
 .../spark/sql/hive/QueryPartitionSuite.scala| 64 +++-
 2 files changed, 35 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0402f129/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e9de14f..2974055 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -312,7 +312,7 @@ private[spark] object SQLConf {
 doc = "When true, enable filter pushdown for ORC files.")
 
   val HIVE_VERIFY_PARTITION_PATH = 
booleanConf("spark.sql.hive.verifyPartitionPath",
-defaultValue = Some(true),
+defaultValue = Some(false),
 doc = "")
 
   val HIVE_METASTORE_PARTITION_PRUNING = 
booleanConf("spark.sql.hive.metastorePartitionPruning",

http://git-wip-us.apache.org/repos/asf/spark/blob/0402f129/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 017bc2a..1cc8a93 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,50 +18,54 @@
 package org.apache.spark.sql.hive
 
 import com.google.common.io.Files
+import org.apache.spark.sql.test.SQLTestUtils
 
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.util.Utils
 
 
-class QueryPartitionSuite extends QueryTest {
+class QueryPartitionSuite extends QueryTest with SQLTestUtils {
 
   private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
   import ctx.implicits._
-  import ctx.sql
+
+  protected def _sqlContext = ctx
 
   test("SPARK-5068: query data when path doesn't exist"){
-val testData = ctx.sparkContext.parallelize(
-  (1 to 10).map(i => TestData(i, i.toString))).toDF()
-testData.registerTempTable("testData")
+withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
+  val testData = ctx.sparkContext.parallelize(
+(1 to 10).map(i => TestData(i, i.toString))).toDF()
+  testData.registerTempTable("testData")
 
-val tmpDir = Files.createTempDir()
-// create the table for test
-sql(s"CREATE TABLE table_with_partition(key int,value string) " +
-  s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
-  "SELECT key,value FROM testData")
+  val tmpDir = Files.createTempDir()
+  // create the table for test
+  sql(s"CREATE TABLE table_with_partition(key int,value string) " +
+s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
+"SELECT key,value FROM testData")
 
-// test for the exist path
-checkAnswer(sq

spark git commit: [SPARK-10198] [SQL] Turn off partition verification by default

2015-08-25 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master 69c9c1771 -> 5c08c86bf


[SPARK-10198] [SQL] Turn off partition verification by default

Author: Michael Armbrust 

Closes #8404 from marmbrus/turnOffPartitionVerification.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c08c86b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c08c86b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c08c86b

Branch: refs/heads/master
Commit: 5c08c86bfa43462fb2ca5f7c5980ddfb44dd57f8
Parents: 69c9c17
Author: Michael Armbrust 
Authored: Tue Aug 25 10:22:54 2015 -0700
Committer: Michael Armbrust 
Committed: Tue Aug 25 10:22:54 2015 -0700

--
 .../scala/org/apache/spark/sql/SQLConf.scala|  2 +-
 .../spark/sql/hive/QueryPartitionSuite.scala| 64 +++-
 2 files changed, 35 insertions(+), 31 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5c08c86b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e6f7619..9de75f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -312,7 +312,7 @@ private[spark] object SQLConf {
 doc = "When true, enable filter pushdown for ORC files.")
 
   val HIVE_VERIFY_PARTITION_PATH = 
booleanConf("spark.sql.hive.verifyPartitionPath",
-defaultValue = Some(true),
+defaultValue = Some(false),
 doc = "")
 
   val HIVE_METASTORE_PARTITION_PRUNING = 
booleanConf("spark.sql.hive.metastorePartitionPruning",

http://git-wip-us.apache.org/repos/asf/spark/blob/5c08c86b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index 017bc2a..1cc8a93 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -18,50 +18,54 @@
 package org.apache.spark.sql.hive
 
 import com.google.common.io.Files
+import org.apache.spark.sql.test.SQLTestUtils
 
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.util.Utils
 
 
-class QueryPartitionSuite extends QueryTest {
+class QueryPartitionSuite extends QueryTest with SQLTestUtils {
 
   private lazy val ctx = org.apache.spark.sql.hive.test.TestHive
   import ctx.implicits._
-  import ctx.sql
+
+  protected def _sqlContext = ctx
 
   test("SPARK-5068: query data when path doesn't exist"){
-val testData = ctx.sparkContext.parallelize(
-  (1 to 10).map(i => TestData(i, i.toString))).toDF()
-testData.registerTempTable("testData")
+withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) {
+  val testData = ctx.sparkContext.parallelize(
+(1 to 10).map(i => TestData(i, i.toString))).toDF()
+  testData.registerTempTable("testData")
 
-val tmpDir = Files.createTempDir()
-// create the table for test
-sql(s"CREATE TABLE table_with_partition(key int,value string) " +
-  s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
-  "SELECT key,value FROM testData")
-sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
-  "SELECT key,value FROM testData")
+  val tmpDir = Files.createTempDir()
+  // create the table for test
+  sql(s"CREATE TABLE table_with_partition(key int,value string) " +
+s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
+"SELECT key,value FROM testData")
+  sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
+"SELECT key,value FROM testData")
 
-// test for the exist path
-checkAnswer(sql("select key,value from table_with_partition"),
-  testData.toDF.collect ++ testData.toDF.collect
-   

spark git commit: [SPARK-8400] [ML] Added check in ml.ALS for positive block size parameter setting

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-1.3 3d2eaf0a7 -> e8b0564e7


[SPARK-8400] [ML] Added check in ml.ALS for positive block size parameter 
setting

Added check for positive block size with a note that -1 for auto-configuring is 
not supported

Author: Bryan Cutler 

Closes #8363 from BryanCutler/ml.ALS-neg-blocksize-8400-1.3.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8b0564e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8b0564e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8b0564e

Branch: refs/heads/branch-1.3
Commit: e8b0564e770b896a6156c7e1eed5bc5200e223f4
Parents: 3d2eaf0
Author: Bryan Cutler 
Authored: Tue Aug 25 12:36:49 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 12:36:49 2015 +0100

--
 .../scala/org/apache/spark/ml/recommendation/ALS.scala| 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e8b0564e/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala 
b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
index fc0f529..6ce5285 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala
@@ -234,10 +234,16 @@ class ALS extends Estimator[ALSModel] with ALSParams {
   def setRank(value: Int): this.type = set(rank, value)
 
   /** @group setParam */
-  def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value)
+  def setNumUserBlocks(value: Int): this.type = {
+require(value > 0, "Number of blocks must be > 0, auto-configuring with -1 
is not supported.")
+set(numUserBlocks, value)
+  }
 
   /** @group setParam */
-  def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value)
+  def setNumItemBlocks(value: Int): this.type = {
+require(value > 0, "Number of blocks must be > 0, auto-configuring with -1 
is not supported.")
+set(numItemBlocks, value)
+  }
 
   /** @group setParam */
   def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[4/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters

2015-08-25 Thread srowen
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
index b089da8..7c170a7 100644
--- 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
+++ 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala
@@ -19,7 +19,7 @@ package org.apache.spark.network.netty
 
 import java.nio.ByteBuffer
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 
 import org.apache.spark.Logging
 import org.apache.spark.network.BlockDataManager
@@ -55,7 +55,7 @@ class NettyBlockRpcServer(
   case openBlocks: OpenBlocks =>
 val blocks: Seq[ManagedBuffer] =
   openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData)
-val streamId = streamManager.registerStream(blocks.iterator)
+val streamId = streamManager.registerStream(blocks.iterator.asJava)
 logTrace(s"Registered streamId $streamId with ${blocks.size} buffers")
 responseContext.onSuccess(new StreamHandle(streamId, 
blocks.size).toByteArray)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
index d650d5f..ff8aae9 100644
--- 
a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
+++ 
b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.network.netty
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.concurrent.{Future, Promise}
 
 import org.apache.spark.{SecurityManager, SparkConf}
@@ -58,7 +58,7 @@ class NettyBlockTransferService(conf: SparkConf, 
securityManager: SecurityManage
 securityManager.isSaslEncryptionEnabled()))
 }
 transportContext = new TransportContext(transportConf, rpcHandler)
-clientFactory = 
transportContext.createClientFactory(clientBootstrap.toList)
+clientFactory = 
transportContext.createClientFactory(clientBootstrap.toSeq.asJava)
 server = createServer(serverBootstrap.toList)
 appId = conf.getAppId
 logInfo("Server created on " + server.getPort)
@@ -67,7 +67,7 @@ class NettyBlockTransferService(conf: SparkConf, 
securityManager: SecurityManage
   /** Creates and binds the TransportServer, possibly trying multiple ports. */
   private def createServer(bootstraps: List[TransportServerBootstrap]): 
TransportServer = {
 def startService(port: Int): (TransportServer, Int) = {
-  val server = transportContext.createServer(port, bootstraps)
+  val server = transportContext.createServer(port, bootstraps.asJava)
   (server, server.getPort)
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
--
diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala 
b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
index 1499da0..8d9ebad 100644
--- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
+++ b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala
@@ -23,7 +23,7 @@ import java.nio.channels._
 import java.util.concurrent.ConcurrentLinkedQueue
 import java.util.LinkedList
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
@@ -145,7 +145,7 @@ abstract class Connection(val channel: SocketChannel, val 
selector: Selector,
   }
 
   def callOnExceptionCallbacks(e: Throwable) {
-onExceptionCallbacks foreach {
+onExceptionCallbacks.asScala.foreach {
   callback =>
 try {
   callback(this, e)

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala 
b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
index 91b07ce..5afce75 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
@@ -19,7 +19,7 @@ package org.apache.spark.partial
 
 i

[3/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters

2015-08-25 Thread srowen
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
--
diff --git 
a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
 
b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
index 91d63d4..a2ab320 100644
--- 
a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
+++ 
b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala
@@ -18,9 +18,8 @@
 package org.apache.spark.streaming.flume
 
 import java.util.concurrent._
-import java.util.{List => JList, Map => JMap}
+import java.util.{Map => JMap, Collections}
 
-import scala.collection.JavaConversions._
 import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.base.Charsets.UTF_8
@@ -77,7 +76,7 @@ private[flume] class PollingFlumeTestUtils {
   /**
* Start 2 sinks and return the ports
*/
-  def startMultipleSinks(): JList[Int] = {
+  def startMultipleSinks(): Seq[Int] = {
 channels.clear()
 sinks.clear()
 
@@ -138,8 +137,7 @@ private[flume] class PollingFlumeTestUtils {
   /**
* A Python-friendly method to assert the output
*/
-  def assertOutput(
-  outputHeaders: JList[JMap[String, String]], outputBodies: 
JList[String]): Unit = {
+  def assertOutput(outputHeaders: Seq[JMap[String, String]], outputBodies: 
Seq[String]): Unit = {
 require(outputHeaders.size == outputBodies.size)
 val eventSize = outputHeaders.size
 if (eventSize != totalEventsPerChannel * channels.size) {
@@ -149,12 +147,12 @@ private[flume] class PollingFlumeTestUtils {
 var counter = 0
 for (k <- 0 until channels.size; i <- 0 until totalEventsPerChannel) {
   val eventBodyToVerify = s"${channels(k).getName}-$i"
-  val eventHeaderToVerify: JMap[String, String] = Map[String, 
String](s"test-$i" -> "header")
+  val eventHeaderToVerify: JMap[String, String] = 
Collections.singletonMap(s"test-$i", "header")
   var found = false
   var j = 0
   while (j < eventSize && !found) {
-if (eventBodyToVerify == outputBodies.get(j) &&
-  eventHeaderToVerify == outputHeaders.get(j)) {
+if (eventBodyToVerify == outputBodies(j) &&
+  eventHeaderToVerify == outputHeaders(j)) {
   found = true
   counter += 1
 }
@@ -195,7 +193,7 @@ private[flume] class PollingFlumeTestUtils {
 tx.begin()
 for (j <- 0 until eventsPerBatch) {
   
channel.put(EventBuilder.withBody(s"${channel.getName}-$t".getBytes(UTF_8),
-Map[String, String](s"test-$t" -> "header")))
+Collections.singletonMap(s"test-$t", "header")))
   t += 1
 }
 tx.commit()

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
--
diff --git 
a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
 
b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
index d5f9a0a..ff2fb8e 100644
--- 
a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
+++ 
b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.flume
 
 import java.net.InetSocketAddress
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
 import scala.concurrent.duration._
 import scala.language.postfixOps
@@ -116,9 +116,9 @@ class FlumePollingStreamSuite extends SparkFunSuite with 
BeforeAndAfter with Log
   // The eventually is required to ensure that all data in the batch has 
been processed.
   eventually(timeout(10 seconds), interval(100 milliseconds)) {
 val flattenOutputBuffer = outputBuffer.flatten
-val headers = flattenOutputBuffer.map(_.event.getHeaders.map {
-  case kv => (kv._1.toString, kv._2.toString)
-}).map(mapAsJavaMap)
+val headers = flattenOutputBuffer.map(_.event.getHeaders.asScala.map {
+  case (key, value) => (key.toString, value.toString)
+}).map(_.asJava)
 val bodies = flattenOutputBuffer.map(e => new 
String(e.event.getBody.array(), UTF_8))
 utils.assertOutput(headers, bodies)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
--
diff --git 
a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
 
b/exter

[5/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters

2015-08-25 Thread srowen
[SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to 
JavaConverters

Replace `JavaConversions` implicits with `JavaConverters`

Most occurrences I've seen so far are necessary conversions; a few have been 
avoidable. None are in critical code as far as I see, yet.

Author: Sean Owen 

Closes #8033 from srowen/SPARK-9613.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69c9c177
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69c9c177
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69c9c177

Branch: refs/heads/master
Commit: 69c9c177160e32a2fbc9b36ecc52156077fca6fc
Parents: 7f1e507
Author: Sean Owen 
Authored: Tue Aug 25 12:33:13 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 12:33:13 2015 +0100

--
 .../shuffle/unsafe/UnsafeShuffleWriter.java |   4 +-
 .../org/apache/spark/MapOutputTracker.scala |   4 +-
 .../scala/org/apache/spark/SSLOptions.scala |  11 +-
 .../scala/org/apache/spark/SparkContext.scala   |   4 +-
 .../main/scala/org/apache/spark/TestUtils.scala |   9 +-
 .../apache/spark/api/java/JavaHadoopRDD.scala   |   4 +-
 .../spark/api/java/JavaNewHadoopRDD.scala   |   4 +-
 .../org/apache/spark/api/java/JavaPairRDD.scala |  19 ++--
 .../org/apache/spark/api/java/JavaRDDLike.scala |  75 +
 .../spark/api/java/JavaSparkContext.scala   |  20 ++--
 .../spark/api/python/PythonHadoopUtil.scala |  28 ++---
 .../org/apache/spark/api/python/PythonRDD.scala |  26 ++---
 .../apache/spark/api/python/PythonUtils.scala   |  15 ++-
 .../spark/api/python/PythonWorkerFactory.scala  |  11 +-
 .../org/apache/spark/api/python/SerDeUtil.scala |   3 +-
 .../WriteInputFormatTestDataGenerator.scala |   8 +-
 .../scala/org/apache/spark/api/r/RRDD.scala |  13 ++-
 .../scala/org/apache/spark/api/r/RUtils.scala   |   5 +-
 .../scala/org/apache/spark/api/r/SerDe.scala|   4 +-
 .../spark/broadcast/TorrentBroadcast.scala  |   4 +-
 .../spark/deploy/ExternalShuffleService.scala   |   8 +-
 .../org/apache/spark/deploy/PythonRunner.scala  |   4 +-
 .../org/apache/spark/deploy/RPackageUtils.scala |   4 +-
 .../scala/org/apache/spark/deploy/RRunner.scala |   4 +-
 .../apache/spark/deploy/SparkCuratorUtil.scala  |   4 +-
 .../apache/spark/deploy/SparkHadoopUtil.scala   |  19 ++--
 .../spark/deploy/SparkSubmitArguments.scala |   6 +-
 .../master/ZooKeeperPersistenceEngine.scala |   6 +-
 .../spark/deploy/worker/CommandUtils.scala  |   5 +-
 .../spark/deploy/worker/DriverRunner.scala  |   8 +-
 .../spark/deploy/worker/ExecutorRunner.scala|   7 +-
 .../org/apache/spark/deploy/worker/Worker.scala |   1 -
 .../org/apache/spark/executor/Executor.scala|   6 +-
 .../apache/spark/executor/ExecutorSource.scala  |   4 +-
 .../spark/executor/MesosExecutorBackend.scala   |   6 +-
 .../apache/spark/input/PortableDataStream.scala |  11 +-
 .../spark/input/WholeTextFileInputFormat.scala  |   8 +-
 .../spark/launcher/WorkerCommandBuilder.scala   |   4 +-
 .../apache/spark/metrics/MetricsConfig.scala|  22 ++--
 .../network/netty/NettyBlockRpcServer.scala |   4 +-
 .../netty/NettyBlockTransferService.scala   |   6 +-
 .../apache/spark/network/nio/Connection.scala   |   4 +-
 .../spark/partial/GroupedCountEvaluator.scala   |  10 +-
 .../spark/partial/GroupedMeanEvaluator.scala|  10 +-
 .../spark/partial/GroupedSumEvaluator.scala |  10 +-
 .../org/apache/spark/rdd/PairRDDFunctions.scala |   6 +-
 .../scala/org/apache/spark/rdd/PipedRDD.scala   |   6 +-
 .../org/apache/spark/rdd/SubtractedRDD.scala|   4 +-
 .../spark/scheduler/InputFormatInfo.scala   |   4 +-
 .../scala/org/apache/spark/scheduler/Pool.scala |  10 +-
 .../mesos/CoarseMesosSchedulerBackend.scala |  20 ++--
 .../mesos/MesosClusterPersistenceEngine.scala   |   4 +-
 .../cluster/mesos/MesosClusterScheduler.scala   |  14 +--
 .../cluster/mesos/MesosSchedulerBackend.scala   |  22 ++--
 .../cluster/mesos/MesosSchedulerUtils.scala |  25 ++---
 .../spark/serializer/KryoSerializer.scala   |  10 +-
 .../shuffle/FileShuffleBlockResolver.scala  |   8 +-
 .../storage/BlockManagerMasterEndpoint.scala|   8 +-
 .../scala/org/apache/spark/util/AkkaUtils.scala |   4 +-
 .../org/apache/spark/util/ListenerBus.scala |   7 +-
 .../spark/util/MutableURLClassLoader.scala  |   2 -
 .../apache/spark/util/TimeStampedHashMap.scala  |  10 +-
 .../apache/spark/util/TimeStampedHashSet.scala  |   4 +-
 .../scala/org/apache/spark/util/Utils.scala |  20 ++--
 .../apache/spark/util/collection/Utils.scala|   4 +-
 .../java/org/apache/spark/JavaAPISuite.java |   6 +-
 .../scala/org/apache/spark/SparkConfSuite.scala |   7 +-
 .../spark/deploy/LogUrlsStandaloneSuite.scala   |   1 -
 .../spark/deploy/RPackageUtilsSuite.scala   |   8 +-
 .../deploy/worker/ExecutorRunnerTest.scala  |   5 +-
 .../

[1/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 7f1e507bf -> 69c9c1771


http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
--
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index ccf753e..5f897cb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -21,9 +21,7 @@ import java.util.Collections
 import java.util.concurrent._
 import java.util.regex.Pattern
 
-import org.apache.spark.util.Utils
-
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder
@@ -39,8 +37,8 @@ import org.apache.log4j.{Level, Logger}
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
 import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._
 import org.apache.spark.rpc.RpcEndpointRef
-import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._
+import org.apache.spark.util.Utils
 
 /**
  * YarnAllocator is charged with requesting containers from the YARN 
ResourceManager and deciding
@@ -164,7 +162,7 @@ private[yarn] class YarnAllocator(
* Number of container requests at the given location that have not yet been 
fulfilled.
*/
   private def getNumPendingAtLocation(location: String): Int =
-amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, 
resource).map(_.size).sum
+amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, 
resource).asScala.map(_.size).sum
 
   /**
* Request as many executors from the ResourceManager as needed to reach the 
desired total. If
@@ -231,14 +229,14 @@ private[yarn] class YarnAllocator(
   numExecutorsRunning,
   allocateResponse.getAvailableResources))
 
-  handleAllocatedContainers(allocatedContainers)
+  handleAllocatedContainers(allocatedContainers.asScala)
 }
 
 val completedContainers = allocateResponse.getCompletedContainersStatuses()
 if (completedContainers.size > 0) {
   logDebug("Completed %d containers".format(completedContainers.size))
 
-  processCompletedContainers(completedContainers)
+  processCompletedContainers(completedContainers.asScala)
 
   logDebug("Finished processing %d completed containers. Current running 
executor count: %d."
 .format(completedContainers.size, numExecutorsRunning))
@@ -271,7 +269,7 @@ private[yarn] class YarnAllocator(
 val request = createContainerRequest(resource, locality.nodes, 
locality.racks)
 amClient.addContainerRequest(request)
 val nodes = request.getNodes
-val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last
+val hostStr = if (nodes == null || nodes.isEmpty) "Any" else 
nodes.asScala.last
 logInfo(s"Container request (host: $hostStr, capability: $resource)")
   }
 } else if (missing < 0) {
@@ -280,7 +278,8 @@ private[yarn] class YarnAllocator(
 
   val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, 
ANY_HOST, resource)
   if (!matchingRequests.isEmpty) {
-
matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest)
+matchingRequests.iterator().next().asScala
+  .take(numToCancel).foreach(amClient.removeContainerRequest)
   } else {
 logWarning("Expected to find pending requests, but found none.")
   }
@@ -459,7 +458,7 @@ private[yarn] class YarnAllocator(
 }
   }
 
-  if (allocatedContainerToHostMap.containsKey(containerId)) {
+  if (allocatedContainerToHostMap.contains(containerId)) {
 val host = allocatedContainerToHostMap.get(containerId).get
 val containerSet = allocatedHostToContainersMap.get(host).get
 

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
--
diff --git 
a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala 
b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index 4999f9c..df042bf 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -19,17 +19,15 @@ package org.apache.spark.deploy.yarn
 
 import java.util.{List => JList}
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.{Map, Set}
 import scala.util.Try
 
 import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.yarn.api.App

[2/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters

2015-08-25 Thread srowen
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 98d21aa..b8da084 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.hive
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 
 import com.google.common.base.Objects
@@ -483,7 +483,7 @@ private[hive] class HiveMetastoreCatalog(val client: 
ClientInterface, hive: Hive
   // are empty.
   val partitions = metastoreRelation.getHiveQlPartitions().map { p =>
 val location = p.getLocation
-val values = 
InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map {
+val values = 
InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map {
   case (rawValue, dataType) => Cast(Literal(rawValue), 
dataType).eval(null)
 })
 ParquetPartition(values, location)
@@ -798,9 +798,9 @@ private[hive] case class MetastoreRelation
 
 val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
 tTable.setSd(sd)
-sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)))
+sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)).asJava)
 tTable.setPartitionKeys(
-  table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)))
+  table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)).asJava)
 
 table.location.foreach(sd.setLocation)
 table.inputFormat.foreach(sd.setInputFormat)
@@ -852,11 +852,11 @@ private[hive] case class MetastoreRelation
   val tPartition = new org.apache.hadoop.hive.metastore.api.Partition
   tPartition.setDbName(databaseName)
   tPartition.setTableName(tableName)
-  tPartition.setValues(p.values)
+  tPartition.setValues(p.values.asJava)
 
   val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor()
   tPartition.setSd(sd)
-  sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)))
+  sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, 
c.comment)).asJava)
 
   sd.setLocation(p.storage.location)
   sd.setInputFormat(p.storage.inputFormat)

http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
--
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index ad33dee..d5cd7e9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -20,6 +20,9 @@ package org.apache.spark.sql.hive
 import java.sql.Date
 import java.util.Locale
 
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.serde.serdeConstants
@@ -48,10 +51,6 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.random.RandomSampler
 
-/* Implicit conversions */
-import scala.collection.JavaConversions._
-import scala.collection.mutable.ArrayBuffer
-
 /**
  * Used when we need to start parsing the AST before deciding that we are 
going to pass the command
  * back for Hive to execute natively.  Will be replaced with a native command 
that contains the
@@ -202,7 +201,7 @@ private[hive] object HiveQl extends Logging {
  * Returns a scala.Seq equivalent to [s] or Nil if [s] is null.
  */
 private def nilIfEmpty[A](s: java.util.List[A]): Seq[A] =
-  Option(s).map(_.toSeq).getOrElse(Nil)
+  Option(s).map(_.asScala).getOrElse(Nil)
 
 /**
  * Returns this ASTNode with the text changed to `newText`.
@@ -217,7 +216,7 @@ private[hive] object HiveQl extends Logging {
  */
 def withChildren(newChildren: Seq[ASTNode]): ASTNode = {
   (1 to n.getChildCount).foreach(_ => n.deleteChild(0))
-  n.addChildren(newChildren)
+  n.addChildren(newChildren.asJava)
   n
 }
 
@@ -323,11 +322,11 @@ private[hive] object HiveQl extends Logging {
 assert(tree.asInstanceOf[ASTNode].getText == "TOK_CREATETABLE", "Only 
CREATE TABLE supported.")
 val tableOps = tree.getChildren
 val colList =
-  tableOps
+  tableOps.asScala
 .find(_.asInstance

spark git commit: Fixed a typo in DAGScheduler.

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 5c1489015 -> 7f1e507bf


Fixed a typo in DAGScheduler.

Author: ehnalis 

Closes #8308 from ehnalis/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f1e507b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f1e507b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f1e507b

Branch: refs/heads/master
Commit: 7f1e507bf7e82bff323c5dec3c1ee044687c4173
Parents: 5c14890
Author: ehnalis 
Authored: Tue Aug 25 12:30:06 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 12:30:06 2015 +0100

--
 .../apache/spark/scheduler/DAGScheduler.scala   | 27 +++-
 1 file changed, 20 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7f1e507b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 684db66..daf9b0f 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -152,17 +152,24 @@ class DAGScheduler(
   // may lead to more delay in scheduling if those locations are busy.
   private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
 
-  // Called by TaskScheduler to report task's starting.
+  /**
+   * Called by the TaskSetManager to report task's starting.
+   */
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
 eventProcessLoop.post(BeginEvent(task, taskInfo))
   }
 
-  // Called to report that a task has completed and results are being fetched 
remotely.
+  /**
+   * Called by the TaskSetManager to report that a task has completed
+   * and results are being fetched remotely.
+   */
   def taskGettingResult(taskInfo: TaskInfo) {
 eventProcessLoop.post(GettingResultEvent(taskInfo))
   }
 
-  // Called by TaskScheduler to report task completions or failures.
+  /**
+   * Called by the TaskSetManager to report task completions or failures.
+   */
   def taskEnded(
   task: Task[_],
   reason: TaskEndReason,
@@ -188,18 +195,24 @@ class DAGScheduler(
   BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, 
"BlockManagerHeartbeat"))
   }
 
-  // Called by TaskScheduler when an executor fails.
+  /**
+   * Called by TaskScheduler implementation when an executor fails.
+   */
   def executorLost(execId: String): Unit = {
 eventProcessLoop.post(ExecutorLost(execId))
   }
 
-  // Called by TaskScheduler when a host is added
+  /**
+   * Called by TaskScheduler implementation when a host is added.
+   */
   def executorAdded(execId: String, host: String): Unit = {
 eventProcessLoop.post(ExecutorAdded(execId, host))
   }
 
-  // Called by TaskScheduler to cancel an entire TaskSet due to either 
repeated failures or
-  // cancellation of the job itself.
+  /**
+   * Called by the TaskSetManager to cancel an entire TaskSet due to either 
repeated failures or
+   * cancellation of the job itself.
+   */
   def taskSetFailed(taskSet: TaskSet, reason: String, exception: 
Option[Throwable]): Unit = {
 eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception))
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: Fixed a typo in DAGScheduler.

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 5d6840569 -> bdcc8e608


Fixed a typo in DAGScheduler.

Author: ehnalis 

Closes #8308 from ehnalis/master.

(cherry picked from commit 7f1e507bf7e82bff323c5dec3c1ee044687c4173)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdcc8e60
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdcc8e60
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdcc8e60

Branch: refs/heads/branch-1.5
Commit: bdcc8e608d9a1160db988faa76808149c28a3b50
Parents: 5d68405
Author: ehnalis 
Authored: Tue Aug 25 12:30:06 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 12:30:18 2015 +0100

--
 .../apache/spark/scheduler/DAGScheduler.scala   | 27 +++-
 1 file changed, 20 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bdcc8e60/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 591b714..de69911 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -152,17 +152,24 @@ class DAGScheduler(
   // may lead to more delay in scheduling if those locations are busy.
   private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2
 
-  // Called by TaskScheduler to report task's starting.
+  /**
+   * Called by the TaskSetManager to report task's starting.
+   */
   def taskStarted(task: Task[_], taskInfo: TaskInfo) {
 eventProcessLoop.post(BeginEvent(task, taskInfo))
   }
 
-  // Called to report that a task has completed and results are being fetched 
remotely.
+  /**
+   * Called by the TaskSetManager to report that a task has completed
+   * and results are being fetched remotely.
+   */
   def taskGettingResult(taskInfo: TaskInfo) {
 eventProcessLoop.post(GettingResultEvent(taskInfo))
   }
 
-  // Called by TaskScheduler to report task completions or failures.
+  /**
+   * Called by the TaskSetManager to report task completions or failures.
+   */
   def taskEnded(
   task: Task[_],
   reason: TaskEndReason,
@@ -188,18 +195,24 @@ class DAGScheduler(
   BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, 
"BlockManagerHeartbeat"))
   }
 
-  // Called by TaskScheduler when an executor fails.
+  /**
+   * Called by TaskScheduler implementation when an executor fails.
+   */
   def executorLost(execId: String): Unit = {
 eventProcessLoop.post(ExecutorLost(execId))
   }
 
-  // Called by TaskScheduler when a host is added
+  /**
+   * Called by TaskScheduler implementation when a host is added.
+   */
   def executorAdded(execId: String, host: String): Unit = {
 eventProcessLoop.post(ExecutorAdded(execId, host))
   }
 
-  // Called by TaskScheduler to cancel an entire TaskSet due to either 
repeated failures or
-  // cancellation of the job itself.
+  /**
+   * Called by the TaskSetManager to cancel an entire TaskSet due to either 
repeated failures or
+   * cancellation of the job itself.
+   */
   def taskSetFailed(taskSet: TaskSet, reason: String, exception: 
Option[Throwable]): Unit = {
 eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception))
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [DOC] add missing parameters in SparkContext.scala for scala doc

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 73f1dd1b5 -> 5d6840569


[DOC] add missing parameters in SparkContext.scala for scala doc

Author: Zhang, Liye 

Closes #8412 from liyezhang556520/minorDoc.

(cherry picked from commit 5c14890159a5711072bf395f662b2433a389edf9)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d684056
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d684056
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d684056

Branch: refs/heads/branch-1.5
Commit: 5d6840569761a42624f9852b942e33039d21f46a
Parents: 73f1dd1
Author: Zhang, Liye 
Authored: Tue Aug 25 11:48:55 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 11:49:07 2015 +0100

--
 .../main/scala/org/apache/spark/SparkContext.scala   | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5d684056/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1ddaca8..9849aff 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* :: DeveloperApi ::
* Alternative constructor for setting preferred locations where Spark will 
create executors.
*
+   * @param config a [[org.apache.spark.SparkConf]] object specifying other 
Spark parameters
* @param preferredNodeLocationData used in YARN mode to select nodes to 
launch containers on.
* Can be generated using 
[[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
* from a list of input files or InputFormats for the application.
@@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @param jars Collection of JARs to send to the cluster. These can be paths 
on the local file
* system or HDFS, HTTP, HTTPS, or FTP URLs.
* @param environment Environment variables to set on worker nodes.
+   * @param preferredNodeLocationData used in YARN mode to select nodes to 
launch containers on.
+   * Can be generated using 
[[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
+   * from a list of input files or InputFormats for the application.
*/
   def this(
   master: String,
@@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @note Small files are preferred, large file is also allowable, but may 
cause bad performance.
* @note On some filesystems, `.../path/*` can be a more efficient way 
to read all files
*   in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param minPartitions A suggestion value of the minimal splitting number 
for input data.
*/
   def wholeTextFiles(
@@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @note Small files are preferred; very large files may cause bad 
performance.
* @note On some filesystems, `.../path/*` can be a more efficient way 
to read all files
*   in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param minPartitions A suggestion value of the minimal splitting number 
for input data.
*/
   @Experimental
@@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* '''Note:''' We ensure that the byte array for each record in the 
resulting RDD
* has the provided record length.
*
-   * @param path Directory to the input data files
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param recordLength The length at which to split the records
+   * @param conf Configuration for setting up the dataset.
+   *
* @return An RDD of data with values, represented as byte arrays
*/
   @Experimental


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [DOC] add missing parameters in SparkContext.scala for scala doc

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master 0e6368ffa -> 5c1489015


[DOC] add missing parameters in SparkContext.scala for scala doc

Author: Zhang, Liye 

Closes #8412 from liyezhang556520/minorDoc.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c148901
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c148901
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c148901

Branch: refs/heads/master
Commit: 5c14890159a5711072bf395f662b2433a389edf9
Parents: 0e6368f
Author: Zhang, Liye 
Authored: Tue Aug 25 11:48:55 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 11:48:55 2015 +0100

--
 .../main/scala/org/apache/spark/SparkContext.scala   | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5c148901/core/src/main/scala/org/apache/spark/SparkContext.scala
--
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 1ddaca8..9849aff 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* :: DeveloperApi ::
* Alternative constructor for setting preferred locations where Spark will 
create executors.
*
+   * @param config a [[org.apache.spark.SparkConf]] object specifying other 
Spark parameters
* @param preferredNodeLocationData used in YARN mode to select nodes to 
launch containers on.
* Can be generated using 
[[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
* from a list of input files or InputFormats for the application.
@@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @param jars Collection of JARs to send to the cluster. These can be paths 
on the local file
* system or HDFS, HTTP, HTTPS, or FTP URLs.
* @param environment Environment variables to set on worker nodes.
+   * @param preferredNodeLocationData used in YARN mode to select nodes to 
launch containers on.
+   * Can be generated using 
[[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
+   * from a list of input files or InputFormats for the application.
*/
   def this(
   master: String,
@@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @note Small files are preferred, large file is also allowable, but may 
cause bad performance.
* @note On some filesystems, `.../path/*` can be a more efficient way 
to read all files
*   in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param minPartitions A suggestion value of the minimal splitting number 
for input data.
*/
   def wholeTextFiles(
@@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* @note Small files are preferred; very large files may cause bad 
performance.
* @note On some filesystems, `.../path/*` can be a more efficient way 
to read all files
*   in a directory rather than `.../path/` or `.../path`
+   *
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param minPartitions A suggestion value of the minimal splitting number 
for input data.
*/
   @Experimental
@@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with 
ExecutorAllocationCli
* '''Note:''' We ensure that the byte array for each record in the 
resulting RDD
* has the provided record length.
*
-   * @param path Directory to the input data files
+   * @param path Directory to the input data files, the path can be comma 
separated paths as the
+   * list of inputs.
* @param recordLength The length at which to split the records
+   * @param conf Configuration for setting up the dataset.
+   *
* @return An RDD of data with values, represented as byte arrays
*/
   @Experimental


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).

2015-08-25 Thread lian
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 a0f22cf29 -> 73f1dd1b5


[SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).

https://issues.apache.org/jira/browse/SPARK-10197

Author: Yin Huai 

Closes #8407 from yhuai/ORCSPARK-10197.

(cherry picked from commit 0e6368ffaec1965d0c7f89420e04a974675c7f6e)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73f1dd1b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73f1dd1b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73f1dd1b

Branch: refs/heads/branch-1.5
Commit: 73f1dd1b5acf1c6c37045da25902d7ca5ab795e4
Parents: a0f22cf
Author: Yin Huai 
Authored: Tue Aug 25 16:19:34 2015 +0800
Committer: Cheng Lian 
Committed: Tue Aug 25 16:20:13 2015 +0800

--
 .../apache/spark/sql/hive/HiveInspectors.scala  | 29 
 .../spark/sql/hive/orc/OrcSourceSuite.scala | 29 
 2 files changed, 53 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/73f1dd1b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 9824dad..64fffdb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -370,17 +370,36 @@ private[hive] trait HiveInspectors {
   protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => 
Any = oi match {
 case _: JavaHiveVarcharObjectInspector =>
   (o: Any) =>
-val s = o.asInstanceOf[UTF8String].toString
-new HiveVarchar(s, s.size)
+if (o != null) {
+  val s = o.asInstanceOf[UTF8String].toString
+  new HiveVarchar(s, s.size)
+} else {
+  null
+}
 
 case _: JavaHiveDecimalObjectInspector =>
-  (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+  (o: Any) =>
+if (o != null) {
+  HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+} else {
+  null
+}
 
 case _: JavaDateObjectInspector =>
-  (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+  (o: Any) =>
+if (o != null) {
+  DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+} else {
+  null
+}
 
 case _: JavaTimestampObjectInspector =>
-  (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+  (o: Any) =>
+if (o != null) {
+  DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+} else {
+  null
+}
 
 case soi: StandardStructObjectInspector =>
   val schema = dataType.asInstanceOf[StructType]

http://git-wip-us.apache.org/repos/asf/spark/blob/73f1dd1b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 82e08ca..80c3808 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with 
BeforeAndAfterAll {
   sql("SELECT * FROM normal_orc_as_source"),
   (6 to 10).map(i => Row(i, s"part-$i")))
   }
+
+  test("write null values") {
+sql("DROP TABLE IF EXISTS orcNullValues")
+
+val df = sql(
+  """
+|SELECT
+|  CAST(null as TINYINT),
+|  CAST(null as SMALLINT),
+|  CAST(null as INT),
+|  CAST(null as BIGINT),
+|  CAST(null as FLOAT),
+|  CAST(null as DOUBLE),
+|  CAST(null as DECIMAL(7,2)),
+|  CAST(null as TIMESTAMP),
+|  CAST(null as DATE),
+|  CAST(null as STRING),
+|  CAST(null as VARCHAR(10))
+|FROM orc_temp_table limit 1
+  """.stripMargin)
+
+df.write.format("orc").saveAsTable("orcNullValues")
+
+checkAnswer(
+  sql("SELECT * FROM orcNullValues"),
+  Row.fromSeq(Seq.fill(11)(null)))
+
+sql("DROP TABLE IF EXISTS orcNullValues")
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).

2015-08-25 Thread lian
Repository: spark
Updated Branches:
  refs/heads/master 7bc9a8c62 -> 0e6368ffa


[SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).

https://issues.apache.org/jira/browse/SPARK-10197

Author: Yin Huai 

Closes #8407 from yhuai/ORCSPARK-10197.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0e6368ff
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0e6368ff
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0e6368ff

Branch: refs/heads/master
Commit: 0e6368ffaec1965d0c7f89420e04a974675c7f6e
Parents: 7bc9a8c
Author: Yin Huai 
Authored: Tue Aug 25 16:19:34 2015 +0800
Committer: Cheng Lian 
Committed: Tue Aug 25 16:19:34 2015 +0800

--
 .../apache/spark/sql/hive/HiveInspectors.scala  | 29 
 .../spark/sql/hive/orc/OrcSourceSuite.scala | 29 
 2 files changed, 53 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0e6368ff/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 9824dad..64fffdb 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -370,17 +370,36 @@ private[hive] trait HiveInspectors {
   protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => 
Any = oi match {
 case _: JavaHiveVarcharObjectInspector =>
   (o: Any) =>
-val s = o.asInstanceOf[UTF8String].toString
-new HiveVarchar(s, s.size)
+if (o != null) {
+  val s = o.asInstanceOf[UTF8String].toString
+  new HiveVarchar(s, s.size)
+} else {
+  null
+}
 
 case _: JavaHiveDecimalObjectInspector =>
-  (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+  (o: Any) =>
+if (o != null) {
+  HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal)
+} else {
+  null
+}
 
 case _: JavaDateObjectInspector =>
-  (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+  (o: Any) =>
+if (o != null) {
+  DateTimeUtils.toJavaDate(o.asInstanceOf[Int])
+} else {
+  null
+}
 
 case _: JavaTimestampObjectInspector =>
-  (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+  (o: Any) =>
+if (o != null) {
+  DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long])
+} else {
+  null
+}
 
 case soi: StandardStructObjectInspector =>
   val schema = dataType.asInstanceOf[StructType]

http://git-wip-us.apache.org/repos/asf/spark/blob/0e6368ff/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
index 82e08ca..80c3808 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala
@@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with 
BeforeAndAfterAll {
   sql("SELECT * FROM normal_orc_as_source"),
   (6 to 10).map(i => Row(i, s"part-$i")))
   }
+
+  test("write null values") {
+sql("DROP TABLE IF EXISTS orcNullValues")
+
+val df = sql(
+  """
+|SELECT
+|  CAST(null as TINYINT),
+|  CAST(null as SMALLINT),
+|  CAST(null as INT),
+|  CAST(null as BIGINT),
+|  CAST(null as FLOAT),
+|  CAST(null as DOUBLE),
+|  CAST(null as DECIMAL(7,2)),
+|  CAST(null as TIMESTAMP),
+|  CAST(null as DATE),
+|  CAST(null as STRING),
+|  CAST(null as VARCHAR(10))
+|FROM orc_temp_table limit 1
+  """.stripMargin)
+
+df.write.format("orc").saveAsTable("orcNullValues")
+
+checkAnswer(
+  sql("SELECT * FROM orcNullValues"),
+  Row.fromSeq(Seq.fill(11)(null)))
+
+sql("DROP TABLE IF EXISTS orcNullValues")
+  }
 }
 
 class OrcSourceSuite extends OrcSuite {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10195] [SQL] Data sources Filter should not expose internal types

2015-08-25 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 e5cea566a -> a0f22cf29


[SPARK-10195] [SQL] Data sources Filter should not expose internal types

Spark SQL's data sources API exposes Catalyst's internal types through its 
Filter interfaces. This is a problem because types like UTF8String are not 
stable developer APIs and should not be exposed to third-parties.

This issue caused incompatibilities when upgrading our `spark-redshift` library 
to work against Spark 1.5.0.  To avoid these issues in the future we should 
only expose public types through these Filter objects. This patch accomplishes 
this by using CatalystTypeConverters to add the appropriate conversions.

Author: Josh Rosen 

Closes #8403 from JoshRosen/datasources-internal-vs-external-types.

(cherry picked from commit 7bc9a8c6249300ded31ea931c463d0a8f798e193)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0f22cf2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0f22cf2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0f22cf2

Branch: refs/heads/branch-1.5
Commit: a0f22cf295a1d20814c5be6cc727e39e95a81c27
Parents: e5cea56
Author: Josh Rosen 
Authored: Tue Aug 25 01:06:36 2015 -0700
Committer: Reynold Xin 
Committed: Tue Aug 25 01:06:51 2015 -0700

--
 .../datasources/DataSourceStrategy.scala| 67 ++--
 .../execution/datasources/jdbc/JDBCRDD.scala|  2 +-
 .../datasources/parquet/ParquetFilters.scala| 19 +++---
 .../spark/sql/sources/FilteredScanSuite.scala   |  7 ++
 4 files changed, 54 insertions(+), 41 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a0f22cf2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2a4c40d..6c1ef6a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
-import org.apache.spark.sql.catalyst.{InternalRow, expressions}
+import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, 
expressions}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
@@ -344,45 +345,47 @@ private[sql] object DataSourceStrategy extends Strategy 
with Logging {
*/
   protected[sql] def selectFilters(filters: Seq[Expression]) = {
 def translate(predicate: Expression): Option[Filter] = predicate match {
-  case expressions.EqualTo(a: Attribute, Literal(v, _)) =>
-Some(sources.EqualTo(a.name, v))
-  case expressions.EqualTo(Literal(v, _), a: Attribute) =>
-Some(sources.EqualTo(a.name, v))
-
-  case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) =>
-Some(sources.EqualNullSafe(a.name, v))
-  case expressions.EqualNullSafe(Literal(v, _), a: Attribute) =>
-Some(sources.EqualNullSafe(a.name, v))
-
-  case expressions.GreaterThan(a: Attribute, Literal(v, _)) =>
-Some(sources.GreaterThan(a.name, v))
-  case expressions.GreaterThan(Literal(v, _), a: Attribute) =>
-Some(sources.LessThan(a.name, v))
-
-  case expressions.LessThan(a: Attribute, Literal(v, _)) =>
-Some(sources.LessThan(a.name, v))
-  case expressions.LessThan(Literal(v, _), a: Attribute) =>
-Some(sources.GreaterThan(a.name, v))
-
-  case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) =>
-Some(sources.GreaterThanOrEqual(a.name, v))
-  case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) =>
-Some(sources.LessThanOrEqual(a.name, v))
-
-  case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) =>
-Some(sources.LessThanOrEqual(a.name, v))
-  case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) =>
-Some(sources.GreaterThanOrEqual(a.name, v))
+  case expressions.EqualTo(a: Attribute, Literal(v, t)) =>
+Some(sources.EqualTo(a.name, convertToScala(v, t)))
+  case expressions.EqualTo(Literal(v, t), a: Attribute) =>
+Some(sources.EqualTo(a.name, convertToScala(v, t)))
+
+  case 

spark git commit: [SPARK-10195] [SQL] Data sources Filter should not expose internal types

2015-08-25 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 2f493f7e3 -> 7bc9a8c62


[SPARK-10195] [SQL] Data sources Filter should not expose internal types

Spark SQL's data sources API exposes Catalyst's internal types through its 
Filter interfaces. This is a problem because types like UTF8String are not 
stable developer APIs and should not be exposed to third-parties.

This issue caused incompatibilities when upgrading our `spark-redshift` library 
to work against Spark 1.5.0.  To avoid these issues in the future we should 
only expose public types through these Filter objects. This patch accomplishes 
this by using CatalystTypeConverters to add the appropriate conversions.

Author: Josh Rosen 

Closes #8403 from JoshRosen/datasources-internal-vs-external-types.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7bc9a8c6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7bc9a8c6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7bc9a8c6

Branch: refs/heads/master
Commit: 7bc9a8c6249300ded31ea931c463d0a8f798e193
Parents: 2f493f7
Author: Josh Rosen 
Authored: Tue Aug 25 01:06:36 2015 -0700
Committer: Reynold Xin 
Committed: Tue Aug 25 01:06:36 2015 -0700

--
 .../datasources/DataSourceStrategy.scala| 67 ++--
 .../execution/datasources/jdbc/JDBCRDD.scala|  2 +-
 .../datasources/parquet/ParquetFilters.scala| 19 +++---
 .../spark/sql/sources/FilteredScanSuite.scala   |  7 ++
 4 files changed, 54 insertions(+), 41 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7bc9a8c6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 2a4c40d..6c1ef6a 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources
 import org.apache.spark.{Logging, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD}
-import org.apache.spark.sql.catalyst.{InternalRow, expressions}
+import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, 
expressions}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical
@@ -344,45 +345,47 @@ private[sql] object DataSourceStrategy extends Strategy 
with Logging {
*/
   protected[sql] def selectFilters(filters: Seq[Expression]) = {
 def translate(predicate: Expression): Option[Filter] = predicate match {
-  case expressions.EqualTo(a: Attribute, Literal(v, _)) =>
-Some(sources.EqualTo(a.name, v))
-  case expressions.EqualTo(Literal(v, _), a: Attribute) =>
-Some(sources.EqualTo(a.name, v))
-
-  case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) =>
-Some(sources.EqualNullSafe(a.name, v))
-  case expressions.EqualNullSafe(Literal(v, _), a: Attribute) =>
-Some(sources.EqualNullSafe(a.name, v))
-
-  case expressions.GreaterThan(a: Attribute, Literal(v, _)) =>
-Some(sources.GreaterThan(a.name, v))
-  case expressions.GreaterThan(Literal(v, _), a: Attribute) =>
-Some(sources.LessThan(a.name, v))
-
-  case expressions.LessThan(a: Attribute, Literal(v, _)) =>
-Some(sources.LessThan(a.name, v))
-  case expressions.LessThan(Literal(v, _), a: Attribute) =>
-Some(sources.GreaterThan(a.name, v))
-
-  case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) =>
-Some(sources.GreaterThanOrEqual(a.name, v))
-  case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) =>
-Some(sources.LessThanOrEqual(a.name, v))
-
-  case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) =>
-Some(sources.LessThanOrEqual(a.name, v))
-  case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) =>
-Some(sources.GreaterThanOrEqual(a.name, v))
+  case expressions.EqualTo(a: Attribute, Literal(v, t)) =>
+Some(sources.EqualTo(a.name, convertToScala(v, t)))
+  case expressions.EqualTo(Literal(v, t), a: Attribute) =>
+Some(sources.EqualTo(a.name, convertToScala(v, t)))
+
+  case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) =>
+Some(sources.EqualNullSafe(a.name, conv

spark git commit: [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive

2015-08-25 Thread lian
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 2032d6670 -> e5cea566a


[SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive

We misunderstood the Julian days and nanoseconds of the day in parquet (as 
TimestampType) from Hive/Impala, they are overlapped, so can't be added 
together directly.

In order to avoid the confusing rounding when do the converting, we use 
`2440588` as the Julian Day of epoch of unix timestamp (which should be 
2440587.5).

Author: Davies Liu 
Author: Cheng Lian 

Closes #8400 from davies/timestamp_parquet.

(cherry picked from commit 2f493f7e3924b769160a16f73cccbebf21973b91)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5cea566
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5cea566
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5cea566

Branch: refs/heads/branch-1.5
Commit: e5cea566a32d254adc9424a2f9e79b92eda3e6e4
Parents: 2032d66
Author: Davies Liu 
Authored: Tue Aug 25 16:00:44 2015 +0800
Committer: Cheng Lian 
Committed: Tue Aug 25 16:00:58 2015 +0800

--
 .../apache/spark/sql/catalyst/util/DateTimeUtils.scala |  7 ---
 .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala   | 13 +
 .../spark/sql/hive/ParquetHiveCompatibilitySuite.scala |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e5cea566/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 6726204..d652fce 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -37,7 +37,8 @@ object DateTimeUtils {
   type SQLTimestamp = Long
 
   // see 
http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
-  final val JULIAN_DAY_OF_EPOCH = 2440587  // and .5
+  // it's 2440587.5, rounding up to compatible with Hive
+  final val JULIAN_DAY_OF_EPOCH = 2440588
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
@@ -183,7 +184,7 @@ object DateTimeUtils {
*/
   def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = {
 // use Long to avoid rounding errors
-val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - 
SECONDS_PER_DAY / 2
+val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY
 seconds * MICROS_PER_SECOND + nanoseconds / 1000L
   }
 
@@ -191,7 +192,7 @@ object DateTimeUtils {
* Returns Julian day and nanoseconds in a day from the number of 
microseconds
*/
   def toJulianDay(us: SQLTimestamp): (Int, Long) = {
-val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
+val seconds = us / MICROS_PER_SECOND
 val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
 val secondsInDay = seconds % SECONDS_PER_DAY
 val nanos = (us % MICROS_PER_SECOND) * 1000L

http://git-wip-us.apache.org/repos/asf/spark/blob/e5cea566/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d18fa4d..1596bb7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   test("us and julian day") {
 val (d, ns) = toJulianDay(0)
 assert(d === JULIAN_DAY_OF_EPOCH)
-assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
+assert(ns === 0)
 assert(fromJulianDay(d, ns) == 0L)
 
-val t = new Timestamp(6139477861L) // (2015, 6, 11, 10, 10, 10, 100)
+val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
 val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
-val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
-assert(t.equals(t2))
+val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
+assert(t.equals(t1))
+
+val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
+val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
+val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
+assert(t2.equals(t22))
   }
 
   test("SPARK-6785: java date conversion before and

spark git commit: [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive

2015-08-25 Thread lian
Repository: spark
Updated Branches:
  refs/heads/master 1fc37581a -> 2f493f7e3


[SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive

We misunderstood the Julian days and nanoseconds of the day in parquet (as 
TimestampType) from Hive/Impala, they are overlapped, so can't be added 
together directly.

In order to avoid the confusing rounding when do the converting, we use 
`2440588` as the Julian Day of epoch of unix timestamp (which should be 
2440587.5).

Author: Davies Liu 
Author: Cheng Lian 

Closes #8400 from davies/timestamp_parquet.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f493f7e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f493f7e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f493f7e

Branch: refs/heads/master
Commit: 2f493f7e3924b769160a16f73cccbebf21973b91
Parents: 1fc3758
Author: Davies Liu 
Authored: Tue Aug 25 16:00:44 2015 +0800
Committer: Cheng Lian 
Committed: Tue Aug 25 16:00:44 2015 +0800

--
 .../apache/spark/sql/catalyst/util/DateTimeUtils.scala |  7 ---
 .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala   | 13 +
 .../spark/sql/hive/ParquetHiveCompatibilitySuite.scala |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 6726204..d652fce 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -37,7 +37,8 @@ object DateTimeUtils {
   type SQLTimestamp = Long
 
   // see 
http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian
-  final val JULIAN_DAY_OF_EPOCH = 2440587  // and .5
+  // it's 2440587.5, rounding up to compatible with Hive
+  final val JULIAN_DAY_OF_EPOCH = 2440588
   final val SECONDS_PER_DAY = 60 * 60 * 24L
   final val MICROS_PER_SECOND = 1000L * 1000L
   final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L
@@ -183,7 +184,7 @@ object DateTimeUtils {
*/
   def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = {
 // use Long to avoid rounding errors
-val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - 
SECONDS_PER_DAY / 2
+val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY
 seconds * MICROS_PER_SECOND + nanoseconds / 1000L
   }
 
@@ -191,7 +192,7 @@ object DateTimeUtils {
* Returns Julian day and nanoseconds in a day from the number of 
microseconds
*/
   def toJulianDay(us: SQLTimestamp): (Int, Long) = {
-val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2
+val seconds = us / MICROS_PER_SECOND
 val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH
 val secondsInDay = seconds % SECONDS_PER_DAY
 val nanos = (us % MICROS_PER_SECOND) * 1000L

http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d18fa4d..1596bb7 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite {
   test("us and julian day") {
 val (d, ns) = toJulianDay(0)
 assert(d === JULIAN_DAY_OF_EPOCH)
-assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND)
+assert(ns === 0)
 assert(fromJulianDay(d, ns) == 0L)
 
-val t = new Timestamp(6139477861L) // (2015, 6, 11, 10, 10, 10, 100)
+val t = Timestamp.valueOf("2015-06-11 10:10:10.100")
 val (d1, ns1) = toJulianDay(fromJavaTimestamp(t))
-val t2 = toJavaTimestamp(fromJulianDay(d1, ns1))
-assert(t.equals(t2))
+val t1 = toJavaTimestamp(fromJulianDay(d1, ns1))
+assert(t.equals(t1))
+
+val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100")
+val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2))
+val t22 = toJavaTimestamp(fromJulianDay(d2, ns2))
+assert(t2.equals(t22))
   }
 
   test("SPARK-6785: java date conversion before and after epoch") {

http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/hive/src/test/scala/org/

spark git commit: [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD

2015-08-25 Thread tdas
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 4841ebb18 -> 2032d6670


[SPARK-10210] [STREAMING] Filter out non-existent blocks before creating 
BlockRDD

When write ahead log is not enabled, a recovered streaming driver still tries 
to run jobs using pre-failure block ids, and fails as the block do not exists 
in-memory any more (and cannot be recovered as receiver WAL is not enabled).

This occurs because the driver-side WAL of ReceivedBlockTracker is recovers 
that past block information, and ReceiveInputDStream creates BlockRDDs even if 
those blocks do not exist.

The solution in this PR is to filter out block ids that do not exist before 
creating the BlockRDD. In addition, it adds unit tests to verify other logic in 
ReceiverInputDStream.

Author: Tathagata Das 

Closes #8405 from tdas/SPARK-10210.

(cherry picked from commit 1fc37581a52530bac5d555dbf14927a5780c3b75)
Signed-off-by: Tathagata Das 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2032d667
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2032d667
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2032d667

Branch: refs/heads/branch-1.5
Commit: 2032d66706d165079550f06bf695e0b08be7e143
Parents: 4841ebb
Author: Tathagata Das 
Authored: Tue Aug 25 00:35:51 2015 -0700
Committer: Tathagata Das 
Committed: Tue Aug 25 00:36:01 2015 -0700

--
 .../dstream/ReceiverInputDStream.scala  |  10 +-
 .../rdd/WriteAheadLogBackedBlockRDD.scala   |   2 +-
 .../streaming/ReceiverInputDStreamSuite.scala   | 156 +++
 3 files changed, 166 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index a158009..6c139f3 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -116,7 +116,15 @@ abstract class ReceiverInputDStream[T: 
ClassTag](@transient ssc_ : StreamingCont
 logWarning("Some blocks have Write Ahead Log information; this is 
unexpected")
   }
 }
-new BlockRDD[T](ssc.sc, blockIds)
+val validBlockIds = blockIds.filter { id =>
+  ssc.sparkContext.env.blockManager.master.contains(id)
+}
+if (validBlockIds.size != blockIds.size) {
+  logWarning("Some blocks could not be recovered as they were not 
found in memory. " +
+"To prevent such data loss, enabled Write Ahead Log (see 
programming guide " +
+"for more details.")
+}
+new BlockRDD[T](ssc.sc, validBlockIds)
   }
 } else {
   // If no block is ready now, creating WriteAheadLogBackedBlockRDD or 
BlockRDD

http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 620b8a3..e081ffe 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -75,7 +75,7 @@ private[streaming]
 class WriteAheadLogBackedBlockRDD[T: ClassTag](
 @transient sc: SparkContext,
 @transient blockIds: Array[BlockId],
-@transient walRecordHandles: Array[WriteAheadLogRecordHandle],
+@transient val walRecordHandles: Array[WriteAheadLogRecordHandle],
 @transient isBlockIdValid: Array[Boolean] = Array.empty,
 storeInBlockManager: Boolean = false,
 storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER)

http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
--
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
new file mode 100644
index 000..6d388d9
--- /dev/null
+++ 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Founda

spark git commit: [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD

2015-08-25 Thread tdas
Repository: spark
Updated Branches:
  refs/heads/master 57b960bf3 -> 1fc37581a


[SPARK-10210] [STREAMING] Filter out non-existent blocks before creating 
BlockRDD

When write ahead log is not enabled, a recovered streaming driver still tries 
to run jobs using pre-failure block ids, and fails as the block do not exists 
in-memory any more (and cannot be recovered as receiver WAL is not enabled).

This occurs because the driver-side WAL of ReceivedBlockTracker is recovers 
that past block information, and ReceiveInputDStream creates BlockRDDs even if 
those blocks do not exist.

The solution in this PR is to filter out block ids that do not exist before 
creating the BlockRDD. In addition, it adds unit tests to verify other logic in 
ReceiverInputDStream.

Author: Tathagata Das 

Closes #8405 from tdas/SPARK-10210.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1fc37581
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1fc37581
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1fc37581

Branch: refs/heads/master
Commit: 1fc37581a52530bac5d555dbf14927a5780c3b75
Parents: 57b960b
Author: Tathagata Das 
Authored: Tue Aug 25 00:35:51 2015 -0700
Committer: Tathagata Das 
Committed: Tue Aug 25 00:35:51 2015 -0700

--
 .../dstream/ReceiverInputDStream.scala  |  10 +-
 .../rdd/WriteAheadLogBackedBlockRDD.scala   |   2 +-
 .../streaming/ReceiverInputDStreamSuite.scala   | 156 +++
 3 files changed, 166 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
index a158009..6c139f3 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala
@@ -116,7 +116,15 @@ abstract class ReceiverInputDStream[T: 
ClassTag](@transient ssc_ : StreamingCont
 logWarning("Some blocks have Write Ahead Log information; this is 
unexpected")
   }
 }
-new BlockRDD[T](ssc.sc, blockIds)
+val validBlockIds = blockIds.filter { id =>
+  ssc.sparkContext.env.blockManager.master.contains(id)
+}
+if (validBlockIds.size != blockIds.size) {
+  logWarning("Some blocks could not be recovered as they were not 
found in memory. " +
+"To prevent such data loss, enabled Write Ahead Log (see 
programming guide " +
+"for more details.")
+}
+new BlockRDD[T](ssc.sc, validBlockIds)
   }
 } else {
   // If no block is ready now, creating WriteAheadLogBackedBlockRDD or 
BlockRDD

http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
--
diff --git 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
index 620b8a3..e081ffe 100644
--- 
a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
+++ 
b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala
@@ -75,7 +75,7 @@ private[streaming]
 class WriteAheadLogBackedBlockRDD[T: ClassTag](
 @transient sc: SparkContext,
 @transient blockIds: Array[BlockId],
-@transient walRecordHandles: Array[WriteAheadLogRecordHandle],
+@transient val walRecordHandles: Array[WriteAheadLogRecordHandle],
 @transient isBlockIdValid: Array[Boolean] = Array.empty,
 storeInBlockManager: Boolean = false,
 storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER)

http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
--
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
new file mode 100644
index 000..6d388d9
--- /dev/null
+++ 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * t

spark git commit: [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 76d920f2b -> 4841ebb18


[SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided

Follow up to https://github.com/apache/spark/pull/7047

pwendell mentioned that MapR should use `hadoop-provided` now, and indeed the 
new build script does not produce `mapr3`/`mapr4` artifacts anymore. Hence the 
action seems to be to remove the profiles, which are now not used.

CC trystanleftwich

Author: Sean Owen 

Closes #8338 from srowen/SPARK-6196.

(cherry picked from commit 57b960bf3706728513f9e089455a533f0244312e)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4841ebb1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4841ebb1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4841ebb1

Branch: refs/heads/branch-1.5
Commit: 4841ebb1861025067a1108c11f64bb144427a308
Parents: 76d920f
Author: Sean Owen 
Authored: Tue Aug 25 08:32:20 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 08:32:31 2015 +0100

--
 pom.xml | 38 --
 1 file changed, 38 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4841ebb1/pom.xml
--
diff --git a/pom.xml b/pom.xml
index ccfa1ea..a597f86 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2382,44 +2382,6 @@
 
 
 
-  mapr3
-  
-1.0.3-mapr-3.0.3
-2.4.1-mapr-1408
-0.98.4-mapr-1408
-3.4.5-mapr-1406
-  
-
-
-
-  mapr4
-  
-2.4.1-mapr-1408
-2.4.1-mapr-1408
-0.98.4-mapr-1408
-3.4.5-mapr-1406
-  
-  
-
-  org.apache.curator
-  curator-recipes
-  ${curator.version}
-  
-
-  org.apache.zookeeper
-  zookeeper
-
-  
-
-
-  org.apache.zookeeper
-  zookeeper
-  3.4.5-mapr-1406
-
-  
-
-
-
   hive-thriftserver
   
 sql/hive-thriftserver


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided

2015-08-25 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master d4549fe58 -> 57b960bf3


[SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided

Follow up to https://github.com/apache/spark/pull/7047

pwendell mentioned that MapR should use `hadoop-provided` now, and indeed the 
new build script does not produce `mapr3`/`mapr4` artifacts anymore. Hence the 
action seems to be to remove the profiles, which are now not used.

CC trystanleftwich

Author: Sean Owen 

Closes #8338 from srowen/SPARK-6196.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57b960bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57b960bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57b960bf

Branch: refs/heads/master
Commit: 57b960bf3706728513f9e089455a533f0244312e
Parents: d4549fe
Author: Sean Owen 
Authored: Tue Aug 25 08:32:20 2015 +0100
Committer: Sean Owen 
Committed: Tue Aug 25 08:32:20 2015 +0100

--
 pom.xml | 38 --
 1 file changed, 38 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/57b960bf/pom.xml
--
diff --git a/pom.xml b/pom.xml
index d5945f2..0716016 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2387,44 +2387,6 @@
 
 
 
-  mapr3
-  
-1.0.3-mapr-3.0.3
-2.4.1-mapr-1408
-0.98.4-mapr-1408
-3.4.5-mapr-1406
-  
-
-
-
-  mapr4
-  
-2.4.1-mapr-1408
-2.4.1-mapr-1408
-0.98.4-mapr-1408
-3.4.5-mapr-1406
-  
-  
-
-  org.apache.curator
-  curator-recipes
-  ${curator.version}
-  
-
-  org.apache.zookeeper
-  zookeeper
-
-  
-
-
-  org.apache.zookeeper
-  zookeeper
-  3.4.5-mapr-1406
-
-  
-
-
-
   hive-thriftserver
   
 sql/hive-thriftserver


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs

2015-08-25 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 b7c4ff144 -> 76d920f2b


[SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs

cc: shivaram

## Summary

- Add name tags to each methods in DataFrame.R and column.R
- Replace `rdname column` with `rdname {each_func}`. i.e. alias method : 
`rdname column` =>  `rdname alias`

## Generated PDF File
https://drive.google.com/file/d/0B9biIZIU47lLNHN2aFpnQXlSeGs/view?usp=sharing

## JIRA
[[SPARK-10214] Improve SparkR Column, DataFrame API docs - ASF 
JIRA](https://issues.apache.org/jira/browse/SPARK-10214)

Author: Yu ISHIKAWA 

Closes #8414 from yu-iskw/SPARK-10214.

(cherry picked from commit d4549fe58fa0d781e0e891bceff893420cb1d598)
Signed-off-by: Shivaram Venkataraman 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76d920f2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76d920f2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76d920f2

Branch: refs/heads/branch-1.5
Commit: 76d920f2b814304051dd76f0ca78301e872fc811
Parents: b7c4ff1
Author: Yu ISHIKAWA 
Authored: Tue Aug 25 00:28:51 2015 -0700
Committer: Shivaram Venkataraman 
Committed: Tue Aug 25 00:28:58 2015 -0700

--
 R/pkg/R/DataFrame.R | 101 +--
 R/pkg/R/column.R|  40 +--
 R/pkg/R/generics.R  |   2 +-
 3 files changed, 109 insertions(+), 34 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/76d920f2/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 8956032..10f3c4e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -27,9 +27,10 @@ setOldClass("jobj")
 #'  \code{jsonFile}, \code{table} etc.
 #' @rdname DataFrame
 #' @seealso jsonFile, table
+#' @docType class
 #'
-#' @param env An R environment that stores bookkeeping states of the DataFrame
-#' @param sdf A Java object reference to the backing Scala DataFrame
+#' @slot env An R environment that stores bookkeeping states of the DataFrame
+#' @slot sdf A Java object reference to the backing Scala DataFrame
 #' @export
 setClass("DataFrame",
  slots = list(env = "environment",
@@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname printSchema
+#' @name printSchema
 #' @export
 #' @examples
 #'\dontrun{
@@ -84,6 +86,7 @@ setMethod("printSchema",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname schema
+#' @name schema
 #' @export
 #' @examples
 #'\dontrun{
@@ -106,6 +109,7 @@ setMethod("schema",
 #' @param x A SparkSQL DataFrame
 #' @param extended Logical. If extended is False, explain() only prints the 
physical plan.
 #' @rdname explain
+#' @name explain
 #' @export
 #' @examples
 #'\dontrun{
@@ -135,6 +139,7 @@ setMethod("explain",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname isLocal
+#' @name isLocal
 #' @export
 #' @examples
 #'\dontrun{
@@ -158,6 +163,7 @@ setMethod("isLocal",
 #' @param numRows The number of rows to print. Defaults to 20.
 #'
 #' @rdname showDF
+#' @name showDF
 #' @export
 #' @examples
 #'\dontrun{
@@ -181,6 +187,7 @@ setMethod("showDF",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname show
+#' @name show
 #' @export
 #' @examples
 #'\dontrun{
@@ -206,6 +213,7 @@ setMethod("show", "DataFrame",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname dtypes
+#' @name dtypes
 #' @export
 #' @examples
 #'\dontrun{
@@ -230,6 +238,8 @@ setMethod("dtypes",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname columns
+#' @name columns
+#' @aliases names
 #' @export
 #' @examples
 #'\dontrun{
@@ -248,7 +258,7 @@ setMethod("columns",
   })
 
 #' @rdname columns
-#' @aliases names,DataFrame,function-method
+#' @name names
 setMethod("names",
   signature(x = "DataFrame"),
   function(x) {
@@ -256,6 +266,7 @@ setMethod("names",
   })
 
 #' @rdname columns
+#' @name names<-
 setMethod("names<-",
   signature(x = "DataFrame"),
   function(x, value) {
@@ -273,6 +284,7 @@ setMethod("names<-",
 #' @param tableName A character vector containing the name of the table
 #'
 #' @rdname registerTempTable
+#' @name registerTempTable
 #' @export
 #' @examples
 #'\dontrun{
@@ -299,6 +311,7 @@ setMethod("registerTempTable",
 #' the existing rows in the table.
 #'
 #' @rdname insertInto
+#' @name insertInto
 #' @export
 #' @examples
 #'\dontrun{
@@ -321,7 +334,8 @@ setMethod("insertInto",
 #'
 #' @param x A SparkSQL DataFrame
 #'
-#' @rdname cache-methods
+#' @rdname cache
+#' @name cache
 #' @export
 #' @examples
 #'\dontrun{
@@ -347,6 +361,7 @@ setMethod("cache",
 #'
 #' @param x The DataFrame to persist
 #' @rdname persist
+#' @name persist
 #' @export
 #' @examples
 #'\dontrun{
@@ -372,6 +387,7

spark git commit: [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs

2015-08-25 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 82268f07a -> d4549fe58


[SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs

cc: shivaram

## Summary

- Add name tags to each methods in DataFrame.R and column.R
- Replace `rdname column` with `rdname {each_func}`. i.e. alias method : 
`rdname column` =>  `rdname alias`

## Generated PDF File
https://drive.google.com/file/d/0B9biIZIU47lLNHN2aFpnQXlSeGs/view?usp=sharing

## JIRA
[[SPARK-10214] Improve SparkR Column, DataFrame API docs - ASF 
JIRA](https://issues.apache.org/jira/browse/SPARK-10214)

Author: Yu ISHIKAWA 

Closes #8414 from yu-iskw/SPARK-10214.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4549fe5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4549fe5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4549fe5

Branch: refs/heads/master
Commit: d4549fe58fa0d781e0e891bceff893420cb1d598
Parents: 82268f0
Author: Yu ISHIKAWA 
Authored: Tue Aug 25 00:28:51 2015 -0700
Committer: Shivaram Venkataraman 
Committed: Tue Aug 25 00:28:51 2015 -0700

--
 R/pkg/R/DataFrame.R | 101 +--
 R/pkg/R/column.R|  40 +--
 R/pkg/R/generics.R  |   2 +-
 3 files changed, 109 insertions(+), 34 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d4549fe5/R/pkg/R/DataFrame.R
--
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 8956032..10f3c4e 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -27,9 +27,10 @@ setOldClass("jobj")
 #'  \code{jsonFile}, \code{table} etc.
 #' @rdname DataFrame
 #' @seealso jsonFile, table
+#' @docType class
 #'
-#' @param env An R environment that stores bookkeeping states of the DataFrame
-#' @param sdf A Java object reference to the backing Scala DataFrame
+#' @slot env An R environment that stores bookkeeping states of the DataFrame
+#' @slot sdf A Java object reference to the backing Scala DataFrame
 #' @export
 setClass("DataFrame",
  slots = list(env = "environment",
@@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) {
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname printSchema
+#' @name printSchema
 #' @export
 #' @examples
 #'\dontrun{
@@ -84,6 +86,7 @@ setMethod("printSchema",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname schema
+#' @name schema
 #' @export
 #' @examples
 #'\dontrun{
@@ -106,6 +109,7 @@ setMethod("schema",
 #' @param x A SparkSQL DataFrame
 #' @param extended Logical. If extended is False, explain() only prints the 
physical plan.
 #' @rdname explain
+#' @name explain
 #' @export
 #' @examples
 #'\dontrun{
@@ -135,6 +139,7 @@ setMethod("explain",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname isLocal
+#' @name isLocal
 #' @export
 #' @examples
 #'\dontrun{
@@ -158,6 +163,7 @@ setMethod("isLocal",
 #' @param numRows The number of rows to print. Defaults to 20.
 #'
 #' @rdname showDF
+#' @name showDF
 #' @export
 #' @examples
 #'\dontrun{
@@ -181,6 +187,7 @@ setMethod("showDF",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname show
+#' @name show
 #' @export
 #' @examples
 #'\dontrun{
@@ -206,6 +213,7 @@ setMethod("show", "DataFrame",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname dtypes
+#' @name dtypes
 #' @export
 #' @examples
 #'\dontrun{
@@ -230,6 +238,8 @@ setMethod("dtypes",
 #' @param x A SparkSQL DataFrame
 #'
 #' @rdname columns
+#' @name columns
+#' @aliases names
 #' @export
 #' @examples
 #'\dontrun{
@@ -248,7 +258,7 @@ setMethod("columns",
   })
 
 #' @rdname columns
-#' @aliases names,DataFrame,function-method
+#' @name names
 setMethod("names",
   signature(x = "DataFrame"),
   function(x) {
@@ -256,6 +266,7 @@ setMethod("names",
   })
 
 #' @rdname columns
+#' @name names<-
 setMethod("names<-",
   signature(x = "DataFrame"),
   function(x, value) {
@@ -273,6 +284,7 @@ setMethod("names<-",
 #' @param tableName A character vector containing the name of the table
 #'
 #' @rdname registerTempTable
+#' @name registerTempTable
 #' @export
 #' @examples
 #'\dontrun{
@@ -299,6 +311,7 @@ setMethod("registerTempTable",
 #' the existing rows in the table.
 #'
 #' @rdname insertInto
+#' @name insertInto
 #' @export
 #' @examples
 #'\dontrun{
@@ -321,7 +334,8 @@ setMethod("insertInto",
 #'
 #' @param x A SparkSQL DataFrame
 #'
-#' @rdname cache-methods
+#' @rdname cache
+#' @name cache
 #' @export
 #' @examples
 #'\dontrun{
@@ -347,6 +361,7 @@ setMethod("cache",
 #'
 #' @param x The DataFrame to persist
 #' @rdname persist
+#' @name persist
 #' @export
 #' @examples
 #'\dontrun{
@@ -372,6 +387,7 @@ setMethod("persist",
 #' @param x The DataFrame to unpersist
 #' @param blocking Whether to block until all bloc

spark git commit: [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns

2015-08-25 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/branch-1.5 95a14e9f2 -> b7c4ff144


[SPARK-9293] [SPARK-9813] Analysis should check that set operations are only 
performed on tables with equal numbers of columns

This patch adds an analyzer rule to ensure that set operations (union, 
intersect, and except) are only applied to tables with the same number of 
columns. Without this rule, there are scenarios where invalid queries can 
return incorrect results instead of failing with error messages; SPARK-9813 
provides one example of this problem. In other cases, the invalid query can 
crash at runtime with extremely confusing exceptions.

I also performed a bit of cleanup to refactor some of those logical operators' 
code into a common `SetOperation` base class.

Author: Josh Rosen 

Closes #7631 from JoshRosen/SPARK-9293.

(cherry picked from commit 82268f07abfa658869df2354ae72f8d6ddd119e8)
Signed-off-by: Michael Armbrust 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7c4ff14
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7c4ff14
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7c4ff14

Branch: refs/heads/branch-1.5
Commit: b7c4ff144e783e80adc1efc2c28965c2e739dd5e
Parents: 95a14e9
Author: Josh Rosen 
Authored: Tue Aug 25 00:04:10 2015 -0700
Committer: Michael Armbrust 
Committed: Tue Aug 25 00:04:23 2015 -0700

--
 .../sql/catalyst/analysis/CheckAnalysis.scala   |  6 
 .../catalyst/analysis/HiveTypeCoercion.scala| 14 +++-
 .../catalyst/plans/logical/basicOperators.scala | 38 +---
 .../catalyst/analysis/AnalysisErrorSuite.scala  | 18 ++
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +-
 .../hive/execution/InsertIntoHiveTable.scala|  2 +-
 6 files changed, 48 insertions(+), 32 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b7c4ff14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 39f554c..7701fd0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -137,6 +137,12 @@ trait CheckAnalysis {
   }
 }
 
+  case s @ SetOperation(left, right) if left.output.length != 
right.output.length =>
+failAnalysis(
+  s"${s.nodeName} can only be performed on tables with the same 
number of columns, " +
+   s"but the left table has ${left.output.length} columns and the 
right has " +
+   s"${right.output.length}")
+
   case _ => // Fallbacks to the following checks
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b7c4ff14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 2cb067f..a1aa2a2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -203,6 +203,7 @@ object HiveTypeCoercion {
 planName: String,
 left: LogicalPlan,
 right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
+  require(left.output.length == right.output.length)
 
   val castedTypes = left.output.zip(right.output).map {
 case (lhs, rhs) if lhs.dataType != rhs.dataType =>
@@ -229,15 +230,10 @@ object HiveTypeCoercion {
 def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
   case p if p.analyzed => p
 
-  case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right)
-Union(newLeft, newRight)
-  case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
-val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right)
-Except(newLeft, newRight)
-  case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
-val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right)
-Intersect(newLeft, newRight)
+  case s @ SetOperation(left, right) if s.childrenResolved
+  && left.output.length == right.output.length && !s.resolved =>
+val (ne

spark git commit: [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns

2015-08-25 Thread marmbrus
Repository: spark
Updated Branches:
  refs/heads/master bf03fe68d -> 82268f07a


[SPARK-9293] [SPARK-9813] Analysis should check that set operations are only 
performed on tables with equal numbers of columns

This patch adds an analyzer rule to ensure that set operations (union, 
intersect, and except) are only applied to tables with the same number of 
columns. Without this rule, there are scenarios where invalid queries can 
return incorrect results instead of failing with error messages; SPARK-9813 
provides one example of this problem. In other cases, the invalid query can 
crash at runtime with extremely confusing exceptions.

I also performed a bit of cleanup to refactor some of those logical operators' 
code into a common `SetOperation` base class.

Author: Josh Rosen 

Closes #7631 from JoshRosen/SPARK-9293.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82268f07
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82268f07
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82268f07

Branch: refs/heads/master
Commit: 82268f07abfa658869df2354ae72f8d6ddd119e8
Parents: bf03fe6
Author: Josh Rosen 
Authored: Tue Aug 25 00:04:10 2015 -0700
Committer: Michael Armbrust 
Committed: Tue Aug 25 00:04:10 2015 -0700

--
 .../sql/catalyst/analysis/CheckAnalysis.scala   |  6 
 .../catalyst/analysis/HiveTypeCoercion.scala| 14 +++-
 .../catalyst/plans/logical/basicOperators.scala | 38 +---
 .../catalyst/analysis/AnalysisErrorSuite.scala  | 18 ++
 .../spark/sql/hive/HiveMetastoreCatalog.scala   |  2 +-
 .../hive/execution/InsertIntoHiveTable.scala|  2 +-
 6 files changed, 48 insertions(+), 32 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/82268f07/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 39f554c..7701fd0 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -137,6 +137,12 @@ trait CheckAnalysis {
   }
 }
 
+  case s @ SetOperation(left, right) if left.output.length != 
right.output.length =>
+failAnalysis(
+  s"${s.nodeName} can only be performed on tables with the same 
number of columns, " +
+   s"but the left table has ${left.output.length} columns and the 
right has " +
+   s"${right.output.length}")
+
   case _ => // Fallbacks to the following checks
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/82268f07/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 2cb067f..a1aa2a2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -203,6 +203,7 @@ object HiveTypeCoercion {
 planName: String,
 left: LogicalPlan,
 right: LogicalPlan): (LogicalPlan, LogicalPlan) = {
+  require(left.output.length == right.output.length)
 
   val castedTypes = left.output.zip(right.output).map {
 case (lhs, rhs) if lhs.dataType != rhs.dataType =>
@@ -229,15 +230,10 @@ object HiveTypeCoercion {
 def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
   case p if p.analyzed => p
 
-  case u @ Union(left, right) if u.childrenResolved && !u.resolved =>
-val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right)
-Union(newLeft, newRight)
-  case e @ Except(left, right) if e.childrenResolved && !e.resolved =>
-val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right)
-Except(newLeft, newRight)
-  case i @ Intersect(left, right) if i.childrenResolved && !i.resolved =>
-val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right)
-Intersect(newLeft, newRight)
+  case s @ SetOperation(left, right) if s.childrenResolved
+  && left.output.length == right.output.length && !s.resolved =>
+val (newLeft, newRight) = widenOutputTypes(s.nodeName, left, right)
+s.makeCopy(Array(newLeft, newRight))