spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)
Repository: spark Updated Branches: refs/heads/branch-1.5 21a10a86d -> 5220db9e3 [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select) Add support for ``` df[df$name == "Smith", c(1,2)] df[df$age %in% c(19, 30), 1:2] ``` shivaram Author: felixcheung Closes #8394 from felixcheung/rsubset. (cherry picked from commit 75d4773aa50e24972c533e8b48697fde586429eb) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5220db9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5220db9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5220db9e Branch: refs/heads/branch-1.5 Commit: 5220db9e352b5d5eae59cead9478ca0a9f73f16b Parents: 21a10a8 Author: felixcheung Authored: Tue Aug 25 23:48:16 2015 -0700 Committer: Shivaram Venkataraman Committed: Tue Aug 25 23:48:27 2015 -0700 -- R/pkg/R/DataFrame.R | 22 +- R/pkg/inst/tests/test_sparkSQL.R | 27 +++ 2 files changed, 48 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 10f3c4e..1d870ec 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -954,9 +954,11 @@ setMethod("$<-", signature(x = "DataFrame"), x }) +setClassUnion("numericOrcharacter", c("numeric", "character")) + #' @rdname select #' @name [[ -setMethod("[[", signature(x = "DataFrame"), +setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -979,6 +981,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), select(x, j) }) +#' @rdname select +#' @name [ +setMethod("[", signature(x = "DataFrame", i = "Column"), + function(x, i, j, ...) { +# It could handle i as "character" but it seems confusing and not required +# https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html +filtered <- filter(x, i) +if (!missing(j)) { + filtered[, j] +} else { + filtered +} + }) + #' Select #' #' Selects a set of columns with names or Column expressions. @@ -997,8 +1013,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' # Columns can also be selected using `[[` and `[` #' df[[2]] == df[["age"]] #' df[,2] == df[,"age"] +#' df[,c("name", "age")] #' # Similar to R data frames columns can also be selected using `$` #' df$age +#' # It can also be subset on rows and Columns +#' df[df$name == "Smith", c(1,2)] +#' df[df$age %in% c(19, 30), 1:2] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { http://git-wip-us.apache.org/repos/asf/spark/blob/5220db9e/R/pkg/inst/tests/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 556b8c5..ee48a3d 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -587,6 +587,33 @@ test_that("select with column", { expect_equal(collect(select(df3, "x"))[[1, 1]], "x") }) +test_that("subsetting", { + # jsonFile returns columns in random order + df <- select(jsonFile(sqlContext, jsonPath), "name", "age") + filtered <- df[df$age > 20,] + expect_equal(count(filtered), 1) + expect_equal(columns(filtered), c("name", "age")) + expect_equal(collect(filtered)$name, "Andy") + + df2 <- df[df$age == 19, 1] + expect_is(df2, "DataFrame") + expect_equal(count(df2), 1) + expect_equal(columns(df2), c("name")) + expect_equal(collect(df2)$name, "Justin") + + df3 <- df[df$age > 20, 2] + expect_equal(count(df3), 1) + expect_equal(columns(df3), c("age")) + + df4 <- df[df$age %in% c(19, 30), 1:2] + expect_equal(count(df4), 2) + expect_equal(columns(df4), c("name", "age")) + + df5 <- df[df$age %in% c(19), c(1,2)] + expect_equal(count(df5), 1) + expect_equal(columns(df5), c("name", "age")) +}) + test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) selected <- selectExpr(df, "age * 2") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select)
Repository: spark Updated Branches: refs/heads/master 321d77596 -> 75d4773aa [SPARK-9316] [SPARKR] Add support for filtering using `[` (synonym for filter / select) Add support for ``` df[df$name == "Smith", c(1,2)] df[df$age %in% c(19, 30), 1:2] ``` shivaram Author: felixcheung Closes #8394 from felixcheung/rsubset. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/75d4773a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/75d4773a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/75d4773a Branch: refs/heads/master Commit: 75d4773aa50e24972c533e8b48697fde586429eb Parents: 321d775 Author: felixcheung Authored: Tue Aug 25 23:48:16 2015 -0700 Committer: Shivaram Venkataraman Committed: Tue Aug 25 23:48:16 2015 -0700 -- R/pkg/R/DataFrame.R | 22 +- R/pkg/inst/tests/test_sparkSQL.R | 27 +++ 2 files changed, 48 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/75d4773a/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ae1d912..a5162de 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -985,9 +985,11 @@ setMethod("$<-", signature(x = "DataFrame"), x }) +setClassUnion("numericOrcharacter", c("numeric", "character")) + #' @rdname select #' @name [[ -setMethod("[[", signature(x = "DataFrame"), +setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"), function(x, i) { if (is.numeric(i)) { cols <- columns(x) @@ -1010,6 +1012,20 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), select(x, j) }) +#' @rdname select +#' @name [ +setMethod("[", signature(x = "DataFrame", i = "Column"), + function(x, i, j, ...) { +# It could handle i as "character" but it seems confusing and not required +# https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html +filtered <- filter(x, i) +if (!missing(j)) { + filtered[, j] +} else { + filtered +} + }) + #' Select #' #' Selects a set of columns with names or Column expressions. @@ -1028,8 +1044,12 @@ setMethod("[", signature(x = "DataFrame", i = "missing"), #' # Columns can also be selected using `[[` and `[` #' df[[2]] == df[["age"]] #' df[,2] == df[,"age"] +#' df[,c("name", "age")] #' # Similar to R data frames columns can also be selected using `$` #' df$age +#' # It can also be subset on rows and Columns +#' df[df$name == "Smith", c(1,2)] +#' df[df$age %in% c(19, 30), 1:2] #' } setMethod("select", signature(x = "DataFrame", col = "character"), function(x, col, ...) { http://git-wip-us.apache.org/repos/asf/spark/blob/75d4773a/R/pkg/inst/tests/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 556b8c5..ee48a3d 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -587,6 +587,33 @@ test_that("select with column", { expect_equal(collect(select(df3, "x"))[[1, 1]], "x") }) +test_that("subsetting", { + # jsonFile returns columns in random order + df <- select(jsonFile(sqlContext, jsonPath), "name", "age") + filtered <- df[df$age > 20,] + expect_equal(count(filtered), 1) + expect_equal(columns(filtered), c("name", "age")) + expect_equal(collect(filtered)$name, "Andy") + + df2 <- df[df$age == 19, 1] + expect_is(df2, "DataFrame") + expect_equal(count(df2), 1) + expect_equal(columns(df2), c("name")) + expect_equal(collect(df2)$name, "Justin") + + df3 <- df[df$age > 20, 2] + expect_equal(count(df3), 1) + expect_equal(columns(df3), c("age")) + + df4 <- df[df$age %in% c(19, 30), 1:2] + expect_equal(count(df4), 2) + expect_equal(columns(df4), c("name", "age")) + + df5 <- df[df$age %in% c(19), c(1,2)] + expect_equal(count(df5), 1) + expect_equal(columns(df5), c("name", "age")) +}) + test_that("selectExpr() on a DataFrame", { df <- jsonFile(sqlContext, jsonPath) selected <- selectExpr(df, "age * 2") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10236] [MLLIB] update since versions in mllib.feature
Repository: spark Updated Branches: refs/heads/master 4657fa1f3 -> 321d77596 [SPARK-10236] [MLLIB] update since versions in mllib.feature Same as #8421 but for `mllib.feature`. cc dbtsai Author: Xiangrui Meng Closes #8449 from mengxr/SPARK-10236.feature and squashes the following commits: 0e8d658 [Xiangrui Meng] remove unnecessary comment ad70b03 [Xiangrui Meng] update since versions in mllib.feature Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/321d7759 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/321d7759 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/321d7759 Branch: refs/heads/master Commit: 321d7759691bed9867b1f0470f12eab2faa50aff Parents: 4657fa1 Author: Xiangrui Meng Authored: Tue Aug 25 23:45:41 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 23:45:41 2015 -0700 -- .../mllib/clustering/PowerIterationClustering.scala | 2 -- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++-- .../apache/spark/mllib/feature/ElementwiseProduct.scala | 3 ++- .../main/scala/org/apache/spark/mllib/feature/IDF.scala | 6 -- .../org/apache/spark/mllib/feature/Normalizer.scala | 2 +- .../main/scala/org/apache/spark/mllib/feature/PCA.scala | 7 +-- .../org/apache/spark/mllib/feature/StandardScaler.scala | 12 ++-- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 1 + 8 files changed, 21 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index da234bd..6c76e26 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -71,8 +71,6 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode private[clustering] val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel" -/** - */ @Since("1.4.0") def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = { val sqlContext = new SQLContext(sc) http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index fdd974d..4743cfd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD */ @Since("1.3.0") @Experimental -class ChiSqSelectorModel ( +class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer { require(isSorted(selectedFeatures), "Array has to be sorted asc") @@ -112,7 +112,7 @@ class ChiSqSelectorModel ( */ @Since("1.3.0") @Experimental -class ChiSqSelector ( +class ChiSqSelector @Since("1.3.0") ( @Since("1.3.0") val numTopFeatures: Int) extends Serializable { /** http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala index 33e2d17..d0a6cf6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala @@ -29,7 +29,8 @@ import org.apache.spark.mllib.linalg._ */ @Since("1.4.0") @Experimental -class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer { +class ElementwiseProduct @Since("1.4.0") ( +@Since("1.4.0") val scalingVec: Vector) extends VectorTransformer { /** * Does the hadamard product transformation. http://git-wip-us.apache.org/repos/asf/spark/blob/321d7759/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala index d5353
spark git commit: [SPARK-10236] [MLLIB] update since versions in mllib.feature
Repository: spark Updated Branches: refs/heads/branch-1.5 08d390f45 -> 21a10a86d [SPARK-10236] [MLLIB] update since versions in mllib.feature Same as #8421 but for `mllib.feature`. cc dbtsai Author: Xiangrui Meng Closes #8449 from mengxr/SPARK-10236.feature and squashes the following commits: 0e8d658 [Xiangrui Meng] remove unnecessary comment ad70b03 [Xiangrui Meng] update since versions in mllib.feature (cherry picked from commit 321d7759691bed9867b1f0470f12eab2faa50aff) Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21a10a86 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21a10a86 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21a10a86 Branch: refs/heads/branch-1.5 Commit: 21a10a86d20ec1a6fea42286b4d2aae9ce7e848d Parents: 08d390f Author: Xiangrui Meng Authored: Tue Aug 25 23:45:41 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 23:45:53 2015 -0700 -- .../mllib/clustering/PowerIterationClustering.scala | 2 -- .../org/apache/spark/mllib/feature/ChiSqSelector.scala | 4 ++-- .../apache/spark/mllib/feature/ElementwiseProduct.scala | 3 ++- .../main/scala/org/apache/spark/mllib/feature/IDF.scala | 6 -- .../org/apache/spark/mllib/feature/Normalizer.scala | 2 +- .../main/scala/org/apache/spark/mllib/feature/PCA.scala | 7 +-- .../org/apache/spark/mllib/feature/StandardScaler.scala | 12 ++-- .../scala/org/apache/spark/mllib/feature/Word2Vec.scala | 1 + 8 files changed, 21 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala index da234bd..6c76e26 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/PowerIterationClustering.scala @@ -71,8 +71,6 @@ object PowerIterationClusteringModel extends Loader[PowerIterationClusteringMode private[clustering] val thisClassName = "org.apache.spark.mllib.clustering.PowerIterationClusteringModel" -/** - */ @Since("1.4.0") def save(sc: SparkContext, model: PowerIterationClusteringModel, path: String): Unit = { val sqlContext = new SQLContext(sc) http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala index fdd974d..4743cfd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala @@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD */ @Since("1.3.0") @Experimental -class ChiSqSelectorModel ( +class ChiSqSelectorModel @Since("1.3.0") ( @Since("1.3.0") val selectedFeatures: Array[Int]) extends VectorTransformer { require(isSorted(selectedFeatures), "Array has to be sorted asc") @@ -112,7 +112,7 @@ class ChiSqSelectorModel ( */ @Since("1.3.0") @Experimental -class ChiSqSelector ( +class ChiSqSelector @Since("1.3.0") ( @Since("1.3.0") val numTopFeatures: Int) extends Serializable { /** http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala index 33e2d17..d0a6cf6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/ElementwiseProduct.scala @@ -29,7 +29,8 @@ import org.apache.spark.mllib.linalg._ */ @Since("1.4.0") @Experimental -class ElementwiseProduct(val scalingVec: Vector) extends VectorTransformer { +class ElementwiseProduct @Since("1.4.0") ( +@Since("1.4.0") val scalingVec: Vector) extends VectorTransformer { /** * Does the hadamard product transformation. http://git-wip-us.apache.org/repos/asf/spark/blob/21a10a86/mllib/src/main/scala/org/apache/spark/mllib/feature/IDF.scala -- diff --git a/mllib/src/main/scala/org/apache/spar
spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression
Repository: spark Updated Branches: refs/heads/master fb7e12fe2 -> 4657fa1f3 [SPARK-10235] [MLLIB] update since versions in mllib.regression Same as #8421 but for `mllib.regression`. cc freeman-lab dbtsai Author: Xiangrui Meng Closes #8426 from mengxr/SPARK-10235 and squashes the following commits: 6cd28e4 [Xiangrui Meng] update since versions in mllib.regression Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4657fa1f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4657fa1f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4657fa1f Branch: refs/heads/master Commit: 4657fa1f37d41dd4c7240a960342b68c7c591f48 Parents: fb7e12f Author: Xiangrui Meng Authored: Tue Aug 25 22:49:33 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 22:49:33 2015 -0700 -- .../regression/GeneralizedLinearAlgorithm.scala | 6 -- .../spark/mllib/regression/IsotonicRegression.scala | 16 +--- .../spark/mllib/regression/LabeledPoint.scala | 5 +++-- .../org/apache/spark/mllib/regression/Lasso.scala | 9 ++--- .../spark/mllib/regression/LinearRegression.scala | 9 ++--- .../spark/mllib/regression/RidgeRegression.scala| 12 +++- .../mllib/regression/StreamingLinearAlgorithm.scala | 8 +++- .../StreamingLinearRegressionWithSGD.scala | 11 +-- 8 files changed, 47 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 509f6a2..7e3b4d5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel */ @Since("0.8.0") @DeveloperApi -abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double) +abstract class GeneralizedLinearModel @Since("1.0.0") ( +@Since("1.0.0") val weights: Vector, +@Since("0.8.0") val intercept: Double) extends Serializable { /** @@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] * The optimizer to solve the problem. * */ - @Since("1.0.0") + @Since("0.8.0") def optimizer: Optimizer /** Whether to add intercept (default: false). */ http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 31ca7c2..877d31b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext */ @Since("1.3.0") @Experimental -class IsotonicRegressionModel ( -val boundaries: Array[Double], -val predictions: Array[Double], -val isotonic: Boolean) extends Serializable with Saveable { +class IsotonicRegressionModel @Since("1.3.0") ( +@Since("1.3.0") val boundaries: Array[Double], +@Since("1.3.0") val predictions: Array[Double], +@Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable { private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse @@ -63,7 +63,6 @@ class IsotonicRegressionModel ( /** * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. - * */ @Since("1.4.0") def this(boundaries: java.lang.Iterable[Double], @@ -214,8 +213,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { } } - /** - */ @Since("1.4.0") override def load(sc: SparkContext, path: String): IsotonicRegressionModel = { implicit val formats = DefaultFormats @@ -256,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] */ @Experimental +@Since("1.3.0") class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** @@ -263,6 +261,7 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali * *
spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression
Repository: spark Updated Branches: refs/heads/branch-1.5 6d8ebc801 -> 08d390f45 [SPARK-10235] [MLLIB] update since versions in mllib.regression Same as #8421 but for `mllib.regression`. cc freeman-lab dbtsai Author: Xiangrui Meng Closes #8426 from mengxr/SPARK-10235 and squashes the following commits: 6cd28e4 [Xiangrui Meng] update since versions in mllib.regression (cherry picked from commit 4657fa1f37d41dd4c7240a960342b68c7c591f48) Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08d390f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08d390f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08d390f4 Branch: refs/heads/branch-1.5 Commit: 08d390f457f80ffdc2dfce61ea579d9026047f12 Parents: 6d8ebc8 Author: Xiangrui Meng Authored: Tue Aug 25 22:49:33 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 22:49:46 2015 -0700 -- .../regression/GeneralizedLinearAlgorithm.scala | 6 -- .../spark/mllib/regression/IsotonicRegression.scala | 16 +--- .../spark/mllib/regression/LabeledPoint.scala | 5 +++-- .../org/apache/spark/mllib/regression/Lasso.scala | 9 ++--- .../spark/mllib/regression/LinearRegression.scala | 9 ++--- .../spark/mllib/regression/RidgeRegression.scala| 12 +++- .../mllib/regression/StreamingLinearAlgorithm.scala | 8 +++- .../StreamingLinearRegressionWithSGD.scala | 11 +-- 8 files changed, 47 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 509f6a2..7e3b4d5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel */ @Since("0.8.0") @DeveloperApi -abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double) +abstract class GeneralizedLinearModel @Since("1.0.0") ( +@Since("1.0.0") val weights: Vector, +@Since("0.8.0") val intercept: Double) extends Serializable { /** @@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M <: GeneralizedLinearModel] * The optimizer to solve the problem. * */ - @Since("1.0.0") + @Since("0.8.0") def optimizer: Optimizer /** Whether to add intercept (default: false). */ http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 31ca7c2..877d31b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext */ @Since("1.3.0") @Experimental -class IsotonicRegressionModel ( -val boundaries: Array[Double], -val predictions: Array[Double], -val isotonic: Boolean) extends Serializable with Saveable { +class IsotonicRegressionModel @Since("1.3.0") ( +@Since("1.3.0") val boundaries: Array[Double], +@Since("1.3.0") val predictions: Array[Double], +@Since("1.3.0") val isotonic: Boolean) extends Serializable with Saveable { private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse @@ -63,7 +63,6 @@ class IsotonicRegressionModel ( /** * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. - * */ @Since("1.4.0") def this(boundaries: java.lang.Iterable[Double], @@ -214,8 +213,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { } } - /** - */ @Since("1.4.0") override def load(sc: SparkContext, path: String): IsotonicRegressionModel = { implicit val formats = DefaultFormats @@ -256,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] */ @Experimental +@Since("1.3.0") class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** @@ -263,6 +
spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree
Repository: spark Updated Branches: refs/heads/branch-1.5 be0c9915c -> 6d8ebc801 [SPARK-10243] [MLLIB] update since versions in mllib.tree Same as #8421 but for `mllib.tree`. cc jkbradley Author: Xiangrui Meng Closes #8442 from mengxr/SPARK-10236. (cherry picked from commit fb7e12fe2e14af8de4c206ca8096b2e8113bfddc) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d8ebc80 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d8ebc80 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d8ebc80 Branch: refs/heads/branch-1.5 Commit: 6d8ebc801799714d297c83be6935b37e26dc2df7 Parents: be0c991 Author: Xiangrui Meng Authored: Tue Aug 25 22:35:49 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:35:56 2015 -0700 -- .../apache/spark/mllib/tree/DecisionTree.scala | 3 +- .../spark/mllib/tree/GradientBoostedTrees.scala | 2 +- .../spark/mllib/tree/configuration/Algo.scala | 2 ++ .../tree/configuration/BoostingStrategy.scala | 12 .../mllib/tree/configuration/FeatureType.scala | 2 ++ .../tree/configuration/QuantileStrategy.scala | 2 ++ .../mllib/tree/configuration/Strategy.scala | 29 ++-- .../mllib/tree/model/DecisionTreeModel.scala| 5 +++- .../apache/spark/mllib/tree/model/Node.scala| 18 ++-- .../apache/spark/mllib/tree/model/Predict.scala | 6 ++-- .../apache/spark/mllib/tree/model/Split.scala | 8 +++--- .../mllib/tree/model/treeEnsembleModels.scala | 12 12 files changed, 57 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 9728410..4a77d4a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom */ @Since("1.0.0") @Experimental -class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { +class DecisionTree @Since("1.0.0") (private val strategy: Strategy) + extends Serializable with Logging { strategy.assertValid() http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index e750408..95ed48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel */ @Since("1.2.0") @Experimental -class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) +class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: BoostingStrategy) extends Serializable with Logging { /** http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 8301ad1..853c731 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since} @Since("1.0.0") @Experimental object Algo extends Enumeration { + @Since("1.0.0") type Algo = Value + @Since("1.0.0") val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 7c56998..b5c72fb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configurati
spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree
Repository: spark Updated Branches: refs/heads/master d703372f8 -> fb7e12fe2 [SPARK-10243] [MLLIB] update since versions in mllib.tree Same as #8421 but for `mllib.tree`. cc jkbradley Author: Xiangrui Meng Closes #8442 from mengxr/SPARK-10236. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb7e12fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb7e12fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb7e12fe Branch: refs/heads/master Commit: fb7e12fe2e14af8de4c206ca8096b2e8113bfddc Parents: d703372 Author: Xiangrui Meng Authored: Tue Aug 25 22:35:49 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:35:49 2015 -0700 -- .../apache/spark/mllib/tree/DecisionTree.scala | 3 +- .../spark/mllib/tree/GradientBoostedTrees.scala | 2 +- .../spark/mllib/tree/configuration/Algo.scala | 2 ++ .../tree/configuration/BoostingStrategy.scala | 12 .../mllib/tree/configuration/FeatureType.scala | 2 ++ .../tree/configuration/QuantileStrategy.scala | 2 ++ .../mllib/tree/configuration/Strategy.scala | 29 ++-- .../mllib/tree/model/DecisionTreeModel.scala| 5 +++- .../apache/spark/mllib/tree/model/Node.scala| 18 ++-- .../apache/spark/mllib/tree/model/Predict.scala | 6 ++-- .../apache/spark/mllib/tree/model/Split.scala | 8 +++--- .../mllib/tree/model/treeEnsembleModels.scala | 12 12 files changed, 57 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 9728410..4a77d4a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom */ @Since("1.0.0") @Experimental -class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { +class DecisionTree @Since("1.0.0") (private val strategy: Strategy) + extends Serializable with Logging { strategy.assertValid() http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index e750408..95ed48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel */ @Since("1.2.0") @Experimental -class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) +class GradientBoostedTrees @Since("1.2.0") (private val boostingStrategy: BoostingStrategy) extends Serializable with Logging { /** http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 8301ad1..853c731 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since} @Since("1.0.0") @Experimental object Algo extends Enumeration { + @Since("1.0.0") type Algo = Value + @Since("1.0.0") val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 7c56998..b5c72fb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -41,14 +41,14 @@ import org.apache.spark.mllib.tree.loss.{LogLoss, SquaredErro
spark git commit: [SPARK-10234] [MLLIB] update since version in mllib.clustering
Repository: spark Updated Branches: refs/heads/branch-1.5 b7766699a -> be0c9915c [SPARK-10234] [MLLIB] update since version in mllib.clustering Same as #8421 but for `mllib.clustering`. cc feynmanliang yu-iskw Author: Xiangrui Meng Closes #8435 from mengxr/SPARK-10234. (cherry picked from commit d703372f86d6a59383ba8569fcd9d379849cffbf) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be0c9915 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be0c9915 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be0c9915 Branch: refs/heads/branch-1.5 Commit: be0c9915c0084a187933f338e51e606dc68e93af Parents: b776669 Author: Xiangrui Meng Authored: Tue Aug 25 22:33:48 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:33:55 2015 -0700 -- .../mllib/clustering/GaussianMixture.scala | 1 + .../mllib/clustering/GaussianMixtureModel.scala | 8 +++--- .../apache/spark/mllib/clustering/KMeans.scala | 1 + .../spark/mllib/clustering/KMeansModel.scala| 4 +-- .../spark/mllib/clustering/LDAModel.scala | 28 +++- .../clustering/PowerIterationClustering.scala | 10 --- .../mllib/clustering/StreamingKMeans.scala | 15 ++- 7 files changed, 44 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index daa947e..f82bd82 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -53,6 +53,7 @@ import org.apache.spark.util.Utils * @param maxIterations The maximum number of iterations to perform */ @Experimental +@Since("1.3.0") class GaussianMixture private ( private var k: Int, private var convergenceTol: Double, http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 1a10a8b..7f6163e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row} */ @Since("1.3.0") @Experimental -class GaussianMixtureModel( - val weights: Array[Double], - val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { +class GaussianMixtureModel @Since("1.3.0") ( + @Since("1.3.0") val weights: Array[Double], + @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match") @@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - return new GaussianMixtureModel(weights.toArray, gaussians.toArray) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 3e9545a..46920ff 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given * to it should be cached by the user. */ +@Since("0.8.0") class KMeans private ( private var k: Int, private var maxIterations: Int, http://git-wip-us.apache.org/repos/asf/spark/blob/be0c9915/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMean
spark git commit: [SPARK-10234] [MLLIB] update since version in mllib.clustering
Repository: spark Updated Branches: refs/heads/master c3a54843c -> d703372f8 [SPARK-10234] [MLLIB] update since version in mllib.clustering Same as #8421 but for `mllib.clustering`. cc feynmanliang yu-iskw Author: Xiangrui Meng Closes #8435 from mengxr/SPARK-10234. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d703372f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d703372f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d703372f Branch: refs/heads/master Commit: d703372f86d6a59383ba8569fcd9d379849cffbf Parents: c3a5484 Author: Xiangrui Meng Authored: Tue Aug 25 22:33:48 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:33:48 2015 -0700 -- .../mllib/clustering/GaussianMixture.scala | 1 + .../mllib/clustering/GaussianMixtureModel.scala | 8 +++--- .../apache/spark/mllib/clustering/KMeans.scala | 1 + .../spark/mllib/clustering/KMeansModel.scala| 4 +-- .../spark/mllib/clustering/LDAModel.scala | 28 +++- .../clustering/PowerIterationClustering.scala | 10 --- .../mllib/clustering/StreamingKMeans.scala | 15 ++- 7 files changed, 44 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index daa947e..f82bd82 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -53,6 +53,7 @@ import org.apache.spark.util.Utils * @param maxIterations The maximum number of iterations to perform */ @Experimental +@Since("1.3.0") class GaussianMixture private ( private var k: Int, private var convergenceTol: Double, http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 1a10a8b..7f6163e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row} */ @Since("1.3.0") @Experimental -class GaussianMixtureModel( - val weights: Array[Double], - val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { +class GaussianMixtureModel @Since("1.3.0") ( + @Since("1.3.0") val weights: Array[Double], + @Since("1.3.0") val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { require(weights.length == gaussians.length, "Length of weight and Gaussian arrays must match") @@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - return new GaussianMixtureModel(weights.toArray, gaussians.toArray) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 3e9545a..46920ff 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given * to it should be cached by the user. */ +@Since("0.8.0") class KMeans private ( private var k: Int, private var maxIterations: Int, http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index e425ecd..a741584 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMea
spark git commit: [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat
Repository: spark Updated Branches: refs/heads/branch-1.5 46750b912 -> b7766699a [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat The same as #8241 but for `mllib.stat` and `mllib.random`. cc feynmanliang Author: Xiangrui Meng Closes #8439 from mengxr/SPARK-10242. (cherry picked from commit c3a54843c0c8a14059da4e6716c1ad45c69bbe6c) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7766699 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7766699 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7766699 Branch: refs/heads/branch-1.5 Commit: b7766699aef65586b0c3af96fb625efaa218d2b2 Parents: 46750b9 Author: Xiangrui Meng Authored: Tue Aug 25 22:31:23 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:31:33 2015 -0700 -- .../mllib/random/RandomDataGenerator.scala | 43 ++-- .../apache/spark/mllib/random/RandomRDDs.scala | 69 +--- .../distribution/MultivariateGaussian.scala | 6 +- .../spark/mllib/stat/test/TestResult.scala | 24 --- 4 files changed, 117 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7766699/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala index 9349eca..a2d85a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.random import org.apache.commons.math3.distribution.{ExponentialDistribution, GammaDistribution, LogNormalDistribution, PoissonDistribution} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} /** @@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} * Trait for random data generators that generate i.i.d. data. */ @DeveloperApi +@Since("1.1.0") trait RandomDataGenerator[T] extends Pseudorandom with Serializable { /** * Returns an i.i.d. sample as a generic type from an underlying distribution. */ + @Since("1.1.0") def nextValue(): T /** * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the * class when applicable for non-locking concurrent usage. */ + @Since("1.1.0") def copy(): RandomDataGenerator[T] } @@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable { * Generates i.i.d. samples from U[0.0, 1.0] */ @DeveloperApi +@Since("1.1.0") class UniformGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextDouble() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): UniformGenerator = new UniformGenerator() } @@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] { * Generates i.i.d. samples from the standard normal distribution. */ @DeveloperApi +@Since("1.1.0") class StandardNormalGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextGaussian() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): StandardNormalGenerator = new StandardNormalGenerator() } @@ -87,16 +98,21 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] { * @param mean mean for the Poisson distribution. */ @DeveloperApi -class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { +@Since("1.1.0") +class PoissonGenerator @Since("1.1.0") ( +@Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] { private val rng = new PoissonDistribution(mean) + @Since("1.1.0") override def nextValue(): Double = rng.sample() + @Since("1.1.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.1.0") override def copy(): PoissonGenerator = new PoissonGenerator(mean) } @@ -107,16 +123,21 @@ class PoissonGenerat
spark git commit: [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat
Repository: spark Updated Branches: refs/heads/master ab431f8a9 -> c3a54843c [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat The same as #8241 but for `mllib.stat` and `mllib.random`. cc feynmanliang Author: Xiangrui Meng Closes #8439 from mengxr/SPARK-10242. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3a54843 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3a54843 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3a54843 Branch: refs/heads/master Commit: c3a54843c0c8a14059da4e6716c1ad45c69bbe6c Parents: ab431f8 Author: Xiangrui Meng Authored: Tue Aug 25 22:31:23 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 22:31:23 2015 -0700 -- .../mllib/random/RandomDataGenerator.scala | 43 ++-- .../apache/spark/mllib/random/RandomRDDs.scala | 69 +--- .../distribution/MultivariateGaussian.scala | 6 +- .../spark/mllib/stat/test/TestResult.scala | 24 --- 4 files changed, 117 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c3a54843/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala index 9349eca..a2d85a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.random import org.apache.commons.math3.distribution.{ExponentialDistribution, GammaDistribution, LogNormalDistribution, PoissonDistribution} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} /** @@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} * Trait for random data generators that generate i.i.d. data. */ @DeveloperApi +@Since("1.1.0") trait RandomDataGenerator[T] extends Pseudorandom with Serializable { /** * Returns an i.i.d. sample as a generic type from an underlying distribution. */ + @Since("1.1.0") def nextValue(): T /** * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the * class when applicable for non-locking concurrent usage. */ + @Since("1.1.0") def copy(): RandomDataGenerator[T] } @@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable { * Generates i.i.d. samples from U[0.0, 1.0] */ @DeveloperApi +@Since("1.1.0") class UniformGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextDouble() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): UniformGenerator = new UniformGenerator() } @@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] { * Generates i.i.d. samples from the standard normal distribution. */ @DeveloperApi +@Since("1.1.0") class StandardNormalGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since("1.1.0") override def nextValue(): Double = { random.nextGaussian() } + @Since("1.1.0") override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since("1.1.0") override def copy(): StandardNormalGenerator = new StandardNormalGenerator() } @@ -87,16 +98,21 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] { * @param mean mean for the Poisson distribution. */ @DeveloperApi -class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { +@Since("1.1.0") +class PoissonGenerator @Since("1.1.0") ( +@Since("1.1.0") val mean: Double) extends RandomDataGenerator[Double] { private val rng = new PoissonDistribution(mean) + @Since("1.1.0") override def nextValue(): Double = rng.sample() + @Since("1.1.0") override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since("1.1.0") override def copy(): PoissonGenerator = new PoissonGenerator(mean) } @@ -107,16 +123,21 @@ class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { * @param mean mean for the exponential distrib
spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg
Repository: spark Updated Branches: refs/heads/branch-1.5 af98e51f2 -> 46750b912 [SPARK-10238] [MLLIB] update since versions in mllib.linalg Same as #8421 but for `mllib.linalg`. cc dbtsai Author: Xiangrui Meng Closes #8440 from mengxr/SPARK-10238 and squashes the following commits: b38437e [Xiangrui Meng] update since versions in mllib.linalg (cherry picked from commit ab431f8a970b85fba34ccb506c0f8815e55c63bf) Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/46750b91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/46750b91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/46750b91 Branch: refs/heads/branch-1.5 Commit: 46750b912781433b6ce0845ac22805cde975361e Parents: af98e51 Author: Xiangrui Meng Authored: Tue Aug 25 20:07:56 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 20:08:09 2015 -0700 -- .../apache/spark/mllib/linalg/Matrices.scala| 44 +--- .../linalg/SingularValueDecomposition.scala | 1 + .../org/apache/spark/mllib/linalg/Vectors.scala | 25 --- .../mllib/linalg/distributed/BlockMatrix.scala | 10 +++-- .../linalg/distributed/CoordinateMatrix.scala | 4 +- .../linalg/distributed/DistributedMatrix.scala | 2 + .../linalg/distributed/IndexedRowMatrix.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala| 5 ++- 8 files changed, 64 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/46750b91/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 28b5b46..c02ba42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -32,18 +32,23 @@ import org.apache.spark.sql.types._ * Trait for a local matrix. */ @SQLUserDefinedType(udt = classOf[MatrixUDT]) +@Since("1.0.0") sealed trait Matrix extends Serializable { /** Number of rows. */ + @Since("1.0.0") def numRows: Int /** Number of columns. */ + @Since("1.0.0") def numCols: Int /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + @Since("1.3.0") val isTransposed: Boolean = false /** Converts to a dense array in column major. */ + @Since("1.0.0") def toArray: Array[Double] = { val newArray = new Array[Double](numRows * numCols) foreachActive { (i, j, v) => @@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable { private[mllib] def toBreeze: BM[Double] /** Gets the (i, j)-th element. */ + @Since("1.3.0") def apply(i: Int, j: Int): Double /** Return the index for the (i, j)-th element in the backing array. */ @@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable { private[mllib] def update(i: Int, j: Int, v: Double): Unit /** Get a deep copy of the matrix. */ + @Since("1.2.0") def copy: Matrix /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + @Since("1.3.0") def transpose: Matrix /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ + @Since("1.2.0") def multiply(y: DenseMatrix): DenseMatrix = { val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) BLAS.gemm(1.0, this, y, 0.0, C) @@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable { } /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */ + @Since("1.2.0") def multiply(y: DenseVector): DenseVector = { multiply(y.asInstanceOf[Vector]) } /** Convenience method for `Matrix`-`Vector` multiplication. */ + @Since("1.4.0") def multiply(y: Vector): DenseVector = { val output = new DenseVector(new Array[Double](numRows)) BLAS.gemv(1.0, this, y, 0.0, output) @@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable { override def toString: String = toBreeze.toString() /** A human readable representation of the matrix with maximum lines and width */ + @Since("1.4.0") def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth) /** Map the values of this matrix using a function. Generates a new matrix. Performs the @@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable { /** * Find the number of non-zero active values. */ + @Since("1.5.0") def numNonzeros: Int /** * Find the number of values stored explicitly. These values can be zero as well. */ + @Since("1.5.0") def numActives: Int } @@ -230,11 +244,11 @@ private[spark] class MatrixUD
spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg
Repository: spark Updated Branches: refs/heads/master 8668ead2e -> ab431f8a9 [SPARK-10238] [MLLIB] update since versions in mllib.linalg Same as #8421 but for `mllib.linalg`. cc dbtsai Author: Xiangrui Meng Closes #8440 from mengxr/SPARK-10238 and squashes the following commits: b38437e [Xiangrui Meng] update since versions in mllib.linalg Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab431f8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab431f8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab431f8a Branch: refs/heads/master Commit: ab431f8a970b85fba34ccb506c0f8815e55c63bf Parents: 8668ead Author: Xiangrui Meng Authored: Tue Aug 25 20:07:56 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 20:07:56 2015 -0700 -- .../apache/spark/mllib/linalg/Matrices.scala| 44 +--- .../linalg/SingularValueDecomposition.scala | 1 + .../org/apache/spark/mllib/linalg/Vectors.scala | 25 --- .../mllib/linalg/distributed/BlockMatrix.scala | 10 +++-- .../linalg/distributed/CoordinateMatrix.scala | 4 +- .../linalg/distributed/DistributedMatrix.scala | 2 + .../linalg/distributed/IndexedRowMatrix.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala| 5 ++- 8 files changed, 64 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab431f8a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 28b5b46..c02ba42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -32,18 +32,23 @@ import org.apache.spark.sql.types._ * Trait for a local matrix. */ @SQLUserDefinedType(udt = classOf[MatrixUDT]) +@Since("1.0.0") sealed trait Matrix extends Serializable { /** Number of rows. */ + @Since("1.0.0") def numRows: Int /** Number of columns. */ + @Since("1.0.0") def numCols: Int /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + @Since("1.3.0") val isTransposed: Boolean = false /** Converts to a dense array in column major. */ + @Since("1.0.0") def toArray: Array[Double] = { val newArray = new Array[Double](numRows * numCols) foreachActive { (i, j, v) => @@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable { private[mllib] def toBreeze: BM[Double] /** Gets the (i, j)-th element. */ + @Since("1.3.0") def apply(i: Int, j: Int): Double /** Return the index for the (i, j)-th element in the backing array. */ @@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable { private[mllib] def update(i: Int, j: Int, v: Double): Unit /** Get a deep copy of the matrix. */ + @Since("1.2.0") def copy: Matrix /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + @Since("1.3.0") def transpose: Matrix /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ + @Since("1.2.0") def multiply(y: DenseMatrix): DenseMatrix = { val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) BLAS.gemm(1.0, this, y, 0.0, C) @@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable { } /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */ + @Since("1.2.0") def multiply(y: DenseVector): DenseVector = { multiply(y.asInstanceOf[Vector]) } /** Convenience method for `Matrix`-`Vector` multiplication. */ + @Since("1.4.0") def multiply(y: Vector): DenseVector = { val output = new DenseVector(new Array[Double](numRows)) BLAS.gemv(1.0, this, y, 0.0, output) @@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable { override def toString: String = toBreeze.toString() /** A human readable representation of the matrix with maximum lines and width */ + @Since("1.4.0") def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth) /** Map the values of this matrix using a function. Generates a new matrix. Performs the @@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable { /** * Find the number of non-zero active values. */ + @Since("1.5.0") def numNonzeros: Int /** * Find the number of values stored explicitly. These values can be zero as well. */ + @Since("1.5.0") def numActives: Int } @@ -230,11 +244,11 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { */ @Since("1.0.0") @SQLUserDefinedType(udt = classOf[MatrixUD
spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation
Repository: spark Updated Branches: refs/heads/branch-1.5 5cf266fde -> af98e51f2 [SPARK-10233] [MLLIB] update since version in mllib.evaluation Same as #8421 but for `mllib.evaluation`. cc avulanov Author: Xiangrui Meng Closes #8423 from mengxr/SPARK-10233. (cherry picked from commit 8668ead2e7097b9591069599fbfccf67c53db659) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af98e51f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af98e51f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af98e51f Branch: refs/heads/branch-1.5 Commit: af98e51f273d95e0fc19da1eca32a5f87a8c5576 Parents: 5cf266f Author: Xiangrui Meng Authored: Tue Aug 25 18:17:54 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 18:18:27 2015 -0700 -- .../mllib/evaluation/BinaryClassificationMetrics.scala | 8 .../spark/mllib/evaluation/MulticlassMetrics.scala | 11 ++- .../spark/mllib/evaluation/MultilabelMetrics.scala | 12 +++- .../spark/mllib/evaluation/RegressionMetrics.scala | 3 ++- 4 files changed, 27 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 76ae847..508fe53 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame *be smaller as a result, meaning there may be an extra sample at *partition boundaries. */ -@Since("1.3.0") +@Since("1.0.0") @Experimental -class BinaryClassificationMetrics( -val scoreAndLabels: RDD[(Double, Double)], -val numBins: Int) extends Logging { +class BinaryClassificationMetrics @Since("1.3.0") ( +@Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)], +@Since("1.3.0") val numBins: Int) extends Logging { require(numBins >= 0, "numBins must be nonnegative") http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 02e89d9..00e8376 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame */ @Since("1.1.0") @Experimental -class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { +class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) { /** * An auxiliary constructor taking a DataFrame. @@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision */ + @Since("1.1.0") lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount /** @@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * because sum of all false positives is equal to sum * of all false negatives) */ + @Since("1.1.0") lazy val recall: Double = precision /** * Returns f-measure * (equals to precision and recall because precision equals recall) */ + @Since("1.1.0") lazy val fMeasure: Double = precision /** * Returns weighted true positive rate * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedTruePositiveRate: Double = weightedRecall /** * Returns weighted false positive rate */ + @Since("1.1.0") lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) => falsePositiveRate(category) * count.toDouble / labelCount }.sum @@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged recall * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) => recall(category) * count.toDouble / labelCount }.sum @@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)
spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation
Repository: spark Updated Branches: refs/heads/master 125205cdb -> 8668ead2e [SPARK-10233] [MLLIB] update since version in mllib.evaluation Same as #8421 but for `mllib.evaluation`. cc avulanov Author: Xiangrui Meng Closes #8423 from mengxr/SPARK-10233. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8668ead2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8668ead2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8668ead2 Branch: refs/heads/master Commit: 8668ead2e7097b9591069599fbfccf67c53db659 Parents: 125205c Author: Xiangrui Meng Authored: Tue Aug 25 18:17:54 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 18:17:54 2015 -0700 -- .../mllib/evaluation/BinaryClassificationMetrics.scala | 8 .../spark/mllib/evaluation/MulticlassMetrics.scala | 11 ++- .../spark/mllib/evaluation/MultilabelMetrics.scala | 12 +++- .../spark/mllib/evaluation/RegressionMetrics.scala | 3 ++- 4 files changed, 27 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 76ae847..508fe53 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame *be smaller as a result, meaning there may be an extra sample at *partition boundaries. */ -@Since("1.3.0") +@Since("1.0.0") @Experimental -class BinaryClassificationMetrics( -val scoreAndLabels: RDD[(Double, Double)], -val numBins: Int) extends Logging { +class BinaryClassificationMetrics @Since("1.3.0") ( +@Since("1.3.0") val scoreAndLabels: RDD[(Double, Double)], +@Since("1.3.0") val numBins: Int) extends Logging { require(numBins >= 0, "numBins must be nonnegative") http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 02e89d9..00e8376 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame */ @Since("1.1.0") @Experimental -class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { +class MulticlassMetrics @Since("1.1.0") (predictionAndLabels: RDD[(Double, Double)]) { /** * An auxiliary constructor taking a DataFrame. @@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision */ + @Since("1.1.0") lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount /** @@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * because sum of all false positives is equal to sum * of all false negatives) */ + @Since("1.1.0") lazy val recall: Double = precision /** * Returns f-measure * (equals to precision and recall because precision equals recall) */ + @Since("1.1.0") lazy val fMeasure: Double = precision /** * Returns weighted true positive rate * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedTruePositiveRate: Double = weightedRecall /** * Returns weighted false positive rate */ + @Since("1.1.0") lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) => falsePositiveRate(category) * count.toDouble / labelCount }.sum @@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged recall * (equals to precision, recall and f-measure) */ + @Since("1.1.0") lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) => recall(category) * count.toDouble / labelCount }.sum @@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged precision */ + @Since("1.1.0") lazy val weightedPrecisio
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc4 [deleted] dbaa5c294 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc2 [deleted] 07b95c7ad - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc3 [deleted] 3e8ae3894 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc1 [deleted] 60e08e507 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features
Repository: spark Updated Branches: refs/heads/branch-1.5 4c03cb4da -> 5cf266fde [SPARK-9888] [MLLIB] User guide for new LDA features * Adds two new sections to LDA's user guide; one for each optimizer/model * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, hyperpam optimization) * Cleans up a TODO and sets a default parameter in LDA code jkbradley hhbyyh Author: Feynman Liang Closes #8254 from feynmanliang/SPARK-9888. (cherry picked from commit 125205cdb35530cdb4a8fff3e1ee49cf4a299583) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5cf266fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5cf266fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5cf266fd Branch: refs/heads/branch-1.5 Commit: 5cf266fdeb6632622642e5d9bc056a76680b1970 Parents: 4c03cb4 Author: Feynman Liang Authored: Tue Aug 25 17:39:20 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 17:39:40 2015 -0700 -- docs/mllib-clustering.md| 135 --- .../spark/mllib/clustering/LDAModel.scala | 1 - .../spark/mllib/clustering/LDASuite.scala | 1 + 3 files changed, 117 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5cf266fd/docs/mllib-clustering.md -- diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index fd9ab25..3fb35d3 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") is a topic model which infers topics from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: -* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. -* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts. -* Rather than estimating a clustering using a traditional distance, LDA uses a function based - on a statistical model of how text documents are generated. - -LDA takes in a collection of documents as vectors of word counts. -It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) -on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides: - -* Topics: Inferred topics, each of which is a probability distribution over terms (words). -* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only) +* Topics correspond to cluster centers, and documents correspond to +examples (rows) in a dataset. +* Topics and documents both exist in a feature space, where feature +vectors are vectors of word counts (bag of words). +* Rather than estimating a clustering using a traditional distance, LDA +uses a function based on a statistical model of how text documents are +generated. + +LDA supports different inference algorithms via `setOptimizer` function. +`EMLDAOptimizer` learns clustering using +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +on the likelihood function and yields comprehensive results, while +`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online +variational +inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) +and is generally memory friendly. -LDA takes the following parameters: +LDA takes in a collection of documents as vectors of word counts and the +following parameters (set using the builder pattern): * `k`: Number of topics (i.e., cluster centers) -* `maxIterations`: Limit on the number of iterations of EM used for learning -* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions. -* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions. -* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created. If `maxIterations` is large, using checkpoi
spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features
Repository: spark Updated Branches: refs/heads/master 7467b52ed -> 125205cdb [SPARK-9888] [MLLIB] User guide for new LDA features * Adds two new sections to LDA's user guide; one for each optimizer/model * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, hyperpam optimization) * Cleans up a TODO and sets a default parameter in LDA code jkbradley hhbyyh Author: Feynman Liang Closes #8254 from feynmanliang/SPARK-9888. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/125205cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/125205cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/125205cd Branch: refs/heads/master Commit: 125205cdb35530cdb4a8fff3e1ee49cf4a299583 Parents: 7467b52 Author: Feynman Liang Authored: Tue Aug 25 17:39:20 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 17:39:20 2015 -0700 -- docs/mllib-clustering.md| 135 --- .../spark/mllib/clustering/LDAModel.scala | 1 - .../spark/mllib/clustering/LDASuite.scala | 1 + 3 files changed, 117 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/125205cd/docs/mllib-clustering.md -- diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index fd9ab25..3fb35d3 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, "myModelPath") is a topic model which infers topics from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: -* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. -* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts. -* Rather than estimating a clustering using a traditional distance, LDA uses a function based - on a statistical model of how text documents are generated. - -LDA takes in a collection of documents as vectors of word counts. -It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) -on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides: - -* Topics: Inferred topics, each of which is a probability distribution over terms (words). -* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only) +* Topics correspond to cluster centers, and documents correspond to +examples (rows) in a dataset. +* Topics and documents both exist in a feature space, where feature +vectors are vectors of word counts (bag of words). +* Rather than estimating a clustering using a traditional distance, LDA +uses a function based on a statistical model of how text documents are +generated. + +LDA supports different inference algorithms via `setOptimizer` function. +`EMLDAOptimizer` learns clustering using +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +on the likelihood function and yields comprehensive results, while +`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online +variational +inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) +and is generally memory friendly. -LDA takes the following parameters: +LDA takes in a collection of documents as vectors of word counts and the +following parameters (set using the builder pattern): * `k`: Number of topics (i.e., cluster centers) -* `maxIterations`: Limit on the number of iterations of EM used for learning -* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be > 1, where larger values encourage smoother inferred distributions. -* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be > 1, where larger values encourage smoother inferred distributions. -* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created. If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and help with failure recovery. - -*Note*: LDA is a new featur
[2/2] spark git commit: Preparing development version 1.5.1-SNAPSHOT
Preparing development version 1.5.1-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c03cb4d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c03cb4d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c03cb4d Branch: refs/heads/branch-1.5 Commit: 4c03cb4da846bf3ea4cd99f593d74c4a817a7d2d Parents: 7277713 Author: Patrick Wendell Authored: Tue Aug 25 15:56:44 2015 -0700 Committer: Patrick Wendell Committed: Tue Aug 25 15:56:44 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 3ef7d6f..7b41ebb 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 684e07b..16bf17c 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 6b082ad..beb547f 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 9ef1eda..3926b79 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index aa7021d..5eda12d 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 7d72f78..33f2cd7 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0 +1.5.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 38683e3..670c783 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark s
[1/2] spark git commit: Preparing Spark release v1.5.0-rc2
Repository: spark Updated Branches: refs/heads/branch-1.5 ab7d46d1d -> 4c03cb4da Preparing Spark release v1.5.0-rc2 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/72777135 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/72777135 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/72777135 Branch: refs/heads/branch-1.5 Commit: 727771352855dbb780008c449a877f5aaa5fc27a Parents: ab7d46d Author: Patrick Wendell Authored: Tue Aug 25 15:56:37 2015 -0700 Committer: Patrick Wendell Committed: Tue Aug 25 15:56:37 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index e9c6d26..3ef7d6f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index ed5c37e..684e07b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 4f79d71..6b082ad 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e6884b0..9ef1eda 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 561ed4b..aa7021d 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 0664cfb..7d72f78 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.10 -1.5.0-SNAPSHOT +1.5.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 14f7daa..38683e3 100644 --- a/external/flume/pom.xml +++
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.5.0-rc2 [created] 727771352 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)
Repository: spark Updated Branches: refs/heads/branch-1.5 8925896b1 -> ab7d46d1d [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive) Follow the rule in Hive for decimal division. see https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113 cc chenghao-intel Author: Davies Liu Closes #8415 from davies/decimal_div2. (cherry picked from commit 7467b52ed07f174d93dfc4cb544dc4b69a2c2826) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab7d46d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab7d46d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab7d46d1 Branch: refs/heads/branch-1.5 Commit: ab7d46d1d6e7e6705a3348a0cab2d05fe62951cf Parents: 8925896 Author: Davies Liu Authored: Tue Aug 25 15:19:41 2015 -0700 Committer: Yin Huai Committed: Tue Aug 25 15:20:42 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++-- .../sql/catalyst/analysis/AnalysisSuite.scala | 9 --- .../analysis/DecimalPrecisionSuite.scala| 8 +++ .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++-- 4 files changed, 39 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index a1aa2a2..87c11ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -396,8 +396,14 @@ object HiveTypeCoercion { resultType) case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => - val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), -max(6, s1 + p2 + 1)) + var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2) + var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1)) + val diff = (intDig + decDig) - DecimalType.MAX_SCALE + if (diff > 0) { +decDig -= diff / 2 + 1 +intDig = DecimalType.MAX_SCALE - decDig + } + val resultType = DecimalType.bounded(intDig + decDig, decDig) val widerType = widerDecimalType(p1, s1, p2, s2) CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), resultType) http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 1e0cc81..820b336 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,15 +17,14 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.dsl.plans._ class AnalysisSuite extends AnalysisTest { - import TestRelations._ + import org.apache.spark.sql.catalyst.analysis.TestRelations._ test("union project *") { val plan = (1 to 100) @@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest { assert(pl(1).dataType == DoubleType) assert(pl(2).dataType == DoubleType) // StringType will be promoted into Decimal(38, 18) -assert(pl(3).dataType == DecimalType(38, 29)) +assert(pl(3).dataType == DecimalType(38, 22)) assert(pl(4).dataType == DoubleType) } http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)
Repository: spark Updated Branches: refs/heads/master ec89bd840 -> 7467b52ed [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive) Follow the rule in Hive for decimal division. see https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113 cc chenghao-intel Author: Davies Liu Closes #8415 from davies/decimal_div2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7467b52e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7467b52e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7467b52e Branch: refs/heads/master Commit: 7467b52ed07f174d93dfc4cb544dc4b69a2c2826 Parents: ec89bd8 Author: Davies Liu Authored: Tue Aug 25 15:19:41 2015 -0700 Committer: Yin Huai Committed: Tue Aug 25 15:20:24 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++-- .../sql/catalyst/analysis/AnalysisSuite.scala | 9 --- .../analysis/DecimalPrecisionSuite.scala| 8 +++ .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++-- 4 files changed, 39 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index a1aa2a2..87c11ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -396,8 +396,14 @@ object HiveTypeCoercion { resultType) case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) => - val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), -max(6, s1 + p2 + 1)) + var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2) + var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1)) + val diff = (intDig + decDig) - DecimalType.MAX_SCALE + if (diff > 0) { +decDig -= diff / 2 + 1 +intDig = DecimalType.MAX_SCALE - decDig + } + val resultType = DecimalType.bounded(intDig + decDig, decDig) val widerType = widerDecimalType(p1, s1, p2, s2) CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), resultType) http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 1e0cc81..820b336 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,15 +17,14 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.dsl.plans._ class AnalysisSuite extends AnalysisTest { - import TestRelations._ + import org.apache.spark.sql.catalyst.analysis.TestRelations._ test("union project *") { val plan = (1 to 100) @@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest { assert(pl(1).dataType == DoubleType) assert(pl(2).dataType == DoubleType) // StringType will be promoted into Decimal(38, 18) -assert(pl(3).dataType == DecimalType(38, 29)) +assert(pl(3).dataType == DecimalType(38, 22)) assert(pl(4).dataType == DoubleType) } http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala index
spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision < scale
Repository: spark Updated Branches: refs/heads/branch-1.5 6f05b7aeb -> 8925896b1 [SPARK-10245] [SQL] Fix decimal literals with precision < scale In BigDecimal or java.math.BigDecimal, the precision could be smaller than scale, for example, BigDecimal("0.001") has precision = 1 and scale = 3. But DecimalType require that the precision should be larger than scale, so we should use the maximum of precision and scale when inferring the schema from decimal literal. Author: Davies Liu Closes #8428 from davies/smaller_decimal. (cherry picked from commit ec89bd840a6862751999d612f586a962cae63f6d) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8925896b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8925896b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8925896b Branch: refs/heads/branch-1.5 Commit: 8925896b1eb0a13d723d38fb263d3bec0a01ec10 Parents: 6f05b7a Author: Davies Liu Authored: Tue Aug 25 14:55:34 2015 -0700 Committer: Yin Huai Committed: Tue Aug 25 14:55:45 2015 -0700 -- .../apache/spark/sql/catalyst/expressions/literals.scala | 7 --- .../sql/catalyst/expressions/LiteralExpressionSuite.scala | 8 +--- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 10 ++ 3 files changed, 19 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 34bad23..8c0c5d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -36,9 +36,10 @@ object Literal { case s: Short => Literal(s, ShortType) case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) -case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, d.scale)) -case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType(d.precision(), d.scale())) -case d: Decimal => Literal(d, DecimalType(d.precision, d.scale)) +case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) +case d: java.math.BigDecimal => + Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) +case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index f6404d2..015eb18 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test("decimal") { -List(0.0, 1.2, 1., 5).foreach { d => +List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d => checkEvaluation(Literal(Decimal(d)), Decimal(d)) checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt)) checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong)) - checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)), -Decimal((d * 1000L).toLong, 10, 1)) + checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)), +Decimal((d * 1000L).toLong, 10, 3)) + checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d)) + checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index dcb4e83..aa07665 100644 ---
spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision < scale
Repository: spark Updated Branches: refs/heads/master 00ae4be97 -> ec89bd840 [SPARK-10245] [SQL] Fix decimal literals with precision < scale In BigDecimal or java.math.BigDecimal, the precision could be smaller than scale, for example, BigDecimal("0.001") has precision = 1 and scale = 3. But DecimalType require that the precision should be larger than scale, so we should use the maximum of precision and scale when inferring the schema from decimal literal. Author: Davies Liu Closes #8428 from davies/smaller_decimal. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec89bd84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec89bd84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec89bd84 Branch: refs/heads/master Commit: ec89bd840a6862751999d612f586a962cae63f6d Parents: 00ae4be Author: Davies Liu Authored: Tue Aug 25 14:55:34 2015 -0700 Committer: Yin Huai Committed: Tue Aug 25 14:55:34 2015 -0700 -- .../apache/spark/sql/catalyst/expressions/literals.scala | 7 --- .../sql/catalyst/expressions/LiteralExpressionSuite.scala | 8 +--- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 10 ++ 3 files changed, 19 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 34bad23..8c0c5d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -36,9 +36,10 @@ object Literal { case s: Short => Literal(s, ShortType) case s: String => Literal(UTF8String.fromString(s), StringType) case b: Boolean => Literal(b, BooleanType) -case d: BigDecimal => Literal(Decimal(d), DecimalType(d.precision, d.scale)) -case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType(d.precision(), d.scale())) -case d: Decimal => Literal(d, DecimalType(d.precision, d.scale)) +case d: BigDecimal => Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) +case d: java.math.BigDecimal => + Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) +case d: Decimal => Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp => Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date => Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index f6404d2..015eb18 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test("decimal") { -List(0.0, 1.2, 1., 5).foreach { d => +List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d => checkEvaluation(Literal(Decimal(d)), Decimal(d)) checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt)) checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong)) - checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)), -Decimal((d * 1000L).toLong, 10, 1)) + checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)), +Decimal((d * 1000L).toLong, 10, 3)) + checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d)) + checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index dcb4e83..aa07665 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/ap
spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util
Repository: spark Updated Branches: refs/heads/branch-1.5 055387c08 -> 6f05b7aeb [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util Same as #8421 but for `mllib.pmml` and `mllib.util`. cc dbtsai Author: Xiangrui Meng Closes #8430 from mengxr/SPARK-10239 and squashes the following commits: a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util (cherry picked from commit 00ae4be97f7b205432db2967ba6d506286ef2ca6) Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f05b7ae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f05b7ae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f05b7ae Branch: refs/heads/branch-1.5 Commit: 6f05b7aebd66a00e2556a29b35084e81ac526406 Parents: 055387c Author: Xiangrui Meng Authored: Tue Aug 25 14:11:38 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 14:11:50 2015 -0700 -- .../org/apache/spark/mllib/pmml/PMMLExportable.scala | 7 ++- .../org/apache/spark/mllib/util/DataValidators.scala | 7 +-- .../org/apache/spark/mllib/util/KMeansDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 -- .../mllib/util/LogisticRegressionDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/MFDataGenerator.scala | 4 +++- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 ++ .../org/apache/spark/mllib/util/SVMDataGenerator.scala| 6 -- .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala | 6 +- 9 files changed, 41 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 5e882d4..274ac7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory * developed by the Data Mining Group (www.dmg.org). */ @DeveloperApi +@Since("1.4.0") trait PMMLExportable { /** @@ -48,6 +49,7 @@ trait PMMLExportable { * Export the model to a local file in PMML format */ @Experimental + @Since("1.4.0") def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } @@ -57,6 +59,7 @@ trait PMMLExportable { * Export the model to a directory on a distributed file system in PMML format */ @Experimental + @Since("1.4.0") def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() sc.parallelize(Array(pmml), 1).saveAsTextFile(path) @@ -67,6 +70,7 @@ trait PMMLExportable { * Export the model to the OutputStream in PMML format */ @Experimental + @Since("1.4.0") def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } @@ -76,6 +80,7 @@ trait PMMLExportable { * Export the model to a String in PMML format */ @Experimental + @Since("1.4.0") def toPMML(): String = { val writer = new StringWriter toPMML(new StreamResult(writer)) http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index be335a1..dffe6e7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -17,16 +17,17 @@ package org.apache.spark.mllib.util -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * A collection of methods used to validate data before applying ML algorithms. */ @DeveloperApi +@Since("0.8.0") object DataValidators extends Logging { /** @
spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util
Repository: spark Updated Branches: refs/heads/master 920590787 -> 00ae4be97 [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util Same as #8421 but for `mllib.pmml` and `mllib.util`. cc dbtsai Author: Xiangrui Meng Closes #8430 from mengxr/SPARK-10239 and squashes the following commits: a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00ae4be9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00ae4be9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00ae4be9 Branch: refs/heads/master Commit: 00ae4be97f7b205432db2967ba6d506286ef2ca6 Parents: 9205907 Author: Xiangrui Meng Authored: Tue Aug 25 14:11:38 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 14:11:38 2015 -0700 -- .../org/apache/spark/mllib/pmml/PMMLExportable.scala | 7 ++- .../org/apache/spark/mllib/util/DataValidators.scala | 7 +-- .../org/apache/spark/mllib/util/KMeansDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 -- .../mllib/util/LogisticRegressionDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/MFDataGenerator.scala | 4 +++- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 ++ .../org/apache/spark/mllib/util/SVMDataGenerator.scala| 6 -- .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala | 6 +- 9 files changed, 41 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 5e882d4..274ac7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory * developed by the Data Mining Group (www.dmg.org). */ @DeveloperApi +@Since("1.4.0") trait PMMLExportable { /** @@ -48,6 +49,7 @@ trait PMMLExportable { * Export the model to a local file in PMML format */ @Experimental + @Since("1.4.0") def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } @@ -57,6 +59,7 @@ trait PMMLExportable { * Export the model to a directory on a distributed file system in PMML format */ @Experimental + @Since("1.4.0") def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() sc.parallelize(Array(pmml), 1).saveAsTextFile(path) @@ -67,6 +70,7 @@ trait PMMLExportable { * Export the model to the OutputStream in PMML format */ @Experimental + @Since("1.4.0") def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } @@ -76,6 +80,7 @@ trait PMMLExportable { * Export the model to a String in PMML format */ @Experimental + @Since("1.4.0") def toPMML(): String = { val writer = new StringWriter toPMML(new StreamResult(writer)) http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index be335a1..dffe6e7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -17,16 +17,17 @@ package org.apache.spark.mllib.util -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * A collection of methods used to validate data before applying ML algorithms. */ @DeveloperApi +@Since("0.8.0") object DataValidators extends Logging { /** @@ -34,6 +35,7 @@ object DataValidators extends Logging { * * @return True if labels are all ze
spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value
Repository: spark Updated Branches: refs/heads/branch-1.5 186326df2 -> 055387c08 [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value Adds default convergence tolerance (0.001, set in `GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc Author: Feynman Liang Closes #8424 from feynmanliang/SPARK-9797. (cherry picked from commit 9205907876cf65695e56c2a94bedd83df3675c03) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/055387c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/055387c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/055387c0 Branch: refs/heads/branch-1.5 Commit: 055387c087989c8790b6761429b68416ecee3a33 Parents: 186326d Author: Feynman Liang Authored: Tue Aug 25 13:23:15 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 13:23:25 2015 -0700 -- .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/055387c0/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala index 537a052..26654e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala @@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] ( } /** - * Set the convergence tolerance. + * Set the convergence tolerance. Default: 0.001. */ def setConvergenceTol(tolerance: Double): this.type = { this.algorithm.optimizer.setConvergenceTol(tolerance) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value
Repository: spark Updated Branches: refs/heads/master c619c7552 -> 920590787 [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value Adds default convergence tolerance (0.001, set in `GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc Author: Feynman Liang Closes #8424 from feynmanliang/SPARK-9797. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92059078 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92059078 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92059078 Branch: refs/heads/master Commit: 9205907876cf65695e56c2a94bedd83df3675c03 Parents: c619c75 Author: Feynman Liang Authored: Tue Aug 25 13:23:15 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 13:23:15 2015 -0700 -- .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92059078/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala index 537a052..26654e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala @@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] ( } /** - * Set the convergence tolerance. + * Set the convergence tolerance. Default: 0.001. */ def setConvergenceTol(tolerance: Double): this.type = { this.algorithm.optimizer.setConvergenceTol(tolerance) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm
Repository: spark Updated Branches: refs/heads/master c0e9ff158 -> c619c7552 [SPARK-10237] [MLLIB] update since versions in mllib.fpm Same as #8421 but for `mllib.fpm`. cc feynmanliang Author: Xiangrui Meng Closes #8429 from mengxr/SPARK-10237. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c619c755 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c619c755 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c619c755 Branch: refs/heads/master Commit: c619c7552f22d28cfa321ce671fc9ca854dd655f Parents: c0e9ff1 Author: Xiangrui Meng Authored: Tue Aug 25 13:22:38 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 13:22:38 2015 -0700 -- .../spark/mllib/fpm/AssociationRules.scala | 7 -- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 9 ++-- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +--- 3 files changed, 32 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index ba3b447..95c688c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -82,12 +82,15 @@ class AssociationRules private[fpm] ( }.filter(_.confidence >= minConfidence) } + /** Java-friendly version of [[run]]. */ + @Since("1.5.0") def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = { val tag = fakeClassTag[Item] run(freqItemsets.rdd)(tag) } } +@Since("1.5.0") object AssociationRules { /** @@ -104,8 +107,8 @@ object AssociationRules { @Since("1.5.0") @Experimental class Rule[Item] private[fpm] ( - val antecedent: Array[Item], - val consequent: Array[Item], + @Since("1.5.0") val antecedent: Array[Item], + @Since("1.5.0") val consequent: Array[Item], freqUnion: Double, freqAntecedent: Double) extends Serializable { http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index e37f806..aea5c4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel */ @Since("1.3.0") @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { +class FPGrowthModel[Item: ClassTag] @Since("1.3.0") ( +@Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { /** * Generates association rules for the [[Item]]s in [[freqItemsets]]. * @param confidence minimal confidence of the rules produced @@ -126,6 +127,8 @@ class FPGrowth private ( new FPGrowthModel(freqItemsets) } + /** Java-friendly version of [[run]]. */ + @Since("1.3.0") def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.toArray)) @@ -226,7 +229,9 @@ object FPGrowth { * */ @Since("1.3.0") - class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable { + class FreqItemset[Item] @Since("1.3.0") ( + @Since("1.3.0") val items: Array[Item], + @Since("1.3.0") val freq: Long) extends Serializable { /** * Returns items in a Java List. http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index dc4ae1d..97916da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apac
spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm
Repository: spark Updated Branches: refs/heads/branch-1.5 95e44b4df -> 186326df2 [SPARK-10237] [MLLIB] update since versions in mllib.fpm Same as #8421 but for `mllib.fpm`. cc feynmanliang Author: Xiangrui Meng Closes #8429 from mengxr/SPARK-10237. (cherry picked from commit c619c7552f22d28cfa321ce671fc9ca854dd655f) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/186326df Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/186326df Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/186326df Branch: refs/heads/branch-1.5 Commit: 186326df21daf8d8271a522f2569eb5cd7be1442 Parents: 95e44b4 Author: Xiangrui Meng Authored: Tue Aug 25 13:22:38 2015 -0700 Committer: Xiangrui Meng Committed: Tue Aug 25 13:22:45 2015 -0700 -- .../spark/mllib/fpm/AssociationRules.scala | 7 -- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 9 ++-- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +--- 3 files changed, 32 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index ba3b447..95c688c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -82,12 +82,15 @@ class AssociationRules private[fpm] ( }.filter(_.confidence >= minConfidence) } + /** Java-friendly version of [[run]]. */ + @Since("1.5.0") def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = { val tag = fakeClassTag[Item] run(freqItemsets.rdd)(tag) } } +@Since("1.5.0") object AssociationRules { /** @@ -104,8 +107,8 @@ object AssociationRules { @Since("1.5.0") @Experimental class Rule[Item] private[fpm] ( - val antecedent: Array[Item], - val consequent: Array[Item], + @Since("1.5.0") val antecedent: Array[Item], + @Since("1.5.0") val consequent: Array[Item], freqUnion: Double, freqAntecedent: Double) extends Serializable { http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index e37f806..aea5c4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel */ @Since("1.3.0") @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { +class FPGrowthModel[Item: ClassTag] @Since("1.3.0") ( +@Since("1.3.0") val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { /** * Generates association rules for the [[Item]]s in [[freqItemsets]]. * @param confidence minimal confidence of the rules produced @@ -126,6 +127,8 @@ class FPGrowth private ( new FPGrowthModel(freqItemsets) } + /** Java-friendly version of [[run]]. */ + @Since("1.3.0") def run[Item, Basket <: JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.toArray)) @@ -226,7 +229,9 @@ object FPGrowth { * */ @Since("1.3.0") - class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable { + class FreqItemset[Item] @Since("1.3.0") ( + @Since("1.3.0") val items: Array[Item], + @Since("1.3.0") val freq: Long) extends Serializable { /** * Returns items in a Java List. http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index dc4ae1d..97916da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.ap
spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias
Repository: spark Updated Branches: refs/heads/master 71a138cd0 -> c0e9ff158 [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias * Adds doc for alias of runMIniBatchSGD documenting default value for convergeTol * Cleans up a note in code Author: Feynman Liang Closes #8425 from feynmanliang/SPARK-9800. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0e9ff15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0e9ff15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0e9ff15 Branch: refs/heads/master Commit: c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910 Parents: 71a138c Author: Feynman Liang Authored: Tue Aug 25 13:21:05 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 13:21:05 2015 -0700 -- .../org/apache/spark/mllib/optimization/GradientDescent.scala | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0e9ff15/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 8f0d1e4..3b663b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -235,7 +235,7 @@ object GradientDescent extends Logging { if (miniBatchSize > 0) { /** - * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration + * lossSum is computed using the weights from the previous iteration * and regVal is the regularization value computed in the previous iteration as well. */ stochasticLossHistory.append(lossSum / miniBatchSize + regVal) @@ -264,6 +264,9 @@ object GradientDescent extends Logging { } + /** + * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001. + */ def runMiniBatchSGD( data: RDD[(Double, Vector)], gradient: Gradient, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias
Repository: spark Updated Branches: refs/heads/branch-1.5 5a32ed75c -> 95e44b4df [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias * Adds doc for alias of runMIniBatchSGD documenting default value for convergeTol * Cleans up a note in code Author: Feynman Liang Closes #8425 from feynmanliang/SPARK-9800. (cherry picked from commit c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95e44b4d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95e44b4d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95e44b4d Branch: refs/heads/branch-1.5 Commit: 95e44b4df81b09803be2fde8c4e2566be0c8fdbc Parents: 5a32ed7 Author: Feynman Liang Authored: Tue Aug 25 13:21:05 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 13:21:16 2015 -0700 -- .../org/apache/spark/mllib/optimization/GradientDescent.scala | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95e44b4d/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 8f0d1e4..3b663b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -235,7 +235,7 @@ object GradientDescent extends Logging { if (miniBatchSize > 0) { /** - * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration + * lossSum is computed using the weights from the previous iteration * and regVal is the regularization value computed in the previous iteration as well. */ stochasticLossHistory.append(lossSum / miniBatchSize + regVal) @@ -264,6 +264,9 @@ object GradientDescent extends Logging { } + /** + * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001. + */ def runMiniBatchSGD( data: RDD[(Double, Vector)], gradient: Gradient, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde.
Repository: spark Updated Branches: refs/heads/master 16a2be1a8 -> 71a138cd0 [SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde. This PR: 1. supports transferring arbitrary nested array from JVM to R side in SerDe; 2. based on 1, collect() implemenation is improved. Now it can support collecting data of complex types from a DataFrame. Author: Sun Rui Closes #8276 from sun-rui/SPARK-10048. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71a138cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71a138cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71a138cd Branch: refs/heads/master Commit: 71a138cd0e0a14e8426f97877e3b52a562bbd02c Parents: 16a2be1 Author: Sun Rui Authored: Tue Aug 25 13:14:10 2015 -0700 Committer: Shivaram Venkataraman Committed: Tue Aug 25 13:14:10 2015 -0700 -- R/pkg/R/DataFrame.R | 55 ++--- R/pkg/R/deserialize.R | 72 +++- R/pkg/R/serialize.R | 10 +-- R/pkg/inst/tests/test_Serde.R | 77 ++ R/pkg/inst/worker/worker.R | 4 +- .../apache/spark/api/r/RBackendHandler.scala| 7 ++ .../scala/org/apache/spark/api/r/SerDe.scala| 86 .../org/apache/spark/sql/api/r/SQLUtils.scala | 32 +--- 8 files changed, 216 insertions(+), 127 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 10f3c4e..ae1d912 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -652,18 +652,49 @@ setMethod("dim", setMethod("collect", signature(x = "DataFrame"), function(x, stringsAsFactors = FALSE) { -# listCols is a list of raw vectors, one per column -listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf) -cols <- lapply(listCols, function(col) { - objRaw <- rawConnection(col) - numRows <- readInt(objRaw) - col <- readCol(objRaw, numRows) - close(objRaw) - col -}) -names(cols) <- columns(x) -do.call(cbind.data.frame, list(cols, stringsAsFactors = stringsAsFactors)) - }) +names <- columns(x) +ncol <- length(names) +if (ncol <= 0) { + # empty data.frame with 0 columns and 0 rows + data.frame() +} else { + # listCols is a list of columns + listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf) + stopifnot(length(listCols) == ncol) + + # An empty data.frame with 0 columns and number of rows as collected + nrow <- length(listCols[[1]]) + if (nrow <= 0) { +df <- data.frame() + } else { +df <- data.frame(row.names = 1 : nrow) + } + + # Append columns one by one + for (colIndex in 1 : ncol) { +# Note: appending a column of list type into a data.frame so that +# data of complex type can be held. But getting a cell from a column +# of list type returns a list instead of a vector. So for columns of +# non-complex type, append them as vector. +col <- listCols[[colIndex]] +if (length(col) <= 0) { + df[[names[colIndex]]] <- col +} else { + # TODO: more robust check on column of primitive types + vec <- do.call(c, col) + if (class(vec) != "list") { +df[[names[colIndex]]] <- vec + } else { +# For columns of complex type, be careful to access them. +# Get a column of complex type returns a list. +# Get a cell from a column of complex type returns a list instead of a vector. +df[[names[colIndex]]] <- col + } + } +} +df + } +}) #' Limit #' http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/deserialize.R -- diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 33bf13e..6cf628e 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -48,6 +48,7 @@ readTypedObject <- function(con, type) { "r" = readRaw(con), "D" = readDat
spark git commit: [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification
Repository: spark Updated Branches: refs/heads/branch-1.5 c740f5dd2 -> 5a32ed75c [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification Update `Since` annotation in `mllib.classification`: 1. add version to classes, objects, constructors, and public variables declared in constructors 2. correct some versions 3. remove `Since` on `toString` MechCoder dbtsai Author: Xiangrui Meng Closes #8421 from mengxr/SPARK-10231 and squashes the following commits: b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification (cherry picked from commit 16a2be1a84c0a274a60c0a584faaf58b55d4942b) Signed-off-by: DB Tsai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a32ed75 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a32ed75 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a32ed75 Branch: refs/heads/branch-1.5 Commit: 5a32ed75c939dc42886ea940aba2b14b89e9f40e Parents: c740f5d Author: Xiangrui Meng Authored: Tue Aug 25 12:16:23 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 12:16:41 2015 -0700 -- .../classification/ClassificationModel.scala| 7 ++--- .../classification/LogisticRegression.scala | 20 +- .../spark/mllib/classification/NaiveBayes.scala | 28 +++- .../apache/spark/mllib/classification/SVM.scala | 15 +++ .../StreamingLogisticRegressionWithSGD.scala| 9 ++- 5 files changed, 58 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a32ed75/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala index a29b425..85a4132 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc. */ @Experimental +@Since("0.8.0") trait ClassificationModel extends Serializable { /** * Predict values for the given data set using the model trained. @@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable { * @param testData RDD representing data points to be predicted * @return an RDD[Double] where each entry contains the corresponding prediction */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: RDD[Vector]): RDD[Double] /** @@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable { * @param testData array representing a single data point * @return predicted category from the trained model */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: Vector): Double /** @@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable { * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } http://git-wip-us.apache.org/repos/asf/spark/blob/5a32ed75/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index e03e662..5ceff5b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD * Multinomial Logistic Regression. By default, it is binary logistic regression * so numClasses will be set to 2. */ -class LogisticRegressionModel ( -override val weights: Vector, -override val intercept: Double, -val numFeatures: Int, -val numClasses: Int) +@Since("0.8.0") +class LogisticRegressionModel @Since("1.3.0") ( +@Since("1.0.0") override val weights: Vector, +@Since("1.0.0") override val intercept: Double, +@Since("1.3.0") val numFeatures: Int, +@Since("1.3.0") val numClasses: Int) extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable with Saveable with PMMLExportable { @@ -75,6
spark git commit: [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification
Repository: spark Updated Branches: refs/heads/master 881208a8e -> 16a2be1a8 [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification Update `Since` annotation in `mllib.classification`: 1. add version to classes, objects, constructors, and public variables declared in constructors 2. correct some versions 3. remove `Since` on `toString` MechCoder dbtsai Author: Xiangrui Meng Closes #8421 from mengxr/SPARK-10231 and squashes the following commits: b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16a2be1a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16a2be1a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16a2be1a Branch: refs/heads/master Commit: 16a2be1a84c0a274a60c0a584faaf58b55d4942b Parents: 881208a Author: Xiangrui Meng Authored: Tue Aug 25 12:16:23 2015 -0700 Committer: DB Tsai Committed: Tue Aug 25 12:16:23 2015 -0700 -- .../classification/ClassificationModel.scala| 7 ++--- .../classification/LogisticRegression.scala | 20 +- .../spark/mllib/classification/NaiveBayes.scala | 28 +++- .../apache/spark/mllib/classification/SVM.scala | 15 +++ .../StreamingLogisticRegressionWithSGD.scala| 9 ++- 5 files changed, 58 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala index a29b425..85a4132 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc. */ @Experimental +@Since("0.8.0") trait ClassificationModel extends Serializable { /** * Predict values for the given data set using the model trained. @@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable { * @param testData RDD representing data points to be predicted * @return an RDD[Double] where each entry contains the corresponding prediction */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: RDD[Vector]): RDD[Double] /** @@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable { * @param testData array representing a single data point * @return predicted category from the trained model */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: Vector): Double /** @@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable { * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction */ - @Since("0.8.0") + @Since("1.0.0") def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index e03e662..5ceff5b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD * Multinomial Logistic Regression. By default, it is binary logistic regression * so numClasses will be set to 2. */ -class LogisticRegressionModel ( -override val weights: Vector, -override val intercept: Double, -val numFeatures: Int, -val numClasses: Int) +@Since("0.8.0") +class LogisticRegressionModel @Since("1.3.0") ( +@Since("1.0.0") override val weights: Vector, +@Since("1.0.0") override val intercept: Double, +@Since("1.3.0") val numFeatures: Int, +@Since("1.3.0") val numClasses: Int) extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable with Saveable with PMMLExportable { @@ -75,6 +76,7 @@ class LogisticRegressionModel ( /** * Constructs a [[LogisticRegressionModel]] with we
spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration
Repository: spark Updated Branches: refs/heads/branch-1.5 742c82ed9 -> c740f5dd2 [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770) CC jkbradley Author: Feynman Liang Closes #8422 from feynmanliang/SPARK-10230. (cherry picked from commit 881208a8e849facf54166bdd69d3634407f952e7) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c740f5dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c740f5dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c740f5dd Branch: refs/heads/branch-1.5 Commit: c740f5dd20459b491a8c088383c19c11a76c225d Parents: 742c82e Author: Feynman Liang Authored: Tue Aug 25 11:58:47 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 11:58:55 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 16 .../apache/spark/mllib/clustering/LDASuite.scala| 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 5c2aae6..38486e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 - private var optimizeAlpha: Boolean = false + private var optimizeDocConcentration: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) - * will be optimized during training. + * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for + * document-topic distribution) will be optimized during training. */ @Since("1.5.0") - def getOptimzeAlpha: Boolean = this.optimizeAlpha + def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration /** - * Sets whether to optimize alpha parameter during training. + * Sets whether to optimize docConcentration parameter during training. * * Default: false */ @Since("1.5.0") - def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { -this.optimizeAlpha = optimizeAlpha + def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = { +this.optimizeDocConcentration = optimizeDocConcentration this } @@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { // Note that this is an optimization to avoid batch.count updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt) -if (optimizeAlpha) updateAlpha(gammat) +if (optimizeDocConcentration) updateAlpha(gammat) this } http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 8a714f9..746a76a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val k = 2 val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) - .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false) + .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false) val lda = new LDA().setK(k) .setDocConcentration(1D / k) .setTopicConcentration(0.01) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration
Repository: spark Updated Branches: refs/heads/master b37f0cc1b -> 881208a8e [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770) CC jkbradley Author: Feynman Liang Closes #8422 from feynmanliang/SPARK-10230. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/881208a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/881208a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/881208a8 Branch: refs/heads/master Commit: 881208a8e849facf54166bdd69d3634407f952e7 Parents: b37f0cc Author: Feynman Liang Authored: Tue Aug 25 11:58:47 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 11:58:47 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 16 .../apache/spark/mllib/clustering/LDASuite.scala| 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 5c2aae6..38486e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 - private var optimizeAlpha: Boolean = false + private var optimizeDocConcentration: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) - * will be optimized during training. + * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for + * document-topic distribution) will be optimized during training. */ @Since("1.5.0") - def getOptimzeAlpha: Boolean = this.optimizeAlpha + def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration /** - * Sets whether to optimize alpha parameter during training. + * Sets whether to optimize docConcentration parameter during training. * * Default: false */ @Since("1.5.0") - def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { -this.optimizeAlpha = optimizeAlpha + def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = { +this.optimizeDocConcentration = optimizeDocConcentration this } @@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { // Note that this is an optimization to avoid batch.count updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt) -if (optimizeAlpha) updateAlpha(gammat) +if (optimizeDocConcentration) updateAlpha(gammat) this } http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 8a714f9..746a76a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val k = 2 val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) - .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false) + .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false) val lda = new LDA().setK(k) .setDocConcentration(1D / k) .setTopicConcentration(0.01) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8531] [ML] Update ML user guide for MinMaxScaler
Repository: spark Updated Branches: refs/heads/master 5c08c86bf -> b37f0cc1b [SPARK-8531] [ML] Update ML user guide for MinMaxScaler jira: https://issues.apache.org/jira/browse/SPARK-8531 Update ML user guide for MinMaxScaler Author: Yuhao Yang Author: unknown Closes #7211 from hhbyyh/minmaxdoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b37f0cc1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b37f0cc1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b37f0cc1 Branch: refs/heads/master Commit: b37f0cc1b4c064d6f09edb161250fa8b783de52a Parents: 5c08c86 Author: Yuhao Yang Authored: Tue Aug 25 10:54:03 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 10:54:03 2015 -0700 -- docs/ml-features.md | 71 1 file changed, 71 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b37f0cc1/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 642a4b4..62de483 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame) {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StandardScaler; +import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; @@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame) +## MinMaxScaler + +`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters: + +* `min`: 0.0 by default. Lower bound after transformation, shared by all features. +* `max`: 1.0 by default. Upper bound after transformation, shared by all features. + +`MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range. + +The rescaled value for a feature E is calculated as, +`\begin{equation} + Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min +\end{equation}` +For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)` + +Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. + +The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1]. + + + +More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). +{% highlight scala %} +import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.mllib.util.MLUtils + +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") +val dataFrame = sqlContext.createDataFrame(data) +val scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures") + +// Compute summary statistics and generate MinMaxScalerModel +val scalerModel = scaler.fit(dataFrame) + +// rescale each feature to range [min, max]. +val scaledData = scalerModel.transform(dataFrame) +{% endhighlight %} + + + +More details can be found in the API docs for +[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and +[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html). +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.MinMaxScaler; +import org.apache.spark.ml.feature.MinMaxScalerModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; + +JavaRDD data = + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); +DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +MinMaxScaler scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures"); + +// Compute summary statistics and generate MinMaxScalerModel +MinMaxScalerModel scalerModel = scaler.fit(dataFrame); + +// rescale each feature to range [min, max]. +DataFrame scaledData = scalerModel.transform(dataFrame); +{% endhighlight %} + + + ## Bucketizer `Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional command
spark git commit: [SPARK-8531] [ML] Update ML user guide for MinMaxScaler
Repository: spark Updated Branches: refs/heads/branch-1.5 0402f1297 -> 742c82ed9 [SPARK-8531] [ML] Update ML user guide for MinMaxScaler jira: https://issues.apache.org/jira/browse/SPARK-8531 Update ML user guide for MinMaxScaler Author: Yuhao Yang Author: unknown Closes #7211 from hhbyyh/minmaxdoc. (cherry picked from commit b37f0cc1b4c064d6f09edb161250fa8b783de52a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/742c82ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/742c82ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/742c82ed Branch: refs/heads/branch-1.5 Commit: 742c82ed97ed3fc60d4f17c4363c52062829ea49 Parents: 0402f12 Author: Yuhao Yang Authored: Tue Aug 25 10:54:03 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 25 10:54:12 2015 -0700 -- docs/ml-features.md | 71 1 file changed, 71 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/742c82ed/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 642a4b4..62de483 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1133,6 +1133,7 @@ val scaledData = scalerModel.transform(dataFrame) {% highlight java %} import org.apache.spark.api.java.JavaRDD; import org.apache.spark.ml.feature.StandardScaler; +import org.apache.spark.ml.feature.StandardScalerModel; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.sql.DataFrame; @@ -1173,6 +1174,76 @@ scaledData = scalerModel.transform(dataFrame) +## MinMaxScaler + +`MinMaxScaler` transforms a dataset of `Vector` rows, rescaling each feature to a specific range (often [0, 1]). It takes parameters: + +* `min`: 0.0 by default. Lower bound after transformation, shared by all features. +* `max`: 1.0 by default. Upper bound after transformation, shared by all features. + +`MinMaxScaler` computes summary statistics on a data set and produces a `MinMaxScalerModel`. The model can then transform each feature individually such that it is in the given range. + +The rescaled value for a feature E is calculated as, +`\begin{equation} + Rescaled(e_i) = \frac{e_i - E_{min}}{E_{max} - E_{min}} * (max - min) + min +\end{equation}` +For the case `E_{max} == E_{min}`, `Rescaled(e_i) = 0.5 * (max + min)` + +Note that since zero values will probably be transformed to non-zero values, output of the transformer will be DenseVector even for sparse input. + +The following example demonstrates how to load a dataset in libsvm format and then rescale each feature to [0, 1]. + + + +More details can be found in the API docs for +[MinMaxScaler](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScaler) and +[MinMaxScalerModel](api/scala/index.html#org.apache.spark.ml.feature.MinMaxScalerModel). +{% highlight scala %} +import org.apache.spark.ml.feature.MinMaxScaler +import org.apache.spark.mllib.util.MLUtils + +val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") +val dataFrame = sqlContext.createDataFrame(data) +val scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures") + +// Compute summary statistics and generate MinMaxScalerModel +val scalerModel = scaler.fit(dataFrame) + +// rescale each feature to range [min, max]. +val scaledData = scalerModel.transform(dataFrame) +{% endhighlight %} + + + +More details can be found in the API docs for +[MinMaxScaler](api/java/org/apache/spark/ml/feature/MinMaxScaler.html) and +[MinMaxScalerModel](api/java/org/apache/spark/ml/feature/MinMaxScalerModel.html). +{% highlight java %} +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.MinMaxScaler; +import org.apache.spark.ml.feature.MinMaxScalerModel; +import org.apache.spark.mllib.regression.LabeledPoint; +import org.apache.spark.mllib.util.MLUtils; +import org.apache.spark.sql.DataFrame; + +JavaRDD data = + MLUtils.loadLibSVMFile(jsc.sc(), "data/mllib/sample_libsvm_data.txt").toJavaRDD(); +DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); +MinMaxScaler scaler = new MinMaxScaler() + .setInputCol("features") + .setOutputCol("scaledFeatures"); + +// Compute summary statistics and generate MinMaxScalerModel +MinMaxScalerModel scalerModel = scaler.fit(dataFrame); + +// rescale each feature to range [min, max]. +DataFrame scaledData = scalerModel.transform(dataFrame); +{% endhighlight %} + + + ## Bucketizer `Bucketizer` transforms a column of continuous features to a column of feature buckets, where the buckets are specified by users. It takes a parameter: -
spark git commit: [SPARK-10198] [SQL] Turn off partition verification by default
Repository: spark Updated Branches: refs/heads/branch-1.5 bdcc8e608 -> 0402f1297 [SPARK-10198] [SQL] Turn off partition verification by default Author: Michael Armbrust Closes #8404 from marmbrus/turnOffPartitionVerification. (cherry picked from commit 5c08c86bfa43462fb2ca5f7c5980ddfb44dd57f8) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0402f129 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0402f129 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0402f129 Branch: refs/heads/branch-1.5 Commit: 0402f1297c697bfbe8b5c7bfc170fcdc6b2c9de5 Parents: bdcc8e6 Author: Michael Armbrust Authored: Tue Aug 25 10:22:54 2015 -0700 Committer: Michael Armbrust Committed: Tue Aug 25 10:23:08 2015 -0700 -- .../scala/org/apache/spark/sql/SQLConf.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 64 +++- 2 files changed, 35 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0402f129/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index e9de14f..2974055 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -312,7 +312,7 @@ private[spark] object SQLConf { doc = "When true, enable filter pushdown for ORC files.") val HIVE_VERIFY_PARTITION_PATH = booleanConf("spark.sql.hive.verifyPartitionPath", -defaultValue = Some(true), +defaultValue = Some(false), doc = "") val HIVE_METASTORE_PARTITION_PRUNING = booleanConf("spark.sql.hive.metastorePartitionPruning", http://git-wip-us.apache.org/repos/asf/spark/blob/0402f129/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala index 017bc2a..1cc8a93 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala @@ -18,50 +18,54 @@ package org.apache.spark.sql.hive import com.google.common.io.Files +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.util.Utils -class QueryPartitionSuite extends QueryTest { +class QueryPartitionSuite extends QueryTest with SQLTestUtils { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive import ctx.implicits._ - import ctx.sql + + protected def _sqlContext = ctx test("SPARK-5068: query data when path doesn't exist"){ -val testData = ctx.sparkContext.parallelize( - (1 to 10).map(i => TestData(i, i.toString))).toDF() -testData.registerTempTable("testData") +withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { + val testData = ctx.sparkContext.parallelize( +(1 to 10).map(i => TestData(i, i.toString))).toDF() + testData.registerTempTable("testData") -val tmpDir = Files.createTempDir() -// create the table for test -sql(s"CREATE TABLE table_with_partition(key int,value string) " + - s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + - "SELECT key,value FROM testData") + val tmpDir = Files.createTempDir() + // create the table for test + sql(s"CREATE TABLE table_with_partition(key int,value string) " + +s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + +"SELECT key,value FROM testData") -// test for the exist path -checkAnswer(sq
spark git commit: [SPARK-10198] [SQL] Turn off partition verification by default
Repository: spark Updated Branches: refs/heads/master 69c9c1771 -> 5c08c86bf [SPARK-10198] [SQL] Turn off partition verification by default Author: Michael Armbrust Closes #8404 from marmbrus/turnOffPartitionVerification. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c08c86b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c08c86b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c08c86b Branch: refs/heads/master Commit: 5c08c86bfa43462fb2ca5f7c5980ddfb44dd57f8 Parents: 69c9c17 Author: Michael Armbrust Authored: Tue Aug 25 10:22:54 2015 -0700 Committer: Michael Armbrust Committed: Tue Aug 25 10:22:54 2015 -0700 -- .../scala/org/apache/spark/sql/SQLConf.scala| 2 +- .../spark/sql/hive/QueryPartitionSuite.scala| 64 +++- 2 files changed, 35 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c08c86b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala index e6f7619..9de75f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala @@ -312,7 +312,7 @@ private[spark] object SQLConf { doc = "When true, enable filter pushdown for ORC files.") val HIVE_VERIFY_PARTITION_PATH = booleanConf("spark.sql.hive.verifyPartitionPath", -defaultValue = Some(true), +defaultValue = Some(false), doc = "") val HIVE_METASTORE_PARTITION_PRUNING = booleanConf("spark.sql.hive.metastorePartitionPruning", http://git-wip-us.apache.org/repos/asf/spark/blob/5c08c86b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala index 017bc2a..1cc8a93 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala @@ -18,50 +18,54 @@ package org.apache.spark.sql.hive import com.google.common.io.Files +import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.util.Utils -class QueryPartitionSuite extends QueryTest { +class QueryPartitionSuite extends QueryTest with SQLTestUtils { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive import ctx.implicits._ - import ctx.sql + + protected def _sqlContext = ctx test("SPARK-5068: query data when path doesn't exist"){ -val testData = ctx.sparkContext.parallelize( - (1 to 10).map(i => TestData(i, i.toString))).toDF() -testData.registerTempTable("testData") +withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { + val testData = ctx.sparkContext.parallelize( +(1 to 10).map(i => TestData(i, i.toString))).toDF() + testData.registerTempTable("testData") -val tmpDir = Files.createTempDir() -// create the table for test -sql(s"CREATE TABLE table_with_partition(key int,value string) " + - s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + - "SELECT key,value FROM testData") -sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + - "SELECT key,value FROM testData") + val tmpDir = Files.createTempDir() + // create the table for test + sql(s"CREATE TABLE table_with_partition(key int,value string) " + +s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + +"SELECT key,value FROM testData") + sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + +"SELECT key,value FROM testData") -// test for the exist path -checkAnswer(sql("select key,value from table_with_partition"), - testData.toDF.collect ++ testData.toDF.collect -
spark git commit: [SPARK-8400] [ML] Added check in ml.ALS for positive block size parameter setting
Repository: spark Updated Branches: refs/heads/branch-1.3 3d2eaf0a7 -> e8b0564e7 [SPARK-8400] [ML] Added check in ml.ALS for positive block size parameter setting Added check for positive block size with a note that -1 for auto-configuring is not supported Author: Bryan Cutler Closes #8363 from BryanCutler/ml.ALS-neg-blocksize-8400-1.3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8b0564e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8b0564e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8b0564e Branch: refs/heads/branch-1.3 Commit: e8b0564e770b896a6156c7e1eed5bc5200e223f4 Parents: 3d2eaf0 Author: Bryan Cutler Authored: Tue Aug 25 12:36:49 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 12:36:49 2015 +0100 -- .../scala/org/apache/spark/ml/recommendation/ALS.scala| 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8b0564e/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala index fc0f529..6ce5285 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/recommendation/ALS.scala @@ -234,10 +234,16 @@ class ALS extends Estimator[ALSModel] with ALSParams { def setRank(value: Int): this.type = set(rank, value) /** @group setParam */ - def setNumUserBlocks(value: Int): this.type = set(numUserBlocks, value) + def setNumUserBlocks(value: Int): this.type = { +require(value > 0, "Number of blocks must be > 0, auto-configuring with -1 is not supported.") +set(numUserBlocks, value) + } /** @group setParam */ - def setNumItemBlocks(value: Int): this.type = set(numItemBlocks, value) + def setNumItemBlocks(value: Int): this.type = { +require(value > 0, "Number of blocks must be > 0, auto-configuring with -1 is not supported.") +set(numItemBlocks, value) + } /** @group setParam */ def setImplicitPrefs(value: Boolean): this.type = set(implicitPrefs, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[4/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala index b089da8..7c170a7 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala @@ -19,7 +19,7 @@ package org.apache.spark.network.netty import java.nio.ByteBuffer -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager @@ -55,7 +55,7 @@ class NettyBlockRpcServer( case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) -val streamId = streamManager.registerStream(blocks.iterator) +val streamId = streamManager.registerStream(blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index d650d5f..ff8aae9 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -17,7 +17,7 @@ package org.apache.spark.network.netty -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.concurrent.{Future, Promise} import org.apache.spark.{SecurityManager, SparkConf} @@ -58,7 +58,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage securityManager.isSaslEncryptionEnabled())) } transportContext = new TransportContext(transportConf, rpcHandler) -clientFactory = transportContext.createClientFactory(clientBootstrap.toList) +clientFactory = transportContext.createClientFactory(clientBootstrap.toSeq.asJava) server = createServer(serverBootstrap.toList) appId = conf.getAppId logInfo("Server created on " + server.getPort) @@ -67,7 +67,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage /** Creates and binds the TransportServer, possibly trying multiple ports. */ private def createServer(bootstraps: List[TransportServerBootstrap]): TransportServer = { def startService(port: Int): (TransportServer, Int) = { - val server = transportContext.createServer(port, bootstraps) + val server = transportContext.createServer(port, bootstraps.asJava) (server, server.getPort) } http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/nio/Connection.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala index 1499da0..8d9ebad 100644 --- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala +++ b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala @@ -23,7 +23,7 @@ import java.nio.channels._ import java.util.concurrent.ConcurrentLinkedQueue import java.util.LinkedList -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.NonFatal @@ -145,7 +145,7 @@ abstract class Connection(val channel: SocketChannel, val selector: Selector, } def callOnExceptionCallbacks(e: Throwable) { -onExceptionCallbacks foreach { +onExceptionCallbacks.asScala.foreach { callback => try { callback(this, e) http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala -- diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala index 91b07ce..5afce75 100644 --- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala +++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala @@ -19,7 +19,7 @@ package org.apache.spark.partial i
[3/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala -- diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala index 91d63d4..a2ab320 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala @@ -18,9 +18,8 @@ package org.apache.spark.streaming.flume import java.util.concurrent._ -import java.util.{List => JList, Map => JMap} +import java.util.{Map => JMap, Collections} -import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import com.google.common.base.Charsets.UTF_8 @@ -77,7 +76,7 @@ private[flume] class PollingFlumeTestUtils { /** * Start 2 sinks and return the ports */ - def startMultipleSinks(): JList[Int] = { + def startMultipleSinks(): Seq[Int] = { channels.clear() sinks.clear() @@ -138,8 +137,7 @@ private[flume] class PollingFlumeTestUtils { /** * A Python-friendly method to assert the output */ - def assertOutput( - outputHeaders: JList[JMap[String, String]], outputBodies: JList[String]): Unit = { + def assertOutput(outputHeaders: Seq[JMap[String, String]], outputBodies: Seq[String]): Unit = { require(outputHeaders.size == outputBodies.size) val eventSize = outputHeaders.size if (eventSize != totalEventsPerChannel * channels.size) { @@ -149,12 +147,12 @@ private[flume] class PollingFlumeTestUtils { var counter = 0 for (k <- 0 until channels.size; i <- 0 until totalEventsPerChannel) { val eventBodyToVerify = s"${channels(k).getName}-$i" - val eventHeaderToVerify: JMap[String, String] = Map[String, String](s"test-$i" -> "header") + val eventHeaderToVerify: JMap[String, String] = Collections.singletonMap(s"test-$i", "header") var found = false var j = 0 while (j < eventSize && !found) { -if (eventBodyToVerify == outputBodies.get(j) && - eventHeaderToVerify == outputHeaders.get(j)) { +if (eventBodyToVerify == outputBodies(j) && + eventHeaderToVerify == outputHeaders(j)) { found = true counter += 1 } @@ -195,7 +193,7 @@ private[flume] class PollingFlumeTestUtils { tx.begin() for (j <- 0 until eventsPerBatch) { channel.put(EventBuilder.withBody(s"${channel.getName}-$t".getBytes(UTF_8), -Map[String, String](s"test-$t" -> "header"))) +Collections.singletonMap(s"test-$t", "header"))) t += 1 } tx.commit() http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala -- diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index d5f9a0a..ff2fb8e 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.flume import java.net.InetSocketAddress -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps @@ -116,9 +116,9 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log // The eventually is required to ensure that all data in the batch has been processed. eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten -val headers = flattenOutputBuffer.map(_.event.getHeaders.map { - case kv => (kv._1.toString, kv._2.toString) -}).map(mapAsJavaMap) +val headers = flattenOutputBuffer.map(_.event.getHeaders.asScala.map { + case (key, value) => (key.toString, value.toString) +}).map(_.asJava) val bodies = flattenOutputBuffer.map(e => new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala -- diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala b/exter
[5/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
[SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters Replace `JavaConversions` implicits with `JavaConverters` Most occurrences I've seen so far are necessary conversions; a few have been avoidable. None are in critical code as far as I see, yet. Author: Sean Owen Closes #8033 from srowen/SPARK-9613. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69c9c177 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69c9c177 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69c9c177 Branch: refs/heads/master Commit: 69c9c177160e32a2fbc9b36ecc52156077fca6fc Parents: 7f1e507 Author: Sean Owen Authored: Tue Aug 25 12:33:13 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 12:33:13 2015 +0100 -- .../shuffle/unsafe/UnsafeShuffleWriter.java | 4 +- .../org/apache/spark/MapOutputTracker.scala | 4 +- .../scala/org/apache/spark/SSLOptions.scala | 11 +- .../scala/org/apache/spark/SparkContext.scala | 4 +- .../main/scala/org/apache/spark/TestUtils.scala | 9 +- .../apache/spark/api/java/JavaHadoopRDD.scala | 4 +- .../spark/api/java/JavaNewHadoopRDD.scala | 4 +- .../org/apache/spark/api/java/JavaPairRDD.scala | 19 ++-- .../org/apache/spark/api/java/JavaRDDLike.scala | 75 + .../spark/api/java/JavaSparkContext.scala | 20 ++-- .../spark/api/python/PythonHadoopUtil.scala | 28 ++--- .../org/apache/spark/api/python/PythonRDD.scala | 26 ++--- .../apache/spark/api/python/PythonUtils.scala | 15 ++- .../spark/api/python/PythonWorkerFactory.scala | 11 +- .../org/apache/spark/api/python/SerDeUtil.scala | 3 +- .../WriteInputFormatTestDataGenerator.scala | 8 +- .../scala/org/apache/spark/api/r/RRDD.scala | 13 ++- .../scala/org/apache/spark/api/r/RUtils.scala | 5 +- .../scala/org/apache/spark/api/r/SerDe.scala| 4 +- .../spark/broadcast/TorrentBroadcast.scala | 4 +- .../spark/deploy/ExternalShuffleService.scala | 8 +- .../org/apache/spark/deploy/PythonRunner.scala | 4 +- .../org/apache/spark/deploy/RPackageUtils.scala | 4 +- .../scala/org/apache/spark/deploy/RRunner.scala | 4 +- .../apache/spark/deploy/SparkCuratorUtil.scala | 4 +- .../apache/spark/deploy/SparkHadoopUtil.scala | 19 ++-- .../spark/deploy/SparkSubmitArguments.scala | 6 +- .../master/ZooKeeperPersistenceEngine.scala | 6 +- .../spark/deploy/worker/CommandUtils.scala | 5 +- .../spark/deploy/worker/DriverRunner.scala | 8 +- .../spark/deploy/worker/ExecutorRunner.scala| 7 +- .../org/apache/spark/deploy/worker/Worker.scala | 1 - .../org/apache/spark/executor/Executor.scala| 6 +- .../apache/spark/executor/ExecutorSource.scala | 4 +- .../spark/executor/MesosExecutorBackend.scala | 6 +- .../apache/spark/input/PortableDataStream.scala | 11 +- .../spark/input/WholeTextFileInputFormat.scala | 8 +- .../spark/launcher/WorkerCommandBuilder.scala | 4 +- .../apache/spark/metrics/MetricsConfig.scala| 22 ++-- .../network/netty/NettyBlockRpcServer.scala | 4 +- .../netty/NettyBlockTransferService.scala | 6 +- .../apache/spark/network/nio/Connection.scala | 4 +- .../spark/partial/GroupedCountEvaluator.scala | 10 +- .../spark/partial/GroupedMeanEvaluator.scala| 10 +- .../spark/partial/GroupedSumEvaluator.scala | 10 +- .../org/apache/spark/rdd/PairRDDFunctions.scala | 6 +- .../scala/org/apache/spark/rdd/PipedRDD.scala | 6 +- .../org/apache/spark/rdd/SubtractedRDD.scala| 4 +- .../spark/scheduler/InputFormatInfo.scala | 4 +- .../scala/org/apache/spark/scheduler/Pool.scala | 10 +- .../mesos/CoarseMesosSchedulerBackend.scala | 20 ++-- .../mesos/MesosClusterPersistenceEngine.scala | 4 +- .../cluster/mesos/MesosClusterScheduler.scala | 14 +-- .../cluster/mesos/MesosSchedulerBackend.scala | 22 ++-- .../cluster/mesos/MesosSchedulerUtils.scala | 25 ++--- .../spark/serializer/KryoSerializer.scala | 10 +- .../shuffle/FileShuffleBlockResolver.scala | 8 +- .../storage/BlockManagerMasterEndpoint.scala| 8 +- .../scala/org/apache/spark/util/AkkaUtils.scala | 4 +- .../org/apache/spark/util/ListenerBus.scala | 7 +- .../spark/util/MutableURLClassLoader.scala | 2 - .../apache/spark/util/TimeStampedHashMap.scala | 10 +- .../apache/spark/util/TimeStampedHashSet.scala | 4 +- .../scala/org/apache/spark/util/Utils.scala | 20 ++-- .../apache/spark/util/collection/Utils.scala| 4 +- .../java/org/apache/spark/JavaAPISuite.java | 6 +- .../scala/org/apache/spark/SparkConfSuite.scala | 7 +- .../spark/deploy/LogUrlsStandaloneSuite.scala | 1 - .../spark/deploy/RPackageUtilsSuite.scala | 8 +- .../deploy/worker/ExecutorRunnerTest.scala | 5 +- .../
[1/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
Repository: spark Updated Branches: refs/heads/master 7f1e507bf -> 69c9c1771 http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index ccf753e..5f897cb 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -21,9 +21,7 @@ import java.util.Collections import java.util.concurrent._ import java.util.regex.Pattern -import org.apache.spark.util.Utils - -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} import com.google.common.util.concurrent.ThreadFactoryBuilder @@ -39,8 +37,8 @@ import org.apache.log4j.{Level, Logger} import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.rpc.RpcEndpointRef -import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ +import org.apache.spark.util.Utils /** * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding @@ -164,7 +162,7 @@ private[yarn] class YarnAllocator( * Number of container requests at the given location that have not yet been fulfilled. */ private def getNumPendingAtLocation(location: String): Int = -amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum +amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala.map(_.size).sum /** * Request as many executors from the ResourceManager as needed to reach the desired total. If @@ -231,14 +229,14 @@ private[yarn] class YarnAllocator( numExecutorsRunning, allocateResponse.getAvailableResources)) - handleAllocatedContainers(allocatedContainers) + handleAllocatedContainers(allocatedContainers.asScala) } val completedContainers = allocateResponse.getCompletedContainersStatuses() if (completedContainers.size > 0) { logDebug("Completed %d containers".format(completedContainers.size)) - processCompletedContainers(completedContainers) + processCompletedContainers(completedContainers.asScala) logDebug("Finished processing %d completed containers. Current running executor count: %d." .format(completedContainers.size, numExecutorsRunning)) @@ -271,7 +269,7 @@ private[yarn] class YarnAllocator( val request = createContainerRequest(resource, locality.nodes, locality.racks) amClient.addContainerRequest(request) val nodes = request.getNodes -val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.last +val hostStr = if (nodes == null || nodes.isEmpty) "Any" else nodes.asScala.last logInfo(s"Container request (host: $hostStr, capability: $resource)") } } else if (missing < 0) { @@ -280,7 +278,8 @@ private[yarn] class YarnAllocator( val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource) if (!matchingRequests.isEmpty) { - matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest) +matchingRequests.iterator().next().asScala + .take(numToCancel).foreach(amClient.removeContainerRequest) } else { logWarning("Expected to find pending requests, but found none.") } @@ -459,7 +458,7 @@ private[yarn] class YarnAllocator( } } - if (allocatedContainerToHostMap.containsKey(containerId)) { + if (allocatedContainerToHostMap.contains(containerId)) { val host = allocatedContainerToHostMap.get(containerId).get val containerSet = allocatedHostToContainersMap.get(host).get http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala index 4999f9c..df042bf 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala @@ -19,17 +19,15 @@ package org.apache.spark.deploy.yarn import java.util.{List => JList} -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.{Map, Set} import scala.util.Try import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.yarn.api.App
[2/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 98d21aa..b8da084 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.hive -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable import com.google.common.base.Objects @@ -483,7 +483,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive // are empty. val partitions = metastoreRelation.getHiveQlPartitions().map { p => val location = p.getLocation -val values = InternalRow.fromSeq(p.getValues.zip(partitionColumnDataTypes).map { +val values = InternalRow.fromSeq(p.getValues.asScala.zip(partitionColumnDataTypes).map { case (rawValue, dataType) => Cast(Literal(rawValue), dataType).eval(null) }) ParquetPartition(values, location) @@ -798,9 +798,9 @@ private[hive] case class MetastoreRelation val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor() tTable.setSd(sd) -sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) +sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava) tTable.setPartitionKeys( - table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) + table.partitionColumns.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava) table.location.foreach(sd.setLocation) table.inputFormat.foreach(sd.setInputFormat) @@ -852,11 +852,11 @@ private[hive] case class MetastoreRelation val tPartition = new org.apache.hadoop.hive.metastore.api.Partition tPartition.setDbName(databaseName) tPartition.setTableName(tableName) - tPartition.setValues(p.values) + tPartition.setValues(p.values.asJava) val sd = new org.apache.hadoop.hive.metastore.api.StorageDescriptor() tPartition.setSd(sd) - sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment))) + sd.setCols(table.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava) sd.setLocation(p.storage.location) sd.setInputFormat(p.storage.inputFormat) http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala index ad33dee..d5cd7e9 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala @@ -20,6 +20,9 @@ package org.apache.spark.sql.hive import java.sql.Date import java.util.Locale +import scala.collection.JavaConverters._ +import scala.collection.mutable.ArrayBuffer + import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hadoop.hive.serde.serdeConstants @@ -48,10 +51,6 @@ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.random.RandomSampler -/* Implicit conversions */ -import scala.collection.JavaConversions._ -import scala.collection.mutable.ArrayBuffer - /** * Used when we need to start parsing the AST before deciding that we are going to pass the command * back for Hive to execute natively. Will be replaced with a native command that contains the @@ -202,7 +201,7 @@ private[hive] object HiveQl extends Logging { * Returns a scala.Seq equivalent to [s] or Nil if [s] is null. */ private def nilIfEmpty[A](s: java.util.List[A]): Seq[A] = - Option(s).map(_.toSeq).getOrElse(Nil) + Option(s).map(_.asScala).getOrElse(Nil) /** * Returns this ASTNode with the text changed to `newText`. @@ -217,7 +216,7 @@ private[hive] object HiveQl extends Logging { */ def withChildren(newChildren: Seq[ASTNode]): ASTNode = { (1 to n.getChildCount).foreach(_ => n.deleteChild(0)) - n.addChildren(newChildren) + n.addChildren(newChildren.asJava) n } @@ -323,11 +322,11 @@ private[hive] object HiveQl extends Logging { assert(tree.asInstanceOf[ASTNode].getText == "TOK_CREATETABLE", "Only CREATE TABLE supported.") val tableOps = tree.getChildren val colList = - tableOps + tableOps.asScala .find(_.asInstance
spark git commit: Fixed a typo in DAGScheduler.
Repository: spark Updated Branches: refs/heads/master 5c1489015 -> 7f1e507bf Fixed a typo in DAGScheduler. Author: ehnalis Closes #8308 from ehnalis/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f1e507b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f1e507b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f1e507b Branch: refs/heads/master Commit: 7f1e507bf7e82bff323c5dec3c1ee044687c4173 Parents: 5c14890 Author: ehnalis Authored: Tue Aug 25 12:30:06 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 12:30:06 2015 +0100 -- .../apache/spark/scheduler/DAGScheduler.scala | 27 +++- 1 file changed, 20 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f1e507b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 684db66..daf9b0f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -152,17 +152,24 @@ class DAGScheduler( // may lead to more delay in scheduling if those locations are busy. private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2 - // Called by TaskScheduler to report task's starting. + /** + * Called by the TaskSetManager to report task's starting. + */ def taskStarted(task: Task[_], taskInfo: TaskInfo) { eventProcessLoop.post(BeginEvent(task, taskInfo)) } - // Called to report that a task has completed and results are being fetched remotely. + /** + * Called by the TaskSetManager to report that a task has completed + * and results are being fetched remotely. + */ def taskGettingResult(taskInfo: TaskInfo) { eventProcessLoop.post(GettingResultEvent(taskInfo)) } - // Called by TaskScheduler to report task completions or failures. + /** + * Called by the TaskSetManager to report task completions or failures. + */ def taskEnded( task: Task[_], reason: TaskEndReason, @@ -188,18 +195,24 @@ class DAGScheduler( BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat")) } - // Called by TaskScheduler when an executor fails. + /** + * Called by TaskScheduler implementation when an executor fails. + */ def executorLost(execId: String): Unit = { eventProcessLoop.post(ExecutorLost(execId)) } - // Called by TaskScheduler when a host is added + /** + * Called by TaskScheduler implementation when a host is added. + */ def executorAdded(execId: String, host: String): Unit = { eventProcessLoop.post(ExecutorAdded(execId, host)) } - // Called by TaskScheduler to cancel an entire TaskSet due to either repeated failures or - // cancellation of the job itself. + /** + * Called by the TaskSetManager to cancel an entire TaskSet due to either repeated failures or + * cancellation of the job itself. + */ def taskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]): Unit = { eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Fixed a typo in DAGScheduler.
Repository: spark Updated Branches: refs/heads/branch-1.5 5d6840569 -> bdcc8e608 Fixed a typo in DAGScheduler. Author: ehnalis Closes #8308 from ehnalis/master. (cherry picked from commit 7f1e507bf7e82bff323c5dec3c1ee044687c4173) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdcc8e60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdcc8e60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdcc8e60 Branch: refs/heads/branch-1.5 Commit: bdcc8e608d9a1160db988faa76808149c28a3b50 Parents: 5d68405 Author: ehnalis Authored: Tue Aug 25 12:30:06 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 12:30:18 2015 +0100 -- .../apache/spark/scheduler/DAGScheduler.scala | 27 +++- 1 file changed, 20 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bdcc8e60/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 591b714..de69911 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -152,17 +152,24 @@ class DAGScheduler( // may lead to more delay in scheduling if those locations are busy. private[scheduler] val REDUCER_PREF_LOCS_FRACTION = 0.2 - // Called by TaskScheduler to report task's starting. + /** + * Called by the TaskSetManager to report task's starting. + */ def taskStarted(task: Task[_], taskInfo: TaskInfo) { eventProcessLoop.post(BeginEvent(task, taskInfo)) } - // Called to report that a task has completed and results are being fetched remotely. + /** + * Called by the TaskSetManager to report that a task has completed + * and results are being fetched remotely. + */ def taskGettingResult(taskInfo: TaskInfo) { eventProcessLoop.post(GettingResultEvent(taskInfo)) } - // Called by TaskScheduler to report task completions or failures. + /** + * Called by the TaskSetManager to report task completions or failures. + */ def taskEnded( task: Task[_], reason: TaskEndReason, @@ -188,18 +195,24 @@ class DAGScheduler( BlockManagerHeartbeat(blockManagerId), new RpcTimeout(600 seconds, "BlockManagerHeartbeat")) } - // Called by TaskScheduler when an executor fails. + /** + * Called by TaskScheduler implementation when an executor fails. + */ def executorLost(execId: String): Unit = { eventProcessLoop.post(ExecutorLost(execId)) } - // Called by TaskScheduler when a host is added + /** + * Called by TaskScheduler implementation when a host is added. + */ def executorAdded(execId: String, host: String): Unit = { eventProcessLoop.post(ExecutorAdded(execId, host)) } - // Called by TaskScheduler to cancel an entire TaskSet due to either repeated failures or - // cancellation of the job itself. + /** + * Called by the TaskSetManager to cancel an entire TaskSet due to either repeated failures or + * cancellation of the job itself. + */ def taskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]): Unit = { eventProcessLoop.post(TaskSetFailed(taskSet, reason, exception)) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC] add missing parameters in SparkContext.scala for scala doc
Repository: spark Updated Branches: refs/heads/branch-1.5 73f1dd1b5 -> 5d6840569 [DOC] add missing parameters in SparkContext.scala for scala doc Author: Zhang, Liye Closes #8412 from liyezhang556520/minorDoc. (cherry picked from commit 5c14890159a5711072bf395f662b2433a389edf9) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d684056 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d684056 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d684056 Branch: refs/heads/branch-1.5 Commit: 5d6840569761a42624f9852b942e33039d21f46a Parents: 73f1dd1 Author: Zhang, Liye Authored: Tue Aug 25 11:48:55 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 11:49:07 2015 +0100 -- .../main/scala/org/apache/spark/SparkContext.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d684056/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1ddaca8..9849aff 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. * + * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] * from a list of input files or InputFormats for the application. @@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes. + * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. + * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] + * from a list of input files or InputFormats for the application. */ def this( master: String, @@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred, large file is also allowable, but may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ def wholeTextFiles( @@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred; very large files may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @Experimental @@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * '''Note:''' We ensure that the byte array for each record in the resulting RDD * has the provided record length. * - * @param path Directory to the input data files + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param recordLength The length at which to split the records + * @param conf Configuration for setting up the dataset. + * * @return An RDD of data with values, represented as byte arrays */ @Experimental - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [DOC] add missing parameters in SparkContext.scala for scala doc
Repository: spark Updated Branches: refs/heads/master 0e6368ffa -> 5c1489015 [DOC] add missing parameters in SparkContext.scala for scala doc Author: Zhang, Liye Closes #8412 from liyezhang556520/minorDoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c148901 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c148901 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c148901 Branch: refs/heads/master Commit: 5c14890159a5711072bf395f662b2433a389edf9 Parents: 0e6368f Author: Zhang, Liye Authored: Tue Aug 25 11:48:55 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 11:48:55 2015 +0100 -- .../main/scala/org/apache/spark/SparkContext.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c148901/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1ddaca8..9849aff 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. * + * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] * from a list of input files or InputFormats for the application. @@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes. + * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. + * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] + * from a list of input files or InputFormats for the application. */ def this( master: String, @@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred, large file is also allowable, but may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ def wholeTextFiles( @@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred; very large files may cause bad performance. * @note On some filesystems, `.../path/*` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @Experimental @@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * '''Note:''' We ensure that the byte array for each record in the resulting RDD * has the provided record length. * - * @param path Directory to the input data files + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param recordLength The length at which to split the records + * @param conf Configuration for setting up the dataset. + * * @return An RDD of data with values, represented as byte arrays */ @Experimental - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).
Repository: spark Updated Branches: refs/heads/branch-1.5 a0f22cf29 -> 73f1dd1b5 [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors). https://issues.apache.org/jira/browse/SPARK-10197 Author: Yin Huai Closes #8407 from yhuai/ORCSPARK-10197. (cherry picked from commit 0e6368ffaec1965d0c7f89420e04a974675c7f6e) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73f1dd1b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73f1dd1b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73f1dd1b Branch: refs/heads/branch-1.5 Commit: 73f1dd1b5acf1c6c37045da25902d7ca5ab795e4 Parents: a0f22cf Author: Yin Huai Authored: Tue Aug 25 16:19:34 2015 +0800 Committer: Cheng Lian Committed: Tue Aug 25 16:20:13 2015 +0800 -- .../apache/spark/sql/hive/HiveInspectors.scala | 29 .../spark/sql/hive/orc/OrcSourceSuite.scala | 29 2 files changed, 53 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/73f1dd1b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 9824dad..64fffdb 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -370,17 +370,36 @@ private[hive] trait HiveInspectors { protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match { case _: JavaHiveVarcharObjectInspector => (o: Any) => -val s = o.asInstanceOf[UTF8String].toString -new HiveVarchar(s, s.size) +if (o != null) { + val s = o.asInstanceOf[UTF8String].toString + new HiveVarchar(s, s.size) +} else { + null +} case _: JavaHiveDecimalObjectInspector => - (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) + (o: Any) => +if (o != null) { + HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) +} else { + null +} case _: JavaDateObjectInspector => - (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) + (o: Any) => +if (o != null) { + DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) +} else { + null +} case _: JavaTimestampObjectInspector => - (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) + (o: Any) => +if (o != null) { + DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) +} else { + null +} case soi: StandardStructObjectInspector => val schema = dataType.asInstanceOf[StructType] http://git-wip-us.apache.org/repos/asf/spark/blob/73f1dd1b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index 82e08ca..80c3808 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { sql("SELECT * FROM normal_orc_as_source"), (6 to 10).map(i => Row(i, s"part-$i"))) } + + test("write null values") { +sql("DROP TABLE IF EXISTS orcNullValues") + +val df = sql( + """ +|SELECT +| CAST(null as TINYINT), +| CAST(null as SMALLINT), +| CAST(null as INT), +| CAST(null as BIGINT), +| CAST(null as FLOAT), +| CAST(null as DOUBLE), +| CAST(null as DECIMAL(7,2)), +| CAST(null as TIMESTAMP), +| CAST(null as DATE), +| CAST(null as STRING), +| CAST(null as VARCHAR(10)) +|FROM orc_temp_table limit 1 + """.stripMargin) + +df.write.format("orc").saveAsTable("orcNullValues") + +checkAnswer( + sql("SELECT * FROM orcNullValues"), + Row.fromSeq(Seq.fill(11)(null))) + +sql("DROP TABLE IF EXISTS orcNullValues") + } } class OrcSourceSuite extends OrcSuite { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors).
Repository: spark Updated Branches: refs/heads/master 7bc9a8c62 -> 0e6368ffa [SPARK-10197] [SQL] Add null check in wrapperFor (inside HiveInspectors). https://issues.apache.org/jira/browse/SPARK-10197 Author: Yin Huai Closes #8407 from yhuai/ORCSPARK-10197. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0e6368ff Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0e6368ff Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0e6368ff Branch: refs/heads/master Commit: 0e6368ffaec1965d0c7f89420e04a974675c7f6e Parents: 7bc9a8c Author: Yin Huai Authored: Tue Aug 25 16:19:34 2015 +0800 Committer: Cheng Lian Committed: Tue Aug 25 16:19:34 2015 +0800 -- .../apache/spark/sql/hive/HiveInspectors.scala | 29 .../spark/sql/hive/orc/OrcSourceSuite.scala | 29 2 files changed, 53 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0e6368ff/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 9824dad..64fffdb 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -370,17 +370,36 @@ private[hive] trait HiveInspectors { protected def wrapperFor(oi: ObjectInspector, dataType: DataType): Any => Any = oi match { case _: JavaHiveVarcharObjectInspector => (o: Any) => -val s = o.asInstanceOf[UTF8String].toString -new HiveVarchar(s, s.size) +if (o != null) { + val s = o.asInstanceOf[UTF8String].toString + new HiveVarchar(s, s.size) +} else { + null +} case _: JavaHiveDecimalObjectInspector => - (o: Any) => HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) + (o: Any) => +if (o != null) { + HiveDecimal.create(o.asInstanceOf[Decimal].toJavaBigDecimal) +} else { + null +} case _: JavaDateObjectInspector => - (o: Any) => DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) + (o: Any) => +if (o != null) { + DateTimeUtils.toJavaDate(o.asInstanceOf[Int]) +} else { + null +} case _: JavaTimestampObjectInspector => - (o: Any) => DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) + (o: Any) => +if (o != null) { + DateTimeUtils.toJavaTimestamp(o.asInstanceOf[Long]) +} else { + null +} case soi: StandardStructObjectInspector => val schema = dataType.asInstanceOf[StructType] http://git-wip-us.apache.org/repos/asf/spark/blob/0e6368ff/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala index 82e08ca..80c3808 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala @@ -121,6 +121,35 @@ abstract class OrcSuite extends QueryTest with BeforeAndAfterAll { sql("SELECT * FROM normal_orc_as_source"), (6 to 10).map(i => Row(i, s"part-$i"))) } + + test("write null values") { +sql("DROP TABLE IF EXISTS orcNullValues") + +val df = sql( + """ +|SELECT +| CAST(null as TINYINT), +| CAST(null as SMALLINT), +| CAST(null as INT), +| CAST(null as BIGINT), +| CAST(null as FLOAT), +| CAST(null as DOUBLE), +| CAST(null as DECIMAL(7,2)), +| CAST(null as TIMESTAMP), +| CAST(null as DATE), +| CAST(null as STRING), +| CAST(null as VARCHAR(10)) +|FROM orc_temp_table limit 1 + """.stripMargin) + +df.write.format("orc").saveAsTable("orcNullValues") + +checkAnswer( + sql("SELECT * FROM orcNullValues"), + Row.fromSeq(Seq.fill(11)(null))) + +sql("DROP TABLE IF EXISTS orcNullValues") + } } class OrcSourceSuite extends OrcSuite { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10195] [SQL] Data sources Filter should not expose internal types
Repository: spark Updated Branches: refs/heads/branch-1.5 e5cea566a -> a0f22cf29 [SPARK-10195] [SQL] Data sources Filter should not expose internal types Spark SQL's data sources API exposes Catalyst's internal types through its Filter interfaces. This is a problem because types like UTF8String are not stable developer APIs and should not be exposed to third-parties. This issue caused incompatibilities when upgrading our `spark-redshift` library to work against Spark 1.5.0. To avoid these issues in the future we should only expose public types through these Filter objects. This patch accomplishes this by using CatalystTypeConverters to add the appropriate conversions. Author: Josh Rosen Closes #8403 from JoshRosen/datasources-internal-vs-external-types. (cherry picked from commit 7bc9a8c6249300ded31ea931c463d0a8f798e193) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0f22cf2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0f22cf2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0f22cf2 Branch: refs/heads/branch-1.5 Commit: a0f22cf295a1d20814c5be6cc727e39e95a81c27 Parents: e5cea56 Author: Josh Rosen Authored: Tue Aug 25 01:06:36 2015 -0700 Committer: Reynold Xin Committed: Tue Aug 25 01:06:51 2015 -0700 -- .../datasources/DataSourceStrategy.scala| 67 ++-- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../datasources/parquet/ParquetFilters.scala| 19 +++--- .../spark/sql/sources/FilteredScanSuite.scala | 7 ++ 4 files changed, 54 insertions(+), 41 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a0f22cf2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2a4c40d..6c1ef6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical @@ -344,45 +345,47 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { */ protected[sql] def selectFilters(filters: Seq[Expression]) = { def translate(predicate: Expression): Option[Filter] = predicate match { - case expressions.EqualTo(a: Attribute, Literal(v, _)) => -Some(sources.EqualTo(a.name, v)) - case expressions.EqualTo(Literal(v, _), a: Attribute) => -Some(sources.EqualTo(a.name, v)) - - case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) => -Some(sources.EqualNullSafe(a.name, v)) - case expressions.EqualNullSafe(Literal(v, _), a: Attribute) => -Some(sources.EqualNullSafe(a.name, v)) - - case expressions.GreaterThan(a: Attribute, Literal(v, _)) => -Some(sources.GreaterThan(a.name, v)) - case expressions.GreaterThan(Literal(v, _), a: Attribute) => -Some(sources.LessThan(a.name, v)) - - case expressions.LessThan(a: Attribute, Literal(v, _)) => -Some(sources.LessThan(a.name, v)) - case expressions.LessThan(Literal(v, _), a: Attribute) => -Some(sources.GreaterThan(a.name, v)) - - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) => -Some(sources.GreaterThanOrEqual(a.name, v)) - case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) => -Some(sources.LessThanOrEqual(a.name, v)) - - case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) => -Some(sources.LessThanOrEqual(a.name, v)) - case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) => -Some(sources.GreaterThanOrEqual(a.name, v)) + case expressions.EqualTo(a: Attribute, Literal(v, t)) => +Some(sources.EqualTo(a.name, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), a: Attribute) => +Some(sources.EqualTo(a.name, convertToScala(v, t))) + + case
spark git commit: [SPARK-10195] [SQL] Data sources Filter should not expose internal types
Repository: spark Updated Branches: refs/heads/master 2f493f7e3 -> 7bc9a8c62 [SPARK-10195] [SQL] Data sources Filter should not expose internal types Spark SQL's data sources API exposes Catalyst's internal types through its Filter interfaces. This is a problem because types like UTF8String are not stable developer APIs and should not be exposed to third-parties. This issue caused incompatibilities when upgrading our `spark-redshift` library to work against Spark 1.5.0. To avoid these issues in the future we should only expose public types through these Filter objects. This patch accomplishes this by using CatalystTypeConverters to add the appropriate conversions. Author: Josh Rosen Closes #8403 from JoshRosen/datasources-internal-vs-external-types. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7bc9a8c6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7bc9a8c6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7bc9a8c6 Branch: refs/heads/master Commit: 7bc9a8c6249300ded31ea931c463d0a8f798e193 Parents: 2f493f7 Author: Josh Rosen Authored: Tue Aug 25 01:06:36 2015 -0700 Committer: Reynold Xin Committed: Tue Aug 25 01:06:36 2015 -0700 -- .../datasources/DataSourceStrategy.scala| 67 ++-- .../execution/datasources/jdbc/JDBCRDD.scala| 2 +- .../datasources/parquet/ParquetFilters.scala| 19 +++--- .../spark/sql/sources/FilteredScanSuite.scala | 7 ++ 4 files changed, 54 insertions(+), 41 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7bc9a8c6/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala index 2a4c40d..6c1ef6a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala @@ -20,7 +20,8 @@ package org.apache.spark.sql.execution.datasources import org.apache.spark.{Logging, TaskContext} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.{MapPartitionsRDD, RDD, UnionRDD} -import org.apache.spark.sql.catalyst.{InternalRow, expressions} +import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, expressions} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.planning.PhysicalOperation import org.apache.spark.sql.catalyst.plans.logical @@ -344,45 +345,47 @@ private[sql] object DataSourceStrategy extends Strategy with Logging { */ protected[sql] def selectFilters(filters: Seq[Expression]) = { def translate(predicate: Expression): Option[Filter] = predicate match { - case expressions.EqualTo(a: Attribute, Literal(v, _)) => -Some(sources.EqualTo(a.name, v)) - case expressions.EqualTo(Literal(v, _), a: Attribute) => -Some(sources.EqualTo(a.name, v)) - - case expressions.EqualNullSafe(a: Attribute, Literal(v, _)) => -Some(sources.EqualNullSafe(a.name, v)) - case expressions.EqualNullSafe(Literal(v, _), a: Attribute) => -Some(sources.EqualNullSafe(a.name, v)) - - case expressions.GreaterThan(a: Attribute, Literal(v, _)) => -Some(sources.GreaterThan(a.name, v)) - case expressions.GreaterThan(Literal(v, _), a: Attribute) => -Some(sources.LessThan(a.name, v)) - - case expressions.LessThan(a: Attribute, Literal(v, _)) => -Some(sources.LessThan(a.name, v)) - case expressions.LessThan(Literal(v, _), a: Attribute) => -Some(sources.GreaterThan(a.name, v)) - - case expressions.GreaterThanOrEqual(a: Attribute, Literal(v, _)) => -Some(sources.GreaterThanOrEqual(a.name, v)) - case expressions.GreaterThanOrEqual(Literal(v, _), a: Attribute) => -Some(sources.LessThanOrEqual(a.name, v)) - - case expressions.LessThanOrEqual(a: Attribute, Literal(v, _)) => -Some(sources.LessThanOrEqual(a.name, v)) - case expressions.LessThanOrEqual(Literal(v, _), a: Attribute) => -Some(sources.GreaterThanOrEqual(a.name, v)) + case expressions.EqualTo(a: Attribute, Literal(v, t)) => +Some(sources.EqualTo(a.name, convertToScala(v, t))) + case expressions.EqualTo(Literal(v, t), a: Attribute) => +Some(sources.EqualTo(a.name, convertToScala(v, t))) + + case expressions.EqualNullSafe(a: Attribute, Literal(v, t)) => +Some(sources.EqualNullSafe(a.name, conv
spark git commit: [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive
Repository: spark Updated Branches: refs/heads/branch-1.5 2032d6670 -> e5cea566a [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive We misunderstood the Julian days and nanoseconds of the day in parquet (as TimestampType) from Hive/Impala, they are overlapped, so can't be added together directly. In order to avoid the confusing rounding when do the converting, we use `2440588` as the Julian Day of epoch of unix timestamp (which should be 2440587.5). Author: Davies Liu Author: Cheng Lian Closes #8400 from davies/timestamp_parquet. (cherry picked from commit 2f493f7e3924b769160a16f73cccbebf21973b91) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5cea566 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5cea566 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5cea566 Branch: refs/heads/branch-1.5 Commit: e5cea566a32d254adc9424a2f9e79b92eda3e6e4 Parents: 2032d66 Author: Davies Liu Authored: Tue Aug 25 16:00:44 2015 +0800 Committer: Cheng Lian Committed: Tue Aug 25 16:00:58 2015 +0800 -- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 7 --- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 13 + .../spark/sql/hive/ParquetHiveCompatibilitySuite.scala | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e5cea566/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 6726204..d652fce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -37,7 +37,8 @@ object DateTimeUtils { type SQLTimestamp = Long // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian - final val JULIAN_DAY_OF_EPOCH = 2440587 // and .5 + // it's 2440587.5, rounding up to compatible with Hive + final val JULIAN_DAY_OF_EPOCH = 2440588 final val SECONDS_PER_DAY = 60 * 60 * 24L final val MICROS_PER_SECOND = 1000L * 1000L final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L @@ -183,7 +184,7 @@ object DateTimeUtils { */ def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = { // use Long to avoid rounding errors -val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2 +val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY seconds * MICROS_PER_SECOND + nanoseconds / 1000L } @@ -191,7 +192,7 @@ object DateTimeUtils { * Returns Julian day and nanoseconds in a day from the number of microseconds */ def toJulianDay(us: SQLTimestamp): (Int, Long) = { -val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2 +val seconds = us / MICROS_PER_SECOND val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH val secondsInDay = seconds % SECONDS_PER_DAY val nanos = (us % MICROS_PER_SECOND) * 1000L http://git-wip-us.apache.org/repos/asf/spark/blob/e5cea566/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index d18fa4d..1596bb7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite { test("us and julian day") { val (d, ns) = toJulianDay(0) assert(d === JULIAN_DAY_OF_EPOCH) -assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND) +assert(ns === 0) assert(fromJulianDay(d, ns) == 0L) -val t = new Timestamp(6139477861L) // (2015, 6, 11, 10, 10, 10, 100) +val t = Timestamp.valueOf("2015-06-11 10:10:10.100") val (d1, ns1) = toJulianDay(fromJavaTimestamp(t)) -val t2 = toJavaTimestamp(fromJulianDay(d1, ns1)) -assert(t.equals(t2)) +val t1 = toJavaTimestamp(fromJulianDay(d1, ns1)) +assert(t.equals(t1)) + +val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100") +val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2)) +val t22 = toJavaTimestamp(fromJulianDay(d2, ns2)) +assert(t2.equals(t22)) } test("SPARK-6785: java date conversion before and
spark git commit: [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive
Repository: spark Updated Branches: refs/heads/master 1fc37581a -> 2f493f7e3 [SPARK-10177] [SQL] fix reading Timestamp in parquet from Hive We misunderstood the Julian days and nanoseconds of the day in parquet (as TimestampType) from Hive/Impala, they are overlapped, so can't be added together directly. In order to avoid the confusing rounding when do the converting, we use `2440588` as the Julian Day of epoch of unix timestamp (which should be 2440587.5). Author: Davies Liu Author: Cheng Lian Closes #8400 from davies/timestamp_parquet. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f493f7e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f493f7e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f493f7e Branch: refs/heads/master Commit: 2f493f7e3924b769160a16f73cccbebf21973b91 Parents: 1fc3758 Author: Davies Liu Authored: Tue Aug 25 16:00:44 2015 +0800 Committer: Cheng Lian Committed: Tue Aug 25 16:00:44 2015 +0800 -- .../apache/spark/sql/catalyst/util/DateTimeUtils.scala | 7 --- .../spark/sql/catalyst/util/DateTimeUtilsSuite.scala | 13 + .../spark/sql/hive/ParquetHiveCompatibilitySuite.scala | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 6726204..d652fce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -37,7 +37,8 @@ object DateTimeUtils { type SQLTimestamp = Long // see http://stackoverflow.com/questions/466321/convert-unix-timestamp-to-julian - final val JULIAN_DAY_OF_EPOCH = 2440587 // and .5 + // it's 2440587.5, rounding up to compatible with Hive + final val JULIAN_DAY_OF_EPOCH = 2440588 final val SECONDS_PER_DAY = 60 * 60 * 24L final val MICROS_PER_SECOND = 1000L * 1000L final val NANOS_PER_SECOND = MICROS_PER_SECOND * 1000L @@ -183,7 +184,7 @@ object DateTimeUtils { */ def fromJulianDay(day: Int, nanoseconds: Long): SQLTimestamp = { // use Long to avoid rounding errors -val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY - SECONDS_PER_DAY / 2 +val seconds = (day - JULIAN_DAY_OF_EPOCH).toLong * SECONDS_PER_DAY seconds * MICROS_PER_SECOND + nanoseconds / 1000L } @@ -191,7 +192,7 @@ object DateTimeUtils { * Returns Julian day and nanoseconds in a day from the number of microseconds */ def toJulianDay(us: SQLTimestamp): (Int, Long) = { -val seconds = us / MICROS_PER_SECOND + SECONDS_PER_DAY / 2 +val seconds = us / MICROS_PER_SECOND val day = seconds / SECONDS_PER_DAY + JULIAN_DAY_OF_EPOCH val secondsInDay = seconds % SECONDS_PER_DAY val nanos = (us % MICROS_PER_SECOND) * 1000L http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index d18fa4d..1596bb7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -49,13 +49,18 @@ class DateTimeUtilsSuite extends SparkFunSuite { test("us and julian day") { val (d, ns) = toJulianDay(0) assert(d === JULIAN_DAY_OF_EPOCH) -assert(ns === SECONDS_PER_DAY / 2 * NANOS_PER_SECOND) +assert(ns === 0) assert(fromJulianDay(d, ns) == 0L) -val t = new Timestamp(6139477861L) // (2015, 6, 11, 10, 10, 10, 100) +val t = Timestamp.valueOf("2015-06-11 10:10:10.100") val (d1, ns1) = toJulianDay(fromJavaTimestamp(t)) -val t2 = toJavaTimestamp(fromJulianDay(d1, ns1)) -assert(t.equals(t2)) +val t1 = toJavaTimestamp(fromJulianDay(d1, ns1)) +assert(t.equals(t1)) + +val t2 = Timestamp.valueOf("2015-06-11 20:10:10.100") +val (d2, ns2) = toJulianDay(fromJavaTimestamp(t2)) +val t22 = toJavaTimestamp(fromJulianDay(d2, ns2)) +assert(t2.equals(t22)) } test("SPARK-6785: java date conversion before and after epoch") { http://git-wip-us.apache.org/repos/asf/spark/blob/2f493f7e/sql/hive/src/test/scala/org/
spark git commit: [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD
Repository: spark Updated Branches: refs/heads/branch-1.5 4841ebb18 -> 2032d6670 [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD When write ahead log is not enabled, a recovered streaming driver still tries to run jobs using pre-failure block ids, and fails as the block do not exists in-memory any more (and cannot be recovered as receiver WAL is not enabled). This occurs because the driver-side WAL of ReceivedBlockTracker is recovers that past block information, and ReceiveInputDStream creates BlockRDDs even if those blocks do not exist. The solution in this PR is to filter out block ids that do not exist before creating the BlockRDD. In addition, it adds unit tests to verify other logic in ReceiverInputDStream. Author: Tathagata Das Closes #8405 from tdas/SPARK-10210. (cherry picked from commit 1fc37581a52530bac5d555dbf14927a5780c3b75) Signed-off-by: Tathagata Das Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2032d667 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2032d667 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2032d667 Branch: refs/heads/branch-1.5 Commit: 2032d66706d165079550f06bf695e0b08be7e143 Parents: 4841ebb Author: Tathagata Das Authored: Tue Aug 25 00:35:51 2015 -0700 Committer: Tathagata Das Committed: Tue Aug 25 00:36:01 2015 -0700 -- .../dstream/ReceiverInputDStream.scala | 10 +- .../rdd/WriteAheadLogBackedBlockRDD.scala | 2 +- .../streaming/ReceiverInputDStreamSuite.scala | 156 +++ 3 files changed, 166 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala index a158009..6c139f3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala @@ -116,7 +116,15 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont logWarning("Some blocks have Write Ahead Log information; this is unexpected") } } -new BlockRDD[T](ssc.sc, blockIds) +val validBlockIds = blockIds.filter { id => + ssc.sparkContext.env.blockManager.master.contains(id) +} +if (validBlockIds.size != blockIds.size) { + logWarning("Some blocks could not be recovered as they were not found in memory. " + +"To prevent such data loss, enabled Write Ahead Log (see programming guide " + +"for more details.") +} +new BlockRDD[T](ssc.sc, validBlockIds) } } else { // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 620b8a3..e081ffe 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -75,7 +75,7 @@ private[streaming] class WriteAheadLogBackedBlockRDD[T: ClassTag]( @transient sc: SparkContext, @transient blockIds: Array[BlockId], -@transient walRecordHandles: Array[WriteAheadLogRecordHandle], +@transient val walRecordHandles: Array[WriteAheadLogRecordHandle], @transient isBlockIdValid: Array[Boolean] = Array.empty, storeInBlockManager: Boolean = false, storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER) http://git-wip-us.apache.org/repos/asf/spark/blob/2032d667/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala new file mode 100644 index 000..6d388d9 --- /dev/null +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Founda
spark git commit: [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD
Repository: spark Updated Branches: refs/heads/master 57b960bf3 -> 1fc37581a [SPARK-10210] [STREAMING] Filter out non-existent blocks before creating BlockRDD When write ahead log is not enabled, a recovered streaming driver still tries to run jobs using pre-failure block ids, and fails as the block do not exists in-memory any more (and cannot be recovered as receiver WAL is not enabled). This occurs because the driver-side WAL of ReceivedBlockTracker is recovers that past block information, and ReceiveInputDStream creates BlockRDDs even if those blocks do not exist. The solution in this PR is to filter out block ids that do not exist before creating the BlockRDD. In addition, it adds unit tests to verify other logic in ReceiverInputDStream. Author: Tathagata Das Closes #8405 from tdas/SPARK-10210. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1fc37581 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1fc37581 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1fc37581 Branch: refs/heads/master Commit: 1fc37581a52530bac5d555dbf14927a5780c3b75 Parents: 57b960b Author: Tathagata Das Authored: Tue Aug 25 00:35:51 2015 -0700 Committer: Tathagata Das Committed: Tue Aug 25 00:35:51 2015 -0700 -- .../dstream/ReceiverInputDStream.scala | 10 +- .../rdd/WriteAheadLogBackedBlockRDD.scala | 2 +- .../streaming/ReceiverInputDStreamSuite.scala | 156 +++ 3 files changed, 166 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala index a158009..6c139f3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ReceiverInputDStream.scala @@ -116,7 +116,15 @@ abstract class ReceiverInputDStream[T: ClassTag](@transient ssc_ : StreamingCont logWarning("Some blocks have Write Ahead Log information; this is unexpected") } } -new BlockRDD[T](ssc.sc, blockIds) +val validBlockIds = blockIds.filter { id => + ssc.sparkContext.env.blockManager.master.contains(id) +} +if (validBlockIds.size != blockIds.size) { + logWarning("Some blocks could not be recovered as they were not found in memory. " + +"To prevent such data loss, enabled Write Ahead Log (see programming guide " + +"for more details.") +} +new BlockRDD[T](ssc.sc, validBlockIds) } } else { // If no block is ready now, creating WriteAheadLogBackedBlockRDD or BlockRDD http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala index 620b8a3..e081ffe 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/rdd/WriteAheadLogBackedBlockRDD.scala @@ -75,7 +75,7 @@ private[streaming] class WriteAheadLogBackedBlockRDD[T: ClassTag]( @transient sc: SparkContext, @transient blockIds: Array[BlockId], -@transient walRecordHandles: Array[WriteAheadLogRecordHandle], +@transient val walRecordHandles: Array[WriteAheadLogRecordHandle], @transient isBlockIdValid: Array[Boolean] = Array.empty, storeInBlockManager: Boolean = false, storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY_SER) http://git-wip-us.apache.org/repos/asf/spark/blob/1fc37581/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala new file mode 100644 index 000..6d388d9 --- /dev/null +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverInputDStreamSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * t
spark git commit: [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided
Repository: spark Updated Branches: refs/heads/branch-1.5 76d920f2b -> 4841ebb18 [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided Follow up to https://github.com/apache/spark/pull/7047 pwendell mentioned that MapR should use `hadoop-provided` now, and indeed the new build script does not produce `mapr3`/`mapr4` artifacts anymore. Hence the action seems to be to remove the profiles, which are now not used. CC trystanleftwich Author: Sean Owen Closes #8338 from srowen/SPARK-6196. (cherry picked from commit 57b960bf3706728513f9e089455a533f0244312e) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4841ebb1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4841ebb1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4841ebb1 Branch: refs/heads/branch-1.5 Commit: 4841ebb1861025067a1108c11f64bb144427a308 Parents: 76d920f Author: Sean Owen Authored: Tue Aug 25 08:32:20 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 08:32:31 2015 +0100 -- pom.xml | 38 -- 1 file changed, 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4841ebb1/pom.xml -- diff --git a/pom.xml b/pom.xml index ccfa1ea..a597f86 100644 --- a/pom.xml +++ b/pom.xml @@ -2382,44 +2382,6 @@ - mapr3 - -1.0.3-mapr-3.0.3 -2.4.1-mapr-1408 -0.98.4-mapr-1408 -3.4.5-mapr-1406 - - - - - mapr4 - -2.4.1-mapr-1408 -2.4.1-mapr-1408 -0.98.4-mapr-1408 -3.4.5-mapr-1406 - - - - org.apache.curator - curator-recipes - ${curator.version} - - - org.apache.zookeeper - zookeeper - - - - - org.apache.zookeeper - zookeeper - 3.4.5-mapr-1406 - - - - - hive-thriftserver sql/hive-thriftserver - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided
Repository: spark Updated Branches: refs/heads/master d4549fe58 -> 57b960bf3 [SPARK-6196] [BUILD] Remove MapR profiles in favor of hadoop-provided Follow up to https://github.com/apache/spark/pull/7047 pwendell mentioned that MapR should use `hadoop-provided` now, and indeed the new build script does not produce `mapr3`/`mapr4` artifacts anymore. Hence the action seems to be to remove the profiles, which are now not used. CC trystanleftwich Author: Sean Owen Closes #8338 from srowen/SPARK-6196. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57b960bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57b960bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57b960bf Branch: refs/heads/master Commit: 57b960bf3706728513f9e089455a533f0244312e Parents: d4549fe Author: Sean Owen Authored: Tue Aug 25 08:32:20 2015 +0100 Committer: Sean Owen Committed: Tue Aug 25 08:32:20 2015 +0100 -- pom.xml | 38 -- 1 file changed, 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57b960bf/pom.xml -- diff --git a/pom.xml b/pom.xml index d5945f2..0716016 100644 --- a/pom.xml +++ b/pom.xml @@ -2387,44 +2387,6 @@ - mapr3 - -1.0.3-mapr-3.0.3 -2.4.1-mapr-1408 -0.98.4-mapr-1408 -3.4.5-mapr-1406 - - - - - mapr4 - -2.4.1-mapr-1408 -2.4.1-mapr-1408 -0.98.4-mapr-1408 -3.4.5-mapr-1406 - - - - org.apache.curator - curator-recipes - ${curator.version} - - - org.apache.zookeeper - zookeeper - - - - - org.apache.zookeeper - zookeeper - 3.4.5-mapr-1406 - - - - - hive-thriftserver sql/hive-thriftserver - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs
Repository: spark Updated Branches: refs/heads/branch-1.5 b7c4ff144 -> 76d920f2b [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs cc: shivaram ## Summary - Add name tags to each methods in DataFrame.R and column.R - Replace `rdname column` with `rdname {each_func}`. i.e. alias method : `rdname column` => `rdname alias` ## Generated PDF File https://drive.google.com/file/d/0B9biIZIU47lLNHN2aFpnQXlSeGs/view?usp=sharing ## JIRA [[SPARK-10214] Improve SparkR Column, DataFrame API docs - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-10214) Author: Yu ISHIKAWA Closes #8414 from yu-iskw/SPARK-10214. (cherry picked from commit d4549fe58fa0d781e0e891bceff893420cb1d598) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76d920f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76d920f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76d920f2 Branch: refs/heads/branch-1.5 Commit: 76d920f2b814304051dd76f0ca78301e872fc811 Parents: b7c4ff1 Author: Yu ISHIKAWA Authored: Tue Aug 25 00:28:51 2015 -0700 Committer: Shivaram Venkataraman Committed: Tue Aug 25 00:28:58 2015 -0700 -- R/pkg/R/DataFrame.R | 101 +-- R/pkg/R/column.R| 40 +-- R/pkg/R/generics.R | 2 +- 3 files changed, 109 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/76d920f2/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 8956032..10f3c4e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -27,9 +27,10 @@ setOldClass("jobj") #' \code{jsonFile}, \code{table} etc. #' @rdname DataFrame #' @seealso jsonFile, table +#' @docType class #' -#' @param env An R environment that stores bookkeeping states of the DataFrame -#' @param sdf A Java object reference to the backing Scala DataFrame +#' @slot env An R environment that stores bookkeeping states of the DataFrame +#' @slot sdf A Java object reference to the backing Scala DataFrame #' @export setClass("DataFrame", slots = list(env = "environment", @@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) { #' @param x A SparkSQL DataFrame #' #' @rdname printSchema +#' @name printSchema #' @export #' @examples #'\dontrun{ @@ -84,6 +86,7 @@ setMethod("printSchema", #' @param x A SparkSQL DataFrame #' #' @rdname schema +#' @name schema #' @export #' @examples #'\dontrun{ @@ -106,6 +109,7 @@ setMethod("schema", #' @param x A SparkSQL DataFrame #' @param extended Logical. If extended is False, explain() only prints the physical plan. #' @rdname explain +#' @name explain #' @export #' @examples #'\dontrun{ @@ -135,6 +139,7 @@ setMethod("explain", #' @param x A SparkSQL DataFrame #' #' @rdname isLocal +#' @name isLocal #' @export #' @examples #'\dontrun{ @@ -158,6 +163,7 @@ setMethod("isLocal", #' @param numRows The number of rows to print. Defaults to 20. #' #' @rdname showDF +#' @name showDF #' @export #' @examples #'\dontrun{ @@ -181,6 +187,7 @@ setMethod("showDF", #' @param x A SparkSQL DataFrame #' #' @rdname show +#' @name show #' @export #' @examples #'\dontrun{ @@ -206,6 +213,7 @@ setMethod("show", "DataFrame", #' @param x A SparkSQL DataFrame #' #' @rdname dtypes +#' @name dtypes #' @export #' @examples #'\dontrun{ @@ -230,6 +238,8 @@ setMethod("dtypes", #' @param x A SparkSQL DataFrame #' #' @rdname columns +#' @name columns +#' @aliases names #' @export #' @examples #'\dontrun{ @@ -248,7 +258,7 @@ setMethod("columns", }) #' @rdname columns -#' @aliases names,DataFrame,function-method +#' @name names setMethod("names", signature(x = "DataFrame"), function(x) { @@ -256,6 +266,7 @@ setMethod("names", }) #' @rdname columns +#' @name names<- setMethod("names<-", signature(x = "DataFrame"), function(x, value) { @@ -273,6 +284,7 @@ setMethod("names<-", #' @param tableName A character vector containing the name of the table #' #' @rdname registerTempTable +#' @name registerTempTable #' @export #' @examples #'\dontrun{ @@ -299,6 +311,7 @@ setMethod("registerTempTable", #' the existing rows in the table. #' #' @rdname insertInto +#' @name insertInto #' @export #' @examples #'\dontrun{ @@ -321,7 +334,8 @@ setMethod("insertInto", #' #' @param x A SparkSQL DataFrame #' -#' @rdname cache-methods +#' @rdname cache +#' @name cache #' @export #' @examples #'\dontrun{ @@ -347,6 +361,7 @@ setMethod("cache", #' #' @param x The DataFrame to persist #' @rdname persist +#' @name persist #' @export #' @examples #'\dontrun{ @@ -372,6 +387,7
spark git commit: [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs
Repository: spark Updated Branches: refs/heads/master 82268f07a -> d4549fe58 [SPARK-10214] [SPARKR] [DOCS] Improve SparkR Column, DataFrame API docs cc: shivaram ## Summary - Add name tags to each methods in DataFrame.R and column.R - Replace `rdname column` with `rdname {each_func}`. i.e. alias method : `rdname column` => `rdname alias` ## Generated PDF File https://drive.google.com/file/d/0B9biIZIU47lLNHN2aFpnQXlSeGs/view?usp=sharing ## JIRA [[SPARK-10214] Improve SparkR Column, DataFrame API docs - ASF JIRA](https://issues.apache.org/jira/browse/SPARK-10214) Author: Yu ISHIKAWA Closes #8414 from yu-iskw/SPARK-10214. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4549fe5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4549fe5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4549fe5 Branch: refs/heads/master Commit: d4549fe58fa0d781e0e891bceff893420cb1d598 Parents: 82268f0 Author: Yu ISHIKAWA Authored: Tue Aug 25 00:28:51 2015 -0700 Committer: Shivaram Venkataraman Committed: Tue Aug 25 00:28:51 2015 -0700 -- R/pkg/R/DataFrame.R | 101 +-- R/pkg/R/column.R| 40 +-- R/pkg/R/generics.R | 2 +- 3 files changed, 109 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4549fe5/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 8956032..10f3c4e 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -27,9 +27,10 @@ setOldClass("jobj") #' \code{jsonFile}, \code{table} etc. #' @rdname DataFrame #' @seealso jsonFile, table +#' @docType class #' -#' @param env An R environment that stores bookkeeping states of the DataFrame -#' @param sdf A Java object reference to the backing Scala DataFrame +#' @slot env An R environment that stores bookkeeping states of the DataFrame +#' @slot sdf A Java object reference to the backing Scala DataFrame #' @export setClass("DataFrame", slots = list(env = "environment", @@ -61,6 +62,7 @@ dataFrame <- function(sdf, isCached = FALSE) { #' @param x A SparkSQL DataFrame #' #' @rdname printSchema +#' @name printSchema #' @export #' @examples #'\dontrun{ @@ -84,6 +86,7 @@ setMethod("printSchema", #' @param x A SparkSQL DataFrame #' #' @rdname schema +#' @name schema #' @export #' @examples #'\dontrun{ @@ -106,6 +109,7 @@ setMethod("schema", #' @param x A SparkSQL DataFrame #' @param extended Logical. If extended is False, explain() only prints the physical plan. #' @rdname explain +#' @name explain #' @export #' @examples #'\dontrun{ @@ -135,6 +139,7 @@ setMethod("explain", #' @param x A SparkSQL DataFrame #' #' @rdname isLocal +#' @name isLocal #' @export #' @examples #'\dontrun{ @@ -158,6 +163,7 @@ setMethod("isLocal", #' @param numRows The number of rows to print. Defaults to 20. #' #' @rdname showDF +#' @name showDF #' @export #' @examples #'\dontrun{ @@ -181,6 +187,7 @@ setMethod("showDF", #' @param x A SparkSQL DataFrame #' #' @rdname show +#' @name show #' @export #' @examples #'\dontrun{ @@ -206,6 +213,7 @@ setMethod("show", "DataFrame", #' @param x A SparkSQL DataFrame #' #' @rdname dtypes +#' @name dtypes #' @export #' @examples #'\dontrun{ @@ -230,6 +238,8 @@ setMethod("dtypes", #' @param x A SparkSQL DataFrame #' #' @rdname columns +#' @name columns +#' @aliases names #' @export #' @examples #'\dontrun{ @@ -248,7 +258,7 @@ setMethod("columns", }) #' @rdname columns -#' @aliases names,DataFrame,function-method +#' @name names setMethod("names", signature(x = "DataFrame"), function(x) { @@ -256,6 +266,7 @@ setMethod("names", }) #' @rdname columns +#' @name names<- setMethod("names<-", signature(x = "DataFrame"), function(x, value) { @@ -273,6 +284,7 @@ setMethod("names<-", #' @param tableName A character vector containing the name of the table #' #' @rdname registerTempTable +#' @name registerTempTable #' @export #' @examples #'\dontrun{ @@ -299,6 +311,7 @@ setMethod("registerTempTable", #' the existing rows in the table. #' #' @rdname insertInto +#' @name insertInto #' @export #' @examples #'\dontrun{ @@ -321,7 +334,8 @@ setMethod("insertInto", #' #' @param x A SparkSQL DataFrame #' -#' @rdname cache-methods +#' @rdname cache +#' @name cache #' @export #' @examples #'\dontrun{ @@ -347,6 +361,7 @@ setMethod("cache", #' #' @param x The DataFrame to persist #' @rdname persist +#' @name persist #' @export #' @examples #'\dontrun{ @@ -372,6 +387,7 @@ setMethod("persist", #' @param x The DataFrame to unpersist #' @param blocking Whether to block until all bloc
spark git commit: [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns
Repository: spark Updated Branches: refs/heads/branch-1.5 95a14e9f2 -> b7c4ff144 [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns This patch adds an analyzer rule to ensure that set operations (union, intersect, and except) are only applied to tables with the same number of columns. Without this rule, there are scenarios where invalid queries can return incorrect results instead of failing with error messages; SPARK-9813 provides one example of this problem. In other cases, the invalid query can crash at runtime with extremely confusing exceptions. I also performed a bit of cleanup to refactor some of those logical operators' code into a common `SetOperation` base class. Author: Josh Rosen Closes #7631 from JoshRosen/SPARK-9293. (cherry picked from commit 82268f07abfa658869df2354ae72f8d6ddd119e8) Signed-off-by: Michael Armbrust Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7c4ff14 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7c4ff14 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7c4ff14 Branch: refs/heads/branch-1.5 Commit: b7c4ff144e783e80adc1efc2c28965c2e739dd5e Parents: 95a14e9 Author: Josh Rosen Authored: Tue Aug 25 00:04:10 2015 -0700 Committer: Michael Armbrust Committed: Tue Aug 25 00:04:23 2015 -0700 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 .../catalyst/analysis/HiveTypeCoercion.scala| 14 +++- .../catalyst/plans/logical/basicOperators.scala | 38 +--- .../catalyst/analysis/AnalysisErrorSuite.scala | 18 ++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 2 +- 6 files changed, 48 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7c4ff14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 39f554c..7701fd0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -137,6 +137,12 @@ trait CheckAnalysis { } } + case s @ SetOperation(left, right) if left.output.length != right.output.length => +failAnalysis( + s"${s.nodeName} can only be performed on tables with the same number of columns, " + + s"but the left table has ${left.output.length} columns and the right has " + + s"${right.output.length}") + case _ => // Fallbacks to the following checks } http://git-wip-us.apache.org/repos/asf/spark/blob/b7c4ff14/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 2cb067f..a1aa2a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -203,6 +203,7 @@ object HiveTypeCoercion { planName: String, left: LogicalPlan, right: LogicalPlan): (LogicalPlan, LogicalPlan) = { + require(left.output.length == right.output.length) val castedTypes = left.output.zip(right.output).map { case (lhs, rhs) if lhs.dataType != rhs.dataType => @@ -229,15 +230,10 @@ object HiveTypeCoercion { def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case p if p.analyzed => p - case u @ Union(left, right) if u.childrenResolved && !u.resolved => -val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right) -Union(newLeft, newRight) - case e @ Except(left, right) if e.childrenResolved && !e.resolved => -val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right) -Except(newLeft, newRight) - case i @ Intersect(left, right) if i.childrenResolved && !i.resolved => -val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right) -Intersect(newLeft, newRight) + case s @ SetOperation(left, right) if s.childrenResolved + && left.output.length == right.output.length && !s.resolved => +val (ne
spark git commit: [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns
Repository: spark Updated Branches: refs/heads/master bf03fe68d -> 82268f07a [SPARK-9293] [SPARK-9813] Analysis should check that set operations are only performed on tables with equal numbers of columns This patch adds an analyzer rule to ensure that set operations (union, intersect, and except) are only applied to tables with the same number of columns. Without this rule, there are scenarios where invalid queries can return incorrect results instead of failing with error messages; SPARK-9813 provides one example of this problem. In other cases, the invalid query can crash at runtime with extremely confusing exceptions. I also performed a bit of cleanup to refactor some of those logical operators' code into a common `SetOperation` base class. Author: Josh Rosen Closes #7631 from JoshRosen/SPARK-9293. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/82268f07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/82268f07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/82268f07 Branch: refs/heads/master Commit: 82268f07abfa658869df2354ae72f8d6ddd119e8 Parents: bf03fe6 Author: Josh Rosen Authored: Tue Aug 25 00:04:10 2015 -0700 Committer: Michael Armbrust Committed: Tue Aug 25 00:04:10 2015 -0700 -- .../sql/catalyst/analysis/CheckAnalysis.scala | 6 .../catalyst/analysis/HiveTypeCoercion.scala| 14 +++- .../catalyst/plans/logical/basicOperators.scala | 38 +--- .../catalyst/analysis/AnalysisErrorSuite.scala | 18 ++ .../spark/sql/hive/HiveMetastoreCatalog.scala | 2 +- .../hive/execution/InsertIntoHiveTable.scala| 2 +- 6 files changed, 48 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/82268f07/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 39f554c..7701fd0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -137,6 +137,12 @@ trait CheckAnalysis { } } + case s @ SetOperation(left, right) if left.output.length != right.output.length => +failAnalysis( + s"${s.nodeName} can only be performed on tables with the same number of columns, " + + s"but the left table has ${left.output.length} columns and the right has " + + s"${right.output.length}") + case _ => // Fallbacks to the following checks } http://git-wip-us.apache.org/repos/asf/spark/blob/82268f07/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 2cb067f..a1aa2a2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -203,6 +203,7 @@ object HiveTypeCoercion { planName: String, left: LogicalPlan, right: LogicalPlan): (LogicalPlan, LogicalPlan) = { + require(left.output.length == right.output.length) val castedTypes = left.output.zip(right.output).map { case (lhs, rhs) if lhs.dataType != rhs.dataType => @@ -229,15 +230,10 @@ object HiveTypeCoercion { def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case p if p.analyzed => p - case u @ Union(left, right) if u.childrenResolved && !u.resolved => -val (newLeft, newRight) = widenOutputTypes(u.nodeName, left, right) -Union(newLeft, newRight) - case e @ Except(left, right) if e.childrenResolved && !e.resolved => -val (newLeft, newRight) = widenOutputTypes(e.nodeName, left, right) -Except(newLeft, newRight) - case i @ Intersect(left, right) if i.childrenResolved && !i.resolved => -val (newLeft, newRight) = widenOutputTypes(i.nodeName, left, right) -Intersect(newLeft, newRight) + case s @ SetOperation(left, right) if s.childrenResolved + && left.output.length == right.output.length && !s.resolved => +val (newLeft, newRight) = widenOutputTypes(s.nodeName, left, right) +s.makeCopy(Array(newLeft, newRight))