spark git commit: [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions
Repository: spark Updated Branches: refs/heads/master 09f4ceaeb -> 843a1eba8 [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions ## What changes were proposed in this pull request? Doc only changes. Please see screenshots. Before: http://spark.apache.org/docs/latest/api/R/statfunctions.html ![image](https://cloud.githubusercontent.com/assets/8969467/15264110/cd458826-1924-11e6-85bd-8ee2e2e1a85f.png) After ![image](https://cloud.githubusercontent.com/assets/8969467/16218452/b9e89f08-3732-11e6-969d-a3a1796e7ad0.png) (please ignore the style differences - this is due to not having the css in my local copy) This is still a bit weird. As discussed in SPARK-15237, I think the better approach is to separate out the DataFrame stats function instead of putting everything on one page. At least now it is clearer which description is on which function. ## How was this patch tested? Build doc Author: Felix Cheung Author: felixcheung Closes #13109 from felixcheung/rstatdoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/843a1eba Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/843a1eba Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/843a1eba Branch: refs/heads/master Commit: 843a1eba8ec9d5a7beac0c74b54d24cb3c41b45a Parents: 09f4cea Author: Felix Cheung Authored: Tue Jun 21 00:19:09 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 00:19:09 2016 -0700 -- R/pkg/R/generics.R | 8 R/pkg/R/stats.R| 32 +--- 2 files changed, 17 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/843a1eba/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ead403b..43395aa 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -434,19 +434,19 @@ setGeneric("coltypes<-", function(x, value) { standardGeneric("coltypes<-") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("cov", function(x, ...) {standardGeneric("cov") }) -#' @rdname statfunctions +#' @rdname corr #' @export setGeneric("corr", function(x, ...) {standardGeneric("corr") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname statfunctions +#' @rdname covar_pop #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) http://git-wip-us.apache.org/repos/asf/spark/blob/843a1eba/R/pkg/R/stats.R -- diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index e92b9e3..e40b177 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,9 +19,10 @@ setOldClass("jobj") -#' crosstab -#' -#' Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' @title SparkDataFrame statistic functions + +#' @description +#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' @@ -49,8 +50,6 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov -#' #' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' #' @param x A SparkDataFrame @@ -58,7 +57,7 @@ setMethod("crosstab", #' @param col2 the name of the second column #' @return the covariance of the two columns. #' -#' @rdname statfunctions +#' @rdname cov #' @name cov #' @export #' @examples @@ -75,8 +74,6 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr -#' #' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. @@ -88,7 +85,7 @@ setMethod("cov", #' only "pearson" is allowed now. #' @return The Pearson Correlation Coefficient as a Double. #' -#' @rdname statfunctions +#' @rdname corr #' @name corr #' @export #' @examples @@ -106,9 +103,8 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) -#' freqItems -#' -#' Finding frequent items for columns, possibly with false positives. +#' @description +#' freqItems - Finding frequent items for columns, possibly with false positives. #' Using the frequent element count algorithm described in #' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Sche
spark git commit: [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions
Repository: spark Updated Branches: refs/heads/branch-2.0 38f3b76bd -> 34feea336 [SPARK-15319][SPARKR][DOCS] Fix SparkR doc layout for corr and other DataFrame stats functions ## What changes were proposed in this pull request? Doc only changes. Please see screenshots. Before: http://spark.apache.org/docs/latest/api/R/statfunctions.html ![image](https://cloud.githubusercontent.com/assets/8969467/15264110/cd458826-1924-11e6-85bd-8ee2e2e1a85f.png) After ![image](https://cloud.githubusercontent.com/assets/8969467/16218452/b9e89f08-3732-11e6-969d-a3a1796e7ad0.png) (please ignore the style differences - this is due to not having the css in my local copy) This is still a bit weird. As discussed in SPARK-15237, I think the better approach is to separate out the DataFrame stats function instead of putting everything on one page. At least now it is clearer which description is on which function. ## How was this patch tested? Build doc Author: Felix Cheung Author: felixcheung Closes #13109 from felixcheung/rstatdoc. (cherry picked from commit 843a1eba8ec9d5a7beac0c74b54d24cb3c41b45a) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34feea33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34feea33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34feea33 Branch: refs/heads/branch-2.0 Commit: 34feea336886b241135e6c60677000c2ca6b52b4 Parents: 38f3b76 Author: Felix Cheung Authored: Tue Jun 21 00:19:09 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 00:19:18 2016 -0700 -- R/pkg/R/generics.R | 8 R/pkg/R/stats.R| 32 +--- 2 files changed, 17 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34feea33/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index ead403b..43395aa 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -434,19 +434,19 @@ setGeneric("coltypes<-", function(x, value) { standardGeneric("coltypes<-") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("cov", function(x, ...) {standardGeneric("cov") }) -#' @rdname statfunctions +#' @rdname corr #' @export setGeneric("corr", function(x, ...) {standardGeneric("corr") }) -#' @rdname statfunctions +#' @rdname cov #' @export setGeneric("covar_samp", function(col1, col2) {standardGeneric("covar_samp") }) -#' @rdname statfunctions +#' @rdname covar_pop #' @export setGeneric("covar_pop", function(col1, col2) {standardGeneric("covar_pop") }) http://git-wip-us.apache.org/repos/asf/spark/blob/34feea33/R/pkg/R/stats.R -- diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R index e92b9e3..e40b177 100644 --- a/R/pkg/R/stats.R +++ b/R/pkg/R/stats.R @@ -19,9 +19,10 @@ setOldClass("jobj") -#' crosstab -#' -#' Computes a pair-wise frequency table of the given columns. Also known as a contingency +#' @title SparkDataFrame statistic functions + +#' @description +#' crosstab - Computes a pair-wise frequency table of the given columns. Also known as a contingency #' table. The number of distinct values for each column should be less than 1e4. At most 1e6 #' non-zero pair frequencies will be returned. #' @@ -49,8 +50,6 @@ setMethod("crosstab", collect(dataFrame(sct)) }) -#' cov -#' #' Calculate the sample covariance of two numerical columns of a SparkDataFrame. #' #' @param x A SparkDataFrame @@ -58,7 +57,7 @@ setMethod("crosstab", #' @param col2 the name of the second column #' @return the covariance of the two columns. #' -#' @rdname statfunctions +#' @rdname cov #' @name cov #' @export #' @examples @@ -75,8 +74,6 @@ setMethod("cov", callJMethod(statFunctions, "cov", col1, col2) }) -#' corr -#' #' Calculates the correlation of two columns of a SparkDataFrame. #' Currently only supports the Pearson Correlation Coefficient. #' For Spearman Correlation, consider using RDD methods found in MLlib's Statistics. @@ -88,7 +85,7 @@ setMethod("cov", #' only "pearson" is allowed now. #' @return The Pearson Correlation Coefficient as a Double. #' -#' @rdname statfunctions +#' @rdname corr #' @name corr #' @export #' @examples @@ -106,9 +103,8 @@ setMethod("corr", callJMethod(statFunctions, "corr", col1, col2, method) }) -#' freqItems -#' -#' Finding frequent items for columns, possibly with false positives. +#' @description +#' freqItems - Finding frequent items for columns, possibly with false positives. #' Using the fre
spark git commit: Revert "[SPARK-16086] [SQL] fix Python UDF without arguments (for 1.6)"
Repository: spark Updated Branches: refs/heads/master 843a1eba8 -> ce49bfc25 Revert "[SPARK-16086] [SQL] fix Python UDF without arguments (for 1.6)" This reverts commit a46553cbacf0e4012df89fe55385dec5beaa680a. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce49bfc2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce49bfc2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce49bfc2 Branch: refs/heads/master Commit: ce49bfc2550ba8f5a33235c7fc3b88201d63c276 Parents: 843a1eb Author: Xiangrui Meng Authored: Tue Jun 21 00:32:51 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:32:51 2016 -0700 -- python/pyspark/sql/tests.py | 5 - python/pyspark/sql/types.py | 9 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ce49bfc2/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index ecd1a05..c631ad8 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -318,11 +318,6 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(add(1, 2)), add(double(2), 1)").collect() self.assertEqual(tuple(row), (6, 5)) -def test_udf_without_arguments(self): -self.sqlCtx.registerFunction("foo", lambda: "bar") -[row] = self.sqlCtx.sql("SELECT foo()").collect() -self.assertEqual(row[0], "bar") - def test_udf_with_array_type(self): d = [Row(l=list(range(3)), d={"key": list(range(5))})] rdd = self.sc.parallelize(d) http://git-wip-us.apache.org/repos/asf/spark/blob/ce49bfc2/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index f0b56be..bb2b954 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1401,7 +1401,11 @@ class Row(tuple): if args and kwargs: raise ValueError("Can not use both args " "and kwargs to create Row") -if kwargs: +if args: +# create row class or objects +return tuple.__new__(self, args) + +elif kwargs: # create row objects names = sorted(kwargs.keys()) row = tuple.__new__(self, [kwargs[n] for n in names]) @@ -1409,8 +1413,7 @@ class Row(tuple): return row else: -# create row class or objects -return tuple.__new__(self, args) +raise ValueError("No args or kwargs") def asDict(self, recursive=False): """ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Revert "[SPARK-16086] [SQL] fix Python UDF without arguments (for 1.6)"
Repository: spark Updated Branches: refs/heads/branch-2.0 34feea336 -> 37d05ec9e Revert "[SPARK-16086] [SQL] fix Python UDF without arguments (for 1.6)" This reverts commit 087bd2799366f4914d248e9b1f0fb921adbbdb43. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37d05ec9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37d05ec9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37d05ec9 Branch: refs/heads/branch-2.0 Commit: 37d05ec9e96c0da786ee26b5c25216bf98f239c0 Parents: 34feea3 Author: Xiangrui Meng Authored: Tue Jun 21 00:33:38 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:33:38 2016 -0700 -- python/pyspark/sql/tests.py | 5 - python/pyspark/sql/types.py | 9 ++--- 2 files changed, 6 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37d05ec9/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index ecd1a05..c631ad8 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -318,11 +318,6 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(add(1, 2)), add(double(2), 1)").collect() self.assertEqual(tuple(row), (6, 5)) -def test_udf_without_arguments(self): -self.sqlCtx.registerFunction("foo", lambda: "bar") -[row] = self.sqlCtx.sql("SELECT foo()").collect() -self.assertEqual(row[0], "bar") - def test_udf_with_array_type(self): d = [Row(l=list(range(3)), d={"key": list(range(5))})] rdd = self.sc.parallelize(d) http://git-wip-us.apache.org/repos/asf/spark/blob/37d05ec9/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index f0b56be..bb2b954 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1401,7 +1401,11 @@ class Row(tuple): if args and kwargs: raise ValueError("Can not use both args " "and kwargs to create Row") -if kwargs: +if args: +# create row class or objects +return tuple.__new__(self, args) + +elif kwargs: # create row objects names = sorted(kwargs.keys()) row = tuple.__new__(self, [kwargs[n] for n in names]) @@ -1409,8 +1413,7 @@ class Row(tuple): return row else: -# create row class or objects -return tuple.__new__(self, args) +raise ValueError("No args or kwargs") def asDict(self, recursive=False): """ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10258][DOC][ML] Add @Since annotations to ml.feature
Repository: spark Updated Branches: refs/heads/branch-2.0 37d05ec9e -> 14e5decc5 [SPARK-10258][DOC][ML] Add @Since annotations to ml.feature This PR adds missing `Since` annotations to `ml.feature` package. Closes #8505. ## How was this patch tested? Existing tests. Author: Nick Pentreath Closes #13641 from MLnick/add-since-annotations. (cherry picked from commit 37494a18e8d6e22113338523d6498e00ac9725ea) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/14e5decc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/14e5decc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/14e5decc Branch: refs/heads/branch-2.0 Commit: 14e5decc5f8977e253cde0135d57204a7c0ebb7f Parents: 37d05ec Author: Nick Pentreath Authored: Tue Jun 21 00:39:47 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:39:54 2016 -0700 -- .../org/apache/spark/ml/feature/Binarizer.scala | 11 ++- .../apache/spark/ml/feature/Bucketizer.scala| 12 ++- .../apache/spark/ml/feature/ChiSqSelector.scala | 19 +-- .../spark/ml/feature/CountVectorizer.scala | 24 -- .../scala/org/apache/spark/ml/feature/DCT.scala | 7 - .../spark/ml/feature/ElementwiseProduct.scala | 7 - .../org/apache/spark/ml/feature/HashingTF.scala | 14 - .../scala/org/apache/spark/ml/feature/IDF.scala | 20 +--- .../apache/spark/ml/feature/Interaction.scala | 4 +-- .../apache/spark/ml/feature/MaxAbsScaler.scala | 26 ++- .../apache/spark/ml/feature/MinMaxScaler.scala | 23 +++--- .../org/apache/spark/ml/feature/NGram.scala | 7 - .../apache/spark/ml/feature/Normalizer.scala| 7 - .../apache/spark/ml/feature/OneHotEncoder.scala | 10 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++--- .../spark/ml/feature/PolynomialExpansion.scala | 8 - .../spark/ml/feature/QuantileDiscretizer.scala | 10 +- .../org/apache/spark/ml/feature/RFormula.scala | 18 +-- .../spark/ml/feature/SQLTransformer.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 24 +++--- .../spark/ml/feature/StopWordsRemover.scala | 14 - .../apache/spark/ml/feature/StringIndexer.scala | 33 +--- .../org/apache/spark/ml/feature/Tokenizer.scala | 22 +++-- .../spark/ml/feature/VectorAssembler.scala | 8 - .../apache/spark/ml/feature/VectorIndexer.scala | 24 +++--- .../apache/spark/ml/feature/VectorSlicer.scala | 14 - .../org/apache/spark/ml/feature/Word2Vec.scala | 29 +++-- python/pyspark/ml/feature.py| 10 +++--- 28 files changed, 362 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/14e5decc/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 318c8b8..fa9634f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -35,9 +35,11 @@ import org.apache.spark.sql.types._ * Binarize a column of continuous features given a threshold. */ @Experimental -final class Binarizer(override val uid: String) +@Since("1.4.0") +final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { + @Since("1.4.0") def this() = this(Identifiable.randomUID("binarizer")) /** @@ -47,21 +49,26 @@ final class Binarizer(override val uid: String) * Default: 0.0 * @group param */ + @Since("1.4.0") val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold used to binarize continuous features") /** @group getParam */ + @Since("1.4.0") def getThreshold: Double = $(threshold) /** @group setParam */ + @Since("1.4.0") def setThreshold(value: Double): this.type = set(threshold, value) setDefault(threshold -> 0.0) /** @group setParam */ + @Since("1.4.0") def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ + @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") @@ -96,6 +103,7 @@ final class Binarizer(override val uid: String) } } + @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) @@ -115,6 +123,7 @@ final class Binarizer(override val uid: String) St
spark git commit: [SPARK-10258][DOC][ML] Add @Since annotations to ml.feature
Repository: spark Updated Branches: refs/heads/master ce49bfc25 -> 37494a18e [SPARK-10258][DOC][ML] Add @Since annotations to ml.feature This PR adds missing `Since` annotations to `ml.feature` package. Closes #8505. ## How was this patch tested? Existing tests. Author: Nick Pentreath Closes #13641 from MLnick/add-since-annotations. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37494a18 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37494a18 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37494a18 Branch: refs/heads/master Commit: 37494a18e8d6e22113338523d6498e00ac9725ea Parents: ce49bfc Author: Nick Pentreath Authored: Tue Jun 21 00:39:47 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:39:47 2016 -0700 -- .../org/apache/spark/ml/feature/Binarizer.scala | 11 ++- .../apache/spark/ml/feature/Bucketizer.scala| 12 ++- .../apache/spark/ml/feature/ChiSqSelector.scala | 19 +-- .../spark/ml/feature/CountVectorizer.scala | 24 -- .../scala/org/apache/spark/ml/feature/DCT.scala | 7 - .../spark/ml/feature/ElementwiseProduct.scala | 7 - .../org/apache/spark/ml/feature/HashingTF.scala | 14 - .../scala/org/apache/spark/ml/feature/IDF.scala | 20 +--- .../apache/spark/ml/feature/Interaction.scala | 4 +-- .../apache/spark/ml/feature/MaxAbsScaler.scala | 26 ++- .../apache/spark/ml/feature/MinMaxScaler.scala | 23 +++--- .../org/apache/spark/ml/feature/NGram.scala | 7 - .../apache/spark/ml/feature/Normalizer.scala| 7 - .../apache/spark/ml/feature/OneHotEncoder.scala | 10 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 23 +++--- .../spark/ml/feature/PolynomialExpansion.scala | 8 - .../spark/ml/feature/QuantileDiscretizer.scala | 10 +- .../org/apache/spark/ml/feature/RFormula.scala | 18 +-- .../spark/ml/feature/SQLTransformer.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 24 +++--- .../spark/ml/feature/StopWordsRemover.scala | 14 - .../apache/spark/ml/feature/StringIndexer.scala | 33 +--- .../org/apache/spark/ml/feature/Tokenizer.scala | 22 +++-- .../spark/ml/feature/VectorAssembler.scala | 8 - .../apache/spark/ml/feature/VectorIndexer.scala | 24 +++--- .../apache/spark/ml/feature/VectorSlicer.scala | 14 - .../org/apache/spark/ml/feature/Word2Vec.scala | 29 +++-- python/pyspark/ml/feature.py| 10 +++--- 28 files changed, 362 insertions(+), 68 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37494a18/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala index 318c8b8..fa9634f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala @@ -35,9 +35,11 @@ import org.apache.spark.sql.types._ * Binarize a column of continuous features given a threshold. */ @Experimental -final class Binarizer(override val uid: String) +@Since("1.4.0") +final class Binarizer @Since("1.4.0") (@Since("1.4.0") override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { + @Since("1.4.0") def this() = this(Identifiable.randomUID("binarizer")) /** @@ -47,21 +49,26 @@ final class Binarizer(override val uid: String) * Default: 0.0 * @group param */ + @Since("1.4.0") val threshold: DoubleParam = new DoubleParam(this, "threshold", "threshold used to binarize continuous features") /** @group getParam */ + @Since("1.4.0") def getThreshold: Double = $(threshold) /** @group setParam */ + @Since("1.4.0") def setThreshold(value: Double): this.type = set(threshold, value) setDefault(threshold -> 0.0) /** @group setParam */ + @Since("1.4.0") def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ + @Since("1.4.0") def setOutputCol(value: String): this.type = set(outputCol, value) @Since("2.0.0") @@ -96,6 +103,7 @@ final class Binarizer(override val uid: String) } } + @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType val outputColName = $(outputCol) @@ -115,6 +123,7 @@ final class Binarizer(override val uid: String) StructType(schema.fields :+ outCol) } + @Since("1.4.1") override def copy(extra: ParamMap): Binarizer
spark git commit: [SPARK-16045][ML][DOC] Spark 2.0 ML.feature: doc update for stopwords and binarizer
Repository: spark Updated Branches: refs/heads/master 37494a18e -> a58f40239 [SPARK-16045][ML][DOC] Spark 2.0 ML.feature: doc update for stopwords and binarizer ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16045 2.0 Audit: Update document for StopWordsRemover and Binarizer. ## How was this patch tested? manual review for doc Author: Yuhao Yang Author: Yuhao Yang Closes #13375 from hhbyyh/stopdoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a58f4023 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a58f4023 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a58f4023 Branch: refs/heads/master Commit: a58f40239444d42adbc480ddde02cbb02a79bbe4 Parents: 37494a1 Author: Yuhao Yang Authored: Tue Jun 21 00:47:36 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:47:36 2016 -0700 -- docs/ml-features.md | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a58f4023/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 3db24a3..3cb2644 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -251,11 +251,12 @@ frequently and don't carry as much meaning. `StopWordsRemover` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer)) and drops all the stop words from the input sequences. The list of stopwords is specified by -the `stopWords` parameter. We provide [a list of stop -words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by -default, accessible by calling `getStopWords` on a newly instantiated -`StopWordsRemover` instance. A boolean parameter `caseSensitive` indicates -if the matches should be case sensitive (false by default). +the `stopWords` parameter. Default stop words for some languages are accessible +by calling `StopWordsRemover.loadDefaultStopWords(language)`, for which available +options are "danish", "dutch", "english", "finnish", "french", "german", "hungarian", +"italian", "norwegian", "portuguese", "russian", "spanish", "swedish" and "turkish". +A boolean parameter `caseSensitive` indicates if the matches should be case sensitive +(false by default). **Examples** @@ -346,7 +347,10 @@ for more details on the API. Binarization is the process of thresholding numerical features to binary (0/1) features. -`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` for binarization. Feature values greater than the threshold are binarized to 1.0; values equal to or less than the threshold are binarized to 0.0. +`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` +for binarization. Feature values greater than the threshold are binarized to 1.0; values equal +to or less than the threshold are binarized to 0.0. Both Vector and Double types are supported +for `inputCol`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16045][ML][DOC] Spark 2.0 ML.feature: doc update for stopwords and binarizer
Repository: spark Updated Branches: refs/heads/branch-2.0 14e5decc5 -> 0499ed961 [SPARK-16045][ML][DOC] Spark 2.0 ML.feature: doc update for stopwords and binarizer ## What changes were proposed in this pull request? jira: https://issues.apache.org/jira/browse/SPARK-16045 2.0 Audit: Update document for StopWordsRemover and Binarizer. ## How was this patch tested? manual review for doc Author: Yuhao Yang Author: Yuhao Yang Closes #13375 from hhbyyh/stopdoc. (cherry picked from commit a58f40239444d42adbc480ddde02cbb02a79bbe4) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0499ed96 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0499ed96 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0499ed96 Branch: refs/heads/branch-2.0 Commit: 0499ed961838686acccefc08a42efa523f1648dd Parents: 14e5dec Author: Yuhao Yang Authored: Tue Jun 21 00:47:36 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 00:47:44 2016 -0700 -- docs/ml-features.md | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0499ed96/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 3db24a3..3cb2644 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -251,11 +251,12 @@ frequently and don't carry as much meaning. `StopWordsRemover` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer)) and drops all the stop words from the input sequences. The list of stopwords is specified by -the `stopWords` parameter. We provide [a list of stop -words](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) by -default, accessible by calling `getStopWords` on a newly instantiated -`StopWordsRemover` instance. A boolean parameter `caseSensitive` indicates -if the matches should be case sensitive (false by default). +the `stopWords` parameter. Default stop words for some languages are accessible +by calling `StopWordsRemover.loadDefaultStopWords(language)`, for which available +options are "danish", "dutch", "english", "finnish", "french", "german", "hungarian", +"italian", "norwegian", "portuguese", "russian", "spanish", "swedish" and "turkish". +A boolean parameter `caseSensitive` indicates if the matches should be case sensitive +(false by default). **Examples** @@ -346,7 +347,10 @@ for more details on the API. Binarization is the process of thresholding numerical features to binary (0/1) features. -`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` for binarization. Feature values greater than the threshold are binarized to 1.0; values equal to or less than the threshold are binarized to 0.0. +`Binarizer` takes the common parameters `inputCol` and `outputCol`, as well as the `threshold` +for binarization. Feature values greater than the threshold are binarized to 1.0; values equal +to or less than the threshold are binarized to 0.0. Both Vector and Double types are supported +for `inputCol`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16084][SQL] Minor comments update for "DESCRIBE" table
Repository: spark Updated Branches: refs/heads/master a58f40239 -> f3a768b7b [SPARK-16084][SQL] Minor comments update for "DESCRIBE" table ## What changes were proposed in this pull request? 1. FORMATTED is actually supported, but partition is not supported; 2. Remove parenthesis as it is not necessary just like anywhere else. ## How was this patch tested? Minor issue. I do not think it needs a test case! Author: bomeng Closes #13791 from bomeng/SPARK-16084. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f3a768b7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f3a768b7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f3a768b7 Branch: refs/heads/master Commit: f3a768b7b96f00f33d2fe4e6c0bf4acf373ad4f4 Parents: a58f402 Author: bomeng Authored: Tue Jun 21 08:51:43 2016 +0100 Committer: Sean Owen Committed: Tue Jun 21 08:51:43 2016 +0100 -- .../scala/org/apache/spark/sql/execution/SparkSqlParser.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f3a768b7/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 154c25a..2ae8380 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -279,15 +279,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { * Create a [[DescribeTableCommand]] logical plan. */ override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) { -// FORMATTED and columns are not supported. Return null and let the parser decide what to do -// with this (create an exception or pass it on to a different system). +// Describe partition and column are not supported yet. Return null and let the parser decide +// what to do with this (create an exception or pass it on to a different system). if (ctx.describeColName != null || ctx.partitionSpec != null) { null } else { DescribeTableCommand( visitTableIdentifier(ctx.tableIdentifier), ctx.EXTENDED != null, -ctx.FORMATTED() != null) +ctx.FORMATTED != null) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16084][SQL] Minor comments update for "DESCRIBE" table
Repository: spark Updated Branches: refs/heads/branch-2.0 0499ed961 -> 34a8e23c7 [SPARK-16084][SQL] Minor comments update for "DESCRIBE" table ## What changes were proposed in this pull request? 1. FORMATTED is actually supported, but partition is not supported; 2. Remove parenthesis as it is not necessary just like anywhere else. ## How was this patch tested? Minor issue. I do not think it needs a test case! Author: bomeng Closes #13791 from bomeng/SPARK-16084. (cherry picked from commit f3a768b7b96f00f33d2fe4e6c0bf4acf373ad4f4) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/34a8e23c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/34a8e23c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/34a8e23c Branch: refs/heads/branch-2.0 Commit: 34a8e23c739532cd2cb059d9d4e785368d6d0a98 Parents: 0499ed9 Author: bomeng Authored: Tue Jun 21 08:51:43 2016 +0100 Committer: Sean Owen Committed: Tue Jun 21 08:51:57 2016 +0100 -- .../scala/org/apache/spark/sql/execution/SparkSqlParser.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/34a8e23c/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index 154c25a..2ae8380 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -279,15 +279,15 @@ class SparkSqlAstBuilder(conf: SQLConf) extends AstBuilder { * Create a [[DescribeTableCommand]] logical plan. */ override def visitDescribeTable(ctx: DescribeTableContext): LogicalPlan = withOrigin(ctx) { -// FORMATTED and columns are not supported. Return null and let the parser decide what to do -// with this (create an exception or pass it on to a different system). +// Describe partition and column are not supported yet. Return null and let the parser decide +// what to do with this (create an exception or pass it on to a different system). if (ctx.describeColName != null || ctx.partitionSpec != null) { null } else { DescribeTableCommand( visitTableIdentifier(ctx.tableIdentifier), ctx.EXTENDED != null, -ctx.FORMATTED() != null) +ctx.FORMATTED != null) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib
Repository: spark Updated Branches: refs/heads/master f3a768b7b -> 4f83ca105 [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib ## What changes were proposed in this pull request? This PR is a subset of #13023 by yanboliang to make SparkR model param names and default values consistent with MLlib. I tried to avoid other changes from #13023 to keep this PR minimal. I will send a follow-up PR to improve the documentation. Main changes: * `spark.glm`: epsilon -> tol, maxit -> maxIter * `spark.kmeans`: default k -> 2, default maxIter -> 20, default initMode -> "k-means||" * `spark.naiveBayes`: laplace -> smoothing, default 1.0 ## How was this patch tested? Existing unit tests. Author: Xiangrui Meng Closes #13801 from mengxr/SPARK-15177.1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f83ca10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f83ca10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f83ca10 Branch: refs/heads/master Commit: 4f83ca1059a3b580fca3f006974ff5ac4d5212a1 Parents: f3a768b Author: Xiangrui Meng Authored: Tue Jun 21 08:31:15 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 08:31:15 2016 -0700 -- R/pkg/R/mllib.R | 74 ++-- R/pkg/inst/tests/testthat/test_mllib.R | 4 +- .../r/GeneralizedLinearRegressionWrapper.scala | 8 +-- .../apache/spark/ml/r/NaiveBayesWrapper.scala | 4 +- 4 files changed, 44 insertions(+), 46 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f83ca10/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 74dba8f..b83b3b3 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -64,8 +64,8 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. -#' @param epsilon Positive convergence tolerance of iterations. -#' @param maxit Integer giving the maximal number of IRLS iterations. +#' @param tol Positive convergence tolerance of iterations. +#' @param maxIter Integer giving the maximal number of IRLS iterations. #' @return a fitted generalized linear model #' @rdname spark.glm #' @export @@ -74,32 +74,30 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' sparkR.session() #' data(iris) #' df <- createDataFrame(iris) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian") +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") #' summary(model) #' } #' @note spark.glm since 2.0.0 -setMethod( -"spark.glm", -signature(data = "SparkDataFrame", formula = "formula"), -function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) { -if (is.character(family)) { -family <- get(family, mode = "function", envir = parent.frame()) -} -if (is.function(family)) { -family <- family() -} -if (is.null(family$family)) { -print(family) -stop("'family' not recognized") -} +setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) { +if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) +} +if (is.function(family)) { + family <- family() +} +if (is.null(family$family)) { + print(family) + stop("'family' not recognized") +} -formula <- paste(deparse(formula), collapse = "") +formula <- paste(deparse(formula), collapse = "") -jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", -"fit", formula, data@sdf, family$family, family$link, -epsilon, as.integer(maxit)) -return(new("GeneralizedLinearRegressionModel", jobj = jobj)) -}) +jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", +"fit", formula, data@sdf, family$family, family$link, +tol, as.integer(maxIter)) +return(new("GeneralizedLinearRegressionModel", jobj = jobj)) + }) #' Fits a generalized linear model (R-compliant). #' @@ -122,13 +120,13 @@ setMethod( #' sparkR.session() #' data(iris) #' df <- createDataFrame(iris) -#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussi
spark git commit: [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib
Repository: spark Updated Branches: refs/heads/branch-2.0 34a8e23c7 -> 282be71dd [SPARK-15177][.1][R] make SparkR model params and default values consistent with MLlib ## What changes were proposed in this pull request? This PR is a subset of #13023 by yanboliang to make SparkR model param names and default values consistent with MLlib. I tried to avoid other changes from #13023 to keep this PR minimal. I will send a follow-up PR to improve the documentation. Main changes: * `spark.glm`: epsilon -> tol, maxit -> maxIter * `spark.kmeans`: default k -> 2, default maxIter -> 20, default initMode -> "k-means||" * `spark.naiveBayes`: laplace -> smoothing, default 1.0 ## How was this patch tested? Existing unit tests. Author: Xiangrui Meng Closes #13801 from mengxr/SPARK-15177.1. (cherry picked from commit 4f83ca1059a3b580fca3f006974ff5ac4d5212a1) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/282be71d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/282be71d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/282be71d Branch: refs/heads/branch-2.0 Commit: 282be71dd69da87e7f3885b803fc9a4795bbc3fb Parents: 34a8e23 Author: Xiangrui Meng Authored: Tue Jun 21 08:31:15 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 08:31:22 2016 -0700 -- R/pkg/R/mllib.R | 74 ++-- R/pkg/inst/tests/testthat/test_mllib.R | 4 +- .../r/GeneralizedLinearRegressionWrapper.scala | 8 +-- .../apache/spark/ml/r/NaiveBayesWrapper.scala | 4 +- 4 files changed, 44 insertions(+), 46 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/282be71d/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index 74dba8f..b83b3b3 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -64,8 +64,8 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' This can be a character string naming a family function, a family function or #' the result of a call to a family function. Refer R family at #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. -#' @param epsilon Positive convergence tolerance of iterations. -#' @param maxit Integer giving the maximal number of IRLS iterations. +#' @param tol Positive convergence tolerance of iterations. +#' @param maxIter Integer giving the maximal number of IRLS iterations. #' @return a fitted generalized linear model #' @rdname spark.glm #' @export @@ -74,32 +74,30 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' sparkR.session() #' data(iris) #' df <- createDataFrame(iris) -#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family="gaussian") +#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") #' summary(model) #' } #' @note spark.glm since 2.0.0 -setMethod( -"spark.glm", -signature(data = "SparkDataFrame", formula = "formula"), -function(data, formula, family = gaussian, epsilon = 1e-06, maxit = 25) { -if (is.character(family)) { -family <- get(family, mode = "function", envir = parent.frame()) -} -if (is.function(family)) { -family <- family() -} -if (is.null(family$family)) { -print(family) -stop("'family' not recognized") -} +setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), + function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) { +if (is.character(family)) { + family <- get(family, mode = "function", envir = parent.frame()) +} +if (is.function(family)) { + family <- family() +} +if (is.null(family$family)) { + print(family) + stop("'family' not recognized") +} -formula <- paste(deparse(formula), collapse = "") +formula <- paste(deparse(formula), collapse = "") -jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", -"fit", formula, data@sdf, family$family, family$link, -epsilon, as.integer(maxit)) -return(new("GeneralizedLinearRegressionModel", jobj = jobj)) -}) +jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper", +"fit", formula, data@sdf, family$family, family$link, +tol, as.integer(maxIter)) +return(new("GeneralizedLinearRegressionModel", jobj = jobj)) + }) #' Fits a generalized linear model (R-compliant). #' @@ -122,13 +120,13 @@ setMethod( #' sparkR.session() #
spark git commit: [SPARK-13792][SQL] Addendum: Fix Python API
Repository: spark Updated Branches: refs/heads/master 4f83ca105 -> 93338807a [SPARK-13792][SQL] Addendum: Fix Python API ## What changes were proposed in this pull request? This is a follow-up to https://github.com/apache/spark/pull/13795 to properly set CSV options in Python API. As part of this, I also make the Python option setting for both CSV and JSON more robust against positional errors. ## How was this patch tested? N/A Author: Reynold Xin Closes #13800 from rxin/SPARK-13792-2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93338807 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93338807 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93338807 Branch: refs/heads/master Commit: 93338807aafdb2db9fb036ceadee1467cd367cdd Parents: 4f83ca1 Author: Reynold Xin Authored: Tue Jun 21 10:47:51 2016 -0700 Committer: Reynold Xin Committed: Tue Jun 21 10:47:51 2016 -0700 -- python/pyspark/sql/readwriter.py | 54 +-- 1 file changed, 33 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93338807/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 89506ca..ccbf895 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -77,7 +77,7 @@ class ReaderUtils(object): def _set_csv_opts(self, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, - dateFormat, maxColumns, maxCharsPerColumn, mode): + dateFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode): """ Set options based on the CSV optional parameters """ @@ -115,6 +115,8 @@ class ReaderUtils(object): self.option("maxColumns", maxColumns) if maxCharsPerColumn is not None: self.option("maxCharsPerColumn", maxCharsPerColumn) +if maxMalformedLogPerPartition is not None: +self.option("maxMalformedLogPerPartition", maxMalformedLogPerPartition) if mode is not None: self.option("mode", mode) @@ -268,10 +270,12 @@ class DataFrameReader(ReaderUtils): [('age', 'bigint'), ('name', 'string')] """ -self._set_json_opts(schema, primitivesAsString, prefersDecimal, -allowComments, allowUnquotedFieldNames, allowSingleQuotes, -allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, -mode, columnNameOfCorruptRecord) +self._set_json_opts( +schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal, +allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames, +allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, + allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, +mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord) if isinstance(path, basestring): path = [path] if type(path) == list: @@ -343,7 +347,8 @@ class DataFrameReader(ReaderUtils): def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, -negativeInf=None, dateFormat=None, maxColumns=None, maxCharsPerColumn=None, mode=None): +negativeInf=None, dateFormat=None, maxColumns=None, maxCharsPerColumn=None, +maxMalformedLogPerPartition=None, mode=None): """Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -408,11 +413,13 @@ class DataFrameReader(ReaderUtils): >>> df.dtypes [('_c0', 'string'), ('_c1', 'string')] """ - -self._set_csv_opts(schema, sep, encoding, quote, escape, - comment, header, inferSchema, ignoreLeadingWhiteSpace, - ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, - dateFormat, maxColumns, maxCharsPerColumn, mode) +self._set_csv_opts( +schema=schema, sep=sep, encoding=encoding, quote=quote, escape=escape, comment=comment, +header=header, inferSchema=inferSchema, igno
spark git commit: [SPARK-13792][SQL] Addendum: Fix Python API
Repository: spark Updated Branches: refs/heads/branch-2.0 282be71dd -> 943239bf4 [SPARK-13792][SQL] Addendum: Fix Python API ## What changes were proposed in this pull request? This is a follow-up to https://github.com/apache/spark/pull/13795 to properly set CSV options in Python API. As part of this, I also make the Python option setting for both CSV and JSON more robust against positional errors. ## How was this patch tested? N/A Author: Reynold Xin Closes #13800 from rxin/SPARK-13792-2. (cherry picked from commit 93338807aafdb2db9fb036ceadee1467cd367cdd) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/943239bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/943239bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/943239bf Branch: refs/heads/branch-2.0 Commit: 943239bf43c5e59c71db218627ee4dc4308a0680 Parents: 282be71 Author: Reynold Xin Authored: Tue Jun 21 10:47:51 2016 -0700 Committer: Reynold Xin Committed: Tue Jun 21 10:47:57 2016 -0700 -- python/pyspark/sql/readwriter.py | 54 +-- 1 file changed, 33 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/943239bf/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 89506ca..ccbf895 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -77,7 +77,7 @@ class ReaderUtils(object): def _set_csv_opts(self, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, - dateFormat, maxColumns, maxCharsPerColumn, mode): + dateFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode): """ Set options based on the CSV optional parameters """ @@ -115,6 +115,8 @@ class ReaderUtils(object): self.option("maxColumns", maxColumns) if maxCharsPerColumn is not None: self.option("maxCharsPerColumn", maxCharsPerColumn) +if maxMalformedLogPerPartition is not None: +self.option("maxMalformedLogPerPartition", maxMalformedLogPerPartition) if mode is not None: self.option("mode", mode) @@ -268,10 +270,12 @@ class DataFrameReader(ReaderUtils): [('age', 'bigint'), ('name', 'string')] """ -self._set_json_opts(schema, primitivesAsString, prefersDecimal, -allowComments, allowUnquotedFieldNames, allowSingleQuotes, -allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, -mode, columnNameOfCorruptRecord) +self._set_json_opts( +schema=schema, primitivesAsString=primitivesAsString, prefersDecimal=prefersDecimal, +allowComments=allowComments, allowUnquotedFieldNames=allowUnquotedFieldNames, +allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, + allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, +mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord) if isinstance(path, basestring): path = [path] if type(path) == list: @@ -343,7 +347,8 @@ class DataFrameReader(ReaderUtils): def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, -negativeInf=None, dateFormat=None, maxColumns=None, maxCharsPerColumn=None, mode=None): +negativeInf=None, dateFormat=None, maxColumns=None, maxCharsPerColumn=None, +maxMalformedLogPerPartition=None, mode=None): """Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -408,11 +413,13 @@ class DataFrameReader(ReaderUtils): >>> df.dtypes [('_c0', 'string'), ('_c1', 'string')] """ - -self._set_csv_opts(schema, sep, encoding, quote, escape, - comment, header, inferSchema, ignoreLeadingWhiteSpace, - ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, - dateFormat, maxColumns, maxCharsPerColumn, mode) +self._set_csv_opts( +schema=schema, sep=sep, encoding=encoding
spark git commit: [SPARK-16080][YARN] Set correct link name for conf archive in executors.
Repository: spark Updated Branches: refs/heads/branch-2.0 943239bf4 -> 052779a0c [SPARK-16080][YARN] Set correct link name for conf archive in executors. This makes sure the files are in the executor's classpath as they're expected to be. Also update the unit test to make sure the files are there as expected. Author: Marcelo Vanzin Closes #13792 from vanzin/SPARK-16080. (cherry picked from commit bcb0258ae62f23f71a067c1304232f272d7374aa) Signed-off-by: Tom Graves Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/052779a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/052779a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/052779a0 Branch: refs/heads/branch-2.0 Commit: 052779a0cf8676683b3b83203e440d2266eef790 Parents: 943239b Author: Marcelo Vanzin Authored: Tue Jun 21 12:48:06 2016 -0500 Committer: Tom Graves Committed: Tue Jun 21 12:49:00 2016 -0500 -- .../apache/spark/deploy/yarn/ApplicationMaster.scala | 14 ++ .../apache/spark/deploy/yarn/YarnClusterSuite.scala | 8 2 files changed, 18 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/052779a0/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 4df90d7..847d1de 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -160,11 +160,17 @@ private[spark] class ApplicationMaster( } // Distribute the conf archive to executors. -sparkConf.get(CACHED_CONF_ARCHIVE).foreach { uri => - val fs = FileSystem.get(new URI(uri), yarnConf) +sparkConf.get(CACHED_CONF_ARCHIVE).foreach { path => + val uri = new URI(path) + val fs = FileSystem.get(uri, yarnConf) val status = fs.getFileStatus(new Path(uri)) - setupDistributedCache(uri, LocalResourceType.ARCHIVE, status.getModificationTime().toString, -status.getLen.toString, LocalResourceVisibility.PRIVATE.name()) + // SPARK-16080: Make sure to use the correct name for the destination when distributing the + // conf archive to executors. + val destUri = new URI(uri.getScheme(), uri.getRawSchemeSpecificPart(), +Client.LOCALIZED_CONF_DIR) + setupDistributedCache(destUri.toString(), LocalResourceType.ARCHIVE, +status.getModificationTime().toString, status.getLen.toString, +LocalResourceVisibility.PRIVATE.name()) } // Clean up the configuration so it doesn't show up in the Web UI (since it's really noisy). http://git-wip-us.apache.org/repos/asf/spark/blob/052779a0/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala -- diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index c465604..4ce33e0 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -292,6 +292,14 @@ private object YarnClusterDriver extends Logging with Matchers { sc.stop() } +// Verify that the config archive is correctly placed in the classpath of all containers. +val confFile = "/" + Client.SPARK_CONF_FILE +assert(getClass().getResource(confFile) != null) +val configFromExecutors = sc.parallelize(1 to 4, 4) + .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull } + .collect() +assert(configFromExecutors.find(_ == null) === None) + // verify log urls are present val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16080][YARN] Set correct link name for conf archive in executors.
Repository: spark Updated Branches: refs/heads/master 93338807a -> bcb0258ae [SPARK-16080][YARN] Set correct link name for conf archive in executors. This makes sure the files are in the executor's classpath as they're expected to be. Also update the unit test to make sure the files are there as expected. Author: Marcelo Vanzin Closes #13792 from vanzin/SPARK-16080. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bcb0258a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bcb0258a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bcb0258a Branch: refs/heads/master Commit: bcb0258ae62f23f71a067c1304232f272d7374aa Parents: 9333880 Author: Marcelo Vanzin Authored: Tue Jun 21 12:48:06 2016 -0500 Committer: Tom Graves Committed: Tue Jun 21 12:48:06 2016 -0500 -- .../apache/spark/deploy/yarn/ApplicationMaster.scala | 14 ++ .../apache/spark/deploy/yarn/YarnClusterSuite.scala | 8 2 files changed, 18 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bcb0258a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala index 4df90d7..847d1de 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala @@ -160,11 +160,17 @@ private[spark] class ApplicationMaster( } // Distribute the conf archive to executors. -sparkConf.get(CACHED_CONF_ARCHIVE).foreach { uri => - val fs = FileSystem.get(new URI(uri), yarnConf) +sparkConf.get(CACHED_CONF_ARCHIVE).foreach { path => + val uri = new URI(path) + val fs = FileSystem.get(uri, yarnConf) val status = fs.getFileStatus(new Path(uri)) - setupDistributedCache(uri, LocalResourceType.ARCHIVE, status.getModificationTime().toString, -status.getLen.toString, LocalResourceVisibility.PRIVATE.name()) + // SPARK-16080: Make sure to use the correct name for the destination when distributing the + // conf archive to executors. + val destUri = new URI(uri.getScheme(), uri.getRawSchemeSpecificPart(), +Client.LOCALIZED_CONF_DIR) + setupDistributedCache(destUri.toString(), LocalResourceType.ARCHIVE, +status.getModificationTime().toString, status.getLen.toString, +LocalResourceVisibility.PRIVATE.name()) } // Clean up the configuration so it doesn't show up in the Web UI (since it's really noisy). http://git-wip-us.apache.org/repos/asf/spark/blob/bcb0258a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala -- diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala index c465604..4ce33e0 100644 --- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala +++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala @@ -292,6 +292,14 @@ private object YarnClusterDriver extends Logging with Matchers { sc.stop() } +// Verify that the config archive is correctly placed in the classpath of all containers. +val confFile = "/" + Client.SPARK_CONF_FILE +assert(getClass().getResource(confFile) != null) +val configFromExecutors = sc.parallelize(1 to 4, 4) + .map { _ => Option(getClass().getResource(confFile)).map(_.toString).orNull } + .collect() +assert(configFromExecutors.find(_ == null) === None) + // verify log urls are present val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16086] [SQL] [PYSPARK] create Row without any fields
Repository: spark Updated Branches: refs/heads/branch-2.0 052779a0c -> 703a526e7 [SPARK-16086] [SQL] [PYSPARK] create Row without any fields ## What changes were proposed in this pull request? This PR allows us to create a Row without any fields. ## How was this patch tested? Added a test for empty row and udf without arguments. Author: Davies Liu Closes #13812 from davies/no_argus. (cherry picked from commit 2d6919bea9fc213b5af530afab7793b63c6c8b51) Signed-off-by: Davies Liu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/703a526e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/703a526e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/703a526e Branch: refs/heads/branch-2.0 Commit: 703a526e79aa30f5a5bab5477a5ed02b8ecc29ea Parents: 052779a Author: Davies Liu Authored: Tue Jun 21 10:53:33 2016 -0700 Committer: Davies Liu Committed: Tue Jun 21 10:53:42 2016 -0700 -- python/pyspark/sql/tests.py | 9 + python/pyspark/sql/types.py | 9 +++-- 2 files changed, 12 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/703a526e/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index c631ad8..388ac91 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -177,6 +177,10 @@ class DataTypeTests(unittest.TestCase): dt = DateType() self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1)) +def test_empty_row(self): +row = Row() +self.assertEqual(len(row), 0) + class SQLTests(ReusedPySparkTestCase): @@ -318,6 +322,11 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(add(1, 2)), add(double(2), 1)").collect() self.assertEqual(tuple(row), (6, 5)) +def test_udf_without_arguments(self): +self.spark.catalog.registerFunction("foo", lambda: "bar") +[row] = self.spark.sql("SELECT foo()").collect() +self.assertEqual(row[0], "bar") + def test_udf_with_array_type(self): d = [Row(l=list(range(3)), d={"key": list(range(5))})] rdd = self.sc.parallelize(d) http://git-wip-us.apache.org/repos/asf/spark/blob/703a526e/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index bb2b954..f0b56be 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1401,11 +1401,7 @@ class Row(tuple): if args and kwargs: raise ValueError("Can not use both args " "and kwargs to create Row") -if args: -# create row class or objects -return tuple.__new__(self, args) - -elif kwargs: +if kwargs: # create row objects names = sorted(kwargs.keys()) row = tuple.__new__(self, [kwargs[n] for n in names]) @@ -1413,7 +1409,8 @@ class Row(tuple): return row else: -raise ValueError("No args or kwargs") +# create row class or objects +return tuple.__new__(self, args) def asDict(self, recursive=False): """ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16086] [SQL] [PYSPARK] create Row without any fields
Repository: spark Updated Branches: refs/heads/master bcb0258ae -> 2d6919bea [SPARK-16086] [SQL] [PYSPARK] create Row without any fields ## What changes were proposed in this pull request? This PR allows us to create a Row without any fields. ## How was this patch tested? Added a test for empty row and udf without arguments. Author: Davies Liu Closes #13812 from davies/no_argus. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2d6919be Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2d6919be Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2d6919be Branch: refs/heads/master Commit: 2d6919bea9fc213b5af530afab7793b63c6c8b51 Parents: bcb0258 Author: Davies Liu Authored: Tue Jun 21 10:53:33 2016 -0700 Committer: Davies Liu Committed: Tue Jun 21 10:53:33 2016 -0700 -- python/pyspark/sql/tests.py | 9 + python/pyspark/sql/types.py | 9 +++-- 2 files changed, 12 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2d6919be/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index c631ad8..388ac91 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -177,6 +177,10 @@ class DataTypeTests(unittest.TestCase): dt = DateType() self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1)) +def test_empty_row(self): +row = Row() +self.assertEqual(len(row), 0) + class SQLTests(ReusedPySparkTestCase): @@ -318,6 +322,11 @@ class SQLTests(ReusedPySparkTestCase): [row] = self.spark.sql("SELECT double(add(1, 2)), add(double(2), 1)").collect() self.assertEqual(tuple(row), (6, 5)) +def test_udf_without_arguments(self): +self.spark.catalog.registerFunction("foo", lambda: "bar") +[row] = self.spark.sql("SELECT foo()").collect() +self.assertEqual(row[0], "bar") + def test_udf_with_array_type(self): d = [Row(l=list(range(3)), d={"key": list(range(5))})] rdd = self.sc.parallelize(d) http://git-wip-us.apache.org/repos/asf/spark/blob/2d6919be/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index bb2b954..f0b56be 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -1401,11 +1401,7 @@ class Row(tuple): if args and kwargs: raise ValueError("Can not use both args " "and kwargs to create Row") -if args: -# create row class or objects -return tuple.__new__(self, args) - -elif kwargs: +if kwargs: # create row objects names = sorted(kwargs.keys()) row = tuple.__new__(self, [kwargs[n] for n in names]) @@ -1413,7 +1409,8 @@ class Row(tuple): return row else: -raise ValueError("No args or kwargs") +# create row class or objects +return tuple.__new__(self, args) def asDict(self, recursive=False): """ - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16109][SPARKR][DOC] R more doc fixes
Repository: spark Updated Branches: refs/heads/master 2d6919bea -> 57746295e [SPARK-16109][SPARKR][DOC] R more doc fixes ## What changes were proposed in this pull request? Found these issues while reviewing for SPARK-16090 ## How was this patch tested? roxygen2 doc gen, checked output html Author: Felix Cheung Closes #13803 from felixcheung/rdocrd. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57746295 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57746295 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57746295 Branch: refs/heads/master Commit: 57746295e6fb705f8393a00ab1cc570ddb7da44e Parents: 2d6919b Author: Felix Cheung Authored: Tue Jun 21 11:01:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 11:01:42 2016 -0700 -- R/pkg/R/DataFrame.R | 7 +-- R/pkg/R/functions.R | 4 +++- R/pkg/R/generics.R | 8 R/pkg/R/schema.R| 7 +-- R/pkg/R/stats.R | 37 +++-- 5 files changed, 40 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a8ade1a..ed0bb85 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -59,6 +59,7 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached +#' @noRd dataFrame <- function(sdf, isCached = FALSE) { new("SparkDataFrame", sdf, isCached) } @@ -119,7 +120,7 @@ setMethod("schema", #' Print the logical and physical Catalyst plans to the console for debugging. #' #' @param x A SparkDataFrame -#' @param extended Logical. If extended is False, explain() only prints the physical plan. +#' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. #' @family SparkDataFrame functions #' @rdname explain #' @name explain @@ -175,6 +176,8 @@ setMethod("isLocal", #' #' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. +#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be +#' truncated and all cells will be aligned right #' #' @family SparkDataFrame functions #' @rdname showDF @@ -1854,7 +1857,7 @@ setMethod("withColumnRenamed", select(x, cols) }) -#' @param newColPair A named pair of the form new_column_name = existing_column +#' @param ... A named pair of the form new_column_name = existing_column #' @rdname rename #' @name rename #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 6e0009f..09e5afa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1777,7 +1777,7 @@ setMethod("months_between", signature(y = "Column"), #' nanvl #' #' Returns col1 if it is not NaN, or col2 if col1 is NaN. -#' hhBoth inputs should be floating point columns (DoubleType or FloatType). +#' Both inputs should be floating point columns (DoubleType or FloatType). #' #' @rdname nanvl #' @name nanvl @@ -2008,6 +2008,8 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr #' could not be found in str. #' +#' @param y column to check +#' @param x substring to check #' @family string_funcs #' @rdname instr #' @name instr http://git-wip-us.apache.org/repos/asf/spark/blob/57746295/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 43395aa..7b08a8e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -59,15 +59,15 @@ setGeneric("count", function(x) { standardGeneric("count") }) # @export setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) -# @rdname statfunctions +# @rdname crosstab # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) -# @rdname statfunctions +# @rdname freqItems # @export setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) -# @rdname statfunctions +# @rdname approxQuantile # @export setGeneric("approxQuantile", function(x, col, probabilities, relativeError) { @@ -575,7 +575,7 @@ setGeneric("sample", setGeneric("sample_frac", function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") }) -#' @rdname statfunctions +#' @rdn
spark git commit: [SPARK-16109][SPARKR][DOC] R more doc fixes
Repository: spark Updated Branches: refs/heads/branch-2.0 703a526e7 -> 867baaada [SPARK-16109][SPARKR][DOC] R more doc fixes ## What changes were proposed in this pull request? Found these issues while reviewing for SPARK-16090 ## How was this patch tested? roxygen2 doc gen, checked output html Author: Felix Cheung Closes #13803 from felixcheung/rdocrd. (cherry picked from commit 57746295e6fb705f8393a00ab1cc570ddb7da44e) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/867baaad Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/867baaad Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/867baaad Branch: refs/heads/branch-2.0 Commit: 867baaadad48a378b36933df0635a09cddc4c8de Parents: 703a526 Author: Felix Cheung Authored: Tue Jun 21 11:01:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 11:01:56 2016 -0700 -- R/pkg/R/DataFrame.R | 7 +-- R/pkg/R/functions.R | 4 +++- R/pkg/R/generics.R | 8 R/pkg/R/schema.R| 7 +-- R/pkg/R/stats.R | 37 +++-- 5 files changed, 40 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index a8ade1a..ed0bb85 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -59,6 +59,7 @@ setMethod("initialize", "SparkDataFrame", function(.Object, sdf, isCached) { #' @export #' @param sdf A Java object reference to the backing Scala DataFrame #' @param isCached TRUE if the SparkDataFrame is cached +#' @noRd dataFrame <- function(sdf, isCached = FALSE) { new("SparkDataFrame", sdf, isCached) } @@ -119,7 +120,7 @@ setMethod("schema", #' Print the logical and physical Catalyst plans to the console for debugging. #' #' @param x A SparkDataFrame -#' @param extended Logical. If extended is False, explain() only prints the physical plan. +#' @param extended Logical. If extended is FALSE, explain() only prints the physical plan. #' @family SparkDataFrame functions #' @rdname explain #' @name explain @@ -175,6 +176,8 @@ setMethod("isLocal", #' #' @param x A SparkDataFrame #' @param numRows The number of rows to print. Defaults to 20. +#' @param truncate Whether truncate long strings. If true, strings more than 20 characters will be +#' truncated and all cells will be aligned right #' #' @family SparkDataFrame functions #' @rdname showDF @@ -1854,7 +1857,7 @@ setMethod("withColumnRenamed", select(x, cols) }) -#' @param newColPair A named pair of the form new_column_name = existing_column +#' @param ... A named pair of the form new_column_name = existing_column #' @rdname rename #' @name rename #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 6e0009f..09e5afa 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1777,7 +1777,7 @@ setMethod("months_between", signature(y = "Column"), #' nanvl #' #' Returns col1 if it is not NaN, or col2 if col1 is NaN. -#' hhBoth inputs should be floating point columns (DoubleType or FloatType). +#' Both inputs should be floating point columns (DoubleType or FloatType). #' #' @rdname nanvl #' @name nanvl @@ -2008,6 +2008,8 @@ setMethod("from_utc_timestamp", signature(y = "Column", x = "character"), #' NOTE: The position is not zero based, but 1 based index, returns 0 if substr #' could not be found in str. #' +#' @param y column to check +#' @param x substring to check #' @family string_funcs #' @rdname instr #' @name instr http://git-wip-us.apache.org/repos/asf/spark/blob/867baaad/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 43395aa..7b08a8e 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -59,15 +59,15 @@ setGeneric("count", function(x) { standardGeneric("count") }) # @export setGeneric("countByValue", function(x) { standardGeneric("countByValue") }) -# @rdname statfunctions +# @rdname crosstab # @export setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") }) -# @rdname statfunctions +# @rdname freqItems # @export setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") }) -# @rdname statfunctions +# @rdname approxQuantile # @export setGeneric("approxQuantile", function(x, col, probabilities, relativeError) { @@ -575,7 +575,7 @@ setGeneric("sample", setGeneric("sample_frac", fu
spark git commit: [SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None
Repository: spark Updated Branches: refs/heads/branch-2.0 867baaada -> f805b989b [SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None ## What changes were proposed in this pull request? Several places set the seed Param default value to None which will translate to a zero value on the Scala side. This is unnecessary because a default fixed value already exists and if a test depends on a zero valued seed, then it should explicitly set it to zero instead of relying on this translation. These cases can be safely removed except for the ALS doc test, which has been changed to set the seed value to zero. ## How was this patch tested? Ran PySpark tests locally Author: Bryan Cutler Closes #13672 from BryanCutler/pyspark-cleanup-setDefault-seed-SPARK-15741. (cherry picked from commit b76e3553760b3c68bebc2c71b0851598718e6f87) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f805b989 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f805b989 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f805b989 Branch: refs/heads/branch-2.0 Commit: f805b989b380981f5515334f9554648f6bf632af Parents: 867baaa Author: Bryan Cutler Authored: Tue Jun 21 11:43:25 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 11:43:33 2016 -0700 -- python/pyspark/ml/classification.py | 4 ++-- python/pyspark/ml/feature.py| 2 +- python/pyspark/ml/recommendation.py | 4 ++-- python/pyspark/ml/regression.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f805b989/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index a3cd917..e86c27e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -685,7 +685,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.RandomForestClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -825,7 +825,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol "org.apache.spark.ml.classification.GBTClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1, seed=None) + lossType="logistic", maxIter=20, stepSize=0.1) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) http://git-wip-us.apache.org/repos/asf/spark/blob/f805b989/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1e9ec0f..94f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2260,7 +2260,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, windowSize=5, maxSentenceLength=1000) + windowSize=5, maxSentenceLength=1000) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) http://git-wip-us.apache.org/repos/asf/spark/blob/f805b989/python/pyspark/ml/recommendation.py -- diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 0a70967..e28d38b 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -68,7 +68,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha >>> df = spark.createDataFrame( ... [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], ... ["user", "item", "rating"]) ->>> als = ALS(rank=10, maxIter=5) +>>> als = ALS(
spark git commit: [SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None
Repository: spark Updated Branches: refs/heads/master 57746295e -> b76e35537 [SPARK-15741][PYSPARK][ML] Pyspark cleanup of set default seed to None ## What changes were proposed in this pull request? Several places set the seed Param default value to None which will translate to a zero value on the Scala side. This is unnecessary because a default fixed value already exists and if a test depends on a zero valued seed, then it should explicitly set it to zero instead of relying on this translation. These cases can be safely removed except for the ALS doc test, which has been changed to set the seed value to zero. ## How was this patch tested? Ran PySpark tests locally Author: Bryan Cutler Closes #13672 from BryanCutler/pyspark-cleanup-setDefault-seed-SPARK-15741. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b76e3553 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b76e3553 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b76e3553 Branch: refs/heads/master Commit: b76e3553760b3c68bebc2c71b0851598718e6f87 Parents: 5774629 Author: Bryan Cutler Authored: Tue Jun 21 11:43:25 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 11:43:25 2016 -0700 -- python/pyspark/ml/classification.py | 4 ++-- python/pyspark/ml/feature.py| 2 +- python/pyspark/ml/recommendation.py | 4 ++-- python/pyspark/ml/regression.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b76e3553/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index a3cd917..e86c27e 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -685,7 +685,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.RandomForestClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, - maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None, + maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -825,7 +825,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol "org.apache.spark.ml.classification.GBTClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1, seed=None) + lossType="logistic", maxIter=20, stepSize=0.1) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) http://git-wip-us.apache.org/repos/asf/spark/blob/b76e3553/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 1e9ec0f..94f 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2260,7 +2260,7 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, windowSize=5, maxSentenceLength=1000) + windowSize=5, maxSentenceLength=1000) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) http://git-wip-us.apache.org/repos/asf/spark/blob/b76e3553/python/pyspark/ml/recommendation.py -- diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 0a70967..e28d38b 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -68,7 +68,7 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha >>> df = spark.createDataFrame( ... [(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)], ... ["user", "item", "rating"]) ->>> als = ALS(rank=10, maxIter=5) +>>> als = ALS(rank=10, maxIter=5, seed=0) >>> model = als.fit(df) >>> model.rank 10 @@ -142,7 +142,7 @@ cla
spark git commit: [SPARK-16037][SQL] Follow-up: add DataFrameWriter.insertInto() test cases for by position resolution
Repository: spark Updated Branches: refs/heads/master b76e35537 -> f4a3d45e3 [SPARK-16037][SQL] Follow-up: add DataFrameWriter.insertInto() test cases for by position resolution ## What changes were proposed in this pull request? This PR migrates some test cases introduced in #12313 as a follow-up of #13754 and #13766. These test cases cover `DataFrameWriter.insertInto()`, while the former two only cover SQL `INSERT` statements. Note that the `testPartitionedTable` utility method tests both Hive SerDe tables and data source tables. ## How was this patch tested? N/A Author: Cheng Lian Closes #13810 from liancheng/spark-16037-follow-up-tests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4a3d45e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4a3d45e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4a3d45e Branch: refs/heads/master Commit: f4a3d45e38f18278bbdb7cc32486ded50f76d54b Parents: b76e355 Author: Cheng Lian Authored: Tue Jun 21 11:58:33 2016 -0700 Committer: Yin Huai Committed: Tue Jun 21 11:58:33 2016 -0700 -- .../sql/hive/InsertIntoHiveTableSuite.scala | 48 1 file changed, 48 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4a3d45e/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 4643251..d9ce1c3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -469,4 +469,52 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef ) } } + + testPartitionedTable("insertInto() should match columns by position and ignore column names") { +tableName => + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { +// Columns `df.c` and `df.d` are resolved by position, and thus mapped to partition columns +// `b` and `c` of the target table. +val df = Seq((1, 2, 3, 4)).toDF("a", "b", "c", "d") +df.write.insertInto(tableName) + +checkAnswer( + sql(s"SELECT a, b, c, d FROM $tableName"), + Row(1, 3, 4, 2) +) + } + } + + testPartitionedTable("insertInto() should match unnamed columns by position") { +tableName => + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { +// Columns `c + 1` and `d + 1` are resolved by position, and thus mapped to partition +// columns `b` and `c` of the target table. +val df = Seq((1, 2, 3, 4)).toDF("a", "b", "c", "d") +df.select('a + 1, 'b + 1, 'c + 1, 'd + 1).write.insertInto(tableName) + +checkAnswer( + sql(s"SELECT a, b, c, d FROM $tableName"), + Row(2, 4, 5, 3) +) + } + } + + testPartitionedTable("insertInto() should reject missing columns") { +tableName => + sql("CREATE TABLE t (a INT, b INT)") + + intercept[AnalysisException] { +spark.table("t").write.insertInto(tableName) + } + } + + testPartitionedTable("insertInto() should reject extra columns") { +tableName => + sql("CREATE TABLE t (a INT, b INT, c INT, d INT, e INT)") + + intercept[AnalysisException] { +spark.table("t").write.insertInto(tableName) + } + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16037][SQL] Follow-up: add DataFrameWriter.insertInto() test cases for by position resolution
Repository: spark Updated Branches: refs/heads/branch-2.0 f805b989b -> 0d7e1d11d [SPARK-16037][SQL] Follow-up: add DataFrameWriter.insertInto() test cases for by position resolution ## What changes were proposed in this pull request? This PR migrates some test cases introduced in #12313 as a follow-up of #13754 and #13766. These test cases cover `DataFrameWriter.insertInto()`, while the former two only cover SQL `INSERT` statements. Note that the `testPartitionedTable` utility method tests both Hive SerDe tables and data source tables. ## How was this patch tested? N/A Author: Cheng Lian Closes #13810 from liancheng/spark-16037-follow-up-tests. (cherry picked from commit f4a3d45e38f18278bbdb7cc32486ded50f76d54b) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d7e1d11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d7e1d11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d7e1d11 Branch: refs/heads/branch-2.0 Commit: 0d7e1d11d2ea2b7005208951518fdf882fc36ec2 Parents: f805b98 Author: Cheng Lian Authored: Tue Jun 21 11:58:33 2016 -0700 Committer: Yin Huai Committed: Tue Jun 21 11:58:54 2016 -0700 -- .../sql/hive/InsertIntoHiveTableSuite.scala | 48 1 file changed, 48 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d7e1d11/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala index 4643251..d9ce1c3 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertIntoHiveTableSuite.scala @@ -469,4 +469,52 @@ class InsertIntoHiveTableSuite extends QueryTest with TestHiveSingleton with Bef ) } } + + testPartitionedTable("insertInto() should match columns by position and ignore column names") { +tableName => + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { +// Columns `df.c` and `df.d` are resolved by position, and thus mapped to partition columns +// `b` and `c` of the target table. +val df = Seq((1, 2, 3, 4)).toDF("a", "b", "c", "d") +df.write.insertInto(tableName) + +checkAnswer( + sql(s"SELECT a, b, c, d FROM $tableName"), + Row(1, 3, 4, 2) +) + } + } + + testPartitionedTable("insertInto() should match unnamed columns by position") { +tableName => + withSQLConf("hive.exec.dynamic.partition.mode" -> "nonstrict") { +// Columns `c + 1` and `d + 1` are resolved by position, and thus mapped to partition +// columns `b` and `c` of the target table. +val df = Seq((1, 2, 3, 4)).toDF("a", "b", "c", "d") +df.select('a + 1, 'b + 1, 'c + 1, 'd + 1).write.insertInto(tableName) + +checkAnswer( + sql(s"SELECT a, b, c, d FROM $tableName"), + Row(2, 4, 5, 3) +) + } + } + + testPartitionedTable("insertInto() should reject missing columns") { +tableName => + sql("CREATE TABLE t (a INT, b INT)") + + intercept[AnalysisException] { +spark.table("t").write.insertInto(tableName) + } + } + + testPartitionedTable("insertInto() should reject extra columns") { +tableName => + sql("CREATE TABLE t (a INT, b INT, c INT, d INT, e INT)") + + intercept[AnalysisException] { +spark.table("t").write.insertInto(tableName) + } + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16002][SQL] Sleep when no new data arrives to avoid 100% CPU usage
Repository: spark Updated Branches: refs/heads/master f4a3d45e3 -> c399c7f0e [SPARK-16002][SQL] Sleep when no new data arrives to avoid 100% CPU usage ## What changes were proposed in this pull request? Add a configuration to allow people to set a minimum polling delay when no new data arrives (default is 10ms). This PR also cleans up some INFO logs. ## How was this patch tested? Existing unit tests. Author: Shixiong Zhu Closes #13718 from zsxwing/SPARK-16002. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c399c7f0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c399c7f0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c399c7f0 Branch: refs/heads/master Commit: c399c7f0e485dcfc6cbc343bc246b8adc3f0648c Parents: f4a3d45 Author: Shixiong Zhu Authored: Tue Jun 21 12:42:49 2016 -0700 Committer: Yin Huai Committed: Tue Jun 21 12:42:49 2016 -0700 -- .../scala/org/apache/spark/util/ManualClock.scala | 18 +++--- .../datasources/ListingFileCatalog.scala | 2 +- .../datasources/fileSourceInterfaces.scala| 2 +- .../execution/streaming/FileStreamSource.scala| 8 +++- .../sql/execution/streaming/StreamExecution.scala | 5 + .../org/apache/spark/sql/internal/SQLConf.scala | 9 - .../apache/spark/sql/streaming/StreamTest.scala | 5 + 7 files changed, 42 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c399c7f0/core/src/main/scala/org/apache/spark/util/ManualClock.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index e7a65d7..91a9587 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -26,6 +26,8 @@ package org.apache.spark.util */ private[spark] class ManualClock(private var time: Long) extends Clock { + private var _isWaiting = false + /** * @return `ManualClock` with initial time 0 */ @@ -57,9 +59,19 @@ private[spark] class ManualClock(private var time: Long) extends Clock { * @return current time reported by the clock when waiting finishes */ def waitTillTime(targetTime: Long): Long = synchronized { -while (time < targetTime) { - wait(10) +_isWaiting = true +try { + while (time < targetTime) { +wait(10) + } + getTimeMillis() +} finally { + _isWaiting = false } -getTimeMillis() } + + /** + * Returns whether there is any thread being blocked in `waitTillTime`. + */ + def isWaiting: Boolean = synchronized { _isWaiting } } http://git-wip-us.apache.org/repos/asf/spark/blob/c399c7f0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index d96cf1b..f713fde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -82,7 +82,7 @@ class ListingFileCatalog( val pathFilter = FileInputFormat.getInputPathFilter(jobConf) val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) -logInfo(s"Listing $path on driver") +logTrace(s"Listing $path on driver") Try { HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) }.getOrElse(Array.empty[FileStatus]) http://git-wip-us.apache.org/repos/asf/spark/blob/c399c7f0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index 4ac555b..521eb7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -389,7 +389,7 @@ private[sql] object HadoopFsRelation extends Logging { // tasks/jobs may leave partial/corrupted data files there. Files and directories whose name // start with "." are also ignored. def listLeafFiles(fs: FileSystem, status: FileStatus, filter: PathFi
spark git commit: [SPARK-16002][SQL] Sleep when no new data arrives to avoid 100% CPU usage
Repository: spark Updated Branches: refs/heads/branch-2.0 0d7e1d11d -> afa14b71b [SPARK-16002][SQL] Sleep when no new data arrives to avoid 100% CPU usage ## What changes were proposed in this pull request? Add a configuration to allow people to set a minimum polling delay when no new data arrives (default is 10ms). This PR also cleans up some INFO logs. ## How was this patch tested? Existing unit tests. Author: Shixiong Zhu Closes #13718 from zsxwing/SPARK-16002. (cherry picked from commit c399c7f0e485dcfc6cbc343bc246b8adc3f0648c) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/afa14b71 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/afa14b71 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/afa14b71 Branch: refs/heads/branch-2.0 Commit: afa14b71b28d788c53816bd2616ccff0c3967f40 Parents: 0d7e1d1 Author: Shixiong Zhu Authored: Tue Jun 21 12:42:49 2016 -0700 Committer: Yin Huai Committed: Tue Jun 21 12:43:11 2016 -0700 -- .../scala/org/apache/spark/util/ManualClock.scala | 18 +++--- .../datasources/ListingFileCatalog.scala | 2 +- .../datasources/fileSourceInterfaces.scala| 2 +- .../execution/streaming/FileStreamSource.scala| 8 +++- .../sql/execution/streaming/StreamExecution.scala | 5 + .../org/apache/spark/sql/internal/SQLConf.scala | 9 - .../apache/spark/sql/streaming/StreamTest.scala | 5 + 7 files changed, 42 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/afa14b71/core/src/main/scala/org/apache/spark/util/ManualClock.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ManualClock.scala b/core/src/main/scala/org/apache/spark/util/ManualClock.scala index e7a65d7..91a9587 100644 --- a/core/src/main/scala/org/apache/spark/util/ManualClock.scala +++ b/core/src/main/scala/org/apache/spark/util/ManualClock.scala @@ -26,6 +26,8 @@ package org.apache.spark.util */ private[spark] class ManualClock(private var time: Long) extends Clock { + private var _isWaiting = false + /** * @return `ManualClock` with initial time 0 */ @@ -57,9 +59,19 @@ private[spark] class ManualClock(private var time: Long) extends Clock { * @return current time reported by the clock when waiting finishes */ def waitTillTime(targetTime: Long): Long = synchronized { -while (time < targetTime) { - wait(10) +_isWaiting = true +try { + while (time < targetTime) { +wait(10) + } + getTimeMillis() +} finally { + _isWaiting = false } -getTimeMillis() } + + /** + * Returns whether there is any thread being blocked in `waitTillTime`. + */ + def isWaiting: Boolean = synchronized { _isWaiting } } http://git-wip-us.apache.org/repos/asf/spark/blob/afa14b71/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index d96cf1b..f713fde 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -82,7 +82,7 @@ class ListingFileCatalog( val pathFilter = FileInputFormat.getInputPathFilter(jobConf) val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) -logInfo(s"Listing $path on driver") +logTrace(s"Listing $path on driver") Try { HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) }.getOrElse(Array.empty[FileStatus]) http://git-wip-us.apache.org/repos/asf/spark/blob/afa14b71/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index bfb34c0..5689cf6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -388,7 +388,7 @@ private[sql] object HadoopFsRelation extends Logging { // tasks/jobs may leave partial/corrupted data files there. Files and directories whose name // s
spark git commit: [MINOR][MLLIB] move setCheckpointInterval to non-expert setters
Repository: spark Updated Branches: refs/heads/branch-2.0 afa14b71b -> 591bf7909 [MINOR][MLLIB] move setCheckpointInterval to non-expert setters ## What changes were proposed in this pull request? The `checkpointInterval` is a non-expert param. This PR moves its setter to non-expert group. Author: Xiangrui Meng Closes #13813 from mengxr/checkpoint-non-expert. (cherry picked from commit 918c91954fb46400ce2c5ab066d2ec0ae48dda4a) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/591bf790 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/591bf790 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/591bf790 Branch: refs/heads/branch-2.0 Commit: 591bf79093933429d4a5d0b0797961f9eb9566eb Parents: afa14b7 Author: Xiangrui Meng Authored: Tue Jun 21 13:35:06 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 13:35:13 2016 -0700 -- mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/591bf790/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index d7559f8..57c7e44 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -151,7 +151,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams * [[org.apache.spark.SparkContext]]. * Must be >= 1. * (default = 10) - * @group expertSetParam + * @group setParam */ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][MLLIB] move setCheckpointInterval to non-expert setters
Repository: spark Updated Branches: refs/heads/master c399c7f0e -> 918c91954 [MINOR][MLLIB] move setCheckpointInterval to non-expert setters ## What changes were proposed in this pull request? The `checkpointInterval` is a non-expert param. This PR moves its setter to non-expert group. Author: Xiangrui Meng Closes #13813 from mengxr/checkpoint-non-expert. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/918c9195 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/918c9195 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/918c9195 Branch: refs/heads/master Commit: 918c91954fb46400ce2c5ab066d2ec0ae48dda4a Parents: c399c7f Author: Xiangrui Meng Authored: Tue Jun 21 13:35:06 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 13:35:06 2016 -0700 -- mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/918c9195/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index d7559f8..57c7e44 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -151,7 +151,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams * [[org.apache.spark.SparkContext]]. * Must be >= 1. * (default = 10) - * @group expertSetParam + * @group setParam */ def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16096][SPARKR] add union and deprecate unionAll
Repository: spark Updated Branches: refs/heads/master 918c91954 -> dbfdae4e4 [SPARK-16096][SPARKR] add union and deprecate unionAll ## What changes were proposed in this pull request? add union and deprecate unionAll, separate roxygen2 doc for rbind (since their usage and parameter lists are quite different) `explode` is also deprecated - but seems like replacement is a combination of calls; not sure if we should deprecate it in SparkR, yet. ## How was this patch tested? unit tests, manual checks for r doc Author: Felix Cheung Closes #13805 from felixcheung/runion. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbfdae4e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbfdae4e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbfdae4e Branch: refs/heads/master Commit: dbfdae4e41a900de01b48639d6554d32edbb2e0b Parents: 918c919 Author: Felix Cheung Authored: Tue Jun 21 13:36:50 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 13:36:50 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 43 -- R/pkg/R/generics.R| 6 +++- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 +++-- 5 files changed, 47 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbfdae4e/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ea42888..2272d8b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -107,6 +107,7 @@ exportMethods("arrange", "summary", "take", "transform", + "union", "unionAll", "unique", "unpersist", http://git-wip-us.apache.org/repos/asf/spark/blob/dbfdae4e/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ed0bb85..725cbf2 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { cols } -#' rbind +#' Return a new SparkDataFrame containing the union of rows #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame #' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL. @@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the union. #' @family SparkDataFrame functions -#' @rdname rbind -#' @name unionAll +#' @rdname union +#' @name union +#' @seealso \link{rbind} #' @export #' @examples #'\dontrun{ #' sparkR.session() #' df1 <- read.json(path) #' df2 <- read.json(path2) -#' unioned <- unionAll(df, df2) +#' unioned <- union(df, df2) +#' unions <- rbind(df, df2, df3, df4) #' } +#' @note union since 2.0.0 +setMethod("union", + signature(x = "SparkDataFrame", y = "SparkDataFrame"), + function(x, y) { +unioned <- callJMethod(x@sdf, "union", y@sdf) +dataFrame(unioned) + }) + +#' unionAll is deprecated - use union instead +#' @rdname union +#' @name unionAll +#' @export #' @note unionAll since 1.4.0 setMethod("unionAll", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { -unioned <- callJMethod(x@sdf, "unionAll", y@sdf) -dataFrame(unioned) +.Deprecated("union") +union(x, y) }) #' Union two or more SparkDataFrames #' -#' Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL. +#' Note that this does not remove duplicate rows across the two SparkDataFrames. #' +#' @param x A SparkDataFrame +#' @param ... Additional SparkDataFrame +#' @return A SparkDataFrame containing the result of the union. +#' @family SparkDataFrame functions #' @rdname rbind #' @name rbind +#' @seealso \link{union} #' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' unions <- rbind(df, df2, df3, df4) +#' } #' @note rbind since 1.5.0 setMethod("rbind", signature(... = "SparkDataFrame"), function(x, ..., deparse.level = 1) { if (nargs() == 3) { - unionAll(x, ...) + union(x, ...) } else { - unionAll(x, Recall(..., deparse.level = 1)) + union(x, Recall(..., deparse.level = 1)) } }) http://git-wip-us.apache.org/repos/asf/spark/blob/dbfdae4e/R/pkg/R/
spark git commit: [SPARK-16096][SPARKR] add union and deprecate unionAll
Repository: spark Updated Branches: refs/heads/branch-2.0 591bf7909 -> aeda9a153 [SPARK-16096][SPARKR] add union and deprecate unionAll ## What changes were proposed in this pull request? add union and deprecate unionAll, separate roxygen2 doc for rbind (since their usage and parameter lists are quite different) `explode` is also deprecated - but seems like replacement is a combination of calls; not sure if we should deprecate it in SparkR, yet. ## How was this patch tested? unit tests, manual checks for r doc Author: Felix Cheung Closes #13805 from felixcheung/runion. (cherry picked from commit dbfdae4e41a900de01b48639d6554d32edbb2e0b) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aeda9a15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aeda9a15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aeda9a15 Branch: refs/heads/branch-2.0 Commit: aeda9a153c117921e95cf204daab0df3202f1d95 Parents: 591bf79 Author: Felix Cheung Authored: Tue Jun 21 13:36:50 2016 -0700 Committer: Shivaram Venkataraman Committed: Tue Jun 21 13:36:58 2016 -0700 -- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 43 -- R/pkg/R/generics.R| 6 +++- R/pkg/inst/tests/testthat/test_context.R | 2 +- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 +++-- 5 files changed, 47 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index ea42888..2272d8b 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -107,6 +107,7 @@ exportMethods("arrange", "summary", "take", "transform", + "union", "unionAll", "unique", "unpersist", http://git-wip-us.apache.org/repos/asf/spark/blob/aeda9a15/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index ed0bb85..725cbf2 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -2251,7 +2251,7 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { cols } -#' rbind +#' Return a new SparkDataFrame containing the union of rows #' #' Return a new SparkDataFrame containing the union of rows in this SparkDataFrame #' and another SparkDataFrame. This is equivalent to `UNION ALL` in SQL. @@ -2261,39 +2261,64 @@ generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) { #' @param y A SparkDataFrame #' @return A SparkDataFrame containing the result of the union. #' @family SparkDataFrame functions -#' @rdname rbind -#' @name unionAll +#' @rdname union +#' @name union +#' @seealso \link{rbind} #' @export #' @examples #'\dontrun{ #' sparkR.session() #' df1 <- read.json(path) #' df2 <- read.json(path2) -#' unioned <- unionAll(df, df2) +#' unioned <- union(df, df2) +#' unions <- rbind(df, df2, df3, df4) #' } +#' @note union since 2.0.0 +setMethod("union", + signature(x = "SparkDataFrame", y = "SparkDataFrame"), + function(x, y) { +unioned <- callJMethod(x@sdf, "union", y@sdf) +dataFrame(unioned) + }) + +#' unionAll is deprecated - use union instead +#' @rdname union +#' @name unionAll +#' @export #' @note unionAll since 1.4.0 setMethod("unionAll", signature(x = "SparkDataFrame", y = "SparkDataFrame"), function(x, y) { -unioned <- callJMethod(x@sdf, "unionAll", y@sdf) -dataFrame(unioned) +.Deprecated("union") +union(x, y) }) #' Union two or more SparkDataFrames #' -#' Returns a new SparkDataFrame containing rows of all parameters. +#' Union two or more SparkDataFrames. This is equivalent to `UNION ALL` in SQL. +#' Note that this does not remove duplicate rows across the two SparkDataFrames. #' +#' @param x A SparkDataFrame +#' @param ... Additional SparkDataFrame +#' @return A SparkDataFrame containing the result of the union. +#' @family SparkDataFrame functions #' @rdname rbind #' @name rbind +#' @seealso \link{union} #' @export +#' @examples +#'\dontrun{ +#' sparkR.session() +#' unions <- rbind(df, df2, df3, df4) +#' } #' @note rbind since 1.5.0 setMethod("rbind", signature(... = "SparkDataFrame"), function(x, ..., deparse.level = 1) { if (nargs() == 3) { - unionAll(x, ...) + union(x, ...) } else { - unionAll(x, Recall(..., deparse.level = 1)) + union(x, Recall(..., dep
spark git commit: [SPARK-15606][CORE] Use non-blocking removeExecutor call to avoid deadlocks
Repository: spark Updated Branches: refs/heads/branch-1.6 abe36c53d -> d98fb19c1 [SPARK-15606][CORE] Use non-blocking removeExecutor call to avoid deadlocks ## What changes were proposed in this pull request? Set minimum number of dispatcher threads to 3 to avoid deadlocks on machines with only 2 cores ## How was this patch tested? Spark test builds Author: Pete Robbins Closes #13355 from robbinspg/SPARK-13906. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d98fb19c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d98fb19c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d98fb19c Branch: refs/heads/branch-1.6 Commit: d98fb19c18f0122f335e5d810a2f8ff752b98d86 Parents: abe36c5 Author: Pete Robbins Authored: Thu Jun 2 10:14:51 2016 -0700 Committer: Shixiong Zhu Committed: Tue Jun 21 14:21:51 2016 -0700 -- .../scheduler/cluster/CoarseGrainedSchedulerBackend.scala| 2 +- .../scala/org/apache/spark/storage/BlockManagerMaster.scala | 8 2 files changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d98fb19c/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 87f2dbf..75b1d29 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -273,7 +273,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // manager to reregister itself. If that happens, the block manager master will know // about the executor, but the scheduler will not. Therefore, we should remove the // executor from the block manager when we hit this case. - scheduler.sc.env.blockManager.master.removeExecutor(executorId) + scheduler.sc.env.blockManager.master.removeExecutorAsync(executorId) logInfo(s"Asked to remove non-existent executor $executorId") } } http://git-wip-us.apache.org/repos/asf/spark/blob/d98fb19c/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala index 440c4c1..10e1d9e 100644 --- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala +++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMaster.scala @@ -41,6 +41,14 @@ class BlockManagerMaster( logInfo("Removed " + execId + " successfully in removeExecutor") } + /** Request removal of a dead executor from the driver endpoint. + * This is only called on the driver side. Non-blocking + */ + def removeExecutorAsync(execId: String) { +driverEndpoint.ask[Boolean](RemoveExecutor(execId)) +logInfo("Removal of executor " + execId + " requested") + } + /** Register the BlockManager's id with the driver. */ def registerBlockManager( blockManagerId: BlockManagerId, maxMemSize: Long, slaveEndpoint: RpcEndpointRef): Unit = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16117][MLLIB] hide LibSVMFileFormat and move its doc to LibSVMDataSource
Repository: spark Updated Branches: refs/heads/branch-2.0 aeda9a153 -> 8d5a62d5f [SPARK-16117][MLLIB] hide LibSVMFileFormat and move its doc to LibSVMDataSource ## What changes were proposed in this pull request? LibSVMFileFormat implements data source for LIBSVM format. However, users do not really need to call its APIs to use it. So we should hide it in the public API docs. The main issue is that we still need to put the documentation and example code somewhere. The proposal it to have a dummy class to hold the documentation, as a workaround to https://issues.scala-lang.org/browse/SI-8124. ## How was this patch tested? Manually checked the generated API doc and tested loading LIBSVM data. Author: Xiangrui Meng Closes #13819 from mengxr/SPARK-16117. (cherry picked from commit f4e8c31adf45af05751e0d77aefb5cacc58375ee) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8d5a62d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8d5a62d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8d5a62d5 Branch: refs/heads/branch-2.0 Commit: 8d5a62d5f8ee9a2f4643075efdbee9822e85d141 Parents: aeda9a1 Author: Xiangrui Meng Authored: Tue Jun 21 15:46:14 2016 -0700 Committer: Reynold Xin Committed: Tue Jun 21 15:46:19 2016 -0700 -- .../ml/source/libsvm/LibSVMDataSource.scala | 56 .../spark/ml/source/libsvm/LibSVMRelation.scala | 41 ++ 2 files changed, 59 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8d5a62d5/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala new file mode 100644 index 000..73d8130 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.source.libsvm + +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.{DataFrame, DataFrameReader} + +/** + * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]]. + * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and + * `features` containing feature vectors stored as [[Vector]]s. + * + * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and + * optionally specify options, for example: + * {{{ + * // Scala + * val df = spark.read.format("libsvm") + * .option("numFeatures", "780") + * .load("data/mllib/sample_libsvm_data.txt") + * + * // Java + * Dataset df = spark.read().format("libsvm") + * .option("numFeatures, "780") + * .load("data/mllib/sample_libsvm_data.txt"); + * }}} + * + * LIBSVM data source supports the following options: + * - "numFeatures": number of features. + *If unspecified or nonpositive, the number of features will be determined automatically at the + *cost of one additional pass. + *This is also useful when the dataset is already split into multiple files and you want to load + *them separately, because some features may not present in certain files, which leads to + *inconsistent feature dimensions. + * - "vectorType": feature vector type, "sparse" (default) or "dense". + * + * Note that this class is public for documentation purpose. Please don't use this class directly. + * Rather, use the data source API as illustrated above. + * + * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]] + */ +class LibSVMDataSource private() {} http://git-wip-us.apache.org/repos/asf/spark/blob/8d5a62d5/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff
spark git commit: [SPARK-16117][MLLIB] hide LibSVMFileFormat and move its doc to LibSVMDataSource
Repository: spark Updated Branches: refs/heads/master dbfdae4e4 -> f4e8c31ad [SPARK-16117][MLLIB] hide LibSVMFileFormat and move its doc to LibSVMDataSource ## What changes were proposed in this pull request? LibSVMFileFormat implements data source for LIBSVM format. However, users do not really need to call its APIs to use it. So we should hide it in the public API docs. The main issue is that we still need to put the documentation and example code somewhere. The proposal it to have a dummy class to hold the documentation, as a workaround to https://issues.scala-lang.org/browse/SI-8124. ## How was this patch tested? Manually checked the generated API doc and tested loading LIBSVM data. Author: Xiangrui Meng Closes #13819 from mengxr/SPARK-16117. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4e8c31a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4e8c31a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4e8c31a Branch: refs/heads/master Commit: f4e8c31adf45af05751e0d77aefb5cacc58375ee Parents: dbfdae4 Author: Xiangrui Meng Authored: Tue Jun 21 15:46:14 2016 -0700 Committer: Reynold Xin Committed: Tue Jun 21 15:46:14 2016 -0700 -- .../ml/source/libsvm/LibSVMDataSource.scala | 56 .../spark/ml/source/libsvm/LibSVMRelation.scala | 41 ++ 2 files changed, 59 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4e8c31a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala new file mode 100644 index 000..73d8130 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMDataSource.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.source.libsvm + +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.{DataFrame, DataFrameReader} + +/** + * `libsvm` package implements Spark SQL data source API for loading LIBSVM data as [[DataFrame]]. + * The loaded [[DataFrame]] has two columns: `label` containing labels stored as doubles and + * `features` containing feature vectors stored as [[Vector]]s. + * + * To use LIBSVM data source, you need to set "libsvm" as the format in [[DataFrameReader]] and + * optionally specify options, for example: + * {{{ + * // Scala + * val df = spark.read.format("libsvm") + * .option("numFeatures", "780") + * .load("data/mllib/sample_libsvm_data.txt") + * + * // Java + * Dataset df = spark.read().format("libsvm") + * .option("numFeatures, "780") + * .load("data/mllib/sample_libsvm_data.txt"); + * }}} + * + * LIBSVM data source supports the following options: + * - "numFeatures": number of features. + *If unspecified or nonpositive, the number of features will be determined automatically at the + *cost of one additional pass. + *This is also useful when the dataset is already split into multiple files and you want to load + *them separately, because some features may not present in certain files, which leads to + *inconsistent feature dimensions. + * - "vectorType": feature vector type, "sparse" (default) or "dense". + * + * Note that this class is public for documentation purpose. Please don't use this class directly. + * Rather, use the data source API as illustrated above. + * + * @see [[https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ LIBSVM datasets]] + */ +class LibSVMDataSource private() {} http://git-wip-us.apache.org/repos/asf/spark/blob/f4e8c31a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/sc
spark git commit: [SPARK-16118][MLLIB] add getDropLast to OneHotEncoder
Repository: spark Updated Branches: refs/heads/master f4e8c31ad -> 9493b079a [SPARK-16118][MLLIB] add getDropLast to OneHotEncoder ## What changes were proposed in this pull request? We forgot the getter of `dropLast` in `OneHotEncoder` ## How was this patch tested? unit test Author: Xiangrui Meng Closes #13821 from mengxr/SPARK-16118. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9493b079 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9493b079 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9493b079 Branch: refs/heads/master Commit: 9493b079a0050f0a6f4936c17622b96fb185b67f Parents: f4e8c31 Author: Xiangrui Meng Authored: Tue Jun 21 15:52:31 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 15:52:31 2016 -0700 -- .../main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala | 4 .../scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9493b079/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 4fafc1e..01828ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -59,6 +59,10 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e new BooleanParam(this, "dropLast", "whether to drop the last category") setDefault(dropLast -> true) + /** @group getParam */ + @Since("2.0.0") + def getDropLast: Boolean = $(dropLast) + /** @group setParam */ @Since("1.4.0") def setDropLast(value: Boolean): this.type = set(dropLast, value) http://git-wip-us.apache.org/repos/asf/spark/blob/9493b079/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala index 7841b4f..d41eeec 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala @@ -49,7 +49,9 @@ class OneHotEncoderSuite val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") - .setDropLast(false) +assert(encoder.getDropLast === true) +encoder.setDropLast(false) +assert(encoder.getDropLast === false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").rdd.map { r => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16118][MLLIB] add getDropLast to OneHotEncoder
Repository: spark Updated Branches: refs/heads/branch-2.0 8d5a62d5f -> f2413736d [SPARK-16118][MLLIB] add getDropLast to OneHotEncoder ## What changes were proposed in this pull request? We forgot the getter of `dropLast` in `OneHotEncoder` ## How was this patch tested? unit test Author: Xiangrui Meng Closes #13821 from mengxr/SPARK-16118. (cherry picked from commit 9493b079a0050f0a6f4936c17622b96fb185b67f) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f2413736 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f2413736 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f2413736 Branch: refs/heads/branch-2.0 Commit: f2413736d915e8b39dea6e5935a4956b669b30ce Parents: 8d5a62d Author: Xiangrui Meng Authored: Tue Jun 21 15:52:31 2016 -0700 Committer: Xiangrui Meng Committed: Tue Jun 21 15:52:38 2016 -0700 -- .../main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala | 4 .../scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f2413736/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala index 4fafc1e..01828ed 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/OneHotEncoder.scala @@ -59,6 +59,10 @@ class OneHotEncoder @Since("1.4.0") (@Since("1.4.0") override val uid: String) e new BooleanParam(this, "dropLast", "whether to drop the last category") setDefault(dropLast -> true) + /** @group getParam */ + @Since("2.0.0") + def getDropLast: Boolean = $(dropLast) + /** @group setParam */ @Since("1.4.0") def setDropLast(value: Boolean): this.type = set(dropLast, value) http://git-wip-us.apache.org/repos/asf/spark/blob/f2413736/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala index 7841b4f..d41eeec 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/OneHotEncoderSuite.scala @@ -49,7 +49,9 @@ class OneHotEncoderSuite val encoder = new OneHotEncoder() .setInputCol("labelIndex") .setOutputCol("labelVec") - .setDropLast(false) +assert(encoder.getDropLast === true) +encoder.setDropLast(false) +assert(encoder.getDropLast === false) val encoded = encoder.transform(transformed) val output = encoded.select("id", "labelVec").rdd.map { r => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: Update branch-2.0 for 2.0.0 release.
Repository: spark Updated Branches: refs/heads/branch-2.0 f2413736d -> fe41f68fd Update branch-2.0 for 2.0.0 release. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe41f68f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe41f68f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe41f68f Branch: refs/heads/branch-2.0 Commit: fe41f68fdcf914b9de79b3cdd372e7dc891b9a52 Parents: f241373 Author: Reynold Xin Authored: Tue Jun 21 15:53:18 2016 -0700 Committer: Reynold Xin Committed: Tue Jun 21 15:53:18 2016 -0700 -- docs/_config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe41f68f/docs/_config.yml -- diff --git a/docs/_config.yml b/docs/_config.yml index c0a3be7..3951cad 100644 --- a/docs/_config.yml +++ b/docs/_config.yml @@ -14,7 +14,7 @@ include: # These allow the documentation to be updated with newer releases # of Spark, Scala, and Mesos. -SPARK_VERSION: 2.0.0-preview +SPARK_VERSION: 2.0.0 SPARK_VERSION_SHORT: 2.0.0 SCALA_BINARY_VERSION: "2.11" SCALA_VERSION: "2.11.7" - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.0.0-rc1 [created] 0c66ca41a - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v2.0.0-rc1
Repository: spark Updated Branches: refs/heads/branch-2.0 fe41f68fd -> 5a4fce456 Preparing Spark release v2.0.0-rc1 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0c66ca41 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0c66ca41 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0c66ca41 Branch: refs/heads/branch-2.0 Commit: 0c66ca41afade6db73c9aeddd5aed6e5dcea90df Parents: fe41f68 Author: Patrick Wendell Authored: Tue Jun 21 15:59:31 2016 -0700 Committer: Patrick Wendell Committed: Tue Jun 21 15:59:31 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 32 files changed, 32 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 75ac926..5f546bb 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 5444ae6..2eaa810 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index e736436..f068d9d 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 3b7ffe8..fd22188 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 0bd..a17aba5 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0c66ca41/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 14e94ec..0bd8846 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0-SNAPSHOT +2.0.0 ../../
[2/2] spark git commit: Preparing development version 2.0.1-SNAPSHOT
Preparing development version 2.0.1-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a4fce45 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a4fce45 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a4fce45 Branch: refs/heads/branch-2.0 Commit: 5a4fce456f8d94f27e847e00840b5a640f11486d Parents: 0c66ca4 Author: Patrick Wendell Authored: Tue Jun 21 15:59:37 2016 -0700 Committer: Patrick Wendell Committed: Tue Jun 21 15:59:37 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 32 files changed, 32 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 5f546bb..507ddc7 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 2eaa810..bc3b0fe 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index f068d9d..2fb5835 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index fd22188..07d9f1c 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index a17aba5..5e02efd 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 0bd8846..e7fc6a2 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/5a4fce45/c
spark git commit: [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0
Repository: spark Updated Branches: refs/heads/master 9493b079a -> 79aa1d82c [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0 ## What changes were proposed in this pull request? Doc changes ## How was this patch tested? manual liancheng Author: Felix Cheung Closes #13827 from felixcheung/sqldocdeprecate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79aa1d82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79aa1d82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79aa1d82 Branch: refs/heads/master Commit: 79aa1d82ca56eb847cbf4ff81de0564b339988f6 Parents: 9493b07 Author: Felix Cheung Authored: Wed Jun 22 10:37:13 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 10:37:13 2016 +0800 -- docs/sql-programming-guide.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79aa1d82/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index ddf8f70..4b52c94 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2143,7 +2143,7 @@ options. ## Upgrading From Spark SQL 1.6 to 2.0 - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and - `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. + `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. A new `catalog` interface is accessible from `SparkSession` - existing API on databases and tables access such as `listTables`, `createExternalTable`, `dropTempView`, `cacheTable` are moved here. - Dataset API and DataFrame API are unified. In Scala, `DataFrame` becomes a type alias for `Dataset[Row]`, while Java API users must replace `DataFrame` with `Dataset`. Both the typed @@ -2153,6 +2153,10 @@ options. APIs. Instead, `DataFrame` remains the primary programing abstraction, which is analogous to the single-node data frame notion in these languages. + - Dataset and DataFrame API `unionAll` has been deprecated and replaced by `union` + - Dataset and DataFrame API `explode` has been deprecated, alternatively, use `functions.explode()` with `select` or `flatMap` + - Dataset and DataFrame API `registerTempTable` has been deprecated and replaced by `createOrReplaceTempView` + ## Upgrading From Spark SQL 1.5 to 1.6 - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0
Repository: spark Updated Branches: refs/heads/branch-2.0 5a4fce456 -> 77d8226df [SQL][DOC] SQL programming guide add deprecated methods in 2.0.0 ## What changes were proposed in this pull request? Doc changes ## How was this patch tested? manual liancheng Author: Felix Cheung Closes #13827 from felixcheung/sqldocdeprecate. (cherry picked from commit 79aa1d82ca56eb847cbf4ff81de0564b339988f6) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/77d8226d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/77d8226d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/77d8226d Branch: refs/heads/branch-2.0 Commit: 77d8226dfc93fc5f7cde3cc601984fc1a1a54be5 Parents: 5a4fce4 Author: Felix Cheung Authored: Wed Jun 22 10:37:13 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 10:41:15 2016 +0800 -- docs/sql-programming-guide.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/77d8226d/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index ddf8f70..4b52c94 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -2143,7 +2143,7 @@ options. ## Upgrading From Spark SQL 1.6 to 2.0 - `SparkSession` is now the new entry point of Spark that replaces the old `SQLContext` and - `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. + `HiveContext`. Note that the old SQLContext and HiveContext are kept for backward compatibility. A new `catalog` interface is accessible from `SparkSession` - existing API on databases and tables access such as `listTables`, `createExternalTable`, `dropTempView`, `cacheTable` are moved here. - Dataset API and DataFrame API are unified. In Scala, `DataFrame` becomes a type alias for `Dataset[Row]`, while Java API users must replace `DataFrame` with `Dataset`. Both the typed @@ -2153,6 +2153,10 @@ options. APIs. Instead, `DataFrame` remains the primary programing abstraction, which is analogous to the single-node data frame notion in these languages. + - Dataset and DataFrame API `unionAll` has been deprecated and replaced by `union` + - Dataset and DataFrame API `explode` has been deprecated, alternatively, use `functions.explode()` with `select` or `flatMap` + - Dataset and DataFrame API `registerTempTable` has been deprecated and replaced by `createOrReplaceTempView` + ## Upgrading From Spark SQL 1.5 to 1.6 - From Spark 1.6, by default the Thrift server runs in multi-session mode. Which means each JDBC/ODBC - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][MLLIB] deprecate setLabelCol in ChiSqSelectorModel
Repository: spark Updated Branches: refs/heads/master 79aa1d82c -> d77c4e6e2 [MINOR][MLLIB] deprecate setLabelCol in ChiSqSelectorModel ## What changes were proposed in this pull request? Deprecate `labelCol`, which is not used by ChiSqSelectorModel. Author: Xiangrui Meng Closes #13823 from mengxr/deprecate-setLabelCol-in-ChiSqSelectorModel. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d77c4e6e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d77c4e6e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d77c4e6e Branch: refs/heads/master Commit: d77c4e6e2eef24f4276c38b3add8c29bb885f4db Parents: 79aa1d8 Author: Xiangrui Meng Authored: Tue Jun 21 20:53:38 2016 -0700 Committer: Joseph K. Bradley Committed: Tue Jun 21 20:53:38 2016 -0700 -- .../src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d77c4e6e/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 1c32926..33723287 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -142,6 +142,7 @@ final class ChiSqSelectorModel private[ml] ( /** @group setParam */ @Since("1.6.0") + @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) @Since("2.0.0") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][MLLIB] deprecate setLabelCol in ChiSqSelectorModel
Repository: spark Updated Branches: refs/heads/branch-2.0 77d8226df -> f3a2ebe0b [MINOR][MLLIB] deprecate setLabelCol in ChiSqSelectorModel ## What changes were proposed in this pull request? Deprecate `labelCol`, which is not used by ChiSqSelectorModel. Author: Xiangrui Meng Closes #13823 from mengxr/deprecate-setLabelCol-in-ChiSqSelectorModel. (cherry picked from commit d77c4e6e2eef24f4276c38b3add8c29bb885f4db) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f3a2ebe0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f3a2ebe0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f3a2ebe0 Branch: refs/heads/branch-2.0 Commit: f3a2ebe0bcac6d6d3b370ce66b5458e55b7e884d Parents: 77d8226 Author: Xiangrui Meng Authored: Tue Jun 21 20:53:38 2016 -0700 Committer: Joseph K. Bradley Committed: Tue Jun 21 20:53:48 2016 -0700 -- .../src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f3a2ebe0/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 1c32926..33723287 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -142,6 +142,7 @@ final class ChiSqSelectorModel private[ml] ( /** @group setParam */ @Since("1.6.0") + @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) @Since("2.0.0") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16104] [SQL] Do not creaate CSV writer object for every flush when writing
Repository: spark Updated Branches: refs/heads/master d77c4e6e2 -> 7580f3041 [SPARK-16104] [SQL] Do not creaate CSV writer object for every flush when writing ## What changes were proposed in this pull request? This PR let `CsvWriter` object is not created for each time but able to be reused. This way was taken after from JSON data source. Original `CsvWriter` was being created for each row but it was enhanced in https://github.com/apache/spark/pull/13229. However, it still creates `CsvWriter` object for each `flush()` in `LineCsvWriter`. It seems it does not have to close the object and re-create this for every flush. It follows the original logic as it is but `CsvWriter` is reused by reseting `CharArrayWriter`. ## How was this patch tested? Existing tests should cover this. Author: hyukjinkwon Closes #13809 from HyukjinKwon/write-perf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7580f304 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7580f304 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7580f304 Branch: refs/heads/master Commit: 7580f3041a1a3757a0b14b9d8afeb720f261fff6 Parents: d77c4e6 Author: hyukjinkwon Authored: Tue Jun 21 21:58:38 2016 -0700 Committer: Davies Liu Committed: Tue Jun 21 21:58:38 2016 -0700 -- .../execution/datasources/csv/CSVParser.scala | 20 +--- .../execution/datasources/csv/CSVRelation.scala | 1 + 2 files changed, 10 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7580f304/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala index b06f123..2103262 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVParser.scala @@ -17,8 +17,7 @@ package org.apache.spark.sql.execution.datasources.csv -import java.io.{ByteArrayOutputStream, OutputStreamWriter, StringReader} -import java.nio.charset.StandardCharsets +import java.io.{CharArrayWriter, StringReader} import com.univocity.parsers.csv._ @@ -77,10 +76,8 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten writerSettings.setHeaders(headers: _*) writerSettings.setQuoteEscapingEnabled(params.escapeQuotes) - private var buffer = new ByteArrayOutputStream() - private var writer = new CsvWriter( -new OutputStreamWriter(buffer, StandardCharsets.UTF_8), -writerSettings) + private val buffer = new CharArrayWriter() + private val writer = new CsvWriter(buffer, writerSettings) def writeRow(row: Seq[String], includeHeader: Boolean): Unit = { if (includeHeader) { @@ -90,14 +87,15 @@ private[sql] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) exten } def flush(): String = { -writer.close() +writer.flush() val lines = buffer.toString.stripLineEnd -buffer = new ByteArrayOutputStream() -writer = new CsvWriter( - new OutputStreamWriter(buffer, StandardCharsets.UTF_8), - writerSettings) +buffer.reset() lines } + + def close(): Unit = { +writer.close() + } } /** http://git-wip-us.apache.org/repos/asf/spark/blob/7580f304/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala index 083ac33..e8c0134 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVRelation.scala @@ -223,6 +223,7 @@ private[sql] class CsvOutputWriter( override def close(): Unit = { flush() +csvWriter.close() recordWriter.close(context) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15644][MLLIB][SQL] Replace SQLContext with SparkSession in MLlib
Repository: spark Updated Branches: refs/heads/branch-2.0 f3a2ebe0b -> e7a489c7f [SPARK-15644][MLLIB][SQL] Replace SQLContext with SparkSession in MLlib What changes were proposed in this pull request? This PR is to use the latest `SparkSession` to replace the existing `SQLContext` in `MLlib`. `SQLContext` is removed from `MLlib`. Also fix a test case issue in `BroadcastJoinSuite`. BTW, `SQLContext` is not being used in the `MLlib` test suites. How was this patch tested? Existing test cases. Author: gatorsmile Author: xiaoli Author: Xiao Li Closes #13380 from gatorsmile/sqlContextML. (cherry picked from commit 0e3ce75332dd536c0db8467d456ad46e4bf228f4) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7a489c7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7a489c7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7a489c7 Branch: refs/heads/branch-2.0 Commit: e7a489c7fef895fd2ca651f1c67b5495862b3e3e Parents: f3a2ebe Author: gatorsmile Authored: Tue Jun 21 23:12:08 2016 -0700 Committer: Joseph K. Bradley Committed: Tue Jun 21 23:12:28 2016 -0700 -- .../classification/DecisionTreeClassifier.scala | 4 +- .../spark/ml/classification/GBTClassifier.scala | 4 +- .../ml/classification/LogisticRegression.scala | 4 +- .../MultilayerPerceptronClassifier.scala| 4 +- .../spark/ml/classification/NaiveBayes.scala| 4 +- .../classification/RandomForestClassifier.scala | 4 +- .../spark/ml/clustering/GaussianMixture.scala | 4 +- .../org/apache/spark/ml/clustering/KMeans.scala | 10 ++--- .../apache/spark/ml/feature/ChiSqSelector.scala | 4 +- .../spark/ml/feature/CountVectorizer.scala | 4 +- .../scala/org/apache/spark/ml/feature/IDF.scala | 4 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 4 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 4 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 6 +-- .../org/apache/spark/ml/feature/RFormula.scala | 12 +++--- .../spark/ml/feature/StandardScaler.scala | 4 +- .../apache/spark/ml/feature/StringIndexer.scala | 4 +- .../apache/spark/ml/feature/VectorIndexer.scala | 4 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 4 +- .../apache/spark/ml/recommendation/ALS.scala| 4 +- .../ml/regression/AFTSurvivalRegression.scala | 4 +- .../ml/regression/DecisionTreeRegressor.scala | 4 +- .../spark/ml/regression/GBTRegressor.scala | 4 +- .../GeneralizedLinearRegression.scala | 4 +- .../ml/regression/IsotonicRegression.scala | 4 +- .../spark/ml/regression/LinearRegression.scala | 4 +- .../ml/regression/RandomForestRegressor.scala | 4 +- .../org/apache/spark/ml/tree/treeModels.scala | 12 +++--- .../org/apache/spark/ml/util/ReadWrite.scala| 41 ++-- .../ml/util/JavaDefaultReadWriteSuite.java | 2 +- .../org/apache/spark/sql/SparkSession.scala | 2 +- 31 files changed, 100 insertions(+), 81 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e7a489c7/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 881dcef..c65d3d5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -243,7 +243,7 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) val (nodeData, _) = NodeData.build(instance.rootNode, 0) val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(nodeData).write.parquet(dataPath) + sparkSession.createDataFrame(nodeData).write.parquet(dataPath) } } @@ -258,7 +258,7 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sqlContext) + val root = loadTreeNodes(path, metadata, sparkSession) val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) DefaultParamsReader.getAndSetParams(model, metadata) model http://git-wip-us.apache.org/repos/asf/spark/blob/e7a489c7/mllib/src/main/scala/org/apache/spark/ml/clas
spark git commit: [SPARK-15644][MLLIB][SQL] Replace SQLContext with SparkSession in MLlib
Repository: spark Updated Branches: refs/heads/master 7580f3041 -> 0e3ce7533 [SPARK-15644][MLLIB][SQL] Replace SQLContext with SparkSession in MLlib What changes were proposed in this pull request? This PR is to use the latest `SparkSession` to replace the existing `SQLContext` in `MLlib`. `SQLContext` is removed from `MLlib`. Also fix a test case issue in `BroadcastJoinSuite`. BTW, `SQLContext` is not being used in the `MLlib` test suites. How was this patch tested? Existing test cases. Author: gatorsmile Author: xiaoli Author: Xiao Li Closes #13380 from gatorsmile/sqlContextML. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0e3ce753 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0e3ce753 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0e3ce753 Branch: refs/heads/master Commit: 0e3ce75332dd536c0db8467d456ad46e4bf228f4 Parents: 7580f30 Author: gatorsmile Authored: Tue Jun 21 23:12:08 2016 -0700 Committer: Joseph K. Bradley Committed: Tue Jun 21 23:12:08 2016 -0700 -- .../classification/DecisionTreeClassifier.scala | 4 +- .../spark/ml/classification/GBTClassifier.scala | 4 +- .../ml/classification/LogisticRegression.scala | 4 +- .../MultilayerPerceptronClassifier.scala| 4 +- .../spark/ml/classification/NaiveBayes.scala| 4 +- .../classification/RandomForestClassifier.scala | 4 +- .../spark/ml/clustering/GaussianMixture.scala | 4 +- .../org/apache/spark/ml/clustering/KMeans.scala | 10 ++--- .../apache/spark/ml/feature/ChiSqSelector.scala | 4 +- .../spark/ml/feature/CountVectorizer.scala | 4 +- .../scala/org/apache/spark/ml/feature/IDF.scala | 4 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 4 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 4 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 6 +-- .../org/apache/spark/ml/feature/RFormula.scala | 12 +++--- .../spark/ml/feature/StandardScaler.scala | 4 +- .../apache/spark/ml/feature/StringIndexer.scala | 4 +- .../apache/spark/ml/feature/VectorIndexer.scala | 4 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 4 +- .../apache/spark/ml/recommendation/ALS.scala| 4 +- .../ml/regression/AFTSurvivalRegression.scala | 4 +- .../ml/regression/DecisionTreeRegressor.scala | 4 +- .../spark/ml/regression/GBTRegressor.scala | 4 +- .../GeneralizedLinearRegression.scala | 4 +- .../ml/regression/IsotonicRegression.scala | 4 +- .../spark/ml/regression/LinearRegression.scala | 4 +- .../ml/regression/RandomForestRegressor.scala | 4 +- .../org/apache/spark/ml/tree/treeModels.scala | 12 +++--- .../org/apache/spark/ml/util/ReadWrite.scala| 41 ++-- .../ml/util/JavaDefaultReadWriteSuite.java | 2 +- .../org/apache/spark/sql/SparkSession.scala | 2 +- 31 files changed, 100 insertions(+), 81 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0e3ce753/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 881dcef..c65d3d5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -243,7 +243,7 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica DefaultParamsWriter.saveMetadata(instance, path, sc, Some(extraMetadata)) val (nodeData, _) = NodeData.build(instance.rootNode, 0) val dataPath = new Path(path, "data").toString - sqlContext.createDataFrame(nodeData).write.parquet(dataPath) + sparkSession.createDataFrame(nodeData).write.parquet(dataPath) } } @@ -258,7 +258,7 @@ object DecisionTreeClassificationModel extends MLReadable[DecisionTreeClassifica val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val numFeatures = (metadata.metadata \ "numFeatures").extract[Int] val numClasses = (metadata.metadata \ "numClasses").extract[Int] - val root = loadTreeNodes(path, metadata, sqlContext) + val root = loadTreeNodes(path, metadata, sparkSession) val model = new DecisionTreeClassificationModel(metadata.uid, root, numFeatures, numClasses) DefaultParamsReader.getAndSetParams(model, metadata) model http://git-wip-us.apache.org/repos/asf/spark/blob/0e3ce753/mllib/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala -- diff --git