spark git commit: Closes #12407 Closes #12408 Closes #12401
Repository: spark Updated Branches: refs/heads/master b9613239d -> a9324a06e Closes #12407 Closes #12408 Closes #12401 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a9324a06 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a9324a06 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a9324a06 Branch: refs/heads/master Commit: a9324a06ef0e3646410dc9b3d4f21d66b9064303 Parents: b961323 Author: Reynold XinAuthored: Thu Apr 14 22:04:26 2016 -0700 Committer: Reynold Xin Committed: Thu Apr 14 22:04:26 2016 -0700 -- -- - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14374][ML][PYSPARK] PySpark ml GBTClassifier, Regressor support export/import
Repository: spark Updated Branches: refs/heads/master 297ba3f1b -> b9613239d [SPARK-14374][ML][PYSPARK] PySpark ml GBTClassifier, Regressor support export/import ## What changes were proposed in this pull request? PySpark ml GBTClassifier, Regressor support export/import. ## How was this patch tested? Doc test. cc jkbradley Author: Yanbo LiangCloses #12383 from yanboliang/spark-14374. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b9613239 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b9613239 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b9613239 Branch: refs/heads/master Commit: b9613239d303bc0f451233852c1eb1219a69875e Parents: 297ba3f Author: Yanbo Liang Authored: Thu Apr 14 21:36:03 2016 -0700 Committer: Xiangrui Meng Committed: Thu Apr 14 21:36:03 2016 -0700 -- python/pyspark/ml/classification.py | 17 +++-- python/pyspark/ml/regression.py | 17 +++-- 2 files changed, 30 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b9613239/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 922f806..6ef119a 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -739,7 +739,8 @@ class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaML @inherit_doc class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, -GBTParams, HasCheckpointInterval, HasStepSize, HasSeed): +GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, +JavaMLReadable): """ `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)` learning algorithm for classification. @@ -767,6 +768,18 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 +>>> gbtc_path = temp_path + "gbtc" +>>> gbt.save(gbtc_path) +>>> gbt2 = GBTClassifier.load(gbtc_path) +>>> gbt2.getMaxDepth() +2 +>>> model_path = temp_path + "gbtc_model" +>>> model.save(model_path) +>>> model2 = GBTClassificationModel.load(model_path) +>>> model.featureImportances == model2.featureImportances +True +>>> model.treeWeights == model2.treeWeights +True .. versionadded:: 1.4.0 """ @@ -831,7 +844,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol return self.getOrDefault(self.lossType) -class GBTClassificationModel(TreeEnsembleModels): +class GBTClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. http://git-wip-us.apache.org/repos/asf/spark/blob/b9613239/python/pyspark/ml/regression.py -- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index c064fe5..3c78525 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -902,7 +902,8 @@ class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLRead @inherit_doc class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - GBTParams, HasCheckpointInterval, HasStepSize, HasSeed): + GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, + JavaMLReadable): """ `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)` learning algorithm for regression. @@ -925,6 +926,18 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 +>>> gbtr_path = temp_path + "gbtr" +>>> gbt.save(gbtr_path) +>>> gbt2 = GBTRegressor.load(gbtr_path) +>>> gbt2.getMaxDepth() +2 +>>> model_path = temp_path + "gbtr_model" +>>> model.save(model_path) +>>> model2 = GBTRegressionModel.load(model_path) +>>> model.featureImportances == model2.featureImportances +True +>>> model.treeWeights == model2.treeWeights +True .. versionadded:: 1.4.0 """ @@ -989,7 +1002,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, return
spark git commit: [SPARK-14275][SQL] Reimplement TypedAggregateExpression to DeclarativeAggregate
Repository: spark Updated Branches: refs/heads/master b5c60bcdc -> 297ba3f1b [SPARK-14275][SQL] Reimplement TypedAggregateExpression to DeclarativeAggregate ## What changes were proposed in this pull request? `ExpressionEncoder` is just a container for serialization and deserialization expressions, we can use these expressions to build `TypedAggregateExpression` directly, so that it can fit in `DeclarativeAggregate`, which is more efficient. One trick is, for each buffer serializer expression, it will reference to the result object of serialization and function call. To avoid re-calculating this result object, we can serialize the buffer object to a single struct field, so that we can use a special `Expression` to only evaluate result object once. ## How was this patch tested? existing tests Author: Wenchen FanCloses #12067 from cloud-fan/typed_udaf. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/297ba3f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/297ba3f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/297ba3f1 Branch: refs/heads/master Commit: 297ba3f1b49cc37d9891a529142c553e0a5e2d62 Parents: b5c60bc Author: Wenchen Fan Authored: Fri Apr 15 12:10:00 2016 +0800 Committer: Wenchen Fan Committed: Fri Apr 15 12:10:00 2016 +0800 -- .../sql/catalyst/expressions/Expression.scala | 2 +- .../expressions/ReferenceToExpressions.scala| 77 .../sql/catalyst/expressions/literals.scala | 3 +- .../org/apache/spark/sql/types/ObjectType.scala | 2 + .../scala/org/apache/spark/sql/Column.scala | 16 +- .../scala/org/apache/spark/sql/Dataset.scala| 4 +- .../spark/sql/KeyValueGroupedDataset.scala | 3 +- .../aggregate/TypedAggregateExpression.scala| 192 ++- .../spark/sql/expressions/Aggregator.scala | 6 +- .../org/apache/spark/sql/DatasetBenchmark.scala | 112 --- .../scala/org/apache/spark/sql/QueryTest.scala | 2 + .../sql/execution/WholeStageCodegenSuite.scala | 14 ++ 12 files changed, 303 insertions(+), 130 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/297ba3f1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index a24a5db..718bb4b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -185,7 +185,7 @@ abstract class Expression extends TreeNode[Expression] { * Returns a user-facing string representation of this expression's name. * This should usually match the name of the function in SQL. */ - def prettyName: String = getClass.getSimpleName.toLowerCase + def prettyName: String = nodeName.toLowerCase private def flatArguments = productIterator.flatMap { case t: Traversable[_] => t http://git-wip-us.apache.org/repos/asf/spark/blob/297ba3f1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala new file mode 100644 index 000..22645c9 --- /dev/null +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ReferenceToExpressions.scala @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.sql.catalyst.InternalRow +import
spark git commit: [SPARK-14447][SQL] Speed up TungstenAggregate w/ keys using VectorizedHashMap
Repository: spark Updated Branches: refs/heads/master ff9ae61a3 -> b5c60bcdc [SPARK-14447][SQL] Speed up TungstenAggregate w/ keys using VectorizedHashMap ## What changes were proposed in this pull request? This patch speeds up group-by aggregates by around 3-5x by leveraging an in-memory `AggregateHashMap` (please see https://github.com/apache/spark/pull/12161), an append-only aggregate hash map that can act as a 'cache' for extremely fast key-value lookups while evaluating aggregates (and fall back to the `BytesToBytesMap` if a given key isn't found). Architecturally, it is backed by a power-of-2-sized array for index lookups and a columnar batch that stores the key-value pairs. The index lookups in the array rely on linear probing (with a small number of maximum tries) and use an inexpensive hash function which makes it really efficient for a majority of lookups. However, using linear probing and an inexpensive hash function also makes it less robust as compared to the `BytesToBytesMap` (especially for a large number of keys or even for certain distribution of keys) and requires us to fall back on the latter for correctness. ## How was this patch tested? Java HotSpot(TM) 64-Bit Server VM 1.8.0_73-b02 on Mac OS X 10.11.4 Intel(R) Core(TM) i7-4960HQ CPU 2.60GHz Aggregate w keys: Best/Avg Time(ms)Rate(M/s) Per Row(ns) Relative --- codegen = F 2124 / 2204 9.9 101.3 1.0X codegen = T hashmap = F 1198 / 1364 17.5 57.1 1.8X codegen = T hashmap = T 369 / 600 56.8 17.6 5.8X Author: Sameer AgarwalCloses #12345 from sameeragarwal/tungsten-aggregate-integration. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b5c60bcd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b5c60bcd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b5c60bcd Branch: refs/heads/master Commit: b5c60bcdca3bcace607b204a6c196a5386e8a896 Parents: ff9ae61 Author: Sameer Agarwal Authored: Thu Apr 14 20:57:03 2016 -0700 Committer: Yin Huai Committed: Thu Apr 14 20:57:03 2016 -0700 -- .../spark/sql/execution/WholeStageCodegen.scala | 1 + .../aggregate/ColumnarAggMapCodeGenerator.scala | 193 --- .../execution/aggregate/TungstenAggregate.scala | 227 - .../aggregate/TungstenAggregationIterator.scala | 8 +- .../aggregate/VectorizedHashMapGenerator.scala | 241 +++ .../org/apache/spark/sql/internal/SQLConf.scala | 9 + .../execution/BenchmarkWholeStageCodegen.scala | 34 ++- .../hive/execution/AggregationQuerySuite.scala | 47 ++-- 8 files changed, 479 insertions(+), 281 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b5c60bcd/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala index 447dbe7..29acc38 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/WholeStageCodegen.scala @@ -126,6 +126,7 @@ trait CodegenSupport extends SparkPlan { // outputVars will be used to generate the code for UnsafeRow, so we should copy them outputVars.map(_.copy()) } + val rowVar = if (row != null) { ExprCode("", "false", row) } else { http://git-wip-us.apache.org/repos/asf/spark/blob/b5c60bcd/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala deleted file mode 100644 index e415dd8..000 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/ColumnarAggMapCodeGenerator.scala +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file
spark git commit: [SPARK-14601][DOC] Minor doc/usage changes related to removal of Spark assembly
Repository: spark Updated Branches: refs/heads/master c80586d9e -> ff9ae61a3 [SPARK-14601][DOC] Minor doc/usage changes related to removal of Spark assembly ## What changes were proposed in this pull request? Removing references to assembly jar in documentation. Adding an additional (previously undocumented) usage of spark-submit to run examples. ## How was this patch tested? Ran spark-submit usage to ensure formatting was fine. Ran examples using SparkSubmit. Author: Mark GroverCloses #12365 from markgrover/spark-14601. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff9ae61a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff9ae61a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff9ae61a Branch: refs/heads/master Commit: ff9ae61a3b7bbbfc2aac93a99c05a9e1ea9c08bc Parents: c80586d Author: Mark Grover Authored: Thu Apr 14 18:51:43 2016 -0700 Committer: Reynold Xin Committed: Thu Apr 14 18:51:43 2016 -0700 -- bin/spark-class2.cmd | 2 +- core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala | 2 +- .../scala/org/apache/spark/deploy/SparkSubmitArguments.scala | 3 ++- docs/building-spark.md | 2 +- docs/sql-programming-guide.md| 4 ++-- .../sql/execution/datasources/parquet/ParquetRelation.scala | 2 +- .../src/main/scala/org/apache/spark/sql/hive/HiveContext.scala | 2 +- 7 files changed, 9 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ff9ae61a/bin/spark-class2.cmd -- diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 579efff..db68021 100644 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -36,7 +36,7 @@ if exist "%SPARK_HOME%\RELEASE" ( ) if not exist "%SPARK_JARS_DIR%"\ ( - echo Failed to find Spark assembly JAR. + echo Failed to find Spark jars directory. echo You need to build Spark before running this program. exit /b 1 ) http://git-wip-us.apache.org/repos/asf/spark/blob/ff9ae61a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index c0a9e3f..6227a30 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -62,7 +62,7 @@ object PythonRunner { // ready to serve connections. thread.join() -// Build up a PYTHONPATH that includes the Spark assembly JAR (where this class is), the +// Build up a PYTHONPATH that includes the Spark assembly (where this class is), the // python directories in SPARK_HOME (if set), and any files in the pyFiles argument val pathElements = new ArrayBuffer[String] pathElements ++= formattedPyFiles http://git-wip-us.apache.org/repos/asf/spark/blob/ff9ae61a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index ec6d484..78da1b7 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -478,7 +478,8 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S val command = sys.env.get("_SPARK_CMD_USAGE").getOrElse( """Usage: spark-submit [options] [app arguments] |Usage: spark-submit --kill [submission ID] --master [spark://...] -|Usage: spark-submit --status [submission ID] --master [spark://...]""".stripMargin) +|Usage: spark-submit --status [submission ID] --master [spark://...] +|Usage: spark-submit run-example [options] example-class [example args]""".stripMargin) outStream.println(command) val mem_mb = Utils.DEFAULT_DRIVER_MEM_MB http://git-wip-us.apache.org/repos/asf/spark/blob/ff9ae61a/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 4066160..fec442a 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -192,7 +192,7 @@ If you have JDK 8 installed but it is not the system default, you can set JAVA_H # Packaging without Hadoop Dependencies for YARN -The assembly jar produced by `mvn package`
spark git commit: [SPARK-12869] Implemented an improved version of the toIndexedRowMatrix
Repository: spark Updated Branches: refs/heads/master 01dd1f5c0 -> c80586d9e [SPARK-12869] Implemented an improved version of the toIndexedRowMatrix Hi guys, I've implemented an improved version of the `toIndexedRowMatrix` function on the `BlockMatrix`. I needed this for a project, but would like to share it with the rest of the community. In the case of dense matrices, it can increase performance up to 19 times: https://github.com/Fokko/BlockMatrixToIndexedRowMatrix If there are any questions or suggestions, please let me know. Keep up the good work! Cheers. Author: Fokko DriesprongAuthor: Fokko Driesprong Closes #10839 from Fokko/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c80586d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c80586d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c80586d9 Branch: refs/heads/master Commit: c80586d9e820d19fc328b3e4c6f1c1439f5583a7 Parents: 01dd1f5 Author: Fokko Driesprong Authored: Thu Apr 14 17:32:20 2016 -0700 Committer: Xiangrui Meng Committed: Thu Apr 14 17:32:20 2016 -0700 -- .../mllib/linalg/distributed/BlockMatrix.scala | 34 .../linalg/distributed/BlockMatrixSuite.scala | 31 -- 2 files changed, 57 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c80586d9/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala index 89c332a..580d7a9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala @@ -19,12 +19,12 @@ package org.apache.spark.mllib.linalg.distributed import scala.collection.mutable.ArrayBuffer -import breeze.linalg.{DenseMatrix => BDM, Matrix => BM} +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV} import org.apache.spark.{Partitioner, SparkException} import org.apache.spark.annotation.Since import org.apache.spark.internal.Logging -import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, Matrix, SparseMatrix} +import org.apache.spark.mllib.linalg._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel @@ -264,13 +264,35 @@ class BlockMatrix @Since("1.3.0") ( new CoordinateMatrix(entryRDD, numRows(), numCols()) } + /** Converts to IndexedRowMatrix. The number of columns must be within the integer range. */ @Since("1.3.0") def toIndexedRowMatrix(): IndexedRowMatrix = { -require(numCols() < Int.MaxValue, "The number of columns must be within the integer range. " + - s"numCols: ${numCols()}") -// TODO: This implementation may be optimized -toCoordinateMatrix().toIndexedRowMatrix() +val cols = numCols().toInt + +require(cols < Int.MaxValue, s"The number of columns should be less than Int.MaxValue ($cols).") + +val rows = blocks.flatMap { case ((blockRowIdx, blockColIdx), mat) => + mat.rowIter.zipWithIndex.map { +case (vector, rowIdx) => + blockRowIdx * rowsPerBlock + rowIdx -> (blockColIdx, vector.toBreeze) + } +}.groupByKey().map { case (rowIdx, vectors) => + val numberNonZeroPerRow = vectors.map(_._2.activeSize).sum.toDouble / cols.toDouble + + val wholeVector = if (numberNonZeroPerRow <= 0.1) { // Sparse at 1/10th nnz +BSV.zeros[Double](cols) + } else { +BDV.zeros[Double](cols) + } + + vectors.foreach { case (blockColIdx: Int, vec: BV[Double]) => +val offset = colsPerBlock * blockColIdx +wholeVector(offset until offset + colsPerBlock) := vec + } + new IndexedRow(rowIdx, Vectors.fromBreeze(wholeVector)) +} +new IndexedRowMatrix(rows) } /** Collect the distributed matrix on the driver as a `DenseMatrix`. */ http://git-wip-us.apache.org/repos/asf/spark/blob/c80586d9/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala index f737d2c..f37eaf2 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala +++
spark git commit: [SPARK-14565][ML] RandomForest should use parseInt and parseDouble for feature subset size instead of regexes
Repository: spark Updated Branches: refs/heads/master d7e124edf -> 01dd1f5c0 [SPARK-14565][ML] RandomForest should use parseInt and parseDouble for feature subset size instead of regexes ## What changes were proposed in this pull request? This fix tries to change RandomForest's supported strategies from using regexes to using parseInt and parseDouble, for the purpose of robustness and maintainability. ## How was this patch tested? Existing tests passed. Author: Yong TangCloses #12360 from yongtang/SPARK-14565. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01dd1f5c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01dd1f5c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01dd1f5c Branch: refs/heads/master Commit: 01dd1f5c07f5c9ba91389c1556f911b028475cd3 Parents: d7e124e Author: Yong Tang Authored: Thu Apr 14 17:23:16 2016 -0700 Committer: Xiangrui Meng Committed: Thu Apr 14 17:23:16 2016 -0700 -- .../spark/ml/tree/impl/DecisionTreeMetadata.scala | 17 + .../org/apache/spark/ml/tree/treeParams.scala | 11 ++- .../org/apache/spark/mllib/tree/RandomForest.scala | 6 -- .../spark/ml/tree/impl/RandomForestSuite.scala | 4 ++-- 4 files changed, 25 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/01dd1f5c/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala index c7cde15..5f7c40f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/DecisionTreeMetadata.scala @@ -18,8 +18,10 @@ package org.apache.spark.ml.tree.impl import scala.collection.mutable +import scala.util.Try import org.apache.spark.internal.Logging +import org.apache.spark.ml.tree.RandomForestParams import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.configuration.QuantileStrategy._ @@ -184,15 +186,22 @@ private[spark] object DecisionTreeMetadata extends Logging { case _ => featureSubsetStrategy } -val isIntRegex = "^([1-9]\\d*)$".r -val isFractionRegex = "^(0?\\.\\d*[1-9]\\d*|1\\.0+)$".r val numFeaturesPerNode: Int = _featureSubsetStrategy match { case "all" => numFeatures case "sqrt" => math.sqrt(numFeatures).ceil.toInt case "log2" => math.max(1, (math.log(numFeatures) / math.log(2)).ceil.toInt) case "onethird" => (numFeatures / 3.0).ceil.toInt - case isIntRegex(number) => if (BigInt(number) > numFeatures) numFeatures else number.toInt - case isFractionRegex(fraction) => (fraction.toDouble * numFeatures).ceil.toInt + case _ => +Try(_featureSubsetStrategy.toInt).filter(_ > 0).toOption match { + case Some(value) => math.min(value, numFeatures) + case None => +Try(_featureSubsetStrategy.toDouble).filter(_ > 0).filter(_ <= 1.0).toOption match { + case Some(value) => math.ceil(value * numFeatures).toInt + case _ => throw new IllegalArgumentException(s"Supported values:" + +s" ${RandomForestParams.supportedFeatureSubsetStrategies.mkString(", ")}," + +s" (0.0-1.0], [1-n].") +} +} } new DecisionTreeMetadata(numFeatures, numExamples, numClasses, numBins.max, http://git-wip-us.apache.org/repos/asf/spark/blob/01dd1f5c/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index b678391..d7559f8 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -17,6 +17,8 @@ package org.apache.spark.ml.tree +import scala.util.Try + import org.apache.spark.ml.PredictorParams import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ @@ -346,10 +348,12 @@ private[ml] trait HasFeatureSubsetStrategy extends Params { */ final val featureSubsetStrategy: Param[String] = new Param[String](this, "featureSubsetStrategy", "The number of features to consider for splits at each tree node." + - s" Supported options:
spark git commit: [SPARK-14545][SQL] Improve `LikeSimplification` by adding `a%b` rule
Repository: spark Updated Branches: refs/heads/master bc748b7b8 -> d7e124edf [SPARK-14545][SQL] Improve `LikeSimplification` by adding `a%b` rule ## What changes were proposed in this pull request? Current `LikeSimplification` handles the following four rules. - 'a%' => expr.StartsWith("a") - '%b' => expr.EndsWith("b") - '%a%' => expr.Contains("a") - 'a' => EqualTo("a") This PR adds the following rule. - 'a%b' => expr.Length() >= 2 && expr.StartsWith("a") && expr.EndsWith("b") Here, 2 is statically calculated from "a".size + "b".size. **Before** ``` scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain() == Physical Plan == WholeStageCodegen : +- Filter a#5 LIKE a%c : +- INPUT +- Generate explode([abc,adc]), false, false, [a#5] +- Scan OneRowRelation[] ``` **After** ``` scala> sql("select a from (select explode(array('abc','adc')) a) T where a like 'a%c'").explain() == Physical Plan == WholeStageCodegen : +- Filter ((length(a#5) >= 2) && (StartsWith(a#5, a) && EndsWith(a#5, c))) : +- INPUT +- Generate explode([abc,adc]), false, false, [a#5] +- Scan OneRowRelation[] ``` ## How was this patch tested? Pass the Jenkins tests (including new testcase). Author: Dongjoon HyunCloses #12312 from dongjoon-hyun/SPARK-14545. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7e124ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7e124ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7e124ed Branch: refs/heads/master Commit: d7e124edfe2578ecdf8e816a4dda3ce430a09172 Parents: bc748b7 Author: Dongjoon Hyun Authored: Thu Apr 14 13:34:29 2016 -0700 Committer: Reynold Xin Committed: Thu Apr 14 13:34:29 2016 -0700 -- .../sql/catalyst/optimizer/Optimizer.scala | 28 .../optimizer/LikeSimplificationSuite.scala | 14 ++ 2 files changed, 31 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7e124ed/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index aeb1842..f5172b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -517,22 +517,28 @@ object LikeSimplification extends Rule[LogicalPlan] { // Cases like "something\%" are not optimized, but this does not affect correctness. private val startsWith = "([^_%]+)%".r private val endsWith = "%([^_%]+)".r + private val startsAndEndsWith = "([^_%]+)%([^_%]+)".r private val contains = "%([^_%]+)%".r private val equalTo = "([^_%]*)".r def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { -case Like(l, Literal(utf, StringType)) => - utf.toString match { -case startsWith(pattern) if !pattern.endsWith("\\") => - StartsWith(l, Literal(pattern)) -case endsWith(pattern) => - EndsWith(l, Literal(pattern)) -case contains(pattern) if !pattern.endsWith("\\") => - Contains(l, Literal(pattern)) -case equalTo(pattern) => - EqualTo(l, Literal(pattern)) +case Like(input, Literal(pattern, StringType)) => + pattern.toString match { +case startsWith(prefix) if !prefix.endsWith("\\") => + StartsWith(input, Literal(prefix)) +case endsWith(postfix) => + EndsWith(input, Literal(postfix)) +// 'a%a' pattern is basically same with 'a%' && '%a'. +// However, the additional `Length` condition is required to prevent 'a' match 'a%a'. +case startsAndEndsWith(prefix, postfix) if !prefix.endsWith("\\") => + And(GreaterThanOrEqual(Length(input), Literal(prefix.size + postfix.size)), +And(StartsWith(input, Literal(prefix)), EndsWith(input, Literal(postfix +case contains(infix) if !infix.endsWith("\\") => + Contains(input, Literal(infix)) +case equalTo(str) => + EqualTo(input, Literal(str)) case _ => - Like(l, Literal.create(utf, StringType)) + Like(input, Literal.create(pattern, StringType)) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/d7e124ed/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/LikeSimplificationSuite.scala -- diff --git
spark git commit: [SPARK-14238][ML][MLLIB][PYSPARK] Add binary toggle Param to PySpark HashingTF in ML & MLlib
Repository: spark Updated Branches: refs/heads/master bf65c87f7 -> bc748b7b8 [SPARK-14238][ML][MLLIB][PYSPARK] Add binary toggle Param to PySpark HashingTF in ML & MLlib ## What changes were proposed in this pull request? This fix tries to add binary toggle Param to PySpark HashingTF in ML & MLlib. If this toggle is set, then all non-zero counts will be set to 1. Note: This fix (SPARK-14238) is extended from SPARK-13963 where Scala implementation was done. ## How was this patch tested? This fix adds two tests to cover the code changes. One for HashingTF in PySpark's ML and one for HashingTF in PySpark's MLLib. Author: Yong TangCloses #12079 from yongtang/SPARK-14238. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bc748b7b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bc748b7b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bc748b7b Branch: refs/heads/master Commit: bc748b7b8f3b5aee28aff9ea078c216ca137a5b7 Parents: bf65c87 Author: Yong Tang Authored: Thu Apr 14 21:53:32 2016 +0200 Committer: Nick Pentreath Committed: Thu Apr 14 21:53:32 2016 +0200 -- python/pyspark/ml/feature.py| 24 ++-- python/pyspark/ml/tests.py | 19 +++ python/pyspark/mllib/feature.py | 13 - python/pyspark/mllib/tests.py | 16 4 files changed, 69 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bc748b7b/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 0b0c573..809a513 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -536,14 +536,19 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java .. versionadded:: 1.3.0 """ +binary = Param(Params._dummy(), "binary", "If True, all non zero counts are set to 1. " + + "This is useful for discrete probabilistic models that model binary events " + + "rather than integer counts. Default False.", + typeConverter=TypeConverters.toBoolean) + @keyword_only -def __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None): +def __init__(self, numFeatures=1 << 18, binary=False, inputCol=None, outputCol=None): """ __init__(self, numFeatures=1 << 18, inputCol=None, outputCol=None) """ super(HashingTF, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.HashingTF", self.uid) -self._setDefault(numFeatures=1 << 18) +self._setDefault(numFeatures=1 << 18, binary=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -557,6 +562,21 @@ class HashingTF(JavaTransformer, HasInputCol, HasOutputCol, HasNumFeatures, Java kwargs = self.setParams._input_kwargs return self._set(**kwargs) +@since("2.0.0") +def setBinary(self, value): +""" +Sets the value of :py:attr:`binary`. +""" +self._paramMap[self.binary] = value +return self + +@since("2.0.0") +def getBinary(self): +""" +Gets the value of binary or its default value. +""" +return self.getOrDefault(self.binary) + @inherit_doc class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): http://git-wip-us.apache.org/repos/asf/spark/blob/bc748b7b/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 0b0ad23..86c0254 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -847,6 +847,25 @@ class TrainingSummaryTest(PySparkTestCase): self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC) +class HashingTFTest(PySparkTestCase): + +def test_apply_binary_term_freqs(self): +sqlContext = SQLContext(self.sc) + +df = sqlContext.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"]) +n = 100 +hashingTF = HashingTF() + hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) +output = hashingTF.transform(df) +features = output.select("features").first().features.toArray() +expected = Vectors.sparse(n, {(ord("a") % n): 1.0, + (ord("b") % n): 1.0, + (ord("c") % n): 1.0}).toArray() +for i in range(0, n): +self.assertAlmostEqual(features[i],
spark git commit: [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc
Repository: spark Updated Branches: refs/heads/branch-1.5 cb7a90ad5 -> 6043fa8df [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc ## What changes were proposed in this pull request? In Spark 1.4, we negated some metrics from RegressionEvaluator since CrossValidator always maximized metrics. This was fixed in 1.5, but the docs were not updated. This PR updates the docs. ## How was this patch tested? no tests Author: Joseph K. BradleyCloses #12377 from jkbradley/regeval-doc. (cherry picked from commit bf65c87f706019d235d7093637341668a13b1be1) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6043fa8d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6043fa8d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6043fa8d Branch: refs/heads/branch-1.5 Commit: 6043fa8df2832ef7972e281462b3fd36147d34a2 Parents: cb7a90a Author: Joseph K. Bradley Authored: Thu Apr 14 12:44:59 2016 -0700 Committer: Joseph K. Bradley Committed: Thu Apr 14 12:45:27 2016 -0700 -- .../apache/spark/ml/evaluation/RegressionEvaluator.scala| 9 + 1 file changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6043fa8d/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index d21c88a..fba8d4a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -36,11 +36,12 @@ final class RegressionEvaluator(override val uid: String) def this() = this(Identifiable.randomUID("regEval")) /** - * param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`) + * Param for metric name in evaluation. Supports: + * - `"rmse"` (default): root mean squared error + * - `"mse"`: mean squared error + * - `"r2"`: R^2^ metric + * - `"mae"`: mean absolute error * - * Because we will maximize evaluation value (ref: `CrossValidator`), - * when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`), - * we take and output the negative of this metric. * @group param */ val metricName: Param[String] = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc
Repository: spark Updated Branches: refs/heads/branch-1.6 413d0600e -> 93c9a63ea [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc ## What changes were proposed in this pull request? In Spark 1.4, we negated some metrics from RegressionEvaluator since CrossValidator always maximized metrics. This was fixed in 1.5, but the docs were not updated. This PR updates the docs. ## How was this patch tested? no tests Author: Joseph K. BradleyCloses #12377 from jkbradley/regeval-doc. (cherry picked from commit bf65c87f706019d235d7093637341668a13b1be1) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/93c9a63e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/93c9a63e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/93c9a63e Branch: refs/heads/branch-1.6 Commit: 93c9a63ea6b31f1c0208e73f3ce59f255c0460f5 Parents: 413d060 Author: Joseph K. Bradley Authored: Thu Apr 14 12:44:59 2016 -0700 Committer: Joseph K. Bradley Committed: Thu Apr 14 12:45:11 2016 -0700 -- .../apache/spark/ml/evaluation/RegressionEvaluator.scala| 9 + 1 file changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/93c9a63e/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index b6b25ec..807ab33 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -39,11 +39,12 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui def this() = this(Identifiable.randomUID("regEval")) /** - * param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`) + * Param for metric name in evaluation. Supports: + * - `"rmse"` (default): root mean squared error + * - `"mse"`: mean squared error + * - `"r2"`: R^2^ metric + * - `"mae"`: mean absolute error * - * Because we will maximize evaluation value (ref: `CrossValidator`), - * when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`), - * we take and output the negative of this metric. * @group param */ @Since("1.4.0") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc
Repository: spark Updated Branches: refs/heads/master c5172f820 -> bf65c87f7 [SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc ## What changes were proposed in this pull request? In Spark 1.4, we negated some metrics from RegressionEvaluator since CrossValidator always maximized metrics. This was fixed in 1.5, but the docs were not updated. This PR updates the docs. ## How was this patch tested? no tests Author: Joseph K. BradleyCloses #12377 from jkbradley/regeval-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf65c87f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf65c87f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf65c87f Branch: refs/heads/master Commit: bf65c87f706019d235d7093637341668a13b1be1 Parents: c5172f8 Author: Joseph K. Bradley Authored: Thu Apr 14 12:44:59 2016 -0700 Committer: Joseph K. Bradley Committed: Thu Apr 14 12:44:59 2016 -0700 -- .../apache/spark/ml/evaluation/RegressionEvaluator.scala| 9 + 1 file changed, 5 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf65c87f/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index 4134e2d..ed04b67 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -39,11 +39,12 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui def this() = this(Identifiable.randomUID("regEval")) /** - * param for metric name in evaluation (supports `"rmse"` (default), `"mse"`, `"r2"`, and `"mae"`) + * Param for metric name in evaluation. Supports: + * - `"rmse"` (default): root mean squared error + * - `"mse"`: mean squared error + * - `"r2"`: R^2^ metric + * - `"mae"`: mean absolute error * - * Because we will maximize evaluation value (ref: `CrossValidator`), - * when we evaluate a metric that is needed to minimize (e.g., `"rmse"`, `"mse"`, `"mae"`), - * we take and output the negative of this metric. * @group param */ @Since("1.4.0") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-13967][PYSPARK][ML] Added binary Param to Python CountVectorizer
Repository: spark Updated Branches: refs/heads/master 28efdd3fd -> c5172f820 [SPARK-13967][PYSPARK][ML] Added binary Param to Python CountVectorizer Added binary toggle param to CountVectorizer feature transformer in PySpark. Created a unit test for using CountVectorizer with the binary toggle on. Author: Bryan CutlerCloses #12308 from BryanCutler/binary-param-python-CountVectorizer-SPARK-13967. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5172f82 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5172f82 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5172f82 Branch: refs/heads/master Commit: c5172f8205beabe58c0b5392c0d83f9fb9c27f18 Parents: 28efdd3 Author: Bryan Cutler Authored: Thu Apr 14 20:47:31 2016 +0200 Committer: Nick Pentreath Committed: Thu Apr 14 20:47:31 2016 +0200 -- python/pyspark/ml/feature.py | 34 +- python/pyspark/ml/tests.py | 16 2 files changed, 45 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5172f82/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 86b5328..0b0c573 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -256,24 +256,33 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, vocabSize = Param( Params._dummy(), "vocabSize", "max size of the vocabulary. Default 1 << 18.", typeConverter=TypeConverters.toInt) +binary = Param( +Params._dummy(), "binary", "Binary toggle to control the output vector values." + +" If True, all nonzero counts (after minTF filter applied) are set to 1. This is useful" + +" for discrete probabilistic models that model binary events rather than integer counts." + +" Default False", typeConverter=TypeConverters.toBoolean) @keyword_only -def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): +def __init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, + outputCol=None): """ -__init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None) +__init__(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\ + outputCol=None) """ super(CountVectorizer, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.CountVectorizer", self.uid) -self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18) +self._setDefault(minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") -def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None): +def setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None, + outputCol=None): """ -setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, inputCol=None, outputCol=None) +setParams(self, minTF=1.0, minDF=1.0, vocabSize=1 << 18, binary=False, inputCol=None,\ + outputCol=None) Set the params for the CountVectorizer """ kwargs = self.setParams._input_kwargs @@ -324,6 +333,21 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, """ return self.getOrDefault(self.vocabSize) +@since("2.0.0") +def setBinary(self, value): +""" +Sets the value of :py:attr:`binary`. +""" +self._paramMap[self.binary] = value +return self + +@since("2.0.0") +def getBinary(self): +""" +Gets the value of binary or its default value. +""" +return self.getOrDefault(self.binary) + def _create_model(self, java_model): return CountVectorizerModel(java_model) http://git-wip-us.apache.org/repos/asf/spark/blob/c5172f82/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index bcbeacb..0b0ad23 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -406,6 +406,22 @@ class FeatureTests(PySparkTestCase): transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) +def test_count_vectorizer_with_binary(self): +
spark git commit: [SPARK-14592][SQL] Native support for CREATE TABLE LIKE DDL command
Repository: spark Updated Branches: refs/heads/master c971aee40 -> 28efdd3fd [SPARK-14592][SQL] Native support for CREATE TABLE LIKE DDL command ## What changes were proposed in this pull request? JIRA: https://issues.apache.org/jira/browse/SPARK-14592 This patch adds native support for DDL command `CREATE TABLE LIKE`. The SQL syntax is like: CREATE TABLE table_name LIKE existing_table CREATE TABLE IF NOT EXISTS table_name LIKE existing_table ## How was this patch tested? `HiveDDLCommandSuite`. `HiveQuerySuite` already tests `CREATE TABLE LIKE`. Author: Liang-Chi HsiehThis patch had conflicts when merged, resolved by Committer: Andrew Or Closes #12362 from viirya/create-table-like. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28efdd3f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28efdd3f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28efdd3f Branch: refs/heads/master Commit: 28efdd3fd789fa2ebed5be03b36ca0f682e37669 Parents: c971aee Author: Liang-Chi Hsieh Authored: Thu Apr 14 11:08:08 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 11:08:08 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 7 ++-- .../spark/sql/execution/command/tables.scala| 40 +++- .../hive/execution/HiveCompatibilitySuite.scala | 4 +- .../sql/hive/execution/HiveSqlParser.scala | 13 ++- .../spark/sql/hive/HiveDDLCommandSuite.scala| 24 +++- 5 files changed, 79 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/28efdd3f/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index a937ad1..9cf2dd2 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -55,6 +55,8 @@ statement rowFormat? createFileFormat? locationSpec? (TBLPROPERTIES tablePropertyList)? (AS? query)? #createTable +| CREATE TABLE (IF NOT EXISTS)? target=tableIdentifier +LIKE source=tableIdentifier #createTableLike | ANALYZE TABLE tableIdentifier partitionSpec? COMPUTE STATISTICS (identifier | FOR COLUMNS identifierSeq?)? #analyze | ALTER (TABLE | VIEW) from=tableIdentifier @@ -136,10 +138,7 @@ statement ; hiveNativeCommands -: createTableHeader LIKE tableIdentifier -rowFormat? createFileFormat? locationSpec? -(TBLPROPERTIES tablePropertyList)? -| DELETE FROM tableIdentifier (WHERE booleanExpression)? +: DELETE FROM tableIdentifier (WHERE booleanExpression)? | TRUNCATE TABLE tableIdentifier partitionSpec? (COLUMNS identifierList)? | SHOW COLUMNS (FROM | IN) tableIdentifier ((FROM|IN) identifier)? http://git-wip-us.apache.org/repos/asf/spark/blob/28efdd3f/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index e315598..0b41985 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -17,9 +17,45 @@ package org.apache.spark.sql.execution.command -import org.apache.spark.sql.{Row, SQLContext} +import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} + +/** + * A command to create a table with the same definition of the given existing table. + * + * The syntax of using this command in SQL is: + * {{{ + * CREATE TABLE [IF NOT EXISTS] [db_name.]table_name + * LIKE [other_db_name.]existing_table_name + * }}} + */ +case class CreateTableLike( +targetTable: TableIdentifier, +sourceTable: TableIdentifier, +ifNotExists: Boolean) extends RunnableCommand { + + override def run(sqlContext: SQLContext): Seq[Row] = { +val catalog = sqlContext.sessionState.catalog +if (!catalog.tableExists(sourceTable)) { + throw new AnalysisException( +
spark git commit: [SPARK-14499][SQL][TEST] Drop Partition Does Not Delete Data of External Tables
Repository: spark Updated Branches: refs/heads/master 1d04c86fc -> c971aee40 [SPARK-14499][SQL][TEST] Drop Partition Does Not Delete Data of External Tables What changes were proposed in this pull request? This PR is to add a test to ensure drop partitions of an external table will not delete data. cc yhuai andrewor14 How was this patch tested? N/A Author: gatorsmileThis patch had conflicts when merged, resolved by Committer: Andrew Or Closes #12350 from gatorsmile/testDropPartition. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c971aee4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c971aee4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c971aee4 Branch: refs/heads/master Commit: c971aee40d806ed02d3d6a5cc478b63654052e54 Parents: 1d04c86 Author: gatorsmile Authored: Thu Apr 14 11:03:19 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 11:03:19 2016 -0700 -- .../spark/sql/hive/execution/HiveDDLSuite.scala | 67 1 file changed, 67 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c971aee4/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 249dcdf..206d911 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.hive.execution +import java.io.File + import org.apache.hadoop.fs.Path import org.apache.spark.sql.{AnalysisException, QueryTest, SaveMode} @@ -126,6 +128,71 @@ class HiveDDLSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { } } + test("add/drop partitions - external table") { +val catalog = hiveContext.sessionState.catalog +withTempDir { tmpDir => + val basePath = tmpDir.getCanonicalPath + val partitionPath_1stCol_part1 = new File(basePath + "/ds=2008-04-08") + val partitionPath_1stCol_part2 = new File(basePath + "/ds=2008-04-09") + val partitionPath_part1 = new File(basePath + "/ds=2008-04-08/hr=11") + val partitionPath_part2 = new File(basePath + "/ds=2008-04-09/hr=11") + val partitionPath_part3 = new File(basePath + "/ds=2008-04-08/hr=12") + val partitionPath_part4 = new File(basePath + "/ds=2008-04-09/hr=12") + val dirSet = +tmpDir :: partitionPath_1stCol_part1 :: partitionPath_1stCol_part2 :: + partitionPath_part1 :: partitionPath_part2 :: partitionPath_part3 :: + partitionPath_part4 :: Nil + + val externalTab = "extTable_with_partitions" + withTable(externalTab) { +assert(tmpDir.listFiles.isEmpty) +sql( + s""" + |CREATE EXTERNAL TABLE $externalTab (key INT, value STRING) + |PARTITIONED BY (ds STRING, hr STRING) + |LOCATION '$basePath' + """.stripMargin) + +// Before data insertion, all the directory are empty +assert(dirSet.forall(dir => dir.listFiles == null || dir.listFiles.isEmpty)) + +for (ds <- Seq("2008-04-08", "2008-04-09"); hr <- Seq("11", "12")) { + sql( +s""" + |INSERT OVERWRITE TABLE $externalTab + |partition (ds='$ds',hr='$hr') + |SELECT 1, 'a' + """.stripMargin) +} + +val hiveTable = catalog.getTableMetadata(TableIdentifier(externalTab, Some("default"))) +assert(hiveTable.tableType == CatalogTableType.EXTERNAL_TABLE) +// After data insertion, all the directory are not empty +assert(dirSet.forall(dir => dir.listFiles.nonEmpty)) + +sql( + s""" + |ALTER TABLE $externalTab DROP PARTITION (ds='2008-04-08'), + |PARTITION (ds='2008-04-09', hr='12') + """.stripMargin) + assert(catalog.listPartitions(TableIdentifier(externalTab)).map(_.spec).toSet == + Set(Map("ds" -> "2008-04-09", "hr" -> "11"))) +// drop partition will not delete the data of external table +assert(dirSet.forall(dir => dir.listFiles.nonEmpty)) + +sql(s"ALTER TABLE $externalTab ADD PARTITION (ds='2008-04-08', hr='12')") + assert(catalog.listPartitions(TableIdentifier(externalTab)).map(_.spec).toSet == + Set(Map("ds" -> "2008-04-08", "hr" -> "12"), Map("ds" -> "2008-04-09", "hr" -> "11"))) +// add partition will not delete the data +
spark git commit: [SPARK-14558][CORE] In ClosureCleaner, clean the outer pointer if it's a REPL line object
Repository: spark Updated Branches: refs/heads/master a46f98d3f -> 1d04c86fc [SPARK-14558][CORE] In ClosureCleaner, clean the outer pointer if it's a REPL line object ## What changes were proposed in this pull request? When we clean a closure, if its outermost parent is not a closure, we won't clone and clean it as cloning user's objects is dangerous. However, if it's a REPL line object, which may carry a lot of unnecessary references(like hadoop conf, spark conf, etc.), we should clean it as it's not a user object. This PR improves the check for user's objects to exclude REPL line object. ## How was this patch tested? existing tests. Author: Wenchen FanCloses #12327 from cloud-fan/closure. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d04c86f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d04c86f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d04c86f Branch: refs/heads/master Commit: 1d04c86fc575470e15f6667076377cea102552d7 Parents: a46f98d Author: Wenchen Fan Authored: Thu Apr 14 10:58:06 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 10:58:06 2016 -0700 -- .../org/apache/spark/util/ClosureCleaner.scala | 53 +--- .../scala/org/apache/spark/repl/ReplSuite.scala | 27 ++ 2 files changed, 50 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d04c86f/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala index 2f6924f..489688c 100644 --- a/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala +++ b/core/src/main/scala/org/apache/spark/util/ClosureCleaner.scala @@ -19,7 +19,8 @@ package org.apache.spark.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} -import scala.collection.mutable.{Map, Set} +import scala.collection.mutable.{Map, Set, Stack} +import scala.language.existentials import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor, Type} import org.apache.xbean.asm5.Opcodes._ @@ -77,35 +78,19 @@ private[spark] object ClosureCleaner extends Logging { */ private def getInnerClosureClasses(obj: AnyRef): List[Class[_]] = { val seen = Set[Class[_]](obj.getClass) -var stack = List[Class[_]](obj.getClass) +val stack = Stack[Class[_]](obj.getClass) while (!stack.isEmpty) { - val cr = getClassReader(stack.head) - stack = stack.tail + val cr = getClassReader(stack.pop()) val set = Set[Class[_]]() cr.accept(new InnerClosureFinder(set), 0) for (cls <- set -- seen) { seen += cls -stack = cls :: stack +stack.push(cls) } } (seen - obj.getClass).toList } - private def createNullValue(cls: Class[_]): AnyRef = { -if (cls.isPrimitive) { - cls match { -case java.lang.Boolean.TYPE => new java.lang.Boolean(false) -case java.lang.Character.TYPE => new java.lang.Character('\u') -case java.lang.Void.TYPE => - // This should not happen because `Foo(void x) {}` does not compile. - throw new IllegalStateException("Unexpected void parameter in constructor") -case _ => new java.lang.Byte(0: Byte) - } -} else { - null -} - } - /** * Clean the given closure in place. * @@ -233,16 +218,24 @@ private[spark] object ClosureCleaner extends Logging { // Note that all outer objects but the outermost one (first one in this list) must be closures var outerPairs: List[(Class[_], AnyRef)] = (outerClasses zip outerObjects).reverse var parent: AnyRef = null -if (outerPairs.size > 0 && !isClosure(outerPairs.head._1)) { - // The closure is ultimately nested inside a class; keep the object of that - // class without cloning it since we don't want to clone the user's objects. - // Note that we still need to keep around the outermost object itself because - // we need it to clone its child closure later (see below). - logDebug(s" + outermost object is not a closure, so do not clone it: ${outerPairs.head}") - parent = outerPairs.head._2 // e.g. SparkContext - outerPairs = outerPairs.tail -} else if (outerPairs.size > 0) { - logDebug(s" + outermost object is a closure, so we just keep it: ${outerPairs.head}") +if (outerPairs.size > 0) { + val (outermostClass, outermostObject) = outerPairs.head + if (isClosure(outermostClass)) { +logDebug(s" + outermost object is a closure, so we clone it: ${outerPairs.head}") +
spark git commit: [SPARK-14617] Remove deprecated APIs in TaskMetrics
Repository: spark Updated Branches: refs/heads/master dac40b68d -> a46f98d3f [SPARK-14617] Remove deprecated APIs in TaskMetrics ## What changes were proposed in this pull request? This patch removes some of the deprecated APIs in TaskMetrics. This is part of my bigger effort to simplify accumulators and task metrics. ## How was this patch tested? N/A - only removals Author: Reynold XinCloses #12375 from rxin/SPARK-14617. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a46f98d3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a46f98d3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a46f98d3 Branch: refs/heads/master Commit: a46f98d3f4ba6a79f4ef789806fec80a7d4f342d Parents: dac40b6 Author: Reynold Xin Authored: Thu Apr 14 10:56:13 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 10:56:13 2016 -0700 -- .../apache/spark/executor/InputMetrics.scala| 32 ++-- .../apache/spark/executor/OutputMetrics.scala | 30 -- .../spark/executor/ShuffleReadMetrics.scala | 21 + .../org/apache/spark/executor/TaskMetrics.scala | 27 + .../scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +-- .../org/apache/spark/rdd/NewHadoopRDD.scala | 4 +-- .../main/scala/org/apache/spark/rdd/RDD.scala | 4 +-- .../org/apache/spark/util/JsonProtocol.scala| 4 +-- .../spark/executor/TaskMetricsSuite.scala | 4 +-- .../apache/spark/util/JsonProtocolSuite.scala | 2 +- project/MimaExcludes.scala | 5 ++- 11 files changed, 40 insertions(+), 97 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a46f98d3/core/src/main/scala/org/apache/spark/executor/InputMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/InputMetrics.scala b/core/src/main/scala/org/apache/spark/executor/InputMetrics.scala index 6d30d3c..83e11c5 100644 --- a/core/src/main/scala/org/apache/spark/executor/InputMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/InputMetrics.scala @@ -81,35 +81,9 @@ class InputMetrics private ( */ def readMethod: DataReadMethod.Value = DataReadMethod.withName(_readMethod.localValue) - // Once incBytesRead & intRecordsRead is ready to be removed from the public API - // we can remove the internal versions and make the previous public API private. - // This has been done to suppress warnings when building. - @deprecated("incrementing input metrics is for internal use only", "2.0.0") - def incBytesRead(v: Long): Unit = _bytesRead.add(v) - private[spark] def incBytesReadInternal(v: Long): Unit = _bytesRead.add(v) - @deprecated("incrementing input metrics is for internal use only", "2.0.0") - def incRecordsRead(v: Long): Unit = _recordsRead.add(v) - private[spark] def incRecordsReadInternal(v: Long): Unit = _recordsRead.add(v) + private[spark] def incBytesRead(v: Long): Unit = _bytesRead.add(v) + private[spark] def incRecordsRead(v: Long): Unit = _recordsRead.add(v) private[spark] def setBytesRead(v: Long): Unit = _bytesRead.setValue(v) - private[spark] def setReadMethod(v: DataReadMethod.Value): Unit = -_readMethod.setValue(v.toString) + private[spark] def setReadMethod(v: DataReadMethod.Value): Unit = _readMethod.setValue(v.toString) } - -/** - * Deprecated methods to preserve case class matching behavior before Spark 2.0. - */ -object InputMetrics { - - @deprecated("matching on InputMetrics will not be supported in the future", "2.0.0") - def apply(readMethod: DataReadMethod.Value): InputMetrics = { -val im = new InputMetrics -im.setReadMethod(readMethod) -im - } - - @deprecated("matching on InputMetrics will not be supported in the future", "2.0.0") - def unapply(input: InputMetrics): Option[DataReadMethod.Value] = { -Some(input.readMethod) - } -} http://git-wip-us.apache.org/repos/asf/spark/blob/a46f98d3/core/src/main/scala/org/apache/spark/executor/OutputMetrics.scala -- diff --git a/core/src/main/scala/org/apache/spark/executor/OutputMetrics.scala b/core/src/main/scala/org/apache/spark/executor/OutputMetrics.scala index 0b37d55..93f9538 100644 --- a/core/src/main/scala/org/apache/spark/executor/OutputMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/OutputMetrics.scala @@ -52,18 +52,6 @@ class OutputMetrics private ( } /** - * Create a new [[OutputMetrics]] that is not associated with any particular task. - * - * This is only used for preserving matching behavior on [[OutputMetrics]], which used to be - * a case class before Spark 2.0. Once we
spark git commit: [SPARK-14619] Track internal accumulators (metrics) by stage attempt
Repository: spark Updated Branches: refs/heads/master 9fa43a33b -> dac40b68d [SPARK-14619] Track internal accumulators (metrics) by stage attempt ## What changes were proposed in this pull request? When there are multiple attempts for a stage, we currently only reset internal accumulator values if all the tasks are resubmitted. It would make more sense to reset the accumulator values for each stage attempt. This will allow us to eventually get rid of the internal flag in the Accumulator class. This is part of my bigger effort to simplify accumulators and task metrics. ## How was this patch tested? Covered by existing tests. Author: Reynold XinCloses #12378 from rxin/SPARK-14619. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dac40b68 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dac40b68 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dac40b68 Branch: refs/heads/master Commit: dac40b68dc52d5ab855dfde63f0872064aa3d999 Parents: 9fa43a3 Author: Reynold Xin Authored: Thu Apr 14 10:54:57 2016 -0700 Committer: Andrew Or Committed: Thu Apr 14 10:54:57 2016 -0700 -- .../org/apache/spark/InternalAccumulator.scala | 2 +- .../apache/spark/scheduler/DAGScheduler.scala| 11 ++- .../scala/org/apache/spark/scheduler/Stage.scala | 19 ++- .../org/apache/spark/scheduler/StageInfo.scala | 10 +- .../scala/org/apache/spark/ui/jobs/JobPage.scala | 2 +- .../org/apache/spark/util/JsonProtocol.scala | 6 -- .../spark/ExecutorAllocationManagerSuite.scala | 4 ++-- .../scala/org/apache/spark/ShuffleSuite.scala| 6 +++--- .../spark/scheduler/DAGSchedulerSuite.scala | 2 +- .../sql/execution/UnsafeRowSerializerSuite.scala | 2 +- 10 files changed, 26 insertions(+), 38 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dac40b68/core/src/main/scala/org/apache/spark/InternalAccumulator.scala -- diff --git a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala index 7aa9057..0dd4ec6 100644 --- a/core/src/main/scala/org/apache/spark/InternalAccumulator.scala +++ b/core/src/main/scala/org/apache/spark/InternalAccumulator.scala @@ -187,7 +187,7 @@ private[spark] object InternalAccumulator { * add to the same set of accumulators. We do this to report the distribution of accumulator * values across all tasks within each stage. */ - def create(sc: SparkContext): Seq[Accumulator[_]] = { + def createAll(sc: SparkContext): Seq[Accumulator[_]] = { val accums = createAll() accums.foreach { accum => Accumulators.register(accum) http://git-wip-us.apache.org/repos/asf/spark/blob/dac40b68/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index 4609b24..c27aad2 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -950,13 +950,6 @@ class DAGScheduler( // First figure out the indexes of partition ids to compute. val partitionsToCompute: Seq[Int] = stage.findMissingPartitions() -// Create internal accumulators if the stage has no accumulators initialized. -// Reset internal accumulators only if this stage is not partially submitted -// Otherwise, we may override existing accumulator values from some tasks -if (stage.internalAccumulators.isEmpty || stage.numPartitions == partitionsToCompute.size) { - stage.resetInternalAccumulators() -} - // Use the scheduling pool, job group, description, etc. from an ActiveJob associated // with this Stage val properties = jobIdToActiveJob(jobId).properties @@ -1036,7 +1029,7 @@ class DAGScheduler( val locs = taskIdToLocations(id) val part = stage.rdd.partitions(id) new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, - taskBinary, part, locs, stage.internalAccumulators, properties) + taskBinary, part, locs, stage.latestInfo.internalAccumulators, properties) } case stage: ResultStage => @@ -1046,7 +1039,7 @@ class DAGScheduler( val part = stage.rdd.partitions(p) val locs = taskIdToLocations(id) new ResultTask(stage.id, stage.latestInfo.attemptId, - taskBinary, part, locs, id, properties, stage.internalAccumulators) +
spark git commit: [SPARK-14612][ML] Consolidate the version of dependencies in mllib and mllib-local into one place
Repository: spark Updated Branches: refs/heads/master 3e27940a1 -> 9fa43a33b [SPARK-14612][ML] Consolidate the version of dependencies in mllib and mllib-local into one place ## What changes were proposed in this pull request? Move json4s, breeze dependency declaration into parent ## How was this patch tested? Should be no functional change, but Jenkins tests will test that. Author: Sean OwenCloses #12390 from srowen/SPARK-14612. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9fa43a33 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9fa43a33 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9fa43a33 Branch: refs/heads/master Commit: 9fa43a33b91c3a9b6be39bf3e00febf61a4b5b59 Parents: 3e27940 Author: Sean Owen Authored: Thu Apr 14 10:48:17 2016 -0700 Committer: DB Tsai Committed: Thu Apr 14 10:48:17 2016 -0700 -- core/pom.xml| 1 - mllib-local/pom.xml | 13 - mllib/pom.xml | 13 - pom.xml | 22 ++ 4 files changed, 22 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9fa43a33/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 4c7e3a3..7349ad3 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -192,7 +192,6 @@ org.json4s json4s-jackson_${scala.binary.version} - 3.2.10 com.sun.jersey http://git-wip-us.apache.org/repos/asf/spark/blob/9fa43a33/mllib-local/pom.xml -- diff --git a/mllib-local/pom.xml b/mllib-local/pom.xml index c56561f..68f15dd 100644 --- a/mllib-local/pom.xml +++ b/mllib-local/pom.xml @@ -38,19 +38,6 @@ org.scalanlp breeze_${scala.binary.version} - 0.11.2 - - - - junit - junit - - - org.apache.commons - commons-math3 - - org.apache.commons http://git-wip-us.apache.org/repos/asf/spark/blob/9fa43a33/mllib/pom.xml -- diff --git a/mllib/pom.xml b/mllib/pom.xml index e56eafc..24d8274 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -77,19 +77,6 @@ org.scalanlp breeze_${scala.binary.version} - 0.11.2 - - - - junit - junit - - - org.apache.commons - commons-math3 - - org.apache.commons http://git-wip-us.apache.org/repos/asf/spark/blob/9fa43a33/pom.xml -- diff --git a/pom.xml b/pom.xml index 4585c8b..a772d51 100644 --- a/pom.xml +++ b/pom.xml @@ -585,6 +585,28 @@ ${hadoop.deps.scope} +org.scalanlp +breeze_${scala.binary.version} +0.11.2 + + + +junit +junit + + +org.apache.commons +commons-math3 + + + + +org.json4s +json4s-jackson_${scala.binary.version} +3.2.10 + + com.sun.jersey jersey-json ${jersey.version} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14630][BUILD][CORE][SQL][STREAMING] Code style: public abstract methods should have explicit return types
Repository: spark Updated Branches: refs/heads/master de2ad5285 -> 3e27940a1 [SPARK-14630][BUILD][CORE][SQL][STREAMING] Code style: public abstract methods should have explicit return types ## What changes were proposed in this pull request? Currently many public abstract methods (in abstract classes as well as traits) don't declare return types explicitly, such as in [o.a.s.streaming.dstream.InputDStream](https://github.com/apache/spark/blob/master/streaming/src/main/scala/org/apache/spark/streaming/dstream/InputDStream.scala#L110): ```scala def start() // should be: def start(): Unit def stop() // should be: def stop(): Unit ``` These methods exist in core, sql, streaming; this PR fixes them. ## How was this patch tested? N/A ## Which piece of scala style rule led to the changes? the rule was added separately in https://github.com/apache/spark/pull/12396 Author: Liwei LinCloses #12389 from lw-lin/public-abstract-methods. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e27940a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e27940a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e27940a Branch: refs/heads/master Commit: 3e27940a19e7bab448f1af11d2065ecd1ec66197 Parents: de2ad52 Author: Liwei Lin Authored: Thu Apr 14 10:14:38 2016 -0700 Committer: Reynold Xin Committed: Thu Apr 14 10:14:38 2016 -0700 -- core/src/main/scala/org/apache/spark/ContextCleaner.scala | 10 +- core/src/main/scala/org/apache/spark/FutureAction.scala | 4 ++-- .../apache/spark/deploy/client/AppClientListener.scala| 3 ++- .../apache/spark/deploy/master/LeaderElectionAgent.scala | 4 ++-- .../apache/spark/deploy/master/PersistenceEngine.scala| 4 ++-- .../org/apache/spark/deploy/worker/DriverRunner.scala | 2 +- .../scala/org/apache/spark/executor/ExecutorBackend.scala | 2 +- .../org/apache/spark/network/BlockTransferService.scala | 2 +- .../scala/org/apache/spark/scheduler/JobListener.scala| 4 ++-- .../org/apache/spark/scheduler/SchedulableBuilder.scala | 4 ++-- .../scala/org/apache/spark/scheduler/TaskScheduler.scala | 2 +- .../org/apache/spark/serializer/KryoSerializer.scala | 2 +- .../apache/spark/shuffle/FileShuffleBlockResolver.scala | 2 +- core/src/main/scala/org/apache/spark/ui/WebUI.scala | 2 +- .../org/apache/spark/util/logging/RollingPolicy.scala | 4 ++-- .../scala/org/apache/spark/util/random/Pseudorandom.scala | 2 +- .../sql/catalyst/expressions/SpecificMutableRow.scala | 2 +- .../org/apache/spark/sql/catalyst/expressions/rows.scala | 2 +- .../spark/sql/execution/columnar/ColumnAccessor.scala | 2 +- .../spark/sql/execution/columnar/ColumnBuilder.scala | 4 ++-- .../spark/sql/execution/streaming/state/StateStore.scala | 2 +- .../apache/spark/sql/util/ContinuousQueryListener.scala | 6 +++--- .../org/apache/spark/streaming/dstream/InputDStream.scala | 4 ++-- .../apache/spark/streaming/receiver/BlockGenerator.scala | 8 .../spark/streaming/receiver/ReceivedBlockHandler.scala | 2 +- .../org/apache/spark/streaming/receiver/Receiver.scala| 4 ++-- .../spark/streaming/receiver/ReceiverSupervisor.scala | 10 +- 27 files changed, 50 insertions(+), 49 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e27940a/core/src/main/scala/org/apache/spark/ContextCleaner.scala -- diff --git a/core/src/main/scala/org/apache/spark/ContextCleaner.scala b/core/src/main/scala/org/apache/spark/ContextCleaner.scala index 8fc657c..76692cc 100644 --- a/core/src/main/scala/org/apache/spark/ContextCleaner.scala +++ b/core/src/main/scala/org/apache/spark/ContextCleaner.scala @@ -278,9 +278,9 @@ private object ContextCleaner { * Listener class used for testing when any item has been cleaned by the Cleaner class. */ private[spark] trait CleanerListener { - def rddCleaned(rddId: Int) - def shuffleCleaned(shuffleId: Int) - def broadcastCleaned(broadcastId: Long) - def accumCleaned(accId: Long) - def checkpointCleaned(rddId: Long) + def rddCleaned(rddId: Int): Unit + def shuffleCleaned(shuffleId: Int): Unit + def broadcastCleaned(broadcastId: Long): Unit + def accumCleaned(accId: Long): Unit + def checkpointCleaned(rddId: Long): Unit } http://git-wip-us.apache.org/repos/asf/spark/blob/3e27940a/core/src/main/scala/org/apache/spark/FutureAction.scala -- diff --git a/core/src/main/scala/org/apache/spark/FutureAction.scala b/core/src/main/scala/org/apache/spark/FutureAction.scala index ce11772..339266a 100644 ---
spark git commit: [SPARK-14625] TaskUIData and ExecutorUIData shouldn't be case classes
Repository: spark Updated Branches: refs/heads/master 0d22092cd -> de2ad5285 [SPARK-14625] TaskUIData and ExecutorUIData shouldn't be case classes ## What changes were proposed in this pull request? I was trying to understand the accumulator and metrics update source code and these two classes don't really need to be case classes. It would also be more consistent with other UI classes if they are not case classes. This is part of my bigger effort to simplify accumulators and task metrics. ## How was this patch tested? This is a straightforward refactoring without behavior change. Author: Reynold XinCloses #12386 from rxin/SPARK-14625. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de2ad528 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de2ad528 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de2ad528 Branch: refs/heads/master Commit: de2ad52855aee3c60bbc4642afb180d6fe62173b Parents: 0d22092 Author: Reynold Xin Authored: Thu Apr 14 10:12:29 2016 -0700 Committer: Reynold Xin Committed: Thu Apr 14 10:12:29 2016 -0700 -- .../spark/status/api/v1/AllStagesResource.scala | 4 +- .../org/apache/spark/ui/exec/ExecutorsTab.scala | 2 +- .../spark/ui/jobs/JobProgressListener.scala | 8 +- .../org/apache/spark/ui/jobs/StagePage.scala| 85 ++-- .../scala/org/apache/spark/ui/jobs/UIData.scala | 6 +- .../ui/jobs/JobProgressListenerSuite.scala | 10 +-- 6 files changed, 58 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de2ad528/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala -- diff --git a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala index 9c92a50..f8d6e9f 100644 --- a/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala +++ b/core/src/main/scala/org/apache/spark/status/api/v1/AllStagesResource.scala @@ -147,7 +147,7 @@ private[v1] object AllStagesResource { speculative = uiData.taskInfo.speculative, accumulatorUpdates = uiData.taskInfo.accumulables.map { convertAccumulableInfo }, errorMessage = uiData.errorMessage, - taskMetrics = uiData.taskMetrics.map { convertUiTaskMetrics } + taskMetrics = uiData.metrics.map { convertUiTaskMetrics } ) } @@ -155,7 +155,7 @@ private[v1] object AllStagesResource { allTaskData: Iterable[TaskUIData], quantiles: Array[Double]): TaskMetricDistributions = { -val rawMetrics = allTaskData.flatMap{_.taskMetrics}.toSeq +val rawMetrics = allTaskData.flatMap{_.metrics}.toSeq def metricQuantiles(f: InternalTaskMetrics => Double): IndexedSeq[Double] = Distribution(rawMetrics.map { d => f(d) }).get.getQuantiles(quantiles) http://git-wip-us.apache.org/repos/asf/spark/blob/de2ad528/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala index 788f35e..3fd0efd 100644 --- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala +++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala @@ -70,7 +70,7 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener, conf: Spar executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap executorToTotalCores(eid) = executorAdded.executorInfo.totalCores executorToTasksMax(eid) = executorToTotalCores(eid) / conf.getInt("spark.task.cpus", 1) -executorIdToData(eid) = ExecutorUIData(executorAdded.time) +executorIdToData(eid) = new ExecutorUIData(executorAdded.time) } override def onExecutorRemoved( http://git-wip-us.apache.org/repos/asf/spark/blob/de2ad528/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala -- diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala index ed3ab66..13f5f84 100644 --- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala +++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala @@ -396,13 +396,13 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging { None } taskMetrics.foreach { m => -val oldMetrics = stageData.taskData.get(info.taskId).flatMap(_.taskMetrics) +val oldMetrics =
spark git commit: [SPARK-14125][SQL] Native DDL Support: Alter View
Repository: spark Updated Branches: refs/heads/master f83ba454a -> 0d22092cd [SPARK-14125][SQL] Native DDL Support: Alter View What changes were proposed in this pull request? This PR is to provide a native DDL support for the following three Alter View commands: Based on the Hive DDL document: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL # 1. ALTER VIEW RENAME **Syntax:** ```SQL ALTER VIEW view_name RENAME TO new_view_name ``` - to change the name of a view to a different name - not allowed to rename a view's name by ALTER TABLE # 2. ALTER VIEW SET TBLPROPERTIES **Syntax:** ```SQL ALTER VIEW view_name SET TBLPROPERTIES ('comment' = new_comment); ``` - to add metadata to a view - not allowed to set views' properties by ALTER TABLE - ignore it if trying to set a view's existing property key when the value is the same - overwrite the value if trying to set a view's existing key to a different value # 3. ALTER VIEW UNSET TBLPROPERTIES **Syntax:** ```SQL ALTER VIEW view_name UNSET TBLPROPERTIES [IF EXISTS] ('comment', 'key') ``` - to remove metadata from a view - not allowed to unset views' properties by ALTER TABLE - issue an exception if trying to unset a view's non-existent key How was this patch tested? Added test cases to verify if it works properly. Author: gatorsmileAuthor: xiaoli Author: Xiao Li Closes #12324 from gatorsmile/alterView. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0d22092c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0d22092c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0d22092c Branch: refs/heads/master Commit: 0d22092cd9c8876a7f226add578ff1c025012fe9 Parents: f83ba45 Author: gatorsmile Authored: Thu Apr 14 08:34:11 2016 -0700 Committer: Yin Huai Committed: Thu Apr 14 08:34:11 2016 -0700 -- .../spark/sql/execution/SparkSqlParser.scala| 9 +- .../spark/sql/execution/command/ddl.scala | 29 - .../spark/sql/execution/command/tables.scala| 4 +- .../sql/execution/command/DDLCommandSuite.scala | 18 +-- .../spark/sql/hive/execution/HiveDDLSuite.scala | 112 +++ 5 files changed, 157 insertions(+), 15 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0d22092c/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala index af92cec..8ed6ed2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala @@ -393,7 +393,8 @@ class SparkSqlAstBuilder extends AstBuilder { override def visitRenameTable(ctx: RenameTableContext): LogicalPlan = withOrigin(ctx) { AlterTableRename( visitTableIdentifier(ctx.from), - visitTableIdentifier(ctx.to)) + visitTableIdentifier(ctx.to), + ctx.VIEW != null) } /** @@ -409,7 +410,8 @@ class SparkSqlAstBuilder extends AstBuilder { ctx: SetTablePropertiesContext): LogicalPlan = withOrigin(ctx) { AlterTableSetProperties( visitTableIdentifier(ctx.tableIdentifier), - visitTablePropertyList(ctx.tablePropertyList)) + visitTablePropertyList(ctx.tablePropertyList), + ctx.VIEW != null) } /** @@ -426,7 +428,8 @@ class SparkSqlAstBuilder extends AstBuilder { AlterTableUnsetProperties( visitTableIdentifier(ctx.tableIdentifier), visitTablePropertyList(ctx.tablePropertyList).keys.toSeq, - ctx.EXISTS != null) + ctx.EXISTS != null, + ctx.VIEW != null) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/0d22092c/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 234099a..fc37a14 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -23,7 +23,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogTable} -import org.apache.spark.sql.catalyst.catalog.{CatalogTablePartition,
spark git commit: [SPARK-14572][DOC] Update config docs to allow -Xms in extraJavaOptions
Repository: spark Updated Branches: refs/heads/master 3cf3db17b -> f83ba454a [SPARK-14572][DOC] Update config docs to allow -Xms in extraJavaOptions ## What changes were proposed in this pull request? The configuration docs are updated to reflect the changes introduced with [SPARK-12384](https://issues.apache.org/jira/browse/SPARK-12384). This allows the user to specify initial heap memory settings through the extraJavaOptions for executor, driver and am. ## How was this patch tested? The changes are tested in [SPARK-12384](https://issues.apache.org/jira/browse/SPARK-12384). This is just documenting the changes made. Author: Dhruve AsharCloses #12333 from dhruve/doc/SPARK-14572. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f83ba454 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f83ba454 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f83ba454 Branch: refs/heads/master Commit: f83ba454a507bec0cc389d9a382cd71add7f17c1 Parents: 3cf3db1 Author: Dhruve Ashar Authored: Thu Apr 14 10:29:14 2016 -0500 Committer: Tom Graves Committed: Thu Apr 14 10:29:14 2016 -0500 -- docs/configuration.md | 11 +++ docs/running-on-yarn.md | 4 +++- 2 files changed, 10 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f83ba454/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index 937852f..16d5be6 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -225,11 +225,14 @@ Apart from these, the following properties are also available, and may be useful (none) A string of extra JVM options to pass to the driver. For instance, GC settings or other logging. +Note that it is illegal to set maximum heap size (-Xmx) settings with this option. Maximum heap +size settings can be set with spark.driver.memory in the cluster mode and through +the --driver-memory command line option in the client mode. Note: In client mode, this config must not be set through the SparkConf directly in your application, because the driver JVM has already started at that point. Instead, please set this through the --driver-java-options command line option or in -your default properties file. +your default properties file. @@ -269,9 +272,9 @@ Apart from these, the following properties are also available, and may be useful (none) A string of extra JVM options to pass to executors. For instance, GC settings or other logging. -Note that it is illegal to set Spark properties or heap size settings with this option. Spark -properties should be set using a SparkConf object or the spark-defaults.conf file used with the -spark-submit script. Heap size settings can be set with spark.executor.memory. +Note that it is illegal to set Spark properties or maximum heap size (-Xmx) settings with this +option. Spark properties should be set using a SparkConf object or the spark-defaults.conf file +used with the spark-submit script. Maximum heap size settings can be set with spark.executor.memory. http://git-wip-us.apache.org/repos/asf/spark/blob/f83ba454/docs/running-on-yarn.md -- diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index ddc75a7..09701ab 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -342,7 +342,9 @@ If you need a reference to the proper location to put log files in the YARN so t (none) A string of extra JVM options to pass to the YARN Application Master in client mode. - In cluster mode, use spark.driver.extraJavaOptions instead. + In cluster mode, use spark.driver.extraJavaOptions instead. Note that it is illegal + to set maximum heap size (-Xmx) settings with this option. Maximum heap size settings can be set + with spark.yarn.am.memory - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][SQL] Remove extra anonymous closure within functional transformations
Repository: spark Updated Branches: refs/heads/master 478af2f45 -> 6fc3dc883 [MINOR][SQL] Remove extra anonymous closure within functional transformations ## What changes were proposed in this pull request? This PR removes extra anonymous closure within functional transformations. For example, ```scala .map(item => { ... }) ``` which can be just simply as below: ```scala .map { item => ... } ``` ## How was this patch tested? Related unit tests and `sbt scalastyle`. Author: hyukjinkwonCloses #12382 from HyukjinKwon/minor-extra-closers. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6fc3dc88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6fc3dc88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6fc3dc88 Branch: refs/heads/master Commit: 6fc3dc8839eaed673c64ec87af6dfe24f8cebe0c Parents: 478af2f Author: hyukjinkwon Authored: Thu Apr 14 09:43:41 2016 +0100 Committer: Sean Owen Committed: Thu Apr 14 09:43:41 2016 +0100 -- .../scala/org/apache/spark/SparkContext.scala | 4 +-- .../org/apache/spark/deploy/master/Master.scala | 4 +-- .../scala/org/apache/spark/rdd/BlockRDD.scala | 4 +-- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 4 +-- .../spark/rdd/ParallelCollectionRDD.scala | 4 +-- .../spark/rdd/PartitionerAwareUnionRDD.scala| 4 +-- .../cluster/mesos/MesosSchedulerUtils.scala | 4 +-- .../spark/shuffle/BlockStoreShuffleReader.scala | 4 +-- .../apache/spark/ui/jobs/ExecutorTable.scala| 4 +-- .../streaming/RecoverableNetworkWordCount.scala | 4 +-- .../streaming/SqlNetworkWordCount.scala | 4 +-- .../flume/sink/SparkAvroCallbackHandler.scala | 4 +-- .../spark/streaming/flume/sink/SparkSink.scala | 12 - .../flume/sink/TransactionProcessor.scala | 12 - .../flume/FlumePollingInputDStream.scala| 4 +-- .../streaming/flume/PollingFlumeTestUtils.scala | 4 +-- .../expressions/codegen/CodeGenerator.scala | 4 +-- .../sql/catalyst/optimizer/Optimizer.scala | 4 +-- .../spark/sql/execution/basicOperators.scala| 4 +-- .../execution/datasources/jdbc/JdbcUtils.scala | 4 +-- .../apache/spark/sql/hive/HiveInspectors.scala | 27 .../org/apache/spark/streaming/Checkpoint.scala | 8 +++--- .../spark/streaming/dstream/StateDStream.scala | 4 +-- .../streaming/scheduler/ReceiverTracker.scala | 4 +-- 24 files changed, 67 insertions(+), 72 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6fc3dc88/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 966198d..e41088f 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -723,7 +723,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli (safeEnd - safeStart) / step + 1 } } -parallelize(0 until numSlices, numSlices).mapPartitionsWithIndex((i, _) => { +parallelize(0 until numSlices, numSlices).mapPartitionsWithIndex { (i, _) => val partitionStart = (i * numElements) / numSlices * step + start val partitionEnd = (((i + 1) * numElements) / numSlices) * step + start def getSafeMargin(bi: BigInt): Long = @@ -762,7 +762,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli ret } } -}) +} } /** Distribute a local Scala collection to form an RDD. http://git-wip-us.apache.org/repos/asf/spark/blob/6fc3dc88/core/src/main/scala/org/apache/spark/deploy/master/Master.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala index 9bd3fc1..b443e8f 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala @@ -843,10 +843,10 @@ private[deploy] class Master( addressToApp -= app.driver.address if (completedApps.size >= RETAINED_APPLICATIONS) { val toRemove = math.max(RETAINED_APPLICATIONS / 10, 1) -completedApps.take(toRemove).foreach( a => { +completedApps.take(toRemove).foreach { a => Option(appIdToUI.remove(a.id)).foreach { ui => webUi.detachSparkUI(ui) } applicationMetricsSystem.removeSource(a.appSource) -}) +} completedApps.trimStart(toRemove) }
spark git commit: [SPARK-14573][PYSPARK][BUILD] Fix PyDoc Makefile & highlighting issues
Repository: spark Updated Branches: refs/heads/master b4819404a -> 478af2f45 [SPARK-14573][PYSPARK][BUILD] Fix PyDoc Makefile & highlighting issues ## What changes were proposed in this pull request? The PyDoc Makefile used "=" rather than "?=" for setting env variables so it overwrote the user values. This ignored the environment variables we set for linting allowing warnings through. This PR also fixes the warnings that had been introduced. ## How was this patch tested? manual local export & make Author: Holden KarauCloses #12336 from holdenk/SPARK-14573-fix-pydoc-makefile. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/478af2f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/478af2f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/478af2f4 Branch: refs/heads/master Commit: 478af2f45595913c9b8f560d13e8d88447486f99 Parents: b481940 Author: Holden Karau Authored: Thu Apr 14 09:42:15 2016 +0100 Committer: Sean Owen Committed: Thu Apr 14 09:42:15 2016 +0100 -- python/docs/Makefile| 8 python/pyspark/ml/regression.py | 2 +- python/pyspark/sql/context.py | 2 +- python/pyspark/sql/dataframe.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/478af2f4/python/docs/Makefile -- diff --git a/python/docs/Makefile b/python/docs/Makefile index 9030097..905e021 100644 --- a/python/docs/Makefile +++ b/python/docs/Makefile @@ -2,10 +2,10 @@ # # You can set these variables from the command line. -SPHINXOPTS= -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build +SPHINXOPTS?= +SPHINXBUILD ?= sphinx-build +PAPER ?= +BUILDDIR ?= _build export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.9.2-src.zip) http://git-wip-us.apache.org/repos/asf/spark/blob/478af2f4/python/pyspark/ml/regression.py -- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 316d7e3..c064fe5 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -28,7 +28,7 @@ from pyspark.sql import DataFrame __all__ = ['AFTSurvivalRegression', 'AFTSurvivalRegressionModel', 'DecisionTreeRegressor', 'DecisionTreeRegressionModel', 'GBTRegressor', 'GBTRegressionModel', - 'GeneralizedLinearRegression', 'GeneralizedLinearRegressionModel' + 'GeneralizedLinearRegression', 'GeneralizedLinearRegressionModel', 'IsotonicRegression', 'IsotonicRegressionModel', 'LinearRegression', 'LinearRegressionModel', 'LinearRegressionSummary', 'LinearRegressionTrainingSummary', http://git-wip-us.apache.org/repos/asf/spark/blob/478af2f4/python/pyspark/sql/context.py -- diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py index 4008332..11dfcfe1 100644 --- a/python/pyspark/sql/context.py +++ b/python/pyspark/sql/context.py @@ -405,7 +405,7 @@ class SQLContext(object): >>> sqlContext.createDataFrame(rdd, "boolean").collect() # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... -Py4JJavaError:... +Py4JJavaError: ... """ if isinstance(data, DataFrame): raise TypeError("data is already a DataFrame") http://git-wip-us.apache.org/repos/asf/spark/blob/478af2f4/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index d473d6b..b4fa836 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -60,7 +60,7 @@ class DataFrame(object): people = sqlContext.read.parquet("...") department = sqlContext.read.parquet("...") -people.filter(people.age > 30).join(department, people.deptId == department.id)) \ +people.filter(people.age > 30).join(department, people.deptId == department.id)\ .groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"}) .. note:: Experimental - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-14596][SQL] Remove not used SqlNewHadoopRDD and some more unused imports
Repository: spark Updated Branches: refs/heads/master 62b7f306f -> b4819404a [SPARK-14596][SQL] Remove not used SqlNewHadoopRDD and some more unused imports ## What changes were proposed in this pull request? Old `HadoopFsRelation` API includes `buildInternalScan()` which uses `SqlNewHadoopRDD` in `ParquetRelation`. Because now the old API is removed, `SqlNewHadoopRDD` is not used anymore. So, this PR removes `SqlNewHadoopRDD` and several unused imports. This was discussed in https://github.com/apache/spark/pull/12326. ## How was this patch tested? Several related existing unit tests and `sbt scalastyle`. Author: hyukjinkwonCloses #12354 from HyukjinKwon/SPARK-14596. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4819404 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4819404 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4819404 Branch: refs/heads/master Commit: b4819404a65f9b97c1f8deb1fcb8419969831574 Parents: 62b7f30 Author: hyukjinkwon Authored: Thu Apr 14 15:43:44 2016 +0800 Committer: Wenchen Fan Committed: Thu Apr 14 15:43:44 2016 +0800 -- .../scala/org/apache/spark/rdd/HadoopRDD.scala | 8 +- .../apache/spark/rdd/InputFileNameHolder.scala | 41 +++ .../apache/spark/rdd/SqlNewHadoopRDDState.scala | 41 --- project/MimaExcludes.scala | 5 - .../catalyst/expressions/InputFileName.scala| 8 +- .../sql/execution/datasources/FileScanRDD.scala | 11 +- .../execution/datasources/SqlNewHadoopRDD.scala | 282 --- .../org/apache/spark/sql/internal/SQLConf.scala | 1 - .../datasources/FileSourceStrategySuite.scala | 5 +- 9 files changed, 54 insertions(+), 348 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b4819404/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 08db96e..ac5ba9e 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -213,15 +213,13 @@ class HadoopRDD[K, V]( logInfo("Input split: " + split.inputSplit) val jobConf = getJobConf() - // TODO: there is a lot of duplicate code between this and NewHadoopRDD and SqlNewHadoopRDD - val inputMetrics = context.taskMetrics().registerInputMetrics(DataReadMethod.Hadoop) val existingBytesRead = inputMetrics.bytesRead // Sets the thread local variable for the file's name split.inputSplit.value match { -case fs: FileSplit => SqlNewHadoopRDDState.setInputFileName(fs.getPath.toString) -case _ => SqlNewHadoopRDDState.unsetInputFileName() +case fs: FileSplit => InputFileNameHolder.setInputFileName(fs.getPath.toString) +case _ => InputFileNameHolder.unsetInputFileName() } // Find a function that will return the FileSystem bytes read by this thread. Do this before @@ -271,7 +269,7 @@ class HadoopRDD[K, V]( override def close() { if (reader != null) { - SqlNewHadoopRDDState.unsetInputFileName() + InputFileNameHolder.unsetInputFileName() // Close the reader and release it. Note: it's very important that we don't close the // reader more than once, since that exposes us to MAPREDUCE-5918 when running against // Hadoop 1.x and older Hadoop 2.x releases. That bug can lead to non-deterministic http://git-wip-us.apache.org/repos/asf/spark/blob/b4819404/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala new file mode 100644 index 000..108e9d2 --- /dev/null +++ b/core/src/main/scala/org/apache/spark/rdd/InputFileNameHolder.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR