spark git commit: [DOC] add config option spark.ui.enabled into document
Repository: spark Updated Branches: refs/heads/master 2a105134e -> 91f2735a1 [DOC] add config option spark.ui.enabled into document ## What changes were proposed in this pull request? The configuration doc lost the config option `spark.ui.enabled` (default value is `true`) I think this option is important because many cases we would like to turn it off. so I add it. ## How was this patch tested? N/A Author: WeichenXuCloses #14604 from WeichenXu123/add_doc_param_spark_ui_enabled. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91f2735a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91f2735a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91f2735a Branch: refs/heads/master Commit: 91f2735a180f0af1f15303fd0a32633dfd1c1fe0 Parents: 2a10513 Author: WeichenXu Authored: Fri Aug 12 20:10:09 2016 +0100 Committer: Sean Owen Committed: Fri Aug 12 20:10:09 2016 +0100 -- docs/configuration.md | 7 +++ 1 file changed, 7 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/91f2735a/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index ae75318..96e8c6d 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -590,6 +590,13 @@ Apart from these, the following properties are also available, and may be useful + spark.ui.enabled + true + +Whether to run the web UI for the Spark application. + + + spark.ui.killEnabled true - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-17003][BUILD][BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.11
Repository: spark Updated Branches: refs/heads/branch-1.6 b3ecff640 -> 909231d7a [SPARK-17003][BUILD][BRANCH-1.6] release-build.sh is missing hive-thriftserver for scala 2.11 ## What changes were proposed in this pull request? hive-thriftserver works with Scala 2.11 (https://issues.apache.org/jira/browse/SPARK-8013). So, let's publish scala 2.11 artifacts with the flag of `-Phive-thfitserver`. I am also fixing the doc. Author: Yin HuaiCloses #14586 from yhuai/SPARK-16453-branch-1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/909231d7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/909231d7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/909231d7 Branch: refs/heads/branch-1.6 Commit: 909231d7aec591af2fcf0ffaf0612a8c034bcd7a Parents: b3ecff6 Author: Yin Huai Authored: Fri Aug 12 10:29:05 2016 -0700 Committer: Yin Huai Committed: Fri Aug 12 10:29:05 2016 -0700 -- dev/create-release/release-build.sh | 10 -- docs/building-spark.md | 2 -- python/pyspark/sql/functions.py | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/909231d7/dev/create-release/release-build.sh -- diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh index 2c3af6a..840fb20 100755 --- a/dev/create-release/release-build.sh +++ b/dev/create-release/release-build.sh @@ -80,7 +80,7 @@ NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads BASE_DIR=$(pwd) MVN="build/mvn --force" -PUBLISH_PROFILES="-Pyarn -Phive -Phadoop-2.2" +PUBLISH_PROFILES="-Pyarn -Phive -Phive-thriftserver -Phadoop-2.2" PUBLISH_PROFILES="$PUBLISH_PROFILES -Pspark-ganglia-lgpl -Pkinesis-asl" rm -rf spark @@ -187,7 +187,7 @@ if [[ "$1" == "package" ]]; then # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds # share the same Zinc server. make_binary_release "hadoop1" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver" "3030" & - make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Dscala-2.11" "3031" & + make_binary_release "hadoop1-scala2.11" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dscala-2.11" "3031" & make_binary_release "cdh4" "-Psparkr -Phadoop-1 -Phive -Phive-thriftserver -Dhadoop.version=2.0.0-mr1-cdh4.2.0" "3032" & make_binary_release "hadoop2.3" "-Psparkr -Phadoop-2.3 -Phive -Phive-thriftserver -Pyarn" "3033" & make_binary_release "hadoop2.4" "-Psparkr -Phadoop-2.4 -Phive -Phive-thriftserver -Pyarn" "3034" & @@ -256,8 +256,7 @@ if [[ "$1" == "publish-snapshot" ]]; then # Generate random point for Zinc export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES \ --Phive-thriftserver deploy + $MVN -DzincPort=$ZINC_PORT --settings $tmp_settings -DskipTests $PUBLISH_PROFILES deploy ./dev/change-scala-version.sh 2.11 $MVN -DzincPort=$ZINC_PORT -Dscala-2.11 --settings $tmp_settings \ -DskipTests $PUBLISH_PROFILES clean deploy @@ -293,8 +292,7 @@ if [[ "$1" == "publish-release" ]]; then # Generate random point for Zinc export ZINC_PORT=$(python -S -c "import random; print random.randrange(3030,4030)") - $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES \ --Phive-thriftserver clean install + $MVN -DzincPort=$ZINC_PORT -Dmaven.repo.local=$tmp_repo -DskipTests $PUBLISH_PROFILES clean install ./dev/change-scala-version.sh 2.11 http://git-wip-us.apache.org/repos/asf/spark/blob/909231d7/docs/building-spark.md -- diff --git a/docs/building-spark.md b/docs/building-spark.md index 5f694dc..4348b38 100644 --- a/docs/building-spark.md +++ b/docs/building-spark.md @@ -129,8 +129,6 @@ To produce a Spark package compiled with Scala 2.11, use the `-Dscala-2.11` prop ./dev/change-scala-version.sh 2.11 mvn -Pyarn -Phadoop-2.4 -Dscala-2.11 -DskipTests clean package -Spark does not yet support its JDBC component for Scala 2.11. - # Spark Tests in Maven Tests are run by default via the [ScalaTest Maven plugin](http://www.scalatest.org/user_guide/using_the_scalatest_maven_plugin). http://git-wip-us.apache.org/repos/asf/spark/blob/909231d7/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 1152954..6912cc9 100644 --- a/python/pyspark/sql/functions.py +++
spark git commit: [SPARK-16771][SQL] WITH clause should not fall into infinite loop.
Repository: spark Updated Branches: refs/heads/master bbae20ade -> 2a105134e [SPARK-16771][SQL] WITH clause should not fall into infinite loop. ## What changes were proposed in this pull request? This PR changes the CTE resolving rule to use only **forward-declared** tables in order to prevent infinite loops. More specifically, new logic is like the following. * Resolve CTEs in `WITH` clauses first before replacing the main SQL body. * When resolving CTEs, only forward-declared CTEs or base tables are referenced. - Self-referencing is not allowed any more. - Cross-referencing is not allowed any more. **Reported Error Scenarios** ```scala scala> sql("WITH t AS (SELECT 1 FROM t) SELECT * FROM t") java.lang.StackOverflowError ... scala> sql("WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1, t2") java.lang.StackOverflowError ... ``` Note that `t`, `t1`, and `t2` are not declared in database. Spark falls into infinite loops before resolving table names. ## How was this patch tested? Pass the Jenkins tests with new two testcases. Author: Dongjoon HyunCloses #14397 from dongjoon-hyun/SPARK-16771-TREENODE. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a105134 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a105134 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a105134 Branch: refs/heads/master Commit: 2a105134e9a3efd46b761fab5e563ddebb26575d Parents: bbae20a Author: Dongjoon Hyun Authored: Fri Aug 12 19:07:34 2016 +0200 Committer: Herman van Hovell Committed: Fri Aug 12 19:07:34 2016 +0200 -- .../spark/sql/catalyst/analysis/Analyzer.scala | 24 - .../spark/sql/catalyst/parser/AstBuilder.scala | 2 +- .../plans/logical/basicLogicalOperators.scala | 7 ++- .../sql/catalyst/parser/PlanParserSuite.scala | 2 +- .../src/test/resources/sql-tests/inputs/cte.sql | 14 + .../resources/sql-tests/results/cte.sql.out | 57 6 files changed, 88 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 14a2a32..a2e276e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -125,22 +125,22 @@ class Analyzer( object CTESubstitution extends Rule[LogicalPlan] { // TODO allow subquery to define CTE def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { - case With(child, relations) => substituteCTE(child, relations) + case With(child, relations) => +substituteCTE(child, relations.foldLeft(Seq.empty[(String, LogicalPlan)]) { + case (resolved, (name, relation)) => +resolved :+ name -> ResolveRelations(substituteCTE(relation, resolved)) +}) case other => other } -def substituteCTE(plan: LogicalPlan, cteRelations: Map[String, LogicalPlan]): LogicalPlan = { - plan transform { -// In hive, if there is same table name in database and CTE definition, -// hive will use the table in database, not the CTE one. -// Taking into account the reasonableness and the implementation complexity, -// here use the CTE definition first, check table name only and ignore database name -// see https://github.com/apache/spark/pull/4929#discussion_r27186638 for more info +def substituteCTE(plan: LogicalPlan, cteRelations: Seq[(String, LogicalPlan)]): LogicalPlan = { + plan transformDown { case u : UnresolvedRelation => - val substituted = cteRelations.get(u.tableIdentifier.table).map { relation => -val withAlias = u.alias.map(SubqueryAlias(_, relation)) -withAlias.getOrElse(relation) - } + val substituted = cteRelations.find(x => resolver(x._1, u.tableIdentifier.table)) +.map(_._2).map { relation => + val withAlias = u.alias.map(SubqueryAlias(_, relation)) + withAlias.getOrElse(relation) +} substituted.getOrElse(u) case other => // This cannot be done in ResolveSubquery because ResolveSubquery does not know the CTE. http://git-wip-us.apache.org/repos/asf/spark/blob/2a105134/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
spark git commit: [SPARK-17013][SQL] Parse negative numeric literals
Repository: spark Updated Branches: refs/heads/master abff92bfd -> 00e103a6e [SPARK-17013][SQL] Parse negative numeric literals ## What changes were proposed in this pull request? This patch updates the SQL parser to parse negative numeric literals as numeric literals, instead of unary minus of positive literals. This allows the parser to parse the minimal value for each data type, e.g. "-32768S". ## How was this patch tested? Updated test cases. Author: petermaxleeCloses #14608 from petermaxlee/SPARK-17013. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00e103a6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00e103a6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00e103a6 Branch: refs/heads/master Commit: 00e103a6edd1a1f001a94d41dd1f7acc40a1e30f Parents: abff92b Author: petermaxlee Authored: Thu Aug 11 23:56:55 2016 -0700 Committer: Reynold Xin Committed: Thu Aug 11 23:56:55 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 14 +++ .../sql/catalyst/expressions/arithmetic.scala | 4 +- .../sql-tests/results/arithmetic.sql.out| 26 ++-- .../sql-tests/results/literals.sql.out | 44 ++-- .../catalyst/ExpressionSQLBuilderSuite.scala| 4 +- 5 files changed, 37 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00e103a6/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index ba65f2a..6122bcd 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -625,13 +625,13 @@ quotedIdentifier ; number -: DECIMAL_VALUE#decimalLiteral -| SCIENTIFIC_DECIMAL_VALUE #scientificDecimalLiteral -| INTEGER_VALUE#integerLiteral -| BIGINT_LITERAL #bigIntLiteral -| SMALLINT_LITERAL #smallIntLiteral -| TINYINT_LITERAL #tinyIntLiteral -| DOUBLE_LITERAL #doubleLiteral +: MINUS? DECIMAL_VALUE#decimalLiteral +| MINUS? SCIENTIFIC_DECIMAL_VALUE #scientificDecimalLiteral +| MINUS? INTEGER_VALUE#integerLiteral +| MINUS? BIGINT_LITERAL #bigIntLiteral +| MINUS? SMALLINT_LITERAL #smallIntLiteral +| MINUS? TINYINT_LITERAL #tinyIntLiteral +| MINUS? DOUBLE_LITERAL #doubleLiteral ; nonReserved http://git-wip-us.apache.org/repos/asf/spark/blob/00e103a6/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index 4aebef9..13e539a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -58,7 +58,7 @@ case class UnaryMinus(child: Expression) extends UnaryExpression } } - override def sql: String = s"(-${child.sql})" + override def sql: String = s"(- ${child.sql})" } @ExpressionDescription( @@ -76,7 +76,7 @@ case class UnaryPositive(child: Expression) protected override def nullSafeEval(input: Any): Any = input - override def sql: String = s"(+${child.sql})" + override def sql: String = s"(+ ${child.sql})" } /** http://git-wip-us.apache.org/repos/asf/spark/blob/00e103a6/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out index 50ea254..f2b40a0 100644 --- a/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out @@ -5,7 +5,7 @@ -- !query 0 select -100 -- !query 0 schema -struct<(-100):int> +struct<-100:int> -- !query 0 output -100 @@ -21,7 +21,7 @@ struct<230:int> -- !query 2 select -5.2 -- !query 2 schema -struct<(-5.2):decimal(2,1)> +struct<-5.2:decimal(2,1)> -- !query 2 output -5.2 @@ -37,7 +37,7 @@ struct<6.8:double> -- !query 4 select -key, +key from testdata where key = 2 -- !query 4
spark git commit: [SPARK-17013][SQL] Parse negative numeric literals
Repository: spark Updated Branches: refs/heads/branch-2.0 b4047fc21 -> bde94cd71 [SPARK-17013][SQL] Parse negative numeric literals ## What changes were proposed in this pull request? This patch updates the SQL parser to parse negative numeric literals as numeric literals, instead of unary minus of positive literals. This allows the parser to parse the minimal value for each data type, e.g. "-32768S". ## How was this patch tested? Updated test cases. Author: petermaxleeCloses #14608 from petermaxlee/SPARK-17013. (cherry picked from commit 00e103a6edd1a1f001a94d41dd1f7acc40a1e30f) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bde94cd7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bde94cd7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bde94cd7 Branch: refs/heads/branch-2.0 Commit: bde94cd71086fd348f3ba96de628d6df3f87dba5 Parents: b4047fc Author: petermaxlee Authored: Thu Aug 11 23:56:55 2016 -0700 Committer: Reynold Xin Committed: Thu Aug 11 23:57:01 2016 -0700 -- .../apache/spark/sql/catalyst/parser/SqlBase.g4 | 14 +++ .../sql/catalyst/expressions/arithmetic.scala | 4 +- .../sql-tests/results/arithmetic.sql.out| 26 ++-- .../sql-tests/results/literals.sql.out | 44 ++-- .../catalyst/ExpressionSQLBuilderSuite.scala| 4 +- 5 files changed, 37 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bde94cd7/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 279a1ce..aca7282 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -618,13 +618,13 @@ quotedIdentifier ; number -: DECIMAL_VALUE#decimalLiteral -| SCIENTIFIC_DECIMAL_VALUE #scientificDecimalLiteral -| INTEGER_VALUE#integerLiteral -| BIGINT_LITERAL #bigIntLiteral -| SMALLINT_LITERAL #smallIntLiteral -| TINYINT_LITERAL #tinyIntLiteral -| DOUBLE_LITERAL #doubleLiteral +: MINUS? DECIMAL_VALUE#decimalLiteral +| MINUS? SCIENTIFIC_DECIMAL_VALUE #scientificDecimalLiteral +| MINUS? INTEGER_VALUE#integerLiteral +| MINUS? BIGINT_LITERAL #bigIntLiteral +| MINUS? SMALLINT_LITERAL #smallIntLiteral +| MINUS? TINYINT_LITERAL #tinyIntLiteral +| MINUS? DOUBLE_LITERAL #doubleLiteral ; nonReserved http://git-wip-us.apache.org/repos/asf/spark/blob/bde94cd7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala index 7ff8795..fa459aa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/arithmetic.scala @@ -57,7 +57,7 @@ case class UnaryMinus(child: Expression) extends UnaryExpression } } - override def sql: String = s"(-${child.sql})" + override def sql: String = s"(- ${child.sql})" } @ExpressionDescription( @@ -75,7 +75,7 @@ case class UnaryPositive(child: Expression) protected override def nullSafeEval(input: Any): Any = input - override def sql: String = s"(+${child.sql})" + override def sql: String = s"(+ ${child.sql})" } /** http://git-wip-us.apache.org/repos/asf/spark/blob/bde94cd7/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out -- diff --git a/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out b/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out index 50ea254..f2b40a0 100644 --- a/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/arithmetic.sql.out @@ -5,7 +5,7 @@ -- !query 0 select -100 -- !query 0 schema -struct<(-100):int> +struct<-100:int> -- !query 0 output -100 @@ -21,7 +21,7 @@ struct<230:int> -- !query 2 select -5.2 -- !query 2 schema -struct<(-5.2):decimal(2,1)> +struct<-5.2:decimal(2,1)> -- !query 2
spark git commit: [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly
Repository: spark Updated Branches: refs/heads/branch-2.0 0fb01496c -> b4047fc21 [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly Currently, Spark ignores path names starting with underscore `_` and `.`. This causes read-failures for the column-partitioned file data sources whose partition column names starts from '_', e.g. `_col`. **Before** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") org.apache.spark.sql.AnalysisException: Unable to infer schema for ParquetFormat at /tmp/parquet20. It must be specified manually; ``` **After** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") res2: org.apache.spark.sql.DataFrame = [id: bigint, _locality_code: int] ``` Pass the Jenkins with a new test case. Author: Dongjoon HyunCloses #14585 from dongjoon-hyun/SPARK-16975-PARQUET. (cherry picked from commit abff92bfdc7d4c9d2308794f0350561fe0ceb4dd) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b4047fc2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b4047fc2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b4047fc2 Branch: refs/heads/branch-2.0 Commit: b4047fc21cefcf6a43c1ee88af330a042f02bebc Parents: 0fb0149 Author: Dongjoon Hyun Authored: Fri Aug 12 14:40:12 2016 +0800 Committer: Cheng Lian Committed: Fri Aug 12 14:52:50 2016 +0800 -- .../datasources/PartitioningAwareFileCatalog.scala | 2 +- .../sql/execution/datasources/fileSourceInterfaces.scala| 2 +- .../sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../execution/datasources/parquet/ParquetFileFormat.scala | 3 ++- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 + 5 files changed, 14 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala index 811e96c..cef9d4d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog( private def isDataPath(path: Path): Boolean = { val name = path.getName -!(name.startsWith("_") || name.startsWith(".")) +!((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) } } http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index 0b5a19f..438fccb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -363,7 +363,7 @@ private[sql] object HadoopFsRelation extends Logging { // We filter everything that starts with _ and ., except _common_metadata and _metadata // because Parquet needs to find those metadata files from leaf files returned by this method. // We should refactor this logic to not mix metadata files with data files. -(pathName.startsWith("_") || pathName.startsWith(".")) && +((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) && !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata") } http://git-wip-us.apache.org/repos/asf/spark/blob/b4047fc2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 86aef1f..c58de3a 100644 ---
spark git commit: [SPARK-16868][WEB UI] Fix executor be both dead and alive on executor ui.
Repository: spark Updated Branches: refs/heads/master 1c9a386c6 -> 4ec5c360c [SPARK-16868][WEB UI] Fix executor be both dead and alive on executor ui. ## What changes were proposed in this pull request? In a heavy pressure of the spark application, since the executor will register it to driver block manager twice(because of heart beats), the executor will show as picture show: ![image](https://cloud.githubusercontent.com/assets/7404824/17467245/c1359094-5d4e-11e6-843a-f6d6347e1bf6.png) ## How was this patch tested? NA Details in: [SPARK-16868](https://issues.apache.org/jira/browse/SPARK-16868) Author: huangzhaoweiCloses #14530 from SaintBacchus/SPARK-16868. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ec5c360 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ec5c360 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ec5c360 Branch: refs/heads/master Commit: 4ec5c360ce2045a9bdecb3c5277ba519bf0f44ae Parents: 1c9a386 Author: huangzhaowei Authored: Thu Aug 11 14:56:03 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Aug 11 14:56:03 2016 -0700 -- .../scala/org/apache/spark/storage/StorageStatusListener.scala | 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ec5c360/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala index 3008520..798658a 100644 --- a/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala +++ b/core/src/main/scala/org/apache/spark/storage/StorageStatusListener.scala @@ -77,6 +77,10 @@ class StorageStatusListener(conf: SparkConf) extends SparkListener { val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus + + // Try to remove the dead storage status if same executor register the block manager twice. + deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId) +.foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16598][SQL][TEST] Added a test case for verifying the table identifier parsing
Repository: spark Updated Branches: refs/heads/master f4482225c -> 79e2caa13 [SPARK-16598][SQL][TEST] Added a test case for verifying the table identifier parsing What changes were proposed in this pull request? So far, the test cases of `TableIdentifierParserSuite` do not cover the quoted cases. We should add one for avoiding regression. How was this patch tested? N/A Author: gatorsmileCloses #14244 from gatorsmile/quotedIdentifiers. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79e2caa1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79e2caa1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79e2caa1 Branch: refs/heads/master Commit: 79e2caa1328843457841d71642b60be919ebb1e0 Parents: f448222 Author: gatorsmile Authored: Fri Aug 12 10:02:00 2016 +0100 Committer: Sean Owen Committed: Fri Aug 12 10:02:00 2016 +0100 -- .../sql/catalyst/parser/TableIdentifierParserSuite.scala | 8 1 file changed, 8 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79e2caa1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala index 8bbf87e..dadb8a8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/TableIdentifierParserSuite.scala @@ -68,6 +68,14 @@ class TableIdentifierParserSuite extends SparkFunSuite { } } + test("quoted identifiers") { +assert(TableIdentifier("z", Some("x.y")) === parseTableIdentifier("`x.y`.z")) +assert(TableIdentifier("y.z", Some("x")) === parseTableIdentifier("x.`y.z`")) +assert(TableIdentifier("z", Some("`x.y`")) === parseTableIdentifier("```x.y```.z")) +assert(TableIdentifier("`y.z`", Some("x")) === parseTableIdentifier("x.```y.z```")) +assert(TableIdentifier("x.y.z", None) === parseTableIdentifier("`x.y.z`")) + } + test("table identifier - strict keywords") { // SQL Keywords. hiveStrictNonReservedKeyword.foreach { keyword => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOC] Fix style in examples across documentation
Repository: spark Updated Branches: refs/heads/master 993923c8f -> f4482225c [MINOR][DOC] Fix style in examples across documentation ## What changes were proposed in this pull request? This PR fixes the documentation as below: - Python has 4 spaces and Java and Scala has 2 spaces (See https://cwiki.apache.org/confluence/display/SPARK/Spark+Code+Style+Guide). - Avoid excessive parentheses and curly braces for anonymous functions. (See https://github.com/databricks/scala-style-guide#anonymous) ## How was this patch tested? N/A Author: hyukjinkwonCloses #14593 from HyukjinKwon/minor-documentation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f4482225 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f4482225 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f4482225 Branch: refs/heads/master Commit: f4482225c405b9cfe078deac74e4c28e2dcc97c3 Parents: 993923c Author: hyukjinkwon Authored: Fri Aug 12 10:00:58 2016 +0100 Committer: Sean Owen Committed: Fri Aug 12 10:00:58 2016 +0100 -- docs/graphx-programming-guide.md| 8 +++--- docs/programming-guide.md | 4 +-- docs/spark-standalone.md| 6 ++-- docs/streaming-custom-receivers.md | 48 docs/streaming-programming-guide.md | 28 +-- 5 files changed, 47 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f4482225/docs/graphx-programming-guide.md -- diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index bf4b968..6f738f0 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -421,15 +421,15 @@ val graph = Graph(users, relationships, defaultUser) // Notice that there is a user 0 (for which we have no information) connected to users // 4 (peter) and 5 (franklin). graph.triplets.map( -triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1 - ).collect.foreach(println(_)) + triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1 +).collect.foreach(println(_)) // Remove missing vertices as well as the edges to connected to them val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing") // The valid subgraph will disconnect users 4 and 5 by removing user 0 validGraph.vertices.collect.foreach(println(_)) validGraph.triplets.map( -triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1 - ).collect.foreach(println(_)) + triplet => triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1 +).collect.foreach(println(_)) {% endhighlight %} > Note in the above example only the vertex predicate is provided. The > `subgraph` operator defaults http://git-wip-us.apache.org/repos/asf/spark/blob/f4482225/docs/programming-guide.md -- diff --git a/docs/programming-guide.md b/docs/programming-guide.md index f828329..40287d7 100644 --- a/docs/programming-guide.md +++ b/docs/programming-guide.md @@ -1516,8 +1516,8 @@ data.map(x -> { accum.add(x); return f(x); }); {% highlight python %} accum = sc.accumulator(0) def g(x): - accum.add(x) - return f(x) +accum.add(x) +return f(x) data.map(g) # Here, accum is still 0 because no actions have caused the `map` to be computed. {% endhighlight %} http://git-wip-us.apache.org/repos/asf/spark/blob/f4482225/docs/spark-standalone.md -- diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 5ae63fe..1097f1f 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -298,9 +298,9 @@ application at a time. You can cap the number of cores by setting `spark.cores.m {% highlight scala %} val conf = new SparkConf() - .setMaster(...) - .setAppName(...) - .set("spark.cores.max", "10") + .setMaster(...) + .setAppName(...) + .set("spark.cores.max", "10") val sc = new SparkContext(conf) {% endhighlight %} http://git-wip-us.apache.org/repos/asf/spark/blob/f4482225/docs/streaming-custom-receivers.md -- diff --git a/docs/streaming-custom-receivers.md b/docs/streaming-custom-receivers.md index 479140f..fae5901 100644 --- a/docs/streaming-custom-receivers.md +++ b/docs/streaming-custom-receivers.md @@ -59,8 +59,8 @@ class CustomReceiver(host: String, port: Int) } def onStop() { - // There is nothing much to do as the thread calling receive() - // is
spark git commit: [SPARK-16985] Change dataFormat from yyyyMMddHHmm to yyyyMMddHHmmss
Repository: spark Updated Branches: refs/heads/master 00e103a6e -> 993923c8f [SPARK-16985] Change dataFormat from MMddHHmm to MMddHHmmss ## What changes were proposed in this pull request? In our cluster, sometimes the sql output maybe overrided. When I submit some sql, all insert into the same table, and the sql will cost less one minute, here is the detail, 1 sql1, 11:03 insert into table. 2 sql2, 11:04:11 insert into table. 3 sql3, 11:04:48 insert into table. 4 sql4, 11:05 insert into table. 5 sql5, 11:06 insert into table. The sql3's output file will override the sql2's output file. here is the log: ``` 16/05/04 11:04:11 INFO hive.SparkHiveHadoopWriter: XXfinalPath=hdfs://tl-sng-gdt-nn-tdw.tencent-distribute.com:54310/tmp/assorz/tdw-tdwadmin/20160504/04559505496526517_-1_1204544348/1/_tmp.p_20160428/attempt_201605041104_0001_m_00_1 16/05/04 11:04:48 INFO hive.SparkHiveHadoopWriter: XXfinalPath=hdfs://tl-sng-gdt-nn-tdw.tencent-distribute.com:54310/tmp/assorz/tdw-tdwadmin/20160504/04559505496526517_-1_212180468/1/_tmp.p_20160428/attempt_201605041104_0001_m_00_1 ``` The reason is the output file use SimpleDateFormat("MMddHHmm"), if two sql insert into the same table in the same minute, the output will be overrite. I think we should change dateFormat to "MMddHHmmss", in our cluster, we can't finished a sql in one second. ## How was this patch tested? (Please explain how this patch was tested. E.g. unit tests, integration tests, manual tests) (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: hongshenCloses #14574 from shenh062326/SPARK-16985. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/993923c8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/993923c8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/993923c8 Branch: refs/heads/master Commit: 993923c8f5ca719daf905285738b7fdcaf944d8c Parents: 00e103a Author: hongshen Authored: Fri Aug 12 09:58:02 2016 +0100 Committer: Sean Owen Committed: Fri Aug 12 09:58:02 2016 +0100 -- core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala| 4 ++-- core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala| 2 +- core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala | 2 +- core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala index 17daac1..6550d70 100644 --- a/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala +++ b/core/src/main/scala/org/apache/spark/SparkHadoopWriter.scala @@ -67,7 +67,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable { def setup(jobid: Int, splitid: Int, attemptid: Int) { setIDs(jobid, splitid, attemptid) -HadoopRDD.addLocalConfiguration(new SimpleDateFormat("MMddHHmm").format(now), +HadoopRDD.addLocalConfiguration(new SimpleDateFormat("MMddHHmmss").format(now), jobid, splitID, attemptID, conf.value) } @@ -162,7 +162,7 @@ class SparkHadoopWriter(jobConf: JobConf) extends Logging with Serializable { private[spark] object SparkHadoopWriter { def createJobID(time: Date, id: Int): JobID = { -val formatter = new SimpleDateFormat("MMddHHmm") +val formatter = new SimpleDateFormat("MMddHHmmss") val jobtrackerID = formatter.format(time) new JobID(jobtrackerID, id) } http://git-wip-us.apache.org/repos/asf/spark/blob/993923c8/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala -- diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala index 99afe02..fd3a14b 100644 --- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala +++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala @@ -241,7 +241,7 @@ class HadoopRDD[K, V]( var reader: RecordReader[K, V] = null val inputFormat = getInputFormat(jobConf) - HadoopRDD.addLocalConfiguration(new SimpleDateFormat("MMddHHmm").format(createTime), + HadoopRDD.addLocalConfiguration(new SimpleDateFormat("MMddHHmmss").format(createTime), context.stageId, theSplit.index, context.attemptNumber, jobConf) reader = inputFormat.getRecordReader(split.inputSplit.value, jobConf, Reporter.NULL)
spark git commit: [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly
Repository: spark Updated Branches: refs/heads/master ccc6dc0f4 -> abff92bfd [SPARK-16975][SQL] Column-partition path starting '_' should be handled correctly ## What changes were proposed in this pull request? Currently, Spark ignores path names starting with underscore `_` and `.`. This causes read-failures for the column-partitioned file data sources whose partition column names starts from '_', e.g. `_col`. **Before** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") org.apache.spark.sql.AnalysisException: Unable to infer schema for ParquetFormat at /tmp/parquet20. It must be specified manually; ``` **After** ```scala scala> spark.range(10).withColumn("_locality_code", $"id").write.partitionBy("_locality_code").save("/tmp/parquet") scala> spark.read.parquet("/tmp/parquet") res2: org.apache.spark.sql.DataFrame = [id: bigint, _locality_code: int] ``` ## How was this patch tested? Pass the Jenkins with a new test case. Author: Dongjoon HyunCloses #14585 from dongjoon-hyun/SPARK-16975-PARQUET. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/abff92bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/abff92bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/abff92bf Branch: refs/heads/master Commit: abff92bfdc7d4c9d2308794f0350561fe0ceb4dd Parents: ccc6dc0 Author: Dongjoon Hyun Authored: Fri Aug 12 14:40:12 2016 +0800 Committer: Cheng Lian Committed: Fri Aug 12 14:40:12 2016 +0800 -- .../datasources/PartitioningAwareFileCatalog.scala | 2 +- .../sql/execution/datasources/fileSourceInterfaces.scala| 2 +- .../sql/execution/datasources/json/JsonFileFormat.scala | 2 +- .../execution/datasources/parquet/ParquetFileFormat.scala | 3 ++- .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 9 + 5 files changed, 14 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala index 811e96c..cef9d4d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileCatalog.scala @@ -204,6 +204,6 @@ abstract class PartitioningAwareFileCatalog( private def isDataPath(path: Path): Boolean = { val name = path.getName -!(name.startsWith("_") || name.startsWith(".")) +!((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) } } http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala index f068779..e03a232 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/fileSourceInterfaces.scala @@ -364,7 +364,7 @@ object HadoopFsRelation extends Logging { // We filter everything that starts with _ and ., except _common_metadata and _metadata // because Parquet needs to find those metadata files from leaf files returned by this method. // We should refactor this logic to not mix metadata files with data files. -(pathName.startsWith("_") || pathName.startsWith(".")) && +((pathName.startsWith("_") && !pathName.contains("=")) || pathName.startsWith(".")) && !pathName.startsWith("_common_metadata") && !pathName.startsWith("_metadata") } http://git-wip-us.apache.org/repos/asf/spark/blob/abff92bf/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonFileFormat.scala index 19681be..27910e2 100644 ---
spark git commit: [MINOR][ML] Rename TreeEnsembleModels to TreeEnsembleModel for PySpark
Repository: spark Updated Branches: refs/heads/master ac84fb64d -> ccc6dc0f4 [MINOR][ML] Rename TreeEnsembleModels to TreeEnsembleModel for PySpark ## What changes were proposed in this pull request? Fix the typo of ```TreeEnsembleModels``` for PySpark, it should ```TreeEnsembleModel``` which will be consistent with Scala. What's more, it represents a tree ensemble model, so ```TreeEnsembleModel``` should be more reasonable. This should not be used public, so it will not involve breaking change. ## How was this patch tested? No new tests, should pass existing ones. Author: Yanbo LiangCloses #14454 from yanboliang/TreeEnsembleModel. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ccc6dc0f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ccc6dc0f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ccc6dc0f Branch: refs/heads/master Commit: ccc6dc0f4b62837c73fca0e3c8b9c14be798b062 Parents: ac84fb6 Author: Yanbo Liang Authored: Thu Aug 11 22:39:19 2016 -0700 Committer: Yanbo Liang Committed: Thu Aug 11 22:39:19 2016 -0700 -- python/pyspark/ml/classification.py | 6 +++--- python/pyspark/ml/regression.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ccc6dc0f/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 9a3c7b1..6468007 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -22,7 +22,7 @@ from pyspark import since, keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ -RandomForestParams, TreeEnsembleModels, TreeEnsembleParams +RandomForestParams, TreeEnsembleModel, TreeEnsembleParams from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams from pyspark.ml.wrapper import JavaWrapper @@ -722,7 +722,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred return RandomForestClassificationModel(java_model) -class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): +class RandomForestClassificationModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): """ Model fitted by RandomForestClassifier. @@ -873,7 +873,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol return self.getOrDefault(self.lossType) -class GBTClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): +class GBTClassificationModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): """ Model fitted by GBTClassifier. http://git-wip-us.apache.org/repos/asf/spark/blob/ccc6dc0f/python/pyspark/ml/regression.py -- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index d88dc75..1ae2bd4 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -749,7 +749,7 @@ class DecisionTreeModel(JavaModel): @inherit_doc -class TreeEnsembleModels(JavaModel): +class TreeEnsembleModel(JavaModel): """ (private abstraction) @@ -909,7 +909,7 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi return RandomForestRegressionModel(java_model) -class RandomForestRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): +class RandomForestRegressionModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. @@ -1047,7 +1047,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, return self.getOrDefault(self.lossType) -class GBTRegressionModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): +class GBTRegressionModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`GBTRegressor`. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-13081][PYSPARK][SPARK_SUBMIT] Allow set pythonExec of driver and executor through conf…
Repository: spark Updated Branches: refs/heads/master ea0bf91b4 -> 7a9e25c38 [SPARK-13081][PYSPARK][SPARK_SUBMIT] Allow set pythonExec of driver and executor through conf⦠Before this PR, user have to export environment variable to specify the python of driver & executor which is not so convenient for users. This PR is trying to allow user to specify python through configuration "--pyspark-driver-python" & "--pyspark-executor-python" Manually test in local & yarn mode for pyspark-shell and pyspark batch mode. Author: Jeff ZhangCloses #13146 from zjffdu/SPARK-13081. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a9e25c3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a9e25c3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a9e25c3 Branch: refs/heads/master Commit: 7a9e25c38380e6c62080d62ad38a4830e44fe753 Parents: ea0bf91 Author: Jeff Zhang Authored: Thu Aug 11 20:08:25 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Aug 11 20:08:39 2016 -0700 -- .../org/apache/spark/deploy/PythonRunner.scala | 14 ++--- .../apache/spark/internal/config/package.scala | 8 .../spark/launcher/SparkLauncherSuite.java | 8 .../scala/org/apache/spark/SparkConfSuite.scala | 2 ++ .../apache/spark/deploy/SparkSubmitSuite.scala | 5 + docs/configuration.md | 21 ++-- .../apache/spark/launcher/SparkLauncher.java| 4 .../launcher/SparkSubmitCommandBuilder.java | 18 ++--- 8 files changed, 72 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a9e25c3/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala -- diff --git a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala index 6227a30..0b1cec2 100644 --- a/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/deploy/PythonRunner.scala @@ -24,8 +24,9 @@ import scala.collection.mutable.ArrayBuffer import scala.collection.JavaConverters._ import scala.util.Try -import org.apache.spark.SparkUserAppException +import org.apache.spark.{SparkConf, SparkUserAppException} import org.apache.spark.api.python.PythonUtils +import org.apache.spark.internal.config._ import org.apache.spark.util.{RedirectThread, Utils} /** @@ -37,8 +38,12 @@ object PythonRunner { val pythonFile = args(0) val pyFiles = args(1) val otherArgs = args.slice(2, args.length) -val pythonExec = - sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", sys.env.getOrElse("PYSPARK_PYTHON", "python")) +val sparkConf = new SparkConf() +val pythonExec = sparkConf.get(PYSPARK_DRIVER_PYTHON) + .orElse(sparkConf.get(PYSPARK_PYTHON)) + .orElse(sys.env.get("PYSPARK_DRIVER_PYTHON")) + .orElse(sys.env.get("PYSPARK_PYTHON")) + .getOrElse("python") // Format python file paths before adding them to the PYTHONPATH val formattedPythonFile = formatPath(pythonFile) @@ -77,6 +82,9 @@ object PythonRunner { // This is equivalent to setting the -u flag; we use it because ipython doesn't support -u: env.put("PYTHONUNBUFFERED", "YES") // value is needed to be set to a non-empty string env.put("PYSPARK_GATEWAY_PORT", "" + gatewayServer.getListeningPort) +// pass conf spark.pyspark.python to python process, the only way to pass info to +// python process is through environment variable. +sparkConf.get(PYSPARK_PYTHON).foreach(env.put("PYSPARK_PYTHON", _)) builder.redirectErrorStream(true) // Ugly but needed for stdout and stderr to synchronize try { val process = builder.start() http://git-wip-us.apache.org/repos/asf/spark/blob/7a9e25c3/core/src/main/scala/org/apache/spark/internal/config/package.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index e646d99..be3dac4 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -106,4 +106,12 @@ package object config { private[spark] val METRICS_NAMESPACE = ConfigBuilder("spark.metrics.namespace") .stringConf .createOptional + + private[spark] val PYSPARK_DRIVER_PYTHON = ConfigBuilder("spark.pyspark.driver.python") +.stringConf +.createOptional + + private[spark] val PYSPARK_PYTHON = ConfigBuilder("spark.pyspark.python") +.stringConf +
spark git commit: [SPARK-17022][YARN] Handle potential deadlock in driver handling messages
Repository: spark Updated Branches: refs/heads/branch-2.0 bc683f037 -> 0fb01496c [SPARK-17022][YARN] Handle potential deadlock in driver handling messages ## What changes were proposed in this pull request? We directly send RequestExecutors to AM instead of transfer it to yarnShedulerBackend first, to avoid potential deadlock. ## How was this patch tested? manual tests Author: WangTaoTheTonicCloses #14605 from WangTaoTheTonic/lock. (cherry picked from commit ea0bf91b4a2ca3ef472906e50e31fd6268b6f53e) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0fb01496 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0fb01496 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0fb01496 Branch: refs/heads/branch-2.0 Commit: 0fb01496c09defa1436dbb7f5e1cbc5461617a31 Parents: bc683f0 Author: WangTaoTheTonic Authored: Thu Aug 11 15:09:23 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Aug 11 15:09:32 2016 -0700 -- .../scheduler/cluster/YarnSchedulerBackend.scala | 18 +++--- 1 file changed, 15 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0fb01496/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala index 6b3c831..ea63ff5 100644 --- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala +++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala @@ -125,8 +125,20 @@ private[spark] abstract class YarnSchedulerBackend( * This includes executors already pending or running. */ override def doRequestTotalExecutors(requestedTotal: Int): Boolean = { -yarnSchedulerEndpointRef.askWithRetry[Boolean]( - RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount)) +val r = RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount) +yarnSchedulerEndpoint.amEndpoint match { + case Some(am) => +try { + am.askWithRetry[Boolean](r) +} catch { + case NonFatal(e) => +logError(s"Sending $r to AM was unsuccessful", e) +return false +} + case None => +logWarning("Attempted to request executors before the AM has registered!") +return false +} } /** @@ -209,7 +221,7 @@ private[spark] abstract class YarnSchedulerBackend( */ private class YarnSchedulerEndpoint(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint with Logging { -private var amEndpoint: Option[RpcEndpointRef] = None +var amEndpoint: Option[RpcEndpointRef] = None private val askAmThreadPool = ThreadUtils.newDaemonCachedThreadPool("yarn-scheduler-ask-am-thread-pool") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org