svn commit: r25258 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_23_22_01-578607b-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-02-23 Thread pwendell
Author: pwendell
Date: Sat Feb 24 06:17:09 2018
New Revision: 25258

Log:
Apache Spark 2.3.1-SNAPSHOT-2018_02_23_22_01-578607b docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r25257 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_23_20_01-1a198ce-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-02-23 Thread pwendell
Author: pwendell
Date: Sat Feb 24 04:15:10 2018
New Revision: 25257

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_02_23_20_01-1a198ce docs


[This commit notification would consist of 1444 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-23475][UI][BACKPORT-2.3] Show also skipped stages

2018-02-23 Thread vanzin
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 285b841ff -> 578607b30


[SPARK-23475][UI][BACKPORT-2.3] Show also skipped stages

## What changes were proposed in this pull request?

SPARK-20648 introduced the status `SKIPPED` for the stages. On the UI, 
previously, skipped stages were shown as `PENDING`; after this change, they are 
not shown on the UI.

The PR introduce a new section in order to show also `SKIPPED` stages in a 
proper table.

Manual backport from to branch-2.3.

## How was this patch tested?

added UT

Author: Marco Gaido 

Closes #20662 from mgaido91/SPARK-23475_2.3.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/578607b3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/578607b3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/578607b3

Branch: refs/heads/branch-2.3
Commit: 578607b30a65d101380acbe9c9740dc267a5d55c
Parents: 285b841
Author: Marco Gaido 
Authored: Fri Feb 23 18:27:33 2018 -0800
Committer: Marcelo Vanzin 
Committed: Fri Feb 23 18:27:33 2018 -0800

--
 .../apache/spark/ui/jobs/AllStagesPage.scala| 27 
 .../org/apache/spark/ui/UISeleniumSuite.scala   | 17 
 2 files changed, 44 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/578607b3/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
--
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala 
b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
index b1e3434..9325b90 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllStagesPage.scala
@@ -36,6 +36,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends 
WebUIPage("") {
 
 val activeStages = allStages.filter(_.status == StageStatus.ACTIVE)
 val pendingStages = allStages.filter(_.status == StageStatus.PENDING)
+val skippedStages = allStages.filter(_.status == StageStatus.SKIPPED)
 val completedStages = allStages.filter(_.status == StageStatus.COMPLETE)
 val failedStages = allStages.filter(_.status == StageStatus.FAILED).reverse
 
@@ -51,6 +52,9 @@ private[ui] class AllStagesPage(parent: StagesTab) extends 
WebUIPage("") {
 val completedStagesTable =
   new StageTableBase(parent.store, request, completedStages, "completed", 
"completedStage",
 parent.basePath, subPath, parent.isFairScheduler, false, false)
+val skippedStagesTable =
+  new StageTableBase(parent.store, request, skippedStages, "skipped", 
"skippedStage",
+parent.basePath, subPath, parent.isFairScheduler, false, false)
 val failedStagesTable =
   new StageTableBase(parent.store, request, failedStages, "failed", 
"failedStage",
 parent.basePath, subPath, parent.isFairScheduler, false, true)
@@ -66,6 +70,7 @@ private[ui] class AllStagesPage(parent: StagesTab) extends 
WebUIPage("") {
 val shouldShowActiveStages = activeStages.nonEmpty
 val shouldShowPendingStages = pendingStages.nonEmpty
 val shouldShowCompletedStages = completedStages.nonEmpty
+val shouldShowSkippedStages = skippedStages.nonEmpty
 val shouldShowFailedStages = failedStages.nonEmpty
 
 val appSummary = parent.store.appSummary()
@@ -103,6 +108,14 @@ private[ui] class AllStagesPage(parent: StagesTab) extends 
WebUIPage("") {
 }
   }
   {
+if (shouldShowSkippedStages) {
+  
+Skipped Stages:
+{skippedStages.size}
+  
+}
+  }
+  {
 if (shouldShowFailedStages) {
   
 Failed Stages:
@@ -133,6 +146,20 @@ private[ui] class AllStagesPage(parent: StagesTab) extends 
WebUIPage("") {
   content ++= Completed Stages 
({completedStageNumStr}) ++
   completedStagesTable.toNodeSeq
 }
+if (shouldShowSkippedStages) {
+  content ++=
+
+  
+
+Skipped Stages ({skippedStages.size})
+  
+ ++
+
+  {skippedStagesTable.toNodeSeq}
+
+}
 if (shouldShowFailedStages) {
   content ++= Failed Stages ({numFailedStages}) ++
   failedStagesTable.toNodeSeq

http://git-wip-us.apache.org/repos/asf/spark/blob/578607b3/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
--
diff --git a/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala 
b/core/src/test/scala/org/apache/spark/ui/UISeleniumSuite.scala
index 3265467..0f20eea 100644
--- 

spark git commit: [SPARK-23459][SQL] Improve the error message when unknown column is specified in partition columns

2018-02-23 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 855ce13d0 -> 1a198ce8f


[SPARK-23459][SQL] Improve the error message when unknown column is specified 
in partition columns

## What changes were proposed in this pull request?

This PR avoids to print schema internal information when unknown column is 
specified in partition columns. This PR prints column names in the schema with 
more readable format.

The following is an example.

Source code
```
test("save with an unknown partition column") {
  withTempDir { dir =>
val path = dir.getCanonicalPath
  Seq(1L -> "a").toDF("i", "j").write
.format("parquet")
.partitionBy("unknownColumn")
.save(path)
  }
```
Output without this PR
```
Partition column unknownColumn not found in schema 
StructType(StructField(i,LongType,false), StructField(j,StringType,true));
```

Output with this PR
```
Partition column unknownColumn not found in schema struct;
```

## How was this patch tested?

Manually tested

Author: Kazuaki Ishizaki 

Closes #20653 from kiszk/SPARK-23459.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a198ce8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a198ce8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a198ce8

Branch: refs/heads/master
Commit: 1a198ce8f580bcf35b9cbfab403fc40f821046a1
Parents: 855ce13
Author: Kazuaki Ishizaki 
Authored: Fri Feb 23 16:30:32 2018 -0800
Committer: gatorsmile 
Committed: Fri Feb 23 16:30:32 2018 -0800

--
 .../execution/datasources/PartitioningUtils.scala   |  3 ++-
 .../apache/spark/sql/sources/SaveLoadSuite.scala| 16 
 2 files changed, 18 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1a198ce8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 379acb6..f9a2480 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -486,7 +486,8 @@ object PartitioningUtils {
 val equality = columnNameEquality(caseSensitive)
 StructType(partitionColumns.map { col =>
   schema.find(f => equality(f.name, col)).getOrElse {
-throw new AnalysisException(s"Partition column $col not found in 
schema $schema")
+val schemaCatalog = schema.catalogString
+throw new AnalysisException(s"Partition column `$col` not found in 
schema $schemaCatalog")
   }
 }).asNullable
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/1a198ce8/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
index 773d34d..12779b4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/SaveLoadSuite.scala
@@ -126,4 +126,20 @@ class SaveLoadSuite extends DataSourceTest with 
SharedSQLContext with BeforeAndA
 
 checkLoad(df2, "jsonTable2")
   }
+
+  test("SPARK-23459: Improve error message when specified unknown column in 
partition columns") {
+withTempDir { dir =>
+  val path = dir.getCanonicalPath
+  val unknown = "unknownColumn"
+  val df = Seq(1L -> "a").toDF("i", "j")
+  val schemaCatalog = df.schema.catalogString
+  val e = intercept[AnalysisException] {
+df.write
+  .format("parquet")
+  .partitionBy(unknown)
+  .save(path)
+  }.getMessage
+  assert(e.contains(s"Partition column `$unknown` not found in schema 
$schemaCatalog"))
+}
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r25256 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_23_16_01-855ce13-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-02-23 Thread pwendell
Author: pwendell
Date: Sat Feb 24 00:15:07 2018
New Revision: 25256

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_02_23_16_01-855ce13 docs


[This commit notification would consist of 1444 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-23408][SS] Synchronize successive AddData actions in Streaming*JoinSuite

2018-02-23 Thread tdas
Repository: spark
Updated Branches:
  refs/heads/master 049f243c5 -> 855ce13d0


[SPARK-23408][SS] Synchronize successive AddData actions in Streaming*JoinSuite

**The best way to review this PR is to ignore whitespace/indent changes. Use 
this link - https://github.com/apache/spark/pull/20650/files?w=1**

## What changes were proposed in this pull request?

The stream-stream join tests add data to multiple sources and expect it all to 
show up in the next batch. But there's a race condition; the new batch might 
trigger when only one of the AddData actions has been reached.

Prior attempt to solve this issue by jose-torres in #20646 attempted to 
simultaneously synchronize on all memory sources together when consecutive 
AddData was found in the actions. However, this carries the risk of deadlock as 
well as unintended modification of stress tests (see the above PR for a 
detailed explanation). Instead, this PR attempts the following.

- A new action called `StreamProgressBlockedActions` that allows multiple 
actions to be executed while the streaming query is blocked from making 
progress. This allows data to be added to multiple sources that are made 
visible simultaneously in the next batch.
- An alias of `StreamProgressBlockedActions` called `MultiAddData` is 
explicitly used in the `Streaming*JoinSuites` to add data to two memory sources 
simultaneously.

This should avoid unintentional modification of the stress tests (or any other 
test for that matter) while making sure that the flaky tests are deterministic.

## How was this patch tested?
Modified test cases in `Streaming*JoinSuites` where there are consecutive 
`AddData` actions.

Author: Tathagata Das 

Closes #20650 from tdas/SPARK-23408.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/855ce13d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/855ce13d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/855ce13d

Branch: refs/heads/master
Commit: 855ce13d045569b7b16fdc7eee9c981f4ff3a545
Parents: 049f243
Author: Tathagata Das 
Authored: Fri Feb 23 12:40:58 2018 -0800
Committer: Tathagata Das 
Committed: Fri Feb 23 12:40:58 2018 -0800

--
 .../streaming/MicroBatchExecution.scala |  10 +
 .../apache/spark/sql/streaming/StreamTest.scala | 472 ++-
 .../sql/streaming/StreamingJoinSuite.scala  |  54 +--
 3 files changed, 284 insertions(+), 252 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/855ce13d/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 8465501..6bd0397 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -504,6 +504,16 @@ class MicroBatchExecution(
 }
   }
 
+  /** Execute a function while locking the stream from making an progress */
+  private[sql] def withProgressLocked(f: => Unit): Unit = {
+awaitProgressLock.lock()
+try {
+  f
+} finally {
+  awaitProgressLock.unlock()
+}
+  }
+
   private def toJava(scalaOption: Option[OffsetV2]): Optional[OffsetV2] = {
 Optional.ofNullable(scalaOption.orNull)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/855ce13d/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
index 159dd0e..08f722e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamTest.scala
@@ -102,6 +102,19 @@ trait StreamTest extends QueryTest with SharedSQLContext 
with TimeLimits with Be
   AddDataMemory(source, data)
   }
 
+  /**
+   * Adds data to multiple memory streams such that all the data will be made 
visible in the
+   * same batch. This is applicable only to MicroBatchExecution, as this 
coordination cannot be
+   * performed at the driver in ContinuousExecutions.
+   */
+  object MultiAddData {
+def apply[A]
+  (source1: MemoryStream[A], data1: A*)(source2: MemoryStream[A], data2: 
A*): StreamAction = {
+  val actions = Seq(AddDataMemory(source1, data1), AddDataMemory(source2, 
data2))
+  

svn commit: r25233 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_23_00_01-049f243-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-02-23 Thread pwendell
Author: pwendell
Date: Fri Feb 23 08:15:57 2018
New Revision: 25233

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_02_23_00_01-049f243 docs


[This commit notification would consist of 1444 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org