[spark] branch master updated: [SPARK-39810][SQL] Catalog.tableExists should handle nested namespace

2022-07-19 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 0cc96f76d8a [SPARK-39810][SQL] Catalog.tableExists should handle 
nested namespace
0cc96f76d8a is described below

commit 0cc96f76d8a4858aee09e1fa32658da3ae76d384
Author: Rui Wang 
AuthorDate: Wed Jul 20 11:18:34 2022 +0800

[SPARK-39810][SQL] Catalog.tableExists should handle nested namespace

### What changes were proposed in this pull request?

Let CatalogImpl.tableExists to reuse CatalogImpl.getTable code.

### Why are the changes needed?

Currently `tableExists` assume input is only have 3 name parts which is 
wrong assumption (namespace could be an array). The `getTable` implementation 
is correct. So we can re-use `getTable` code and if the getTable succeeds, then 
`tableExists` can return true.

### Does this PR introduce _any_ user-facing change?

No
### How was this patch tested?

UT

Closes #37220 from amaliujia/SPARK-39810.

Authored-by: Rui Wang 
Signed-off-by: Wenchen Fan 
---
 .../scala/org/apache/spark/sql/internal/CatalogImpl.scala| 12 
 .../scala/org/apache/spark/sql/internal/CatalogSuite.scala   | 12 
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
index 8ca11f620a5..e11b349777e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/CatalogImpl.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.plans.logical.{CreateTable, 
LocalRelation, RecoverPartitions, ShowFunctions, ShowNamespaces, ShowTables, 
SubqueryAlias, TableSpec, View}
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
-import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, 
Identifier, SupportsNamespaces, TableCatalog}
+import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogPlugin, 
SupportsNamespaces, TableCatalog}
 import 
org.apache.spark.sql.connector.catalog.CatalogV2Implicits.{CatalogHelper, 
IdentifierHelper, MultipartIdentifierHelper, TransformHelper}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation}
@@ -493,14 +493,10 @@ class CatalogImpl(sparkSession: SparkSession) extends 
Catalog {
*/
   override def tableExists(tableName: String): Boolean = {
 try {
-  val tableIdent = 
sparkSession.sessionState.sqlParser.parseTableIdentifier(tableName)
-  tableExists(tableIdent.database.orNull, tableIdent.table)
+  getTable(tableName)
+  true
 } catch {
-  case e: org.apache.spark.sql.catalyst.parser.ParseException =>
-val ident = 
sparkSession.sessionState.sqlParser.parseMultipartIdentifier(tableName)
-val catalog =
-  
sparkSession.sessionState.catalogManager.catalog(ident(0)).asTableCatalog
-catalog.tableExists(Identifier.of(Array(ident(1)), ident(2)))
+  case e: AnalysisException => false
 }
   }
 
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
index f3133026836..0de48325d98 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/CatalogSuite.scala
@@ -122,6 +122,7 @@ class CatalogSuite extends SharedSparkSession with 
AnalysisTest with BeforeAndAf
   override def afterEach(): Unit = {
 try {
   sessionCatalog.reset()
+  spark.sessionState.catalogManager.reset()
 } finally {
   super.afterEach()
 }
@@ -769,6 +770,17 @@ class CatalogSuite extends SharedSparkSession with 
AnalysisTest with BeforeAndAf
 assert(spark.catalog.tableExists(Array(catalogName, dbName, 
tableName).mkString(".")))
   }
 
+  test("SPARK-39810: Catalog.tableExists should handle nested namespace") {
+val tableSchema = new StructType().add("i", "int")
+val catalogName = "testcat"
+val dbName = "my_db2.my_db3"
+val tableName = "my_table2"
+assert(!spark.catalog.tableExists(Array(catalogName, dbName, 
tableName).mkString(".")))
+createTable(tableName, dbName, catalogName, 
classOf[FakeV2Provider].getName, tableSchema,
+  Map.empty[String, String], "")
+assert(spark.catalog.tableExists(Array(catalogName, dbName, 
tableName).mkString(".")))
+  }
+
   test("three layer namespace compatibility - database exists") {
 val catalogName = "testcat"
 val dbName = 

[spark] branch master updated: [SPARK-39148][SQL] DS V2 aggregate push down can work with OFFSET or LIMIT

2022-07-19 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new aec79534abf [SPARK-39148][SQL] DS V2 aggregate push down can work with 
OFFSET or LIMIT
aec79534abf is described below

commit aec79534abf819e7981babc73d13450ea8e49b08
Author: Wenchen Fan 
AuthorDate: Wed Jul 20 11:13:08 2022 +0800

[SPARK-39148][SQL] DS V2 aggregate push down can work with OFFSET or LIMIT

### What changes were proposed in this pull request?

This PR refactors the v2 agg pushdown code. The main change is, now we 
don't build the `Scan` immediately when pushing agg. We did it so before 
because we want to know the data schema with agg pushed, then we can add cast 
when rewriting the query plan after pushdown. But the problem is, we build 
`Scan` too early and can't push down any more operators, while it's common to 
see LIMIT/OFFSET after agg.

The idea of the refactor is, we don't need to know the data schema with agg 
pushed. We just give an expectation (the data type should be the same of Spark 
agg functions), use it to define the output of `ScanBuilderHolder`, and then 
rewrite the query plan. Later on, when we build the `Scan` and replace 
`ScanBuilderHolder` with `DataSourceV2ScanRelation`, we check the actual data 
schema and add a `Project` to do type cast if necessary.

### Why are the changes needed?

support pushing down LIMIT/OFFSET after agg.

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

updated tests

Closes #37195 from cloud-fan/agg.

Lead-authored-by: Wenchen Fan 
Co-authored-by: Wenchen Fan 
Signed-off-by: Wenchen Fan 
---
 .../datasources/v2/V2ScanRelationPushDown.scala| 419 +++--
 .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala|  38 +-
 2 files changed, 254 insertions(+), 203 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
index 8951c37e127..f1e0e6d80c5 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
@@ -26,12 +26,12 @@ import org.apache.spark.sql.catalyst.planning.ScanOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, 
LeafNode, Limit, LimitAndOffset, LocalLimit, LogicalPlan, Offset, 
OffsetAndLimit, Project, Sample, Sort}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.expressions.{SortOrder => V2SortOrder}
-import org.apache.spark.sql.connector.expressions.aggregate.{Aggregation, Avg, 
Count, GeneralAggregateFunc, Sum, UserDefinedAggregateFunc}
+import org.apache.spark.sql.connector.expressions.aggregate.{Aggregation, Avg, 
Count, CountStar, Max, Min, Sum}
 import org.apache.spark.sql.connector.expressions.filter.Predicate
 import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, 
SupportsPushDownAggregates, SupportsPushDownFilters, V1Scan}
 import org.apache.spark.sql.execution.datasources.DataSourceStrategy
 import org.apache.spark.sql.sources
-import org.apache.spark.sql.types.{DataType, IntegerType, LongType, StructType}
+import org.apache.spark.sql.types.{DataType, DecimalType, IntegerType, 
StructType}
 import org.apache.spark.sql.util.SchemaUtils._
 
 object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper 
with AliasHelper {
@@ -44,6 +44,7 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with 
PredicateHelper wit
   pushDownFilters,
   pushDownAggregates,
   pushDownLimitAndOffset,
+  buildScanWithPushedAggregate,
   pruneColumns)
 
 pushdownRules.foldLeft(plan) { (newPlan, pushDownRule) =>
@@ -92,189 +93,201 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] 
with PredicateHelper wit
 
   def pushDownAggregates(plan: LogicalPlan): LogicalPlan = plan.transform {
 // update the scan builder with agg pushdown and return a new plan with 
agg pushed
-case aggNode @ Aggregate(groupingExpressions, resultExpressions, child) =>
-  child match {
-case ScanOperation(project, filters, sHolder: ScanBuilderHolder)
-  if filters.isEmpty && CollapseProject.canCollapseExpressions(
-resultExpressions, project, alwaysInline = true) =>
-  sHolder.builder match {
-case r: SupportsPushDownAggregates =>
-  val aliasMap = getAliasMap(project)
-  val actualResultExprs = 
resultExpressions.map(replaceAliasButKeepName(_, aliasMap))
-  val actualGroupExprs = 

[spark] branch master updated: [SPARK-39818][SQL] Fix bug in ARRAY, STRUCT, MAP types with DEFAULT values with NULL field(s)

2022-07-19 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 46a8f1f0df8 [SPARK-39818][SQL] Fix bug in ARRAY, STRUCT, MAP types 
with DEFAULT values with NULL field(s)
46a8f1f0df8 is described below

commit 46a8f1f0df83deeda6309f8377988a66ba059c10
Author: Daniel Tenedorio 
AuthorDate: Tue Jul 19 18:53:23 2022 -0700

[SPARK-39818][SQL] Fix bug in ARRAY, STRUCT, MAP types with DEFAULT values 
with NULL field(s)

### What changes were proposed in this pull request?

Fix bug in SQL string generation for literal values of ARRAY, STRUCT, MAP 
types with DEFAULT values with NULL field(s). Specifically, prevent Scala 
`MatchError`s from getting raised when attempting to call the `sql` method of 
the `Literal` expression for such values when one of their fields is `null`.

### Why are the changes needed?

This fixes a bug by preventing exceptions from being inappropriately raised.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

This PR adds new unit test coverage.

Closes #37229 from dtenedor/fix-bug-sql.

Authored-by: Daniel Tenedorio 
Signed-off-by: Gengliang Wang 
---
 .../apache/spark/sql/catalyst/expressions/literals.scala  | 14 +++---
 .../catalyst/analysis/ExpressionTypeCheckingSuite.scala   | 15 +++
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 03678773ccc..a8c877a29de 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -494,20 +494,20 @@ case class Literal (value: Any, dataType: DataType) 
extends LeafExpression {
 case (row: GenericInternalRow, structType: StructType) =>
   val structNames: Array[String] = structType.fields.map(_.name)
   val structValues: Array[String] =
-row.values.zip(structType.fields.map(_.dataType)).map {
-  case (value: Any, fieldType: DataType) =>
-Literal(value, fieldType).sql
+row.values.zip(structType.fields.map(_.dataType)).map { kv =>
+  Literal(kv._1, kv._2).sql
 }
   val structFields: Array[String] =
-structNames.zip(structValues).map { kv => s"${kv._1}, ${kv._2}" }
+structNames.zip(structValues).map {
+  kv => s"${kv._1}, ${kv._2}"
+}
   s"NAMED_STRUCT(${structFields.mkString(", ")})"
 case (data: ArrayBasedMapData, mapType: MapType) =>
   val keyData = data.keyArray.asInstanceOf[GenericArrayData]
   val valueData = data.valueArray.asInstanceOf[GenericArrayData]
   val keysAndValues: Array[String] =
-keyData.array.zip(valueData.array).map {
-  case (key: Any, value: Any) =>
-s"${Literal(key, mapType.keyType).sql}, ${Literal(value, 
mapType.valueType).sql}"
+keyData.array.zip(valueData.array).map { kv =>
+  s"${Literal(kv._1, mapType.keyType).sql}, ${Literal(kv._2, 
mapType.valueType).sql}"
 }
   s"MAP(${keysAndValues.mkString(", ")})"
 case _ => value.toString
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
index 9225a7dcbe4..0969089bd92 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/ExpressionTypeCheckingSuite.scala
@@ -240,4 +240,19 @@ class ExpressionTypeCheckingSuite extends SparkFunSuite 
with SQLHelper {
 "does not support ordering")
 }
   }
+
+  test("check types for SQL string generation") {
+assert(Literal.create(Array(1, 2, 3), ArrayType(IntegerType)).sql ==
+  "ARRAY(1, 2, 3)")
+assert(Literal.create(Array(1, 2, null), ArrayType(IntegerType)).sql ==
+  "ARRAY(1, 2, CAST(NULL AS INT))")
+assert(Literal.default(StructType(Seq(StructField("col", 
StringType.sql ==
+  "NAMED_STRUCT(col, '')")
+assert(Literal.default(StructType(Seq(StructField("col", NullType.sql 
==
+  "NAMED_STRUCT(col, NULL)")
+assert(Literal.create(Map(42L -> true), MapType(LongType, 
BooleanType)).sql ==
+  "MAP(42L, true)")
+assert(Literal.create(Map(42L -> null), MapType(LongType, NullType)).sql ==
+  "MAP(42L, NULL)")
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For 

[spark] branch master updated (70ec696bce7 -> d21624513fa)

2022-07-19 Thread dongjoon
This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 70ec696bce7 [SPARK-39798][SQL] Replcace `toSeq.toArray` with 
`.toArray[Any]` in constructor of `GenericArrayData`
 add d21624513fa [SPARK-39700][SQL][DOCS] Update two-parameter 
`listColumns/getTable/getFunction/tableExists/functionExists` functions docs to 
mention limitation

No new revisions were added by this update.

Summary of changes:
 .../main/scala/org/apache/spark/sql/catalog/Catalog.scala | 15 +++
 1 file changed, 15 insertions(+)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[GitHub] [spark-website] dongjoon-hyun commented on pull request #400: [SPARK-39512] Document docker image release steps

2022-07-19 Thread GitBox


dongjoon-hyun commented on PR #400:
URL: https://github.com/apache/spark-website/pull/400#issuecomment-1189305437

   Thank you, @holdenk and all. According to this guideline, Apache Spark 3.2.2 
images are also published too.
   
   https://user-images.githubusercontent.com/9700541/179801611-3e2ab69f-820a-4ea1-8800-e9253a26709e.png;>
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39798][SQL] Replcace `toSeq.toArray` with `.toArray[Any]` in constructor of `GenericArrayData`

2022-07-19 Thread srowen
This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 70ec696bce7 [SPARK-39798][SQL] Replcace `toSeq.toArray` with 
`.toArray[Any]` in constructor of `GenericArrayData`
70ec696bce7 is described below

commit 70ec696bce7012b25ed6d8acec5e2f3b3e127f11
Author: yangjie01 
AuthorDate: Tue Jul 19 08:41:50 2022 -0500

[SPARK-39798][SQL] Replcace `toSeq.toArray` with `.toArray[Any]` in 
constructor of `GenericArrayData`

### What changes were proposed in this pull request?
There are many `Array.toSeq.toArray` calls in the constructor, this pr 
simplifies them to `Array.toArray[Any]`

### Why are the changes needed?

- For Scala 2.12, just a code simplification, `toSeq` return 
`thisCollection`, can be omitted directly
- For Scala 2.13, removing `toSeq` can save an unnecessary memory copy

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- Pass GitHub Actions
- Manual test:

```
mvn clean install -DskipTests -pl sql/core -am -Pscala-2.13
mvn clean test -pl sql/catalyst -Pscala-2.13
mvn clean test -pl sql/core -Pscala-2.13 
-Dtest.exclude.tags=org.apache.spark.tags.ExtendedLevelDBTest
```

```
Run completed in 8 minutes, 59 seconds.
Total number of tests run: 6594
Suites: completed 286, aborted 0
Tests: succeeded 6594, failed 0, canceled 0, ignored 5, pending 0
All tests passed.

Run completed in 1 hour, 51 minutes, 29 seconds.
Total number of tests run: 11895
Suites: completed 526, aborted 0
Tests: succeeded 11895, failed 0, canceled 11, ignored 33, pending 0
All tests passed.
```

- Run `GenericArrayDataBenchmark` using GA with Scala 2.13:

**Before**

```
OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure
Intel(R) Xeon(R) Platinum 8272CL CPU  2.60GHz
constructor:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)Rate(M/s)   Per Row(ns)   Relative


arrayOfAny3  3  
 0   2992.7   0.3   1.0X
arrayOfAnyAsObject  215215  
 0 46.6  21.5   0.0X
arrayOfAnyAsSeq 232235  
 2 43.1  23.2   0.0X
arrayOfInt  514515  
 1 19.5  51.4   0.0X
arrayOfIntAsObject  724725  
 1 13.8  72.4   0.0X
```
**After**

```
OpenJDK 64-Bit Server VM 1.8.0_332-b09 on Linux 5.13.0-1031-azure
Intel(R) Xeon(R) Platinum 8272CL CPU  2.60GHz
constructor:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)Rate(M/s)   Per Row(ns)   Relative


arrayOfAny3  3  
 0   2992.8   0.3   1.0X
arrayOfAnyAsObject  215215  
 0 46.6  21.5   0.0X
arrayOfAnyAsSeq 233237  
 2 42.8  23.3   0.0X
arrayOfInt  416416  
 1 24.1  41.6   0.0X
arrayOfIntAsObject  737737  
 0 13.6  73.7   0.0X
```

Seems `arrayOfInt` scene has improvement when using Scala 2.13

Closes #37208 from LuciferYang/GenericArrayData-toArray.

Authored-by: yangjie01 
Signed-off-by: Sean Owen 
---
 .../spark/sql/catalyst/util/GenericArrayData.scala | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
index e46d730afb4..e566e659db2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala
@@ -28,23 +28,23 @@ class GenericArrayData(val array: Array[Any]) extends 
ArrayData {
   // Specified this as`scala.collection.Seq` because seqOrArray can be
   // `mutable.ArraySeq` in Scala 2.13
   def this(seq: 

[spark] branch master updated: [SPARK-37287][SQL] Pull out dynamic partition and bucket sort from FileFormatWriter

2022-07-19 Thread wenchen
This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 256227477b0 [SPARK-37287][SQL] Pull out dynamic partition and bucket 
sort from FileFormatWriter
256227477b0 is described below

commit 256227477b027ea8160c4c266c3e8bc7d302a7e2
Author: allisonwang-db 
AuthorDate: Tue Jul 19 21:31:37 2022 +0800

[SPARK-37287][SQL] Pull out dynamic partition and bucket sort from 
FileFormatWriter

### What changes were proposed in this pull request?
`FileFormatWriter.write` is used by all V1 write commands including data 
source and hive tables. Depending on dynamic partitions, bucketed, and sort 
columns in the V1 write command, `FileFormatWriter` can add a physical sort on 
top of the query plan which is not visible from plan directly.

This PR (based on https://github.com/apache/spark/pull/34568) intends to 
pull out the physical sort added by `FileFormatWriter` into logical planning. 
It adds a new logical rule `V1Writes` to add logical Sort operators based on 
the required ordering of a V1 write command. This behavior can be controlled by 
the new config **spark.sql.optimizer.plannedWrite.enabled** (default: true).

### Why are the changes needed?

Improve observability of V1 write, and unify the logic of V1 and V2 write 
commands.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

New unit tests.

Closes #37099 from allisonwang-db/spark-37287-v1-writes.

Authored-by: allisonwang-db 
Signed-off-by: Wenchen Fan 
---
 .../org/apache/spark/sql/internal/SQLConf.scala|  10 +
 .../spark/sql/execution/SparkOptimizer.scala   |   4 +-
 .../adaptive/InsertAdaptiveSparkPlan.scala |   4 +-
 .../execution/command/createDataSourceTables.scala |  15 +-
 .../sql/execution/datasources/DataSource.scala |  39 ++--
 .../execution/datasources/FileFormatWriter.scala   |  51 ++---
 .../InsertIntoHadoopFsRelationCommand.scala|   7 +-
 .../spark/sql/execution/datasources/V1Writes.scala | 137 +
 .../adaptive/AdaptiveQueryExecSuite.scala  |  37 ++--
 .../datasources/V1WriteCommandSuite.scala  | 217 +
 .../execution/CreateHiveTableAsSelectCommand.scala |  20 +-
 .../sql/hive/execution/InsertIntoHiveTable.scala   |  69 ++-
 .../spark/sql/hive/execution/SaveAsHiveFile.scala  |   8 +-
 .../sql/hive/execution/V1WritesHiveUtils.scala | 108 ++
 .../command/V1WriteHiveCommandSuite.scala  | 103 ++
 15 files changed, 703 insertions(+), 126 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1b7857ead59..631c89d798f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -412,6 +412,14 @@ object SQLConf {
   .longConf
   .createWithDefault(67108864L)
 
+  val PLANNED_WRITE_ENABLED = 
buildConf("spark.sql.optimizer.plannedWrite.enabled")
+.internal()
+.doc("When set to true, Spark optimizer will add logical sort operators to 
V1 write commands " +
+  "if needed so that `FileFormatWriter` does not need to insert physical 
sorts.")
+.version("3.4.0")
+.booleanConf
+.createWithDefault(true)
+
   val COMPRESS_CACHED = 
buildConf("spark.sql.inMemoryColumnarStorage.compressed")
 .doc("When set to true Spark SQL will automatically select a compression 
codec for each " +
   "column based on statistics of the data.")
@@ -4617,6 +4625,8 @@ class SQLConf extends Serializable with Logging {
 
   def maxConcurrentOutputFileWriters: Int = 
getConf(SQLConf.MAX_CONCURRENT_OUTPUT_FILE_WRITERS)
 
+  def plannedWriteEnabled: Boolean = getConf(SQLConf.PLANNED_WRITE_ENABLED)
+
   def inferDictAsStruct: Boolean = getConf(SQLConf.INFER_NESTED_DICT_AS_STRUCT)
 
   def legacyInferArrayTypeFromFirstElement: Boolean = getConf(
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
index bffa1d1dae7..72bdab409a9 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala
@@ -23,8 +23,7 @@ import org.apache.spark.sql.catalyst.optimizer._
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.catalog.CatalogManager
-import org.apache.spark.sql.execution.datasources.PruneFileSourcePartitions
-import org.apache.spark.sql.execution.datasources.SchemaPruning
+import 

[spark] branch master updated (46071a9caa2 -> 2ef82ade3cc)

2022-07-19 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


from 46071a9caa2 [SPARK-39792][SQL] Add DecimalDivideWithOverflowCheck for 
decimal average
 add 2ef82ade3cc [MINOR][PYTHON][DOC] Setting toggle-prompt button in docs 
to user-select:none

No new revisions were added by this update.

Summary of changes:
 python/docs/source/_static/copybutton.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39792][SQL] Add DecimalDivideWithOverflowCheck for decimal average

2022-07-19 Thread gengliang
This is an automated email from the ASF dual-hosted git repository.

gengliang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 46071a9caa2 [SPARK-39792][SQL] Add DecimalDivideWithOverflowCheck for 
decimal average
46071a9caa2 is described below

commit 46071a9caa26b991bdd5bb0a3505a0ba76d16d0e
Author: ulysses-you 
AuthorDate: Tue Jul 19 02:02:52 2022 -0700

[SPARK-39792][SQL] Add DecimalDivideWithOverflowCheck for decimal average

### What changes were proposed in this pull request?

Add a new expression `DecimalDivideWithOverflowCheck` to replace the 
previous CheckOverflowInSum + Divide + Cast.

### Why are the changes needed?

If the result data type is decimal, the Average will first calculate the 
result using the default precison and scale of divide, then cast to the result 
data type. We should do calculate and return the result data type directly so 
that we can avoid the precision loss. It can also save one unnecessary cast.

And for the overflow check, we should check the result of divide whether 
overflow instead of the dividend.

### Does this PR introduce _any_ user-facing change?

yes, a small bug fix

### How was this patch tested?

add a test and fix test

Closes #37207 from ulysses-you/average-decimal.

Authored-by: ulysses-you 
Signed-off-by: Gengliang Wang 
---
 .../catalyst/expressions/aggregate/Average.scala   | 11 ++-
 .../catalyst/expressions/decimalExpressions.scala  | 78 +-
 .../apache/spark/sql/DataFrameAggregateSuite.scala |  4 ++
 .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala|  2 +-
 .../sql/hive/execution/AggregationQuerySuite.scala |  2 +-
 5 files changed, 91 insertions(+), 6 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
index 864ec7055f3..e64f76bdb0a 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Average.scala
@@ -88,9 +88,14 @@ abstract class AverageBase
   // We can't directly use `/` as it throws an exception under ansi mode.
   protected def getEvaluateExpression(queryContext: String) = child.dataType 
match {
 case _: DecimalType =>
-  Divide(
-CheckOverflowInSum(sum, sumDataType.asInstanceOf[DecimalType], 
!useAnsiAdd, queryContext),
-count.cast(DecimalType.LongDecimal), failOnError = 
false).cast(resultType)
+  If(EqualTo(count, Literal(0L)),
+Literal(null, resultType),
+DecimalDivideWithOverflowCheck(
+  sum,
+  count.cast(DecimalType.LongDecimal),
+  resultType.asInstanceOf[DecimalType],
+  queryContext,
+  !useAnsiAdd))
 case _: YearMonthIntervalType =>
   If(EqualTo(count, Literal(0L)),
 Literal(null, YearMonthIntervalType()), DivideYMInterval(sum, count))
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
index 7335763c253..2dd60a9d9ca 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, 
CodeGenerator, ExprCode}
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.catalyst.util.TypeUtils
 import org.apache.spark.sql.errors.QueryExecutionErrors
@@ -247,3 +247,79 @@ case class DecimalAddNoOverflowCheck(
   newLeft: Expression, newRight: Expression): DecimalAddNoOverflowCheck =
 copy(left = newLeft, right = newRight)
 }
+
+/**
+ * A divide expression for decimal values which is only used internally by Avg.
+ *
+ * It will fail when nullOnOverflow is false follows:
+ *   - left (sum in avg) is null due to over the max precision 38,
+ * the right (count in avg) should never be null
+ *   - the result of divide is overflow
+ */
+case class DecimalDivideWithOverflowCheck(
+left: Expression,
+right: Expression,
+override val dataType: DecimalType,
+avgQueryContext: String,
+nullOnOverflow: Boolean)
+  extends BinaryExpression with ExpectsInputTypes with SupportQueryContext {
+  override def nullable: Boolean = nullOnOverflow
+  override def inputTypes: 

[spark] branch branch-3.2 updated: [MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

2022-07-19 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
 new 777401984ad [MINOR][PYTHON][DOCS] Fix broken Example section in 
col/column functions
777401984ad is described below

commit 777401984adf3ffce2055cb583cf025e083f8047
Author: Hyukjin Kwon 
AuthorDate: Tue Jul 19 17:52:32 2022 +0900

[MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

This PR fixes a bug in the documentation. Trailing `'` breaks Example 
section in Python reference documentation. This PR removes it.

To render the documentation as intended in NumPy documentation style.

Yes, the documentation is updated.

**Before**

https://user-images.githubusercontent.com/6477701/179661216-715dec96-bff2-474f-ab48-41577bf4c15c.png;>

**After**

https://user-images.githubusercontent.com/6477701/179661245-72d15184-aeed-43c2-b9c9-5f3cab1ae28d.png;>

Manually built the documentation and tested.

Closes #37223 from HyukjinKwon/minor-doc-fx.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 2bdb5bfa48d1fc44358c49f7e379c2afc4a1a32f)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 2d254dc9e54..1aeafd85ade 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -103,7 +103,8 @@ def lit(col):
 @since(1.3)
 def col(col):
 """
-Returns a :class:`~pyspark.sql.Column` based on the given column name.'
+Returns a :class:`~pyspark.sql.Column` based on the given column name.
+
 Examples
 
 >>> col('x')


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch branch-3.3 updated: [MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

2022-07-19 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
 new dcaa6e0eb6d [MINOR][PYTHON][DOCS] Fix broken Example section in 
col/column functions
dcaa6e0eb6d is described below

commit dcaa6e0eb6d5b4c90df64b5396ec0d31e7c9f99a
Author: Hyukjin Kwon 
AuthorDate: Tue Jul 19 17:52:32 2022 +0900

[MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

This PR fixes a bug in the documentation. Trailing `'` breaks Example 
section in Python reference documentation. This PR removes it.

To render the documentation as intended in NumPy documentation style.

Yes, the documentation is updated.

**Before**

https://user-images.githubusercontent.com/6477701/179661216-715dec96-bff2-474f-ab48-41577bf4c15c.png;>

**After**

https://user-images.githubusercontent.com/6477701/179661245-72d15184-aeed-43c2-b9c9-5f3cab1ae28d.png;>

Manually built the documentation and tested.

Closes #37223 from HyukjinKwon/minor-doc-fx.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
(cherry picked from commit 2bdb5bfa48d1fc44358c49f7e379c2afc4a1a32f)
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 019f64b5171..ed3b0789b47 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -140,7 +140,8 @@ def lit(col: Any) -> Column:
 @since(1.3)
 def col(col: str) -> Column:
 """
-Returns a :class:`~pyspark.sql.Column` based on the given column name.'
+Returns a :class:`~pyspark.sql.Column` based on the given column name.
+
 Examples
 
 >>> col('x')


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

2022-07-19 Thread gurwls223
This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 2bdb5bfa48d [MINOR][PYTHON][DOCS] Fix broken Example section in 
col/column functions
2bdb5bfa48d is described below

commit 2bdb5bfa48d1fc44358c49f7e379c2afc4a1a32f
Author: Hyukjin Kwon 
AuthorDate: Tue Jul 19 17:52:32 2022 +0900

[MINOR][PYTHON][DOCS] Fix broken Example section in col/column functions

### What changes were proposed in this pull request?

This PR fixes a bug in the documentation. Trailing `'` breaks Example 
section in Python reference documentation. This PR removes it.

### Why are the changes needed?

To render the documentation as intended in NumPy documentation style.

### Does this PR introduce _any_ user-facing change?

Yes, the documentation is updated.

**Before**

https://user-images.githubusercontent.com/6477701/179661216-715dec96-bff2-474f-ab48-41577bf4c15c.png;>

**After**

https://user-images.githubusercontent.com/6477701/179661245-72d15184-aeed-43c2-b9c9-5f3cab1ae28d.png;>

### How was this patch tested?

Manually built the documentation and tested.

Closes #37223 from HyukjinKwon/minor-doc-fx.

Authored-by: Hyukjin Kwon 
Signed-off-by: Hyukjin Kwon 
---
 python/pyspark/sql/functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index db99dbfc400..2997be08872 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -140,7 +140,7 @@ def lit(col: Any) -> Column:
 @since(1.3)
 def col(col: str) -> Column:
 """
-Returns a :class:`~pyspark.sql.Column` based on the given column name.'
+Returns a :class:`~pyspark.sql.Column` based on the given column name.
 
 Examples
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



[spark] branch master updated: [SPARK-39759][SQL] Implement listIndexes in JDBC (H2 dialect)

2022-07-19 Thread huaxingao
This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
 new 6278becfbed [SPARK-39759][SQL] Implement listIndexes in JDBC (H2 
dialect)
6278becfbed is described below

commit 6278becfbed412bad3d00f2b7989fd19a3a0ff07
Author: panbingkun 
AuthorDate: Mon Jul 18 23:34:28 2022 -0700

[SPARK-39759][SQL] Implement listIndexes in JDBC (H2 dialect)

### What changes were proposed in this pull request?
Implementing listIndexes in DS V2 JDBC for H2 dialect.

### Why are the changes needed?
This is a subtask of the V2 Index 
support(https://issues.apache.org/jira/browse/SPARK-36525).
**It can better test the index interface locally.**
> This PR implements listIndexes in H2 dialect.

### Does this PR introduce _any_ user-facing change?
Yes, listIndexes in DS V2 JDBC for H2

### How was this patch tested?
Update existed UT.

Closes #37172 from panbingkun/h2dialect_listindex_dev.

Authored-by: panbingkun 
Signed-off-by: huaxingao 
---
 .../sql/execution/datasources/jdbc/JdbcUtils.scala |  4 +-
 .../execution/datasources/v2/jdbc/JDBCTable.scala  |  2 +-
 .../org/apache/spark/sql/jdbc/H2Dialect.scala  | 66 +-
 .../org/apache/spark/sql/jdbc/JdbcDialects.scala   |  2 +-
 .../org/apache/spark/sql/jdbc/MySQLDialect.scala   |  4 +-
 .../org/apache/spark/sql/jdbc/JDBCV2Suite.scala|  9 +++
 6 files changed, 78 insertions(+), 9 deletions(-)

diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
index 4401ee4564e..60ecd2ff60b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala
@@ -1072,10 +1072,10 @@ object JdbcUtils extends Logging with SQLConfHelper {
*/
   def listIndexes(
   conn: Connection,
-  tableName: String,
+  tableIdent: Identifier,
   options: JDBCOptions): Array[TableIndex] = {
 val dialect = JdbcDialects.get(options.url)
-dialect.listIndexes(conn, tableName, options)
+dialect.listIndexes(conn, tableIdent, options)
   }
 
   private def executeStatement(conn: Connection, options: JDBCOptions, sql: 
String): Unit = {
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTable.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTable.scala
index be8e1c68b7c..0a184116a0f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/jdbc/JDBCTable.scala
@@ -83,7 +83,7 @@ case class JDBCTable(ident: Identifier, schema: StructType, 
jdbcOptions: JDBCOpt
 
   override def listIndexes(): Array[TableIndex] = {
 JdbcUtils.withConnection(jdbcOptions) { conn =>
-  JdbcUtils.listIndexes(conn, name, jdbcOptions)
+  JdbcUtils.listIndexes(conn, ident, jdbcOptions)
 }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala
index d41929225a8..4200ba91fb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/H2Dialect.scala
@@ -25,12 +25,14 @@ import java.util.concurrent.ConcurrentHashMap
 import scala.collection.JavaConverters._
 import scala.util.control.NonFatal
 
+import org.apache.commons.lang3.StringUtils
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, 
NoSuchIndexException, NoSuchNamespaceException, NoSuchTableException, 
TableAlreadyExistsException}
 import org.apache.spark.sql.connector.catalog.Identifier
 import org.apache.spark.sql.connector.catalog.functions.UnboundFunction
-import org.apache.spark.sql.connector.expressions.Expression
-import org.apache.spark.sql.connector.expressions.NamedReference
+import org.apache.spark.sql.connector.catalog.index.TableIndex
+import org.apache.spark.sql.connector.expressions.{Expression, FieldReference, 
NamedReference}
 import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
 import org.apache.spark.sql.types.{BooleanType, ByteType, DataType, 
DecimalType, ShortType, StringType}
 
@@ -110,6 +112,64 @@ private[sql] object H2Dialect extends JdbcDialect {
 JdbcUtils.checkIfIndexExists(conn, sql, options)
   }
 
+  // See
+  // 
https://www.h2database.com/html/systemtables.html?#information_schema_indexes
+  // 
https://www.h2database.com/html/systemtables.html?#information_schema_index_columns