This is an automated email from the ASF dual-hosted git repository.

huaxingao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 86b8757c2c4 [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 
aggregate push down
86b8757c2c4 is described below

commit 86b8757c2c4bab6a0f7a700cf2c690cdd7f31eba
Author: Cheng Su <chen...@fb.com>
AuthorDate: Fri Apr 22 10:13:40 2022 -0700

    [SPARK-34960][SQL][DOCS][FOLLOWUP] Improve doc for DSv2 aggregate push down
    
    ### What changes were proposed in this pull request?
    
    This is a followup per comment in 
https://issues.apache.org/jira/browse/SPARK-34960, to improve the documentation 
for data source v2 aggregate push down of Parquet and ORC.
    
    * Unify SQL config docs between Parquet and ORC, and add the note that if 
statistics is missing from any file footer, exception would be thrown.
    * Also adding the same note for exception in Parquet and ORC methods to 
aggregate from statistics.
    
    Though in future Spark release, we may improve the behavior to fallback to 
aggregate from real data of file, in case any statistics are missing. We'd 
better to make a clear documentation for current behavior now.
    
    ### Why are the changes needed?
    
    Give users & developers a better idea of when aggregate push down would 
throw exception.
    Have a better documentation for current behavior.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the documentation change in SQL configs.
    
    ### How was this patch tested?
    
    Existing tests as this is just documentation change.
    
    Closes #36311 from c21/agg-doc.
    
    Authored-by: Cheng Su <chen...@fb.com>
    Signed-off-by: huaxingao <huaxin_...@apple.com>
---
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 10 ++++++----
 .../apache/spark/sql/execution/datasources/orc/OrcUtils.scala  |  2 ++
 .../spark/sql/execution/datasources/parquet/ParquetUtils.scala |  4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 50d09d046bc..6d3f283fa73 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -974,9 +974,10 @@ object SQLConf {
       .createWithDefault(10)
 
   val PARQUET_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.parquet.aggregatePushdown")
-    .doc("If true, MAX/MIN/COUNT without filter and group by will be pushed" +
-      " down to Parquet for optimization. MAX/MIN/COUNT for complex types and 
timestamp" +
-      " can't be pushed down")
+    .doc("If true, aggregates will be pushed down to Parquet for optimization. 
Support MIN, MAX " +
+      "and COUNT as aggregate expression. For MIN/MAX, support boolean, 
integer, float and date " +
+      "type. For COUNT, support all data types. If statistics is missing from 
any Parquet file " +
+      "footer, exception would be thrown.")
     .version("3.3.0")
     .booleanConf
     .createWithDefault(false)
@@ -1110,7 +1111,8 @@ object SQLConf {
   val ORC_AGGREGATE_PUSHDOWN_ENABLED = 
buildConf("spark.sql.orc.aggregatePushdown")
     .doc("If true, aggregates will be pushed down to ORC for optimization. 
Support MIN, MAX and " +
       "COUNT as aggregate expression. For MIN/MAX, support boolean, integer, 
float and date " +
-      "type. For COUNT, support all data types.")
+      "type. For COUNT, support all data types. If statistics is missing from 
any ORC file " +
+      "footer, exception would be thrown.")
     .version("3.3.0")
     .booleanConf
     .createWithDefault(false)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
index 79abdfe4690..f07573beae6 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcUtils.scala
@@ -407,6 +407,8 @@ object OrcUtils extends Logging {
    * (Max/Min/Count) result using the statistics information from ORC file 
footer, and then
    * construct an InternalRow from these aggregate results.
    *
+   * NOTE: if statistics is missing from ORC file footer, exception would be 
thrown.
+   *
    * @return Aggregate results in the format of InternalRow
    */
   def createAggInternalRowFromFooter(
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
index 9f2e6580ecb..5a291e6a2e5 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetUtils.scala
@@ -216,7 +216,9 @@ object ParquetUtils {
    * When the partial aggregates (Max/Min/Count) are pushed down to Parquet, 
we don't need to
    * createRowBaseReader to read data from Parquet and aggregate at Spark 
layer. Instead we want
    * to get the partial aggregates (Max/Min/Count) result using the statistics 
information
-   * from Parquet footer file, and then construct an InternalRow from these 
aggregate results.
+   * from Parquet file footer, and then construct an InternalRow from these 
aggregate results.
+   *
+   * NOTE: if statistics is missing from Parquet file footer, exception would 
be thrown.
    *
    * @return Aggregate results in the format of InternalRow
    */


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to