This is an automated email from the ASF dual-hosted git repository. agrove pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push: new d33f903e1 fix: support read Struct by user schema (#1860) d33f903e1 is described below commit d33f903e11206f06d1e59420fc55c758bd01613a Author: Oleks V <comph...@users.noreply.github.com> AuthorDate: Wed Jun 11 17:36:00 2025 -0700 fix: support read Struct by user schema (#1860) --- native/Cargo.lock | 631 ++++++--------------- native/core/src/execution/planner.rs | 140 +++++ native/core/src/parquet/parquet_support.rs | 19 +- .../org/apache/comet/CometExpressionSuite.scala | 30 +- .../apache/comet/exec/CometNativeReaderSuite.scala | 47 ++ .../scala/org/apache/spark/sql/CometTestBase.scala | 5 +- 6 files changed, 405 insertions(+), 467 deletions(-) diff --git a/native/Cargo.lock b/native/Cargo.lock index 6fea0b84e..71662f97f 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -1265,7 +1265,7 @@ dependencies = [ [[package]] name = "datafusion" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "arrow-ipc", @@ -1273,29 +1273,29 @@ dependencies = [ "async-trait", "bytes", "chrono", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", "datafusion-catalog-listing", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", "datafusion-datasource-csv", "datafusion-datasource-json", "datafusion-datasource-parquet", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-nested", "datafusion-functions-table", "datafusion-functions-window", "datafusion-optimizer", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-optimizer", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-sql 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", "futures", "itertools 0.14.0", "log", @@ -1311,31 +1311,6 @@ dependencies = [ "uuid", ] -[[package]] -name = "datafusion-catalog" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-sql 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "tokio", -] - [[package]] name = "datafusion-catalog" version = "48.0.0" @@ -1344,15 +1319,15 @@ dependencies = [ "arrow", "async-trait", "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-sql 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", "futures", "itertools 0.14.0", "log", @@ -1364,19 +1339,19 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", "futures", "log", "object_store", @@ -1472,28 +1447,6 @@ dependencies = [ "twox-hash 2.1.0", ] -[[package]] -name = "datafusion-common" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "ahash", - "arrow", - "arrow-ipc", - "base64", - "half", - "hashbrown 0.14.5", - "indexmap", - "libc", - "log", - "object_store", - "parquet", - "paste", - "sqlparser", - "tokio", - "web-time", -] - [[package]] name = "datafusion-common" version = "48.0.0" @@ -1509,22 +1462,13 @@ dependencies = [ "libc", "log", "object_store", + "parquet", "paste", "sqlparser", "tokio", "web-time", ] -[[package]] -name = "datafusion-common-runtime" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "futures", - "log", - "tokio", -] - [[package]] name = "datafusion-common-runtime" version = "48.0.0" @@ -1538,20 +1482,20 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", "bytes", "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", "futures", "glob", "itertools 0.14.0", @@ -1564,51 +1508,24 @@ dependencies = [ "url", ] -[[package]] -name = "datafusion-datasource" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" -dependencies = [ - "arrow", - "async-trait", - "bytes", - "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "futures", - "glob", - "itertools 0.14.0", - "log", - "object_store", - "rand 0.9.1", - "tokio", - "url", -] - [[package]] name = "datafusion-datasource-csv" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", "futures", "object_store", "regex", @@ -1618,21 +1535,21 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", "futures", "object_store", "serde_json", @@ -1642,23 +1559,23 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", "bytes", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-datasource 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", "datafusion-functions-aggregate", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-optimizer", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-session 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-physical-plan", + "datafusion-session", "futures", "itertools 0.14.0", "log", @@ -1669,34 +1586,11 @@ dependencies = [ "tokio", ] -[[package]] -name = "datafusion-doc" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" - [[package]] name = "datafusion-doc" version = "48.0.0" source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" -[[package]] -name = "datafusion-execution" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "futures", - "log", - "object_store", - "parking_lot", - "rand 0.9.1", - "tempfile", - "url", -] - [[package]] name = "datafusion-execution" version = "48.0.0" @@ -1704,8 +1598,8 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "arrow", "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-expr", "futures", "log", "object_store", @@ -1715,25 +1609,6 @@ dependencies = [ "url", ] -[[package]] -name = "datafusion-expr" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-aggregate-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-window-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "indexmap", - "paste", - "serde_json", - "sqlparser", -] - [[package]] name = "datafusion-expr" version = "48.0.0" @@ -1741,37 +1616,25 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "arrow", "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-functions-aggregate-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-functions-window-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", "indexmap", "paste", "serde_json", "sqlparser", ] -[[package]] -name = "datafusion-expr-common" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "indexmap", - "itertools 0.14.0", - "paste", -] - [[package]] name = "datafusion-expr-common" version = "48.0.0" source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", "indexmap", "itertools 0.14.0", "paste", @@ -1780,7 +1643,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "arrow-buffer", @@ -1788,12 +1651,12 @@ dependencies = [ "blake2", "blake3", "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", "hex", "itertools 0.14.0", "log", @@ -1805,62 +1668,26 @@ dependencies = [ "uuid", ] -[[package]] -name = "datafusion-functions" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" -dependencies = [ - "arrow", - "arrow-buffer", - "base64", - "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "hex", - "itertools 0.14.0", - "log", - "rand 0.9.1", - "regex", - "unicode-segmentation", - "uuid", -] - [[package]] name = "datafusion-functions-aggregate" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "ahash", "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-aggregate-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "half", "log", "paste", ] -[[package]] -name = "datafusion-functions-aggregate-common" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "ahash", - "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", -] - [[package]] name = "datafusion-functions-aggregate-common" version = "48.0.0" @@ -1868,26 +1695,26 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "ahash", "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", ] [[package]] name = "datafusion-functions-nested" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "arrow-ord", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", "datafusion-functions-aggregate", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-macros", + "datafusion-physical-expr-common", "itertools 0.14.0", "log", "paste", @@ -1896,14 +1723,14 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "async-trait", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", "parking_lot", "paste", ] @@ -1911,46 +1738,27 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-doc 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-window-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "log", "paste", ] -[[package]] -name = "datafusion-functions-window-common" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", -] - [[package]] name = "datafusion-functions-window-common" version = "48.0.0" source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", -] - -[[package]] -name = "datafusion-macros" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "quote", - "syn 2.0.101", + "datafusion-common", + "datafusion-physical-expr-common", ] [[package]] @@ -1958,7 +1766,7 @@ name = "datafusion-macros" version = "48.0.0" source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-expr", "quote", "syn 2.0.101", ] @@ -1966,13 +1774,13 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", "indexmap", "itertools 0.14.0", "log", @@ -1980,27 +1788,6 @@ dependencies = [ "regex-syntax", ] -[[package]] -name = "datafusion-physical-expr" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "ahash", - "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-aggregate-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "half", - "hashbrown 0.14.5", - "indexmap", - "itertools 0.14.0", - "log", - "paste", - "petgraph 0.8.1", -] - [[package]] name = "datafusion-physical-expr" version = "48.0.0" @@ -2008,11 +1795,11 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "ahash", "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-functions-aggregate-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", "half", "hashbrown 0.14.5", "indexmap", @@ -2022,19 +1809,6 @@ dependencies = [ "petgraph 0.8.1", ] -[[package]] -name = "datafusion-physical-expr-common" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "ahash", - "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "hashbrown 0.14.5", - "itertools 0.14.0", -] - [[package]] name = "datafusion-physical-expr-common" version = "48.0.0" @@ -2042,8 +1816,8 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "ahash", "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-expr-common", "hashbrown 0.14.5", "itertools 0.14.0", ] @@ -2051,47 +1825,18 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "itertools 0.14.0", - "log", -] - -[[package]] -name = "datafusion-physical-plan" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" +source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ - "ahash", "arrow", - "arrow-ord", - "arrow-schema", - "async-trait", - "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-functions-window-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "futures", - "half", - "hashbrown 0.14.5", - "indexmap", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", "itertools 0.14.0", "log", - "parking_lot", - "pin-project-lite", - "tokio", ] [[package]] @@ -2105,13 +1850,13 @@ dependencies = [ "arrow-schema", "async-trait", "chrono", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-functions-window-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "futures", "half", "hashbrown 0.14.5", @@ -2123,29 +1868,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "datafusion-session" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "async-trait", - "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-sql 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "futures", - "itertools 0.14.0", - "log", - "object_store", - "parking_lot", - "tokio", -] - [[package]] name = "datafusion-session" version = "48.0.0" @@ -2154,13 +1876,13 @@ dependencies = [ "arrow", "async-trait", "dashmap", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-common-runtime 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-physical-plan 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-sql 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-sql", "futures", "itertools 0.14.0", "log", @@ -2175,28 +1897,13 @@ version = "48.0.0" source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7e3c705d0f55d05c24a115a2f98" dependencies = [ "arrow", - "datafusion-catalog 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-execution 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-functions 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-macros 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "log", -] - -[[package]] -name = "datafusion-sql" -version = "48.0.0" -source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc2#85f6621a6b1680b40d483a56b10ff3495861ece3" -dependencies = [ - "arrow", - "bigdecimal", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc2)", - "indexmap", + "datafusion-catalog", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-macros", "log", - "regex", - "sqlparser", ] [[package]] @@ -2206,8 +1913,8 @@ source = "git+https://github.com/apache/datafusion?rev=48.0.0-rc3#33a32d4382bee7 dependencies = [ "arrow", "bigdecimal", - "datafusion-common 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", - "datafusion-expr 48.0.0 (git+https://github.com/apache/datafusion?rev=48.0.0-rc3)", + "datafusion-common", + "datafusion-expr", "indexmap", "log", "regex", diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs index 9994b4371..0eb3d5cc2 100644 --- a/native/core/src/execution/planner.rs +++ b/native/core/src/execution/planner.rs @@ -3329,4 +3329,144 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_nested_types_extract_missing_struct_names() -> Result<(), DataFusionError> { + let session_ctx = SessionContext::new(); + + // generate test data in the temp folder + let test_data = "select named_struct('a', 1, 'b', 'abc') c0"; + let tmp_dir = TempDir::new()?; + let test_path = tmp_dir.path().to_str().unwrap().to_string(); + + let plan = session_ctx + .sql(test_data) + .await? + .create_physical_plan() + .await?; + + // Write a parquet file into temp folder + session_ctx + .write_parquet(plan, test_path.clone(), None) + .await?; + + // Register all parquet with temp data as file groups + let mut file_groups: Vec<FileGroup> = vec![]; + for entry in std::fs::read_dir(&test_path)? { + let entry = entry?; + let path = entry.path(); + + if path.extension().and_then(|ext| ext.to_str()) == Some("parquet") { + if let Some(path_str) = path.to_str() { + file_groups.push(FileGroup::new(vec![PartitionedFile::from_path( + path_str.into(), + )?])); + } + } + } + + let source = ParquetSource::default().with_schema_adapter_factory(Arc::new( + SparkSchemaAdapterFactory::new( + SparkParquetOptions::new(EvalMode::Ansi, "", false), + None, + ), + ))?; + + let object_store_url = ObjectStoreUrl::local_filesystem(); + + // Define schema Comet reads with + let required_schema = Schema::new(Fields::from(vec![Field::new( + "c0", + DataType::Struct(Fields::from(vec![ + Field::new("c", DataType::Int64, true), + Field::new("d", DataType::Utf8, true), + ])), + true, + )])); + + let file_scan_config = FileScanConfigBuilder::new( + object_store_url.clone(), + required_schema.into(), + Arc::clone(&source), + ) + .with_file_groups(file_groups.clone()) + .build(); + + // Run native read + let scan = Arc::new(DataSourceExec::new(Arc::new(file_scan_config.clone()))); + let stream = scan.execute(0, session_ctx.task_ctx())?; + let result: Vec<_> = stream.collect().await; + + let actual = result.first().unwrap().as_ref().unwrap(); + + let expected = ["+----+", "| c0 |", "+----+", "| |", "+----+"]; + assert_batches_eq!(expected, &[actual.clone()]); + + // Define schema Comet reads with + let required_schema = Schema::new(Fields::from(vec![Field::new( + "c0", + DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int64, true)])), + true, + )])); + + let file_scan_config = FileScanConfigBuilder::new( + object_store_url.clone(), + required_schema.into(), + Arc::clone(&source), + ) + .with_file_groups(file_groups.clone()) + .build(); + + // Run native read + let scan = Arc::new(DataSourceExec::new(Arc::new(file_scan_config.clone()))); + let stream = scan.execute(0, session_ctx.task_ctx())?; + let result: Vec<_> = stream.collect().await; + + let actual = result.first().unwrap().as_ref().unwrap(); + + let expected = [ + "+--------+", + "| c0 |", + "+--------+", + "| {a: 1} |", + "+--------+", + ]; + assert_batches_eq!(expected, &[actual.clone()]); + + // Define schema Comet reads with + let required_schema = Schema::new(Fields::from(vec![Field::new( + "c0", + DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Int64, true), + Field::new("x", DataType::Int64, true), + ])), + true, + )])); + + let file_scan_config = FileScanConfigBuilder::new( + object_store_url.clone(), + required_schema.into(), + Arc::clone(&source), + ) + .with_file_groups(file_groups.clone()) + .build(); + + // Run native read + let scan = Arc::new(DataSourceExec::new(Arc::new(file_scan_config.clone()))); + let stream = scan.execute(0, session_ctx.task_ctx())?; + let result: Vec<_> = stream.collect().await; + + let actual = result.first().unwrap().as_ref().unwrap(); + + let expected = [ + "+-------------+", + "| c0 |", + "+-------------+", + "| {a: 1, x: } |", + "+-------------+", + ]; + assert_batches_eq!(expected, &[actual.clone()]); + + Ok(()) + } } diff --git a/native/core/src/parquet/parquet_support.rs b/native/core/src/parquet/parquet_support.rs index c77d60135..3430592c0 100644 --- a/native/core/src/parquet/parquet_support.rs +++ b/native/core/src/parquet/parquet_support.rs @@ -17,6 +17,7 @@ use crate::execution::operators::ExecutionError; use arrow::array::{ListArray, MapArray}; +use arrow::buffer::NullBuffer; use arrow::compute::can_cast_types; use arrow::datatypes::{FieldRef, Fields}; use arrow::{ @@ -211,6 +212,8 @@ fn cast_struct_to_struct( ) -> DataFusionResult<ArrayRef> { match (from_type, to_type) { (DataType::Struct(from_fields), DataType::Struct(to_fields)) => { + // if dest and target schemas has any column in common + let mut field_overlap = false; // TODO some of this logic may be specific to converting Parquet to Spark let mut field_name_to_index_map = HashMap::new(); for (i, field) in from_fields.iter().enumerate() { @@ -239,14 +242,24 @@ fn cast_struct_to_struct( parquet_options, )?; cast_fields.push(cast_field); + field_overlap = true; } else { cast_fields.push(new_null_array(to_fields[i].data_type(), array.len())); } } + + // If target schema doesn't contain any of the existing fields + // mark such a column in array as NULL + let nulls = if field_overlap { + array.nulls().cloned() + } else { + Some(NullBuffer::new_null(array.len())) + }; + Ok(Arc::new(StructArray::new( to_fields.clone(), cast_fields, - array.nulls().cloned(), + nulls, ))) } _ => unreachable!(), @@ -271,12 +284,12 @@ fn cast_map_values( ))?; let key_array = cast_array( - Arc::<dyn Array>::clone(from.keys()), + Arc::clone(from.keys()), key_field.data_type(), parquet_options, )?; let value_array = cast_array( - Arc::<dyn Array>::clone(from.values()), + Arc::clone(from.values()), value_field.data_type(), parquet_options, )?; diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 60c289097..5a0c51eef 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -36,7 +36,7 @@ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.SESSION_LOCAL_TIMEZONE -import org.apache.spark.sql.types.{Decimal, DecimalType} +import org.apache.spark.sql.types.{Decimal, DecimalType, IntegerType, StringType, StructType} import org.apache.comet.CometSparkSessionExtensions.isSpark40Plus import org.apache.comet.testing.{DataGenOptions, ParquetGenerator} @@ -2728,4 +2728,32 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { assert(cometWindowExecs.isEmpty) } + test("vectorized reader: missing all struct fields") { + Seq(true, false).foreach { offheapEnabled => + withSQLConf( + SQLConf.USE_V1_SOURCE_LIST.key -> "parquet", + CometConf.COMET_EXEC_ENABLED.key -> "true", + CometConf.COMET_ENABLED.key -> "true", + CometConf.COMET_EXPLAIN_FALLBACK_ENABLED.key -> "false", + CometConf.COMET_NATIVE_SCAN_IMPL.key -> "native_datafusion", + SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key -> "true", + SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED.key -> offheapEnabled.toString) { + val data = Seq(Tuple1((1, "a")), Tuple1((2, null)), Tuple1(null)) + + val readSchema = new StructType().add( + "_1", + new StructType() + .add("_3", IntegerType, nullable = false) + .add("_4", StringType, nullable = false), + nullable = false) + + withParquetFile(data) { file => + checkAnswer( + spark.read.schema(readSchema).parquet(file), + Row(null) :: Row(null) :: Row(null) :: Nil) + } + } + } + } + } diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala index 6d31d305d..5dbcdfee1 100644 --- a/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala +++ b/spark/src/test/scala/org/apache/comet/exec/CometNativeReaderSuite.scala @@ -25,6 +25,7 @@ import org.scalatest.Tag import org.apache.spark.sql.CometTestBase import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.apache.comet.CometConf @@ -347,4 +348,50 @@ class CometNativeReaderSuite extends CometTestBase with AdaptiveSparkPlanHelper |""".stripMargin, "select map_values(c0).y from tbl") } + + test("native reader - select struct field with user defined schema") { + // extract existing A column + var readSchema = new StructType().add( + "c0", + new StructType() + .add("a", IntegerType, nullable = true), + nullable = true) + + testSingleLineQuery( + """ + | select named_struct('a', 0, 'b', 'xyz') c0 + |""".stripMargin, + "select * from tbl", + readSchema = Some(readSchema)) + + // extract existing A column, nonexisting X + readSchema = new StructType().add( + "c0", + new StructType() + .add("a", IntegerType, nullable = true) + .add("x", StringType, nullable = true), + nullable = true) + + testSingleLineQuery( + """ + | select named_struct('a', 0, 'b', 'xyz') c0 + |""".stripMargin, + "select * from tbl", + readSchema = Some(readSchema)) + + // extract nonexisting X, Y columns + readSchema = new StructType().add( + "c0", + new StructType() + .add("y", IntegerType, nullable = true) + .add("x", StringType, nullable = true), + nullable = true) + + testSingleLineQuery( + """ + | select named_struct('a', 0, 'b', 'xyz') c0 + |""".stripMargin, + "select * from tbl", + readSchema = Some(readSchema)) + } } diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala index 2a7983f0b..fa3a28438 100644 --- a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala +++ b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala @@ -861,6 +861,7 @@ abstract class CometTestBase testName: String = "test", tableName: String = "tbl", sqlConf: Seq[(String, String)] = Seq.empty, + readSchema: Option[StructType] = None, debugCometDF: DataFrame => Unit = _ => (), checkCometOperator: Boolean = true): Unit = { @@ -876,7 +877,9 @@ abstract class CometTestBase } spark.createDataFrame(data, schema).repartition(1).write.parquet(path) - readParquetFile(path, Some(schema)) { df => df.createOrReplaceTempView(tableName) } + readParquetFile(path, readSchema.orElse(Some(schema))) { df => + df.createOrReplaceTempView(tableName) + } withSQLConf(sqlConf: _*) { val cometDF = sql(testQuery) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org