This is an automated email from the ASF dual-hosted git repository. adriangb pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push: new fa1f8c192d Upgrade arrow/parquet to 56.0.0 (#16690) fa1f8c192d is described below commit fa1f8c192dd531d3f2fca61885eafa3e9002f0dd Author: Andrew Lamb <and...@nerdnetworks.org> AuthorDate: Mon Aug 4 14:02:31 2025 -0400 Upgrade arrow/parquet to 56.0.0 (#16690) --- Cargo.lock | 166 ++--- Cargo.toml | 14 +- datafusion-examples/Cargo.toml | 2 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 18 +- .../common/src/file_options/parquet_writer.rs | 25 +- datafusion/common/src/scalar/mod.rs | 19 +- datafusion/common/src/types/native.rs | 5 +- datafusion/core/tests/fuzz_cases/pruning.rs | 11 +- datafusion/core/tests/parquet/mod.rs | 4 +- datafusion/core/tests/parquet/row_group_pruning.rs | 14 +- .../datasource-avro/src/avro_to_arrow/schema.rs | 2 + datafusion/expr/src/utils.rs | 2 + .../proto-common/proto/datafusion_common.proto | 40 +- datafusion/proto-common/src/from_proto/mod.rs | 31 +- datafusion/proto-common/src/generated/pbjson.rs | 728 +++++++++++++++++---- datafusion/proto-common/src/generated/prost.rs | 66 +- datafusion/proto-common/src/to_proto/mod.rs | 18 +- .../proto/src/generated/datafusion_proto_common.rs | 66 +- datafusion/proto/src/logical_plan/file_formats.rs | 14 - datafusion/sql/src/unparser/expr.rs | 6 + datafusion/sqllogictest/test_files/copy.slt | 1 - .../sqllogictest/test_files/information_schema.slt | 6 +- .../test_files/listing_table_statistics.slt | 2 +- .../sqllogictest/test_files/parquet_statistics.slt | 16 +- .../sqllogictest/test_files/repartition_scan.slt | 8 +- .../substrait/src/logical_plan/consumer/utils.rs | 2 + docs/source/library-user-guide/upgrading.md | 6 + docs/source/user-guide/configs.md | 3 +- 29 files changed, 903 insertions(+), 394 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5e8159cc82..286076d81b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -246,9 +246,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" dependencies = [ "arrow-arith", "arrow-array", @@ -270,9 +270,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -284,9 +284,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -301,9 +301,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" dependencies = [ "bytes", "half", @@ -312,9 +312,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" dependencies = [ "arrow-array", "arrow-cast", @@ -348,9 +348,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -360,9 +360,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cb3e1d2b441e6d1d5988e3f7c4523c9466b18ef77d7c525d92d36d4cad49fbe" +checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" dependencies = [ "arrow-arith", "arrow-array", @@ -387,9 +387,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" dependencies = [ "arrow-array", "arrow-buffer", @@ -402,9 +402,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" dependencies = [ "arrow-array", "arrow-buffer", @@ -424,9 +424,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" dependencies = [ "arrow-array", "arrow-buffer", @@ -437,9 +437,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e55ecf16b9b61d433f6e63c72fc6afcf2597d7db96583de88ebb887d1822268" +checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" dependencies = [ "arrow-array", "arrow-data", @@ -449,9 +449,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" dependencies = [ "arrow-array", "arrow-buffer", @@ -462,9 +462,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" dependencies = [ "bitflags 2.9.1", "serde", @@ -473,9 +473,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -487,9 +487,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" dependencies = [ "arrow-array", "arrow-buffer", @@ -567,28 +567,6 @@ dependencies = [ "syn 2.0.104", ] -[[package]] -name = "async-stream" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.104", -] - [[package]] name = "async-trait" version = "0.1.88" @@ -849,7 +827,7 @@ dependencies = [ "rustls-native-certs", "rustls-pki-types", "tokio", - "tower 0.5.2", + "tower", "tracing", ] @@ -970,11 +948,10 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.9" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" dependencies = [ - "async-trait", "axum-core", "bytes", "futures-util", @@ -990,20 +967,19 @@ dependencies = [ "rustversion", "serde", "sync_wrapper", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.4.5" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" dependencies = [ - "async-trait", "bytes", - "futures-util", + "futures-core", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -4152,9 +4128,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.7.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" @@ -4522,9 +4498,9 @@ dependencies = [ [[package]] name = "parquet" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -5021,11 +4997,10 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" +checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" dependencies = [ - "cfg-if", "indoc", "libc", "memoffset", @@ -5039,9 +5014,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" +checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" dependencies = [ "once_cell", "target-lexicon", @@ -5049,9 +5024,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" +checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" dependencies = [ "libc", "pyo3-build-config", @@ -5059,9 +5034,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" +checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5071,9 +5046,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" +checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -5443,7 +5418,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower 0.5.2", + "tower", "tower-http", "tower-service", "url", @@ -6664,11 +6639,10 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" dependencies = [ - "async-stream", "async-trait", "axum", "base64 0.22.1", @@ -6686,27 +6660,7 @@ dependencies = [ "socket2 0.5.10", "tokio", "tokio-stream", - "tower 0.4.13", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", + "tower", "tower-layer", "tower-service", "tracing", @@ -6720,11 +6674,15 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.10.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -6740,7 +6698,7 @@ dependencies = [ "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] diff --git a/Cargo.toml b/Cargo.toml index f4f8e9d875..1335361708 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -89,19 +89,19 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.17", default-features = false } -arrow = { version = "55.2.0", features = [ +arrow = { version = "56.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "55.2.0", default-features = false } -arrow-flight = { version = "55.2.0", features = [ +arrow-buffer = { version = "56.0.0", default-features = false } +arrow-flight = { version = "56.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "55.2.0", default-features = false, features = [ +arrow-ipc = { version = "56.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "55.2.0", default-features = false } -arrow-schema = { version = "55.2.0", default-features = false } +arrow-ord = { version = "56.0.0", default-features = false } +arrow-schema = { version = "56.0.0", default-features = false } async-trait = "0.1.88" bigdecimal = "0.4.8" bytes = "1.10" @@ -155,7 +155,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" -parquet = { version = "55.2.0", default-features = false, features = [ +parquet = { version = "56.0.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 324d9f61b5..c46c24d0c9 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -78,7 +78,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.12.1" +tonic = "0.13.1" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 83e539e31d..d8c5b321c0 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.24.2", optional = true } +pyo3 = { version = "0.25", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 3a9b71e1bf..a2c5d08d52 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -607,13 +607,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option<String>, transform = str::to_lowercase, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option<usize>, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -625,9 +618,9 @@ config_namespace! { /// (writing) Sets column index truncate length pub column_index_truncate_length: Option<usize>, default = Some(64) - /// (writing) Sets statictics truncate length. If NULL, uses + /// (writing) Sets statistics truncate length. If NULL, uses /// default parquet writer setting - pub statistics_truncate_length: Option<usize>, default = None + pub statistics_truncate_length: Option<usize>, default = Some(64) /// (writing) Sets best effort maximum number of rows in data page pub data_page_row_count_limit: usize, default = 20_000 @@ -2064,13 +2057,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option<u64>, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option<usize>, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index 9ea2b6af82..f7ae9b9d92 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -36,7 +36,7 @@ use parquet::{ metadata::KeyValue, properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, }, schema::types::ColumnPath, @@ -161,16 +161,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - // max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[allow(deprecated)] - if let Some(max_statistics_size) = options.max_statistics_size { - builder = { - #[allow(deprecated)] - builder.set_column_max_statistics_size(path, max_statistics_size) - } - } } Ok(builder) @@ -219,7 +209,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -266,13 +255,6 @@ impl ParquetOptions { .set_data_page_row_count_limit(*data_page_row_count_limit) .set_bloom_filter_enabled(*bloom_filter_on_write); - builder = { - #[allow(deprecated)] - builder.set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) - }; - if let Some(bloom_filter_fpp) = bloom_filter_fpp { builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp); }; @@ -465,12 +447,10 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { - #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -495,7 +475,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -554,7 +533,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -611,7 +589,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 1ced4ab825..795c62602d 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -891,11 +891,10 @@ fn dict_from_values<K: ArrowDictionaryKeyType>( .map(|index| { if values_array.is_valid(index) { let native_index = K::Native::from_usize(index).ok_or_else(|| { - DataFusionError::Internal(format!( - "Can not create index of type {} from value {}", - K::DATA_TYPE, - index - )) + _internal_datafusion_err!( + "Can not create index of type {} from value {index}", + K::DATA_TYPE + ) })?; Ok(Some(native_index)) } else { @@ -2192,6 +2191,16 @@ impl ScalarValue { } let array: ArrayRef = match &data_type { + DataType::Decimal32(_precision, _scale) => { + return _not_impl_err!( + "Decimal32 not supported in ScalarValue::iter_to_array" + ); + } + DataType::Decimal64(_precision, _scale) => { + return _not_impl_err!( + "Decimal64 not supported in ScalarValue::iter_to_array" + ); + } DataType::Decimal128(precision, scale) => { let decimal_array = ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 39c79b4b99..76629e555b 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -407,7 +407,10 @@ impl From<DataType> for NativeType { DataType::Union(union_fields, _) => { Union(LogicalUnionFields::from(&union_fields)) } - DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Decimal32(p, s) + | DataType::Decimal64(p, s) + | DataType::Decimal128(p, s) + | DataType::Decimal256(p, s) => Decimal(p, s), DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index 6e624d458b..c6e30c0722 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -319,14 +319,9 @@ async fn write_parquet_file( row_groups: Vec<Vec<String>>, ) -> Bytes { let mut buf = BytesMut::new().writer(); - let mut props = WriterProperties::builder(); - if let Some(truncation_length) = truncation_length { - props = { - #[allow(deprecated)] - props.set_max_statistics_size(truncation_length) - } - } - props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) // row group level + .set_statistics_truncate_length(truncation_length); let props = props.build(); { let mut writer = diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 4f9dde08a6..82444e8b6e 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -109,11 +109,11 @@ struct ContextWithParquet { /// The output of running one of the test cases struct TestOutput { - /// The input string + /// The input query SQL sql: String, /// Execution metrics for the Parquet Scan parquet_metrics: MetricsSet, - /// number of rows in results + /// number of actual rows in results result_rows: usize, /// the contents of the input, as a string pretty_input: String, diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 8613cd481b..44409166d3 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -34,7 +34,7 @@ struct RowGroupPruningTest { expected_files_pruned_by_statistics: Option<usize>, expected_row_group_matched_by_bloom_filter: Option<usize>, expected_row_group_pruned_by_bloom_filter: Option<usize>, - expected_results: usize, + expected_rows: usize, } impl RowGroupPruningTest { // Start building the test configuration @@ -48,7 +48,7 @@ impl RowGroupPruningTest { expected_files_pruned_by_statistics: None, expected_row_group_matched_by_bloom_filter: None, expected_row_group_pruned_by_bloom_filter: None, - expected_results: 0, + expected_rows: 0, } } @@ -99,9 +99,9 @@ impl RowGroupPruningTest { self } - // Set the expected rows for the test + /// Set the number of expected rows from the output of this test fn with_expected_rows(mut self, rows: usize) -> Self { - self.expected_results = rows; + self.expected_rows = rows; self } @@ -145,8 +145,10 @@ impl RowGroupPruningTest { ); assert_eq!( output.result_rows, - self.expected_results, - "mismatched expected rows: {}", + self.expected_rows, + "Expected {} rows, got {}: {}", + output.result_rows, + self.expected_rows, output.description(), ); } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs index f53d38e51d..cc87d3c1ce 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs @@ -235,6 +235,8 @@ fn default_field_name(dt: &DataType) -> &str { | DataType::LargeListView(_) => { unimplemented!("View support not implemented") } + DataType::Decimal32(_, _) => "decimal", + DataType::Decimal64(_, _) => "decimal", DataType::Decimal128(_, _) => "decimal", DataType::Decimal256(_, _) => "decimal", } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index e554152328..a635e708fb 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -815,6 +815,8 @@ pub fn can_hash(data_type: &DataType) -> bool { DataType::Float16 => true, DataType::Float32 => true, DataType::Float64 => true, + DataType::Decimal32(_, _) => true, + DataType::Decimal64(_, _) => true, DataType::Decimal128(_, _) => true, DataType::Decimal256(_, _) => true, DataType::Timestamp(_, _) => true, diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index ffdc29e429..5fb537160b 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -136,7 +136,19 @@ enum IntervalUnit{ MonthDayNano = 2; } -message Decimal{ +message Decimal32Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal64Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal128Type { reserved 1, 2; uint32 precision = 3; int32 scale = 4; @@ -286,6 +298,8 @@ message ScalarValue{ ScalarNestedValue struct_value = 32; ScalarNestedValue map_value = 41; + Decimal32 decimal32_value = 43; + Decimal64 decimal64_value = 44; Decimal128 decimal128_value = 20; Decimal256 decimal256_value = 39; @@ -310,6 +324,18 @@ message ScalarValue{ } } +message Decimal32{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + +message Decimal64{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + message Decimal128{ bytes value = 1; int64 p = 2; @@ -352,7 +378,9 @@ message ArrowType{ TimeUnit TIME32 = 21 ; TimeUnit TIME64 = 22 ; IntervalUnit INTERVAL = 23 ; - Decimal DECIMAL = 24 ; + Decimal32Type DECIMAL32 = 40; + Decimal64Type DECIMAL64 = 41; + Decimal128Type DECIMAL128 = 24; Decimal256Type DECIMAL256 = 36; List LIST = 25; List LARGE_LIST = 26; @@ -480,9 +508,7 @@ message ParquetColumnOptions { uint64 bloom_filter_ndv = 7; } - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } + reserved 8; // used to be uint32 max_statistics_size = 8; } message ParquetOptions { @@ -522,9 +548,7 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } + reserved 14; // used to be uint32 max_statistics_size = 20; oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 98df86a21f..94024abfed 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -37,6 +37,7 @@ use datafusion_common::{ TableParquetOptions, }, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, + not_impl_err, parsers::CompressionTypeVariant, plan_datafusion_err, stats::Precision, @@ -257,7 +258,15 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { arrow_type::ArrowTypeEnum::Interval(interval_unit) => { DataType::Interval(parse_i32_to_interval_unit(interval_unit)?) } - arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { + arrow_type::ArrowTypeEnum::Decimal32(protobuf::Decimal32Type { + precision, + scale, + }) => DataType::Decimal32(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal64(protobuf::Decimal64Type { + precision, + scale, + }) => DataType::Decimal64(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal128(protobuf::Decimal128Type { precision, scale, }) => DataType::Decimal128(*precision as u8, *scale as i8), @@ -469,6 +478,14 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { let null_type: DataType = v.try_into()?; null_type.try_into().map_err(Error::DataFusionError)? } + Value::Decimal32Value(_val) => { + return not_impl_err!("Decimal32 protobuf deserialization") + .map_err(Error::DataFusionError) + } + Value::Decimal64Value(_val) => { + return not_impl_err!("Decimal64 protobuf deserialization") + .map_err(Error::DataFusionError) + } Value::Decimal128Value(val) => { let array = vec_to_array(val.value.clone()); Self::Decimal128( @@ -938,12 +955,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -1010,12 +1021,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 89e85b0dc8..c92869655f 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -243,8 +243,14 @@ impl serde::Serialize for ArrowType { .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; struct_ser.serialize_field("INTERVAL", &v)?; } - arrow_type::ArrowTypeEnum::Decimal(v) => { - struct_ser.serialize_field("DECIMAL", v)?; + arrow_type::ArrowTypeEnum::Decimal32(v) => { + struct_ser.serialize_field("DECIMAL32", v)?; + } + arrow_type::ArrowTypeEnum::Decimal64(v) => { + struct_ser.serialize_field("DECIMAL64", v)?; + } + arrow_type::ArrowTypeEnum::Decimal128(v) => { + struct_ser.serialize_field("DECIMAL128", v)?; } arrow_type::ArrowTypeEnum::Decimal256(v) => { struct_ser.serialize_field("DECIMAL256", v)?; @@ -314,7 +320,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32", "TIME64", "INTERVAL", - "DECIMAL", + "DECIMAL32", + "DECIMAL64", + "DECIMAL128", "DECIMAL256", "LIST", "LARGE_LIST", @@ -356,7 +364,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { Time32, Time64, Interval, - Decimal, + Decimal32, + Decimal64, + Decimal128, Decimal256, List, LargeList, @@ -413,7 +423,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32" => Ok(GeneratedField::Time32), "TIME64" => Ok(GeneratedField::Time64), "INTERVAL" => Ok(GeneratedField::Interval), - "DECIMAL" => Ok(GeneratedField::Decimal), + "DECIMAL32" => Ok(GeneratedField::Decimal32), + "DECIMAL64" => Ok(GeneratedField::Decimal64), + "DECIMAL128" => Ok(GeneratedField::Decimal128), "DECIMAL256" => Ok(GeneratedField::Decimal256), "LIST" => Ok(GeneratedField::List), "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList), @@ -628,11 +640,25 @@ impl<'de> serde::Deserialize<'de> for ArrowType { } arrow_type_enum__ = map_.next_value::<::std::option::Option<IntervalUnit>>()?.map(|x| arrow_type::ArrowTypeEnum::Interval(x as i32)); } - GeneratedField::Decimal => { + GeneratedField::Decimal32 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL32")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal32) +; + } + GeneratedField::Decimal64 => { if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL")); + return Err(serde::de::Error::duplicate_field("DECIMAL64")); } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal) + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal64) +; + } + GeneratedField::Decimal128 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL128")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal128) ; } GeneratedField::Decimal256 => { @@ -2222,45 +2248,431 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions { } null_value__ = Some(map_.next_value()?); } - GeneratedField::Quote => { - if quote__.is_some() { - return Err(serde::de::Error::duplicate_field("quote")); + GeneratedField::Quote => { + if quote__.is_some() { + return Err(serde::de::Error::duplicate_field("quote")); + } + quote__ = Some(map_.next_value()?); + } + GeneratedField::Escape => { + if escape__.is_some() { + return Err(serde::de::Error::duplicate_field("escape")); + } + escape__ = Some(map_.next_value()?); + } + GeneratedField::DoubleQuote => { + if double_quote__.is_some() { + return Err(serde::de::Error::duplicate_field("doubleQuote")); + } + double_quote__ = Some(map_.next_value()?); + } + } + } + Ok(CsvWriterOptions { + compression: compression__.unwrap_or_default(), + delimiter: delimiter__.unwrap_or_default(), + has_header: has_header__.unwrap_or_default(), + date_format: date_format__.unwrap_or_default(), + datetime_format: datetime_format__.unwrap_or_default(), + timestamp_format: timestamp_format__.unwrap_or_default(), + time_format: time_format__.unwrap_or_default(), + null_value: null_value__.unwrap_or_default(), + quote: quote__.unwrap_or_default(), + escape: escape__.unwrap_or_default(), + double_quote: double_quote__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128 { + #[allow(deprecated)] + fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128 { + #[allow(deprecated)] + fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error> + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E> + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128") + } + + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128, V::Error> + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); + } + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); + } + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); + } + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128Type { + #[allow(deprecated)] + fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128Type { + #[allow(deprecated)] + fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error> + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E> + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128Type") + } + + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128Type, V::Error> + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal256 { + #[allow(deprecated)] + fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal256 { + #[allow(deprecated)] + fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error> + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E> + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal256; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal256") + } + + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256, V::Error> + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); } - quote__ = Some(map_.next_value()?); + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; } - GeneratedField::Escape => { - if escape__.is_some() { - return Err(serde::de::Error::duplicate_field("escape")); + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); } - escape__ = Some(map_.next_value()?); + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } - GeneratedField::DoubleQuote => { - if double_quote__.is_some() { - return Err(serde::de::Error::duplicate_field("doubleQuote")); + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); } - double_quote__ = Some(map_.next_value()?); + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } } } - Ok(CsvWriterOptions { - compression: compression__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), - date_format: date_format__.unwrap_or_default(), - datetime_format: datetime_format__.unwrap_or_default(), - timestamp_format: timestamp_format__.unwrap_or_default(), - time_format: time_format__.unwrap_or_default(), - null_value: null_value__.unwrap_or_default(), - quote: quote__.unwrap_or_default(), - escape: escape__.unwrap_or_default(), - double_quote: double_quote__.unwrap_or_default(), + Ok(Decimal256 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal { +impl serde::Serialize for Decimal256Type { #[allow(deprecated)] fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> where @@ -2274,7 +2686,7 @@ impl serde::Serialize for Decimal { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2284,7 +2696,7 @@ impl serde::Serialize for Decimal { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal { +impl<'de> serde::Deserialize<'de> for Decimal256Type { #[allow(deprecated)] fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> where @@ -2331,13 +2743,13 @@ impl<'de> serde::Deserialize<'de> for Decimal { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal; + type Value = Decimal256Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal") + formatter.write_str("struct datafusion_common.Decimal256Type") } - fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal, V::Error> + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256Type, V::Error> where V: serde::de::MapAccess<'de>, { @@ -2363,16 +2775,16 @@ impl<'de> serde::Deserialize<'de> for Decimal { } } } - Ok(Decimal { + Ok(Decimal256Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal128 { +impl serde::Serialize for Decimal32 { #[allow(deprecated)] fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> where @@ -2389,7 +2801,7 @@ impl serde::Serialize for Decimal128 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2408,7 +2820,7 @@ impl serde::Serialize for Decimal128 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal128 { +impl<'de> serde::Deserialize<'de> for Decimal32 { #[allow(deprecated)] fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> where @@ -2458,13 +2870,13 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal128; + type Value = Decimal32; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal128") + formatter.write_str("struct datafusion_common.Decimal32") } - fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal128, V::Error> + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal32, V::Error> where V: serde::de::MapAccess<'de>, { @@ -2499,17 +2911,129 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } } } - Ok(Decimal128 { + Ok(Decimal32 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal32", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256 { +impl serde::Serialize for Decimal32Type { + #[allow(deprecated)] + fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal32Type { + #[allow(deprecated)] + fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error> + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str<E>(self, value: &str) -> std::result::Result<GeneratedField, E> + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal32Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal32Type") + } + + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal32Type, V::Error> + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal32Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal32Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal64 { #[allow(deprecated)] fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> where @@ -2526,7 +3050,7 @@ impl serde::Serialize for Decimal256 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2545,7 +3069,7 @@ impl serde::Serialize for Decimal256 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256 { +impl<'de> serde::Deserialize<'de> for Decimal64 { #[allow(deprecated)] fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> where @@ -2595,13 +3119,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256; + type Value = Decimal64; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256") + formatter.write_str("struct datafusion_common.Decimal64") } - fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256, V::Error> + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal64, V::Error> where V: serde::de::MapAccess<'de>, { @@ -2636,17 +3160,17 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } } } - Ok(Decimal256 { + Ok(Decimal64 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256Type { +impl serde::Serialize for Decimal64Type { #[allow(deprecated)] fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> where @@ -2660,7 +3184,7 @@ impl serde::Serialize for Decimal256Type { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2670,7 +3194,7 @@ impl serde::Serialize for Decimal256Type { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256Type { +impl<'de> serde::Deserialize<'de> for Decimal64Type { #[allow(deprecated)] fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error> where @@ -2717,13 +3241,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256Type; + type Value = Decimal64Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256Type") + formatter.write_str("struct datafusion_common.Decimal64Type") } - fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal256Type, V::Error> + fn visit_map<V>(self, mut map_: V) -> std::result::Result<Decimal64Type, V::Error> where V: serde::de::MapAccess<'de>, { @@ -2749,13 +3273,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } } } - Ok(Decimal256Type { + Ok(Decimal64Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64Type", FIELDS, GeneratedVisitor) } } impl serde::Serialize for DfField { @@ -4589,9 +5113,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4644,13 +5165,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4673,8 +5187,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4686,7 +5198,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize<D>(deserializer: D) -> std::result::Result<GeneratedField, D::Error> @@ -4715,7 +5226,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4742,7 +5252,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4787,12 +5296,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4803,7 +5306,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -5093,9 +5595,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5222,15 +5721,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5337,8 +5827,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "statistics_truncate_length", @@ -5379,7 +5867,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, StatisticsTruncateLength, Encoding, @@ -5432,7 +5919,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength), "encoding" => Ok(GeneratedField::Encoding), @@ -5483,7 +5969,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut statistics_truncate_length_opt__ = None; let mut encoding_opt__ = None; @@ -5656,12 +6141,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5726,7 +6205,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, statistics_truncate_length_opt: statistics_truncate_length_opt__, encoding_opt: encoding_opt__, @@ -6977,6 +7455,12 @@ impl serde::Serialize for ScalarValue { scalar_value::Value::MapValue(v) => { struct_ser.serialize_field("mapValue", v)?; } + scalar_value::Value::Decimal32Value(v) => { + struct_ser.serialize_field("decimal32Value", v)?; + } + scalar_value::Value::Decimal64Value(v) => { + struct_ser.serialize_field("decimal64Value", v)?; + } scalar_value::Value::Decimal128Value(v) => { struct_ser.serialize_field("decimal128Value", v)?; } @@ -7103,6 +7587,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "structValue", "map_value", "mapValue", + "decimal32_value", + "decimal32Value", + "decimal64_value", + "decimal64Value", "decimal128_value", "decimal128Value", "decimal256_value", @@ -7165,6 +7653,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { FixedSizeListValue, StructValue, MapValue, + Decimal32Value, + Decimal64Value, Decimal128Value, Decimal256Value, Date64Value, @@ -7226,6 +7716,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue), "structValue" | "struct_value" => Ok(GeneratedField::StructValue), "mapValue" | "map_value" => Ok(GeneratedField::MapValue), + "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value), + "decimal64Value" | "decimal64_value" => Ok(GeneratedField::Decimal64Value), "decimal128Value" | "decimal128_value" => Ok(GeneratedField::Decimal128Value), "decimal256Value" | "decimal256_value" => Ok(GeneratedField::Decimal256Value), "date64Value" | "date_64_value" => Ok(GeneratedField::Date64Value), @@ -7403,6 +7895,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { return Err(serde::de::Error::duplicate_field("mapValue")); } value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::MapValue) +; + } + GeneratedField::Decimal32Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal32Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal32Value) +; + } + GeneratedField::Decimal64Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal64Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal64Value) ; } GeneratedField::Decimal128Value => { diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index 6ed32d7de0..da8d006f62 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option<scalar_value::Value>, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec<u8>, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec<u8>, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec<u8>, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -789,10 +820,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -833,11 +860,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 0bd6f09bb3..0f0204b3a6 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -189,7 +189,15 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { value: Some(Box::new(value_type.as_ref().try_into()?)), })) } - DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal { + DataType::Decimal32(precision, scale) => Self::Decimal32(protobuf::Decimal32Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal64(precision, scale) => Self::Decimal64(protobuf::Decimal64Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal128(precision, scale) => Self::Decimal128(protobuf::Decimal128Type { precision: *precision as u32, scale: *scale as i32, }), @@ -817,8 +825,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -859,12 +865,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index 6ed32d7de0..da8d006f62 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option<scalar_value::Value>, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec<u8>, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec<u8>, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec<u8>, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option<arrow_type::ArrowTypeEnum>, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -789,10 +820,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -833,11 +860,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 1e0d76bc67..139d615e98 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -382,9 +382,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -441,9 +438,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -482,9 +476,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -544,11 +535,6 @@ impl From<ParquetColumnOptionsProto> for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 4c0dc31661..56a1fb6b68 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1718,6 +1718,12 @@ impl Unparser<'_> { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), + DataType::Decimal32(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } + DataType::Decimal64(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 5eeb05e814..096cde86f2 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -332,7 +332,6 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, -'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value' diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index de68352d0d..2cdfff4400 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -243,7 +243,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -254,7 +253,7 @@ datafusion.execution.parquet.schema_force_view_types true datafusion.execution.parquet.skip_arrow_metadata false datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page -datafusion.execution.parquet.statistics_truncate_length NULL +datafusion.execution.parquet.statistics_truncate_length 64 datafusion.execution.parquet.write_batch_size 1024 datafusion.execution.parquet.writer_version 1.0 datafusion.execution.planning_concurrency 13 @@ -358,7 +357,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-me [...] datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory dat [...] datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer @@ -369,7 +367,7 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to <https://docs.rs/parquet/53.3.0/parquet/arrow/arrow_writer/struct.ArrowWriterOptions.html#method.with_skip_arrow_metadata> datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting -datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting +datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0" datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt index 890d1f2e92..7203767345 100644 --- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt +++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt @@ -35,7 +35,7 @@ query TT explain format indent select * from t; ---- logical_plan TableScan: t projection=[int_col, str_col] -physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] +physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/table/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(212), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt index efbe69bd85..6c0c96c74c 100644 --- a/datafusion/sqllogictest/test_files/parquet_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt @@ -59,11 +59,11 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] 04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)] -05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +05), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok @@ -86,11 +86,11 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] 04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)] -05), statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +05), statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 0b851f9178..c536c8165c 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, p [...] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, p [...] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, p [...] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, p [...] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..280], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:280..554, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..286], [WORKSP [...] +05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSP [...] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..277], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:281..563], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:277..554]]}, p [...] +04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, p [...] # Cleanup statement ok diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs index 67215e8e34..f7eedcb7a2 100644 --- a/datafusion/substrait/src/logical_plan/consumer/utils.rs +++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs @@ -213,6 +213,8 @@ pub fn rename_data_type( | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => Ok(data_type.clone()), } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 268b156608..0c9e10476b 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -24,6 +24,12 @@ **Note:** DataFusion `50.0.0` has not been released yet. The information provided in this section pertains to features and changes that have already been merged to the main branch and are awaiting release in this version. You can see the current [status of the `50.0.0 `release here](https://github.com/apache/datafusion/issues/16799) +### Upgrade to arrow `56.0.0` and parquet `56.0.0` + +This version of DataFusion upgrades the underlying Apache Arrow implementation +to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0) +for more details. + ## DataFusion `49.0.0` ### `MSRV` updated to 1.85.1 diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index da162b741b..7c760e7de4 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -69,11 +69,10 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting [...] | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes [...] | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting [...] -| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used [...] | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. [...] | datafusion.execution.parquet.created_by | datafusion version 49.0.0 | (writing) Sets "created by" property [...] | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length [...] -| datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statictics truncate length. If NULL, uses default parquet writer setting [...] +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting [...] | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page [...] | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting [...] | datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files [...] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datafusion.apache.org For additional commands, e-mail: commits-h...@datafusion.apache.org