This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 9d8f77df79 Upgrade arrow/parquet to `53.1.0` / fix clippy (#12724)
9d8f77df79 is described below
commit 9d8f77df79ad6515f216ab9a825a04d862bbdd4f
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Oct 7 08:48:37 2024 -0400
Upgrade arrow/parquet to `53.1.0` / fix clippy (#12724)
* Update to arrow/parquet 53.1.0
* Update some API
* update for changed file sizes
* Use non deprecated APIs
* Use ParquetMetadataReader from @etseidl
* remove upstreamed implementation
* Update CSV schema
* Use upstream is_null and is_not_null kernels
---
Cargo.toml | 18 +-
datafusion-cli/Cargo.lock | 289 ++++++++++-----------
datafusion/core/src/datasource/file_format/csv.rs | 4 +-
.../core/src/datasource/file_format/parquet.rs | 97 +++----
datafusion/functions/src/lib.rs | 3 -
datafusion/functions/src/regex/regexplike.rs | 7 +-
datafusion/functions/src/regexp_common.rs | 123 ---------
datafusion/functions/src/string/contains.rs | 20 +-
datafusion/physical-expr/src/expressions/binary.rs | 8 +-
.../physical-expr/src/expressions/is_not_null.rs | 2 +-
.../physical-expr/src/expressions/is_null.rs | 77 +-----
.../join_disable_repartition_joins.slt.temp | 26 --
.../sqllogictest/test_files/repartition_scan.slt | 8 +-
13 files changed, 221 insertions(+), 461 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index b8bf83a5ab..448607257c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -70,22 +70,22 @@ version = "42.0.0"
ahash = { version = "0.8", default-features = false, features = [
"runtime-rng",
] }
-arrow = { version = "53.0.0", features = [
+arrow = { version = "53.1.0", features = [
"prettyprint",
] }
-arrow-array = { version = "53.0.0", default-features = false, features = [
+arrow-array = { version = "53.1.0", default-features = false, features = [
"chrono-tz",
] }
-arrow-buffer = { version = "53.0.0", default-features = false }
-arrow-flight = { version = "53.0.0", features = [
+arrow-buffer = { version = "53.1.0", default-features = false }
+arrow-flight = { version = "53.1.0", features = [
"flight-sql-experimental",
] }
-arrow-ipc = { version = "53.0.0", default-features = false, features = [
+arrow-ipc = { version = "53.1.0", default-features = false, features = [
"lz4",
] }
-arrow-ord = { version = "53.0.0", default-features = false }
-arrow-schema = { version = "53.0.0", default-features = false }
-arrow-string = { version = "53.0.0", default-features = false }
+arrow-ord = { version = "53.1.0", default-features = false }
+arrow-schema = { version = "53.1.0", default-features = false }
+arrow-string = { version = "53.1.0", default-features = false }
async-trait = "0.1.73"
bigdecimal = "=0.4.1"
bytes = "1.4"
@@ -126,7 +126,7 @@ log = "^0.4"
num_cpus = "1.13.0"
object_store = { version = "0.11.0", default-features = false }
parking_lot = "0.12"
-parquet = { version = "53.0.0", default-features = false, features = [
+parquet = { version = "53.1.0", default-features = false, features = [
"arrow",
"async",
"object_store",
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index 8c77ea8a25..8a6ccacbb3 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
[[package]]
name = "addr2line"
-version = "0.24.1"
+version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375"
+checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
dependencies = [
"gimli",
]
@@ -173,9 +173,9 @@ checksum =
"7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
[[package]]
name = "arrow"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba"
+checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c"
dependencies = [
"arrow-arith",
"arrow-array",
@@ -194,9 +194,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a"
+checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -209,9 +209,9 @@ dependencies = [
[[package]]
name = "arrow-array"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f"
+checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da"
dependencies = [
"ahash",
"arrow-buffer",
@@ -220,15 +220,15 @@ dependencies = [
"chrono",
"chrono-tz",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"num",
]
[[package]]
name = "arrow-buffer"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d"
+checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80"
dependencies = [
"bytes",
"half",
@@ -237,9 +237,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785"
+checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -258,9 +258,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740"
+checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -277,9 +277,9 @@ dependencies = [
[[package]]
name = "arrow-data"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65"
+checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -289,9 +289,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a"
+checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -304,9 +304,9 @@ dependencies = [
[[package]]
name = "arrow-json"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6"
+checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -324,9 +324,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b"
+checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -339,9 +339,9 @@ dependencies = [
[[package]]
name = "arrow-row"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb"
+checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6"
dependencies = [
"ahash",
"arrow-array",
@@ -353,15 +353,15 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858"
+checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794"
[[package]]
name = "arrow-select"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba"
+checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a"
dependencies = [
"ahash",
"arrow-array",
@@ -373,9 +373,9 @@ dependencies = [
[[package]]
name = "arrow-string"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230"
+checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -406,9 +406,9 @@ dependencies = [
[[package]]
name = "async-compression"
-version = "0.4.12"
+version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa"
+checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429"
dependencies = [
"bzip2",
"flate2",
@@ -456,9 +456,9 @@ checksum =
"ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "aws-config"
-version = "1.5.7"
+version = "1.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8191fb3091fa0561d1379ef80333c3c7191c6f0435d986e85821bcf7acbd1126"
+checksum = "7198e6f03240fdceba36656d8be440297b6b82270325908c7381f37d826a74f6"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -523,9 +523,9 @@ dependencies = [
[[package]]
name = "aws-sdk-sso"
-version = "1.44.0"
+version = "1.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b90cfe6504115e13c41d3ea90286ede5aa14da294f3fe077027a6e83850843c"
+checksum = "e33ae899566f3d395cbf42858e433930682cc9c1889fa89318896082fef45efb"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -545,9 +545,9 @@ dependencies = [
[[package]]
name = "aws-sdk-ssooidc"
-version = "1.45.0"
+version = "1.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "167c0fad1f212952084137308359e8e4c4724d1c643038ce163f06de9662c1d0"
+checksum = "f39c09e199ebd96b9f860b0fce4b6625f211e064ad7c8693b72ecf7ef03881e0"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -567,9 +567,9 @@ dependencies = [
[[package]]
name = "aws-sdk-sts"
-version = "1.44.0"
+version = "1.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cb5f98188ec1435b68097daa2a37d74b9d17c9caa799466338a8d1544e71b9d"
+checksum = "3d95f93a98130389eb6233b9d615249e543f6c24a68ca1f109af9ca5164a8765"
dependencies = [
"aws-credential-types",
"aws-runtime",
@@ -917,9 +917,9 @@ dependencies = [
[[package]]
name = "cc"
-version = "1.1.22"
+version = "1.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9540e661f81799159abee814118cc139a2004b3a3aa3ea37724a1b66530b90e0"
+checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1"
dependencies = [
"jobserver",
"libc",
@@ -953,9 +953,9 @@ dependencies = [
[[package]]
name = "chrono-tz"
-version = "0.9.0"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb"
+checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6"
dependencies = [
"chrono",
"chrono-tz-build",
@@ -964,20 +964,19 @@ dependencies = [
[[package]]
name = "chrono-tz-build"
-version = "0.3.0"
+version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1"
+checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7"
dependencies = [
"parse-zoneinfo",
- "phf",
"phf_codegen",
]
[[package]]
name = "clap"
-version = "4.5.18"
+version = "4.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3"
+checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615"
dependencies = [
"clap_builder",
"clap_derive",
@@ -985,9 +984,9 @@ dependencies = [
[[package]]
name = "clap_builder"
-version = "4.5.18"
+version = "4.5.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b"
+checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b"
dependencies = [
"anstream",
"anstyle",
@@ -1175,7 +1174,7 @@ checksum =
"5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf"
dependencies = [
"cfg-if",
"crossbeam-utils",
- "hashbrown",
+ "hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core",
@@ -1216,7 +1215,7 @@ dependencies = [
"futures",
"glob",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"indexmap",
"itertools",
"log",
@@ -1293,7 +1292,7 @@ dependencies = [
"arrow-schema",
"chrono",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"instant",
"libc",
"num_cpus",
@@ -1322,7 +1321,7 @@ dependencies = [
"datafusion-common",
"datafusion-expr",
"futures",
- "hashbrown",
+ "hashbrown 0.14.5",
"log",
"object_store",
"parking_lot",
@@ -1375,7 +1374,7 @@ dependencies = [
"datafusion-common",
"datafusion-execution",
"datafusion-expr",
- "hashbrown",
+ "hashbrown 0.14.5",
"hex",
"itertools",
"log",
@@ -1468,7 +1467,7 @@ dependencies = [
"datafusion-common",
"datafusion-expr",
"datafusion-physical-expr",
- "hashbrown",
+ "hashbrown 0.14.5",
"indexmap",
"itertools",
"log",
@@ -1496,7 +1495,7 @@ dependencies = [
"datafusion-functions-aggregate-common",
"datafusion-physical-expr-common",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"hex",
"indexmap",
"itertools",
@@ -1514,7 +1513,7 @@ dependencies = [
"arrow",
"datafusion-common",
"datafusion-expr-common",
- "hashbrown",
+ "hashbrown 0.14.5",
"rand",
]
@@ -1553,7 +1552,7 @@ dependencies = [
"datafusion-physical-expr-common",
"futures",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"indexmap",
"itertools",
"log",
@@ -1758,9 +1757,9 @@ dependencies = [
[[package]]
name = "futures"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
@@ -1773,9 +1772,9 @@ dependencies = [
[[package]]
name = "futures-channel"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
@@ -1783,15 +1782,15 @@ dependencies = [
[[package]]
name = "futures-core"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
@@ -1800,15 +1799,15 @@ dependencies = [
[[package]]
name = "futures-io"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
@@ -1817,15 +1816,15 @@ dependencies = [
[[package]]
name = "futures-sink"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
[[package]]
name = "futures-task"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-timer"
@@ -1835,9 +1834,9 @@ checksum =
"f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
[[package]]
name = "futures-util"
-version = "0.3.30"
+version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
@@ -1874,9 +1873,9 @@ dependencies = [
[[package]]
name = "gimli"
-version = "0.31.0"
+version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64"
+checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "glob"
@@ -1943,6 +1942,12 @@ dependencies = [
"allocator-api2",
]
+[[package]]
+name = "hashbrown"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+
[[package]]
name = "heck"
version = "0.4.1"
@@ -2043,9 +2048,9 @@ dependencies = [
[[package]]
name = "httparse"
-version = "1.9.4"
+version = "1.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9"
+checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946"
[[package]]
name = "httpdate"
@@ -2129,7 +2134,7 @@ dependencies = [
"http 1.1.0",
"hyper 1.4.1",
"hyper-util",
- "rustls 0.23.13",
+ "rustls 0.23.14",
"rustls-native-certs 0.8.0",
"rustls-pki-types",
"tokio",
@@ -2191,12 +2196,12 @@ dependencies = [
[[package]]
name = "indexmap"
-version = "2.5.0"
+version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5"
+checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da"
dependencies = [
"equivalent",
- "hashbrown",
+ "hashbrown 0.15.0",
]
[[package]]
@@ -2219,9 +2224,9 @@ checksum =
"8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
[[package]]
name = "ipnet"
-version = "2.10.0"
+version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4"
+checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
[[package]]
name = "is_terminal_polyfill"
@@ -2270,9 +2275,9 @@ checksum =
"bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lexical-core"
-version = "0.8.5"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
+checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458"
dependencies = [
"lexical-parse-float",
"lexical-parse-integer",
@@ -2283,9 +2288,9 @@ dependencies = [
[[package]]
name = "lexical-parse-float"
-version = "0.8.5"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
+checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0"
dependencies = [
"lexical-parse-integer",
"lexical-util",
@@ -2294,9 +2299,9 @@ dependencies = [
[[package]]
name = "lexical-parse-integer"
-version = "0.8.6"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
+checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61"
dependencies = [
"lexical-util",
"static_assertions",
@@ -2304,18 +2309,18 @@ dependencies = [
[[package]]
name = "lexical-util"
-version = "0.8.5"
+version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
+checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0"
dependencies = [
"static_assertions",
]
[[package]]
name = "lexical-write-float"
-version = "0.8.5"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
+checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809"
dependencies = [
"lexical-util",
"lexical-write-integer",
@@ -2324,9 +2329,9 @@ dependencies = [
[[package]]
name = "lexical-write-integer"
-version = "0.8.5"
+version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
+checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162"
dependencies = [
"lexical-util",
"static_assertions",
@@ -2358,7 +2363,7 @@ source =
"registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d"
dependencies = [
"core2",
- "hashbrown",
+ "hashbrown 0.14.5",
"rle-decode-fast",
]
@@ -2601,9 +2606,9 @@ dependencies = [
[[package]]
name = "object"
-version = "0.36.4"
+version = "0.36.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a"
+checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e"
dependencies = [
"memchr",
]
@@ -2629,7 +2634,7 @@ dependencies = [
"rand",
"reqwest",
"ring",
- "rustls-pemfile 2.1.3",
+ "rustls-pemfile 2.2.0",
"serde",
"serde_json",
"snafu",
@@ -2641,9 +2646,9 @@ dependencies = [
[[package]]
name = "once_cell"
-version = "1.19.0"
+version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "openssl-probe"
@@ -2697,9 +2702,9 @@ dependencies = [
[[package]]
name = "parquet"
-version = "53.0.0"
+version = "53.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8"
+checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e"
dependencies = [
"ahash",
"arrow-array",
@@ -2716,7 +2721,7 @@ dependencies = [
"flate2",
"futures",
"half",
- "hashbrown",
+ "hashbrown 0.14.5",
"lz4_flex",
"num",
"num-bigint",
@@ -2908,7 +2913,7 @@ dependencies = [
"quinn-proto",
"quinn-udp",
"rustc-hash",
- "rustls 0.23.13",
+ "rustls 0.23.14",
"socket2",
"thiserror",
"tokio",
@@ -2925,7 +2930,7 @@ dependencies = [
"rand",
"ring",
"rustc-hash",
- "rustls 0.23.13",
+ "rustls 0.23.14",
"slab",
"thiserror",
"tinyvec",
@@ -2996,9 +3001,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
-version = "0.5.6"
+version = "0.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "355ae415ccd3a04315d3f8246e86d67689ea74d88d915576e1589a351062a13b"
+checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f"
dependencies = [
"bitflags 2.6.0",
]
@@ -3016,9 +3021,9 @@ dependencies = [
[[package]]
name = "regex"
-version = "1.10.6"
+version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
+checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
dependencies = [
"aho-corasick",
"memchr",
@@ -3028,9 +3033,9 @@ dependencies = [
[[package]]
name = "regex-automata"
-version = "0.4.7"
+version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
+checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
dependencies = [
"aho-corasick",
"memchr",
@@ -3045,9 +3050,9 @@ checksum =
"53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
[[package]]
name = "regex-syntax"
-version = "0.8.4"
+version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "relative-path"
@@ -3057,9 +3062,9 @@ checksum =
"ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2"
[[package]]
name = "reqwest"
-version = "0.12.7"
+version = "0.12.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63"
+checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b"
dependencies = [
"base64 0.22.1",
"bytes",
@@ -3080,9 +3085,9 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"quinn",
- "rustls 0.23.13",
- "rustls-native-certs 0.7.3",
- "rustls-pemfile 2.1.3",
+ "rustls 0.23.14",
+ "rustls-native-certs 0.8.0",
+ "rustls-pemfile 2.2.0",
"rustls-pki-types",
"serde",
"serde_json",
@@ -3199,9 +3204,9 @@ dependencies = [
[[package]]
name = "rustls"
-version = "0.23.13"
+version = "0.23.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8"
+checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8"
dependencies = [
"once_cell",
"ring",
@@ -3223,19 +3228,6 @@ dependencies = [
"security-framework",
]
-[[package]]
-name = "rustls-native-certs"
-version = "0.7.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
-dependencies = [
- "openssl-probe",
- "rustls-pemfile 2.1.3",
- "rustls-pki-types",
- "schannel",
- "security-framework",
-]
-
[[package]]
name = "rustls-native-certs"
version = "0.8.0"
@@ -3243,7 +3235,7 @@ source =
"registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a"
dependencies = [
"openssl-probe",
- "rustls-pemfile 2.1.3",
+ "rustls-pemfile 2.2.0",
"rustls-pki-types",
"schannel",
"security-framework",
@@ -3260,11 +3252,10 @@ dependencies = [
[[package]]
name = "rustls-pemfile"
-version = "2.1.3"
+version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425"
+checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50"
dependencies = [
- "base64 0.22.1",
"rustls-pki-types",
]
@@ -3340,9 +3331,9 @@ dependencies = [
[[package]]
name = "schannel"
-version = "0.1.24"
+version = "0.1.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b"
+checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1"
dependencies = [
"windows-sys 0.59.0",
]
@@ -3781,7 +3772,7 @@ version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
dependencies = [
- "rustls 0.23.13",
+ "rustls 0.23.14",
"rustls-pki-types",
"tokio",
]
@@ -3897,9 +3888,9 @@ checksum =
"42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
[[package]]
name = "unicode-bidi"
-version = "0.3.15"
+version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75"
+checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
[[package]]
name = "unicode-ident"
diff --git a/datafusion/core/src/datasource/file_format/csv.rs
b/datafusion/core/src/datasource/file_format/csv.rs
index e821fa806f..f235c3b628 100644
--- a/datafusion/core/src/datasource/file_format/csv.rs
+++ b/datafusion/core/src/datasource/file_format/csv.rs
@@ -771,7 +771,7 @@ mod tests {
"c7: Int64",
"c8: Int64",
"c9: Int64",
- "c10: Int64",
+ "c10: Utf8",
"c11: Float64",
"c12: Float64",
"c13: Utf8"
@@ -907,7 +907,7 @@ mod tests {
Field::new("c7", DataType::Int64, true),
Field::new("c8", DataType::Int64, true),
Field::new("c9", DataType::Int64, true),
- Field::new("c10", DataType::Int64, true),
+ Field::new("c10", DataType::Utf8, true),
Field::new("c11", DataType::Float64, true),
Field::new("c12", DataType::Float64, true),
Field::new("c13", DataType::Utf8, true),
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs
b/datafusion/core/src/datasource/file_format/parquet.rs
index 98ae0ce14b..8647b5df90 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -20,6 +20,7 @@
use std::any::Any;
use std::fmt;
use std::fmt::Debug;
+use std::ops::Range;
use std::sync::Arc;
use super::write::demux::start_demuxer_task;
@@ -47,7 +48,7 @@ use
datafusion_common::file_options::parquet_writer::ParquetWriterOptions;
use datafusion_common::parsers::CompressionTypeVariant;
use datafusion_common::stats::Precision;
use datafusion_common::{
- exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt,
+ internal_datafusion_err, not_impl_err, DataFusionError, GetExt,
DEFAULT_PARQUET_EXTENSION,
};
use datafusion_common_runtime::SpawnedTask;
@@ -60,7 +61,7 @@ use datafusion_physical_expr::PhysicalExpr;
use datafusion_physical_plan::metrics::MetricsSet;
use async_trait::async_trait;
-use bytes::{BufMut, BytesMut};
+use bytes::Bytes;
use hashbrown::HashMap;
use log::debug;
use object_store::buffered::BufWriter;
@@ -71,8 +72,7 @@ use parquet::arrow::arrow_writer::{
use parquet::arrow::{
arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter,
};
-use parquet::file::footer::{decode_footer, decode_metadata};
-use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
+use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader,
RowGroupMetaData};
use parquet::file::properties::WriterProperties;
use parquet::file::writer::SerializedFileWriter;
use parquet::format::FileMetaData;
@@ -84,10 +84,13 @@ use crate::datasource::physical_plan::parquet::{
can_expr_be_pushed_down_with_schemas, ParquetExecBuilder,
};
use datafusion_physical_expr_common::sort_expr::LexRequirement;
-use futures::{StreamExt, TryStreamExt};
+use futures::future::BoxFuture;
+use futures::{FutureExt, StreamExt, TryStreamExt};
use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
+use parquet::arrow::async_reader::MetadataFetch;
+use parquet::errors::ParquetError;
/// Initial writing buffer size. Note this is just a size hint for efficiency.
It
/// will grow beyond the set value if needed.
@@ -441,6 +444,33 @@ impl FileFormat for ParquetFormat {
}
}
+/// [`MetadataFetch`] adapter for reading bytes from an [`ObjectStore`]
+struct ObjectStoreFetch<'a> {
+ store: &'a dyn ObjectStore,
+ meta: &'a ObjectMeta,
+}
+
+impl<'a> ObjectStoreFetch<'a> {
+ fn new(store: &'a dyn ObjectStore, meta: &'a ObjectMeta) -> Self {
+ Self { store, meta }
+ }
+}
+
+impl<'a> MetadataFetch for ObjectStoreFetch<'a> {
+ fn fetch(
+ &mut self,
+ range: Range<usize>,
+ ) -> BoxFuture<'_, Result<Bytes, ParquetError>> {
+ async {
+ self.store
+ .get_range(&self.meta.location, range)
+ .await
+ .map_err(ParquetError::from)
+ }
+ .boxed()
+ }
+}
+
/// Fetches parquet metadata from ObjectStore for given object
///
/// This component is a subject to **change** in near future and is exposed
for low level integrations
@@ -452,57 +482,14 @@ pub async fn fetch_parquet_metadata(
meta: &ObjectMeta,
size_hint: Option<usize>,
) -> Result<ParquetMetaData> {
- if meta.size < 8 {
- return exec_err!("file size of {} is less than footer", meta.size);
- }
-
- // If a size hint is provided, read more than the minimum size
- // to try and avoid a second fetch.
- let footer_start = if let Some(size_hint) = size_hint {
- meta.size.saturating_sub(size_hint)
- } else {
- meta.size - 8
- };
-
- let suffix = store
- .get_range(&meta.location, footer_start..meta.size)
- .await?;
-
- let suffix_len = suffix.len();
-
- let mut footer = [0; 8];
- footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]);
-
- let length = decode_footer(&footer)?;
+ let file_size = meta.size;
+ let fetch = ObjectStoreFetch::new(store, meta);
- if meta.size < length + 8 {
- return exec_err!(
- "file size of {} is less than footer + metadata {}",
- meta.size,
- length + 8
- );
- }
-
- // Did not fetch the entire file metadata in the initial read, need to
make a second request
- if length > suffix_len - 8 {
- let metadata_start = meta.size - length - 8;
- let remaining_metadata = store
- .get_range(&meta.location, metadata_start..footer_start)
- .await?;
-
- let mut metadata = BytesMut::with_capacity(length);
-
- metadata.put(remaining_metadata.as_ref());
- metadata.put(&suffix[..suffix_len - 8]);
-
- Ok(decode_metadata(metadata.as_ref())?)
- } else {
- let metadata_start = meta.size - length - 8;
-
- Ok(decode_metadata(
- &suffix[metadata_start - footer_start..suffix_len - 8],
- )?)
- }
+ ParquetMetaDataReader::new()
+ .with_prefetch_hint(size_hint)
+ .load_and_finish(fetch, file_size)
+ .await
+ .map_err(DataFusionError::from)
}
/// Read and parse the schema of the Parquet file at location `path`
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index bb680f3c67..81be555266 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -92,9 +92,6 @@ pub mod macros;
pub mod string;
make_stub_package!(string, "string_expressions");
-#[cfg(feature = "string_expressions")]
-mod regexp_common;
-
/// Core datafusion expressions
/// Enabled via feature flag `core_expressions`
#[cfg(feature = "core_expressions")]
diff --git a/datafusion/functions/src/regex/regexplike.rs
b/datafusion/functions/src/regex/regexplike.rs
index f647b19691..e245ea9fa7 100644
--- a/datafusion/functions/src/regex/regexplike.rs
+++ b/datafusion/functions/src/regex/regexplike.rs
@@ -16,7 +16,7 @@
// under the License.
//! Regx expressions
-use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::compute::kernels::regexp;
use arrow::datatypes::DataType;
use datafusion_common::exec_err;
@@ -206,7 +206,8 @@ pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef])
-> Result<ArrayRef> {
2 => {
let values = as_generic_string_array::<T>(&args[0])?;
let regex = as_generic_string_array::<T>(&args[1])?;
- let array = regexp::regexp_is_match_utf8(values, regex, None)
+ let flags: Option<&GenericStringArray<T>> = None;
+ let array = regexp::regexp_is_match(values, regex, flags)
.map_err(|e| arrow_datafusion_err!(e))?;
Ok(Arc::new(array) as ArrayRef)
@@ -220,7 +221,7 @@ pub fn regexp_like<T: OffsetSizeTrait>(args: &[ArrayRef])
-> Result<ArrayRef> {
return plan_err!("regexp_like() does not support the
\"global\" option");
}
- let array = regexp::regexp_is_match_utf8(values, regex,
Some(flags))
+ let array = regexp::regexp_is_match(values, regex, Some(flags))
.map_err(|e| arrow_datafusion_err!(e))?;
Ok(Arc::new(array) as ArrayRef)
diff --git a/datafusion/functions/src/regexp_common.rs
b/datafusion/functions/src/regexp_common.rs
deleted file mode 100644
index 748c1a294f..0000000000
--- a/datafusion/functions/src/regexp_common.rs
+++ /dev/null
@@ -1,123 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-//! Common utilities for implementing regex functions
-
-use crate::string::common::StringArrayType;
-
-use arrow::array::{Array, ArrayDataBuilder, BooleanArray};
-use arrow::datatypes::DataType;
-use arrow_buffer::{BooleanBufferBuilder, NullBuffer};
-use datafusion_common::DataFusionError;
-use regex::Regex;
-
-use std::collections::HashMap;
-
-#[cfg(doc)]
-use arrow::array::{LargeStringArray, StringArray, StringViewArray};
-/// Perform SQL `array ~ regex_array` operation on
-/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
-///
-/// If `regex_array` element has an empty value, the corresponding result
value is always true.
-///
-/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] /
[`StringViewArray`] flag,
-/// which allow special search modes, such as case-insensitive and multi-line
mode.
-/// See the documentation
[here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
-/// for more information.
-///
-/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs].
-///
-/// Can remove when <https://github.com/apache/arrow-rs/issues/6370> is
implemented upstream
-///
-/// [arrow-rs]:
https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37
-pub fn regexp_is_match_utf8<'a, S1, S2, S3>(
- array: &'a S1,
- regex_array: &'a S2,
- flags_array: Option<&'a S3>,
-) -> datafusion_common::Result<BooleanArray, DataFusionError>
-where
- &'a S1: StringArrayType<'a>,
- &'a S2: StringArrayType<'a>,
- &'a S3: StringArrayType<'a>,
-{
- if array.len() != regex_array.len() {
- return Err(DataFusionError::Execution(
- "Cannot perform comparison operation on arrays of different length"
- .to_string(),
- ));
- }
-
- let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
-
- let mut patterns: HashMap<String, Regex> = HashMap::new();
- let mut result = BooleanBufferBuilder::new(array.len());
-
- let complete_pattern = match flags_array {
- Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
- |(pattern, flags)| {
- pattern.map(|pattern| match flags {
- Some(flag) => format!("(?{flag}){pattern}"),
- None => pattern.to_string(),
- })
- },
- )) as Box<dyn Iterator<Item = Option<String>>>,
- None => Box::new(
- regex_array
- .iter()
- .map(|pattern| pattern.map(|pattern| pattern.to_string())),
- ),
- };
-
- array
- .iter()
- .zip(complete_pattern)
- .map(|(value, pattern)| {
- match (value, pattern) {
- (Some(_), Some(pattern)) if pattern == *"" => {
- result.append(true);
- }
- (Some(value), Some(pattern)) => {
- let existing_pattern = patterns.get(&pattern);
- let re = match existing_pattern {
- Some(re) => re,
- None => {
- let re = Regex::new(pattern.as_str()).map_err(|e| {
- DataFusionError::Execution(format!(
- "Regular expression did not compile: {e:?}"
- ))
- })?;
- patterns.entry(pattern).or_insert(re)
- }
- };
- result.append(re.is_match(value));
- }
- _ => result.append(false),
- }
- Ok(())
- })
- .collect::<datafusion_common::Result<Vec<()>, DataFusionError>>()?;
-
- let data = unsafe {
- ArrayDataBuilder::new(DataType::Boolean)
- .len(array.len())
- .buffers(vec![result.into()])
- .nulls(nulls)
- .build_unchecked()
- };
-
- Ok(BooleanArray::from(data))
-}
diff --git a/datafusion/functions/src/string/contains.rs
b/datafusion/functions/src/string/contains.rs
index c319f80661..722451ab53 100644
--- a/datafusion/functions/src/string/contains.rs
+++ b/datafusion/functions/src/string/contains.rs
@@ -15,7 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-use crate::regexp_common::regexp_is_match_utf8;
use crate::utils::make_scalar_function;
use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray,
StringViewArray};
@@ -28,6 +27,7 @@ use datafusion_expr::ScalarUDFImpl;
use datafusion_expr::TypeSignature::Exact;
use datafusion_expr::{ColumnarValue, Signature, Volatility};
+use arrow::compute::regexp_is_match;
use std::any::Any;
use std::sync::Arc;
@@ -92,7 +92,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8View, Utf8View) => {
let mod_str = args[0].as_string_view();
let match_str = args[1].as_string_view();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
StringViewArray,
StringViewArray,
GenericStringArray<i32>,
@@ -103,7 +103,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8View, Utf8) => {
let mod_str = args[0].as_string_view();
let match_str = args[1].as_string::<i32>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
StringViewArray,
GenericStringArray<i32>,
GenericStringArray<i32>,
@@ -114,7 +114,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8View, LargeUtf8) => {
let mod_str = args[0].as_string_view();
let match_str = args[1].as_string::<i64>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
StringViewArray,
GenericStringArray<i64>,
GenericStringArray<i32>,
@@ -125,7 +125,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8, Utf8View) => {
let mod_str = args[0].as_string::<i32>();
let match_str = args[1].as_string_view();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i32>,
StringViewArray,
GenericStringArray<i32>,
@@ -136,7 +136,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8, Utf8) => {
let mod_str = args[0].as_string::<i32>();
let match_str = args[1].as_string::<i32>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i32>,
GenericStringArray<i32>,
GenericStringArray<i32>,
@@ -147,7 +147,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(Utf8, LargeUtf8) => {
let mod_str = args[0].as_string::<i32>();
let match_str = args[1].as_string::<i64>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i32>,
GenericStringArray<i64>,
GenericStringArray<i32>,
@@ -158,7 +158,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(LargeUtf8, Utf8View) => {
let mod_str = args[0].as_string::<i64>();
let match_str = args[1].as_string_view();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i64>,
StringViewArray,
GenericStringArray<i32>,
@@ -169,7 +169,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(LargeUtf8, Utf8) => {
let mod_str = args[0].as_string::<i64>();
let match_str = args[1].as_string::<i32>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i64>,
GenericStringArray<i32>,
GenericStringArray<i32>,
@@ -180,7 +180,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef,
DataFusionError> {
(LargeUtf8, LargeUtf8) => {
let mod_str = args[0].as_string::<i64>();
let match_str = args[1].as_string::<i64>();
- let res = regexp_is_match_utf8::<
+ let res = regexp_is_match::<
GenericStringArray<i64>,
GenericStringArray<i64>,
GenericStringArray<i32>,
diff --git a/datafusion/physical-expr/src/expressions/binary.rs
b/datafusion/physical-expr/src/expressions/binary.rs
index 236b24dd40..3d9072c2e1 100644
--- a/datafusion/physical-expr/src/expressions/binary.rs
+++ b/datafusion/physical-expr/src/expressions/binary.rs
@@ -27,9 +27,7 @@ use crate::PhysicalExpr;
use arrow::array::*;
use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene};
use arrow::compute::kernels::cmp::*;
-use arrow::compute::kernels::comparison::{
- regexp_is_match_utf8, regexp_is_match_utf8_scalar,
-};
+use arrow::compute::kernels::comparison::{regexp_is_match,
regexp_is_match_scalar};
use arrow::compute::kernels::concat_elements::concat_elements_utf8;
use arrow::compute::{cast, ilike, like, nilike, nlike};
use arrow::datatypes::*;
@@ -179,7 +177,7 @@ macro_rules! compute_utf8_flag_op {
} else {
None
};
- let mut array = paste::expr! {[<$OP _utf8>]}(&ll, &rr, flag.as_ref())?;
+ let mut array = $OP(ll, rr, flag.as_ref())?;
if $NOT {
array = not(&array).unwrap();
}
@@ -216,7 +214,7 @@ macro_rules! compute_utf8_flag_op_scalar {
if let ScalarValue::Utf8(Some(string_value)) |
ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT {
let flag = $FLAG.then_some("i");
let mut array =
- paste::expr! {[<$OP _utf8_scalar>]}(&ll, &string_value, flag)?;
+ paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}
diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs
b/datafusion/physical-expr/src/expressions/is_not_null.rs
index 58559352d4..cbab7d0c9d 100644
--- a/datafusion/physical-expr/src/expressions/is_not_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_not_null.rs
@@ -73,7 +73,7 @@ impl PhysicalExpr for IsNotNullExpr {
let arg = self.arg.evaluate(batch)?;
match arg {
ColumnarValue::Array(array) => {
- let is_not_null = super::is_null::compute_is_not_null(array)?;
+ let is_not_null = arrow::compute::is_not_null(&array)?;
Ok(ColumnarValue::Array(Arc::new(is_not_null)))
}
ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
diff --git a/datafusion/physical-expr/src/expressions/is_null.rs
b/datafusion/physical-expr/src/expressions/is_null.rs
index 3cdb49bcab..1c8597d3fd 100644
--- a/datafusion/physical-expr/src/expressions/is_null.rs
+++ b/datafusion/physical-expr/src/expressions/is_null.rs
@@ -20,14 +20,10 @@
use std::hash::{Hash, Hasher};
use std::{any::Any, sync::Arc};
-use arrow::compute;
use arrow::{
datatypes::{DataType, Schema},
record_batch::RecordBatch,
};
-use arrow_array::{Array, ArrayRef, BooleanArray, Int8Array, UnionArray};
-use arrow_buffer::{BooleanBuffer, ScalarBuffer};
-use arrow_ord::cmp;
use crate::physical_expr::down_cast_any_ref;
use crate::PhysicalExpr;
@@ -77,9 +73,9 @@ impl PhysicalExpr for IsNullExpr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
let arg = self.arg.evaluate(batch)?;
match arg {
- ColumnarValue::Array(array) => {
- Ok(ColumnarValue::Array(Arc::new(compute_is_null(array)?)))
- }
+ ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
+ arrow::compute::is_null(&array)?,
+ ))),
ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
ScalarValue::Boolean(Some(scalar.is_null())),
)),
@@ -103,65 +99,6 @@ impl PhysicalExpr for IsNullExpr {
}
}
-/// workaround <https://github.com/apache/arrow-rs/issues/6017>,
-/// this can be replaced with a direct call to `arrow::compute::is_null` once
it's fixed.
-pub(crate) fn compute_is_null(array: ArrayRef) -> Result<BooleanArray> {
- if let Some(union_array) = array.as_any().downcast_ref::<UnionArray>() {
- if let Some(offsets) = union_array.offsets() {
- dense_union_is_null(union_array, offsets)
- } else {
- sparse_union_is_null(union_array)
- }
- } else {
- compute::is_null(array.as_ref()).map_err(Into::into)
- }
-}
-
-/// workaround <https://github.com/apache/arrow-rs/issues/6017>,
-/// this can be replaced with a direct call to `arrow::compute::is_not_null`
once it's fixed.
-pub(crate) fn compute_is_not_null(array: ArrayRef) -> Result<BooleanArray> {
- if array.as_any().is::<UnionArray>() {
- compute::not(&compute_is_null(array)?).map_err(Into::into)
- } else {
- compute::is_not_null(array.as_ref()).map_err(Into::into)
- }
-}
-
-fn dense_union_is_null(
- union_array: &UnionArray,
- offsets: &ScalarBuffer<i32>,
-) -> Result<BooleanArray> {
- let child_arrays = (0..union_array.type_names().len())
- .map(|type_id| {
- compute::is_null(&union_array.child(type_id as
i8)).map_err(Into::into)
- })
- .collect::<Result<Vec<BooleanArray>>>()?;
-
- let buffer: BooleanBuffer = offsets
- .iter()
- .zip(union_array.type_ids())
- .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset
as usize))
- .collect();
-
- Ok(BooleanArray::new(buffer, None))
-}
-
-fn sparse_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
- let type_ids = Int8Array::new(union_array.type_ids().clone(), None);
-
- let mut union_is_null =
- BooleanArray::new(BooleanBuffer::new_unset(union_array.len()), None);
- for type_id in 0..union_array.type_names().len() {
- let type_id = type_id as i8;
- let union_is_child = cmp::eq(&type_ids,
&Int8Array::new_scalar(type_id))?;
- let child = union_array.child(type_id);
- let child_array_is_null = compute::is_null(&child)?;
- let child_is_null = compute::and(&union_is_child,
&child_array_is_null)?;
- union_is_null = compute::or(&union_is_null, &child_is_null)?;
- }
- Ok(union_is_null)
-}
-
impl PartialEq<dyn Any> for IsNullExpr {
fn eq(&self, other: &dyn Any) -> bool {
down_cast_any_ref(other)
@@ -184,7 +121,7 @@ mod tests {
array::{BooleanArray, StringArray},
datatypes::*,
};
- use arrow_array::{Float64Array, Int32Array};
+ use arrow_array::{Array, Float64Array, Int32Array, UnionArray};
use arrow_buffer::ScalarBuffer;
use datafusion_common::cast::as_boolean_array;
@@ -243,8 +180,7 @@ mod tests {
let array =
UnionArray::try_new(union_fields(), type_ids, None,
children).unwrap();
- let array_ref = Arc::new(array) as ArrayRef;
- let result = compute_is_null(array_ref).unwrap();
+ let result = arrow::compute::is_null(&array).unwrap();
let expected =
&BooleanArray::from(vec![false, true, false, false, true, true,
false]);
@@ -272,8 +208,7 @@ mod tests {
UnionArray::try_new(union_fields(), type_ids, Some(offsets),
children)
.unwrap();
- let array_ref = Arc::new(array) as ArrayRef;
- let result = compute_is_null(array_ref).unwrap();
+ let result = arrow::compute::is_null(&array).unwrap();
let expected = &BooleanArray::from(vec![false, true, false, true,
false, true]);
assert_eq!(expected, &result);
diff --git
a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp
b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp
deleted file mode 100644
index 00e74a207b..0000000000
--- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-
-# http://www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-##########
-## Join Tests
-##########
-
-# turn off repartition_joins
-statement ok
-set datafusion.optimizer.repartition_joins = false;
-
-include ./join.slt
diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt
b/datafusion/sqllogictest/test_files/repartition_scan.slt
index 4c86312f9e..858e421062 100644
--- a/datafusion/sqllogictest/test_files/repartition_scan.slt
+++ b/datafusion/sqllogictest/test_files/repartition_scan.slt
@@ -61,7 +61,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: column1@0 != 42
-03)----ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]},
projec [...]
+03)----ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]},
projec [...]
# disable round robin repartitioning
statement ok
@@ -77,7 +77,7 @@ logical_plan
physical_plan
01)CoalesceBatchesExec: target_batch_size=8192
02)--FilterExec: column1@0 != 42
-03)----ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]},
projec [...]
+03)----ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]},
projec [...]
# enable round robin repartitioning again
statement ok
@@ -102,7 +102,7 @@ physical_plan
02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true]
03)----CoalesceBatchesExec: target_batch_size=8192
04)------FilterExec: column1@0 != 42
-05)--------ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..172],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:172..338,
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..178],
[WORKSPACE [...]
+05)--------ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..174],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:174..342,
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..180],
[WORKSPACE [...]
## Read the files as though they are ordered
@@ -138,7 +138,7 @@ physical_plan
01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST]
02)--CoalesceBatchesExec: target_batch_size=8192
03)----FilterExec: column1@0 != 42
-04)------ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..169],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..173],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:173..347],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:169..338]]},
proj [...]
+04)------ParquetExec: file_groups={4 groups:
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..171],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..175],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:175..351],
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:171..342]]},
proj [...]
# Cleanup
statement ok
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]