This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 424733e build: Upgrade arrow-rs to 50.0.0 and DataFusion to 35.0.0
(#65)
424733e is described below
commit 424733e650fd54da29282399d0cfd2932fb2007d
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Tue Feb 20 23:09:23 2024 -0800
build: Upgrade arrow-rs to 50.0.0 and DataFusion to 35.0.0 (#65)
---
core/Cargo.lock | 130 ++++++++++++-------------
core/Cargo.toml | 18 ++--
core/benches/parquet_read.rs | 3 +-
core/src/execution/datafusion/planner.rs | 1 -
core/src/execution/operators/copy.rs | 3 +-
core/src/execution/operators/scan.rs | 2 +-
core/src/parquet/util/test_common/page_util.rs | 15 +--
7 files changed, 85 insertions(+), 87 deletions(-)
diff --git a/core/Cargo.lock b/core/Cargo.lock
index 9c40b91..0f262c0 100644
--- a/core/Cargo.lock
+++ b/core/Cargo.lock
@@ -114,11 +114,10 @@ checksum =
"96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
[[package]]
name = "arrow"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614"
+checksum = "aa285343fba4d829d49985bdc541e3789cf6000ed0e84be7c039438df4a4e78c"
dependencies = [
- "ahash",
"arrow-arith",
"arrow-array",
"arrow-buffer",
@@ -136,9 +135,9 @@ dependencies = [
[[package]]
name = "arrow-arith"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7"
+checksum = "753abd0a5290c1bcade7c6623a556f7d1659c5f4148b140b5b63ce7bd1a45705"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -151,9 +150,9 @@ dependencies = [
[[package]]
name = "arrow-array"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d"
+checksum = "d390feeb7f21b78ec997a4081a025baef1e2e0d6069e181939b61864c9779609"
dependencies = [
"ahash",
"arrow-buffer",
@@ -168,9 +167,9 @@ dependencies = [
[[package]]
name = "arrow-buffer"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c"
+checksum = "69615b061701bcdffbc62756bc7e85c827d5290b472b580c972ebbbf690f5aa4"
dependencies = [
"bytes",
"half 2.1.0",
@@ -179,9 +178,9 @@ dependencies = [
[[package]]
name = "arrow-cast"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a"
+checksum = "e448e5dd2f4113bf5b74a1f26531708f5edcacc77335b7066f9398f4bcf4cdef"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -198,9 +197,9 @@ dependencies = [
[[package]]
name = "arrow-csv"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca"
+checksum = "46af72211f0712612f5b18325530b9ad1bfbdc87290d5fbfd32a7da128983781"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -217,9 +216,9 @@ dependencies = [
[[package]]
name = "arrow-data"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634"
+checksum = "67d644b91a162f3ad3135ce1184d0a31c28b816a581e08f29e8e9277a574c64e"
dependencies = [
"arrow-buffer",
"arrow-schema",
@@ -229,9 +228,9 @@ dependencies = [
[[package]]
name = "arrow-ipc"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd"
+checksum = "03dea5e79b48de6c2e04f03f62b0afea7105be7b77d134f6c5414868feefb80d"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -239,13 +238,14 @@ dependencies = [
"arrow-data",
"arrow-schema",
"flatbuffers",
+ "lz4_flex",
]
[[package]]
name = "arrow-json"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee"
+checksum = "8950719280397a47d37ac01492e3506a8a724b3fb81001900b866637a829ee0f"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -263,9 +263,9 @@ dependencies = [
[[package]]
name = "arrow-ord"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4"
+checksum = "1ed9630979034077982d8e74a942b7ac228f33dd93a93b615b4d02ad60c260be"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -278,9 +278,9 @@ dependencies = [
[[package]]
name = "arrow-row"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a"
+checksum = "007035e17ae09c4e8993e4cb8b5b96edf0afb927cd38e2dff27189b274d83dcf"
dependencies = [
"ahash",
"arrow-array",
@@ -293,18 +293,18 @@ dependencies = [
[[package]]
name = "arrow-schema"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167"
+checksum = "0ff3e9c01f7cd169379d269f926892d0e622a704960350d09d331be3ec9e0029"
dependencies = [
"bitflags 2.4.1",
]
[[package]]
name = "arrow-select"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036"
+checksum = "1ce20973c1912de6514348e064829e50947e35977bb9d7fb637dc99ea9ffd78c"
dependencies = [
"ahash",
"arrow-array",
@@ -316,9 +316,9 @@ dependencies = [
[[package]]
name = "arrow-string"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7"
+checksum = "00f3b37f2aeece31a2636d1b037dabb69ef590e03bdc7eb68519b51ec86932a7"
dependencies = [
"arrow-array",
"arrow-buffer",
@@ -833,13 +833,14 @@ dependencies = [
[[package]]
name = "datafusion"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "193fd1e7628278d0641c5122860f9a7fd6a1d77d055838d12f55d15bbe28d4d0"
+checksum = "4328f5467f76d890fe3f924362dbc3a838c6a733f762b32d87f9e0b7bef5fb49"
dependencies = [
"ahash",
"arrow",
"arrow-array",
+ "arrow-ipc",
"arrow-schema",
"async-trait",
"bytes",
@@ -867,16 +868,15 @@ dependencies = [
"sqlparser",
"tempfile",
"tokio",
- "tokio-util",
"url",
"uuid",
]
[[package]]
name = "datafusion-common"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "548bc49c4a489e3de474813831ea556dc9d368f9ed8d867b1493da42e8e9f613"
+checksum = "d29a7752143b446db4a2cccd9a6517293c6b97e8c39e520ca43ccd07135a4f7e"
dependencies = [
"ahash",
"arrow",
@@ -893,9 +893,9 @@ dependencies = [
[[package]]
name = "datafusion-execution"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecc865657ffcf4da5ff08bdc6436a9a833bc0aa96c3254c8d18ab8a0ad4e437d"
+checksum = "2d447650af16e138c31237f53ddaef6dd4f92f0e2d3f2f35d190e16c214ca496"
dependencies = [
"arrow",
"chrono",
@@ -914,9 +914,9 @@ dependencies = [
[[package]]
name = "datafusion-expr"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33c473f72d8d81a532e63f6e562ed66dd9209dfd8e433d9712abd42444ee161e"
+checksum = "d8d19598e48a498850fb79f97a9719b1f95e7deb64a7a06f93f313e8fa1d524b"
dependencies = [
"ahash",
"arrow",
@@ -930,9 +930,9 @@ dependencies = [
[[package]]
name = "datafusion-optimizer"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb6218318001d2f6783b7fffa17592318f65f26609d7aab605a3dd0c7c2e2618"
+checksum = "8b7feb0391f1fc75575acb95b74bfd276903dc37a5409fcebe160bc7ddff2010"
dependencies = [
"arrow",
"async-trait",
@@ -948,9 +948,9 @@ dependencies = [
[[package]]
name = "datafusion-physical-expr"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1ca7e35ca22f9dc506c2375b92054b03ccf91afe25c0a90b395a1473a09735"
+checksum = "e911bca609c89a54e8f014777449d8290327414d3e10c57a3e3c2122e38878d0"
dependencies = [
"ahash",
"arrow",
@@ -982,9 +982,9 @@ dependencies = [
[[package]]
name = "datafusion-physical-plan"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddde97adefcca3a55257c646ffee2a95b6cac66f74d1146a6e3a6dbb37830631"
+checksum = "e96b546b8a02e9c2ab35ac6420d511f12a4701950c1eb2e568c122b4fefb0be3"
dependencies = [
"ahash",
"arrow",
@@ -1013,9 +1013,9 @@ dependencies = [
[[package]]
name = "datafusion-sql"
-version = "34.0.0"
+version = "35.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a60d9d6460a64fddb8663db41da97e6b8b0bf79da42f997ebe81722731eaf0e5"
+checksum = "2d18d36f260bbbd63aafdb55339213a23d540d3419810575850ef0a798a6b768"
dependencies = [
"arrow",
"arrow-schema",
@@ -1709,6 +1709,15 @@ dependencies = [
"libc",
]
+[[package]]
+name = "lz4_flex"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15"
+dependencies = [
+ "twox-hash",
+]
+
[[package]]
name = "md-5"
version = "0.10.6"
@@ -1877,16 +1886,16 @@ dependencies = [
[[package]]
name = "object_store"
-version = "0.8.0"
+version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050"
+checksum = "d139f545f64630e2e3688fd9f81c470888ab01edeb72d13b4e86c566f1130000"
dependencies = [
"async-trait",
"bytes",
"chrono",
"futures",
"humantime",
- "itertools 0.11.0",
+ "itertools 0.12.0",
"parking_lot",
"percent-encoding",
"snafu",
@@ -1951,13 +1960,14 @@ dependencies = [
[[package]]
name = "parquet"
-version = "49.0.0"
+version = "50.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af88740a842787da39b3d69ce5fbf6fce97d20211d3b299fee0a0da6430c74d4"
+checksum = "547b92ebf0c1177e3892f44c8f79757ee62e678d564a9834189725f2c5b7a750"
dependencies = [
"ahash",
"bytes",
"chrono",
+ "half 2.1.0",
"hashbrown 0.14.3",
"num",
"num-bigint",
@@ -2506,9 +2516,9 @@ checksum =
"1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b"
[[package]]
name = "sqlparser"
-version = "0.40.0"
+version = "0.41.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c80afe31cdb649e56c0d9bb5503be9166600d68a852c38dd445636d126858e5"
+checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964"
dependencies = [
"log",
"sqlparser_derive",
@@ -2737,7 +2747,6 @@ dependencies = [
"backtrace",
"bytes",
"num_cpus",
- "parking_lot",
"pin-project-lite",
"tokio-macros",
]
@@ -2764,19 +2773,6 @@ dependencies = [
"tokio",
]
-[[package]]
-name = "tokio-util"
-version = "0.7.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
-dependencies = [
- "bytes",
- "futures-core",
- "futures-sink",
- "pin-project-lite",
- "tokio",
-]
-
[[package]]
name = "tracing"
version = "0.1.40"
diff --git a/core/Cargo.toml b/core/Cargo.toml
index b4df34d..14e2717 100644
--- a/core/Cargo.toml
+++ b/core/Cargo.toml
@@ -29,12 +29,12 @@ include = [
[dependencies]
parquet-format = "4.0.0" # This must be kept in sync with that from parquet
crate
-arrow = { version = "~49.0.0", features = ["prettyprint", "ffi", "chrono-tz"] }
-arrow-array = { version = "~49.0.0" }
-arrow-data = { version = "~49.0.0" }
-arrow-schema = { version = "~49.0.0" }
-arrow-string = { version = "~49.0.0" }
-parquet = { version = "~49.0.0", default-features = false, features =
["experimental"] }
+arrow = { version = "~50.0.0", features = ["prettyprint", "ffi", "chrono-tz"] }
+arrow-array = { version = "~50.0.0" }
+arrow-data = { version = "~50.0.0" }
+arrow-schema = { version = "~50.0.0" }
+arrow-string = { version = "~50.0.0" }
+parquet = { version = "~50.0.0", default-features = false, features =
["experimental"] }
half = { version = "~2.1", default-features = false }
futures = "0.3.28"
mimalloc = { version = "*", default-features = false, optional = true }
@@ -66,9 +66,9 @@ itertools = "0.11.0"
chrono = { version = "0.4", default-features = false, features = ["clock"] }
chrono-tz = { version = "0.8" }
paste = "1.0.14"
-datafusion-common = { version = "34.0.0" }
-datafusion = { default-features = false, version = "34.0.0", features =
["unicode_expressions"] }
-datafusion-physical-expr = { version = "34.0.0", default-features = false ,
features = ["unicode_expressions"] }
+datafusion-common = { version = "35.0.0" }
+datafusion = { default-features = false, version = "35.0.0", features =
["unicode_expressions"] }
+datafusion-physical-expr = { version = "35.0.0", default-features = false ,
features = ["unicode_expressions"] }
unicode-segmentation = "^1.10.1"
once_cell = "1.18.0"
regex = "1.9.6"
diff --git a/core/benches/parquet_read.rs b/core/benches/parquet_read.rs
index 7dcfab7..612d081 100644
--- a/core/benches/parquet_read.rs
+++ b/core/benches/parquet_read.rs
@@ -37,6 +37,7 @@ use comet::parquet::util::test_common::page_util::{
use perf::FlamegraphProfiler;
use rand::{prelude::StdRng, Rng, SeedableRng};
+use zstd::zstd_safe::WriteBuf;
fn bench(c: &mut Criterion) {
let expected_num_values: usize = NUM_PAGES * VALUES_PER_PAGE;
@@ -177,7 +178,7 @@ impl TestColumnReader {
fn load_page(&mut self) {
if let Some(page) = self.pages.get_next_page().unwrap() {
let num_values = page.num_values() as usize;
- let buffer = Buffer::from_slice_ref(page.buffer().data());
+ let buffer = Buffer::from_slice_ref(page.buffer().as_slice());
self.inner.set_page_v1(num_values, buffer, page.encoding());
}
}
diff --git a/core/src/execution/datafusion/planner.rs
b/core/src/execution/datafusion/planner.rs
index 0cd4ace..c132724 100644
--- a/core/src/execution/datafusion/planner.rs
+++ b/core/src/execution/datafusion/planner.rs
@@ -684,7 +684,6 @@ impl PhysicalPlanner {
group_by,
agg_exprs?,
vec![None; num_agg], // no filter expressions
- vec![None; num_agg], // no order by expressions
child.clone(),
schema.clone(),
)?,
diff --git a/core/src/execution/operators/copy.rs
b/core/src/execution/operators/copy.rs
index 394c1ca..c818d62 100644
--- a/core/src/execution/operators/copy.rs
+++ b/core/src/execution/operators/copy.rs
@@ -141,7 +141,8 @@ impl CopyStream {
.iter()
.map(|v| copy_or_cast_array(v))
.collect::<Result<Vec<ArrayRef>, _>>()?;
- RecordBatch::try_new(self.schema.clone(),
vectors).map_err(DataFusionError::ArrowError)
+ RecordBatch::try_new(self.schema.clone(), vectors)
+ .map_err(|err| DataFusionError::ArrowError(err, None))
}
}
diff --git a/core/src/execution/operators/scan.rs
b/core/src/execution/operators/scan.rs
index 09afc5f..f80db6c 100644
--- a/core/src/execution/operators/scan.rs
+++ b/core/src/execution/operators/scan.rs
@@ -204,7 +204,7 @@ impl ScanStream {
let options = RecordBatchOptions::new().with_row_count(Some(num_rows));
RecordBatch::try_new_with_options(self.schema.clone(), new_columns,
&options)
- .map_err(DataFusionError::ArrowError)
+ .map_err(|err| DataFusionError::ArrowError(err, None))
}
}
diff --git a/core/src/parquet/util/test_common/page_util.rs
b/core/src/parquet/util/test_common/page_util.rs
index b366994..efd3f38 100644
--- a/core/src/parquet/util/test_common/page_util.rs
+++ b/core/src/parquet/util/test_common/page_util.rs
@@ -29,16 +29,17 @@ use parquet::{
},
errors::Result,
schema::types::{ColumnDescPtr, SchemaDescPtr},
- util::memory::ByteBufferPtr,
};
use super::random_numbers_range;
+use bytes::Bytes;
+use zstd::zstd_safe::WriteBuf;
pub trait DataPageBuilder {
fn add_rep_levels(&mut self, max_level: i16, rep_levels: &[i16]);
fn add_def_levels(&mut self, max_level: i16, def_levels: &[i16]);
fn add_values<T: DataType>(&mut self, encoding: Encoding, values: &[T::T]);
- fn add_indices(&mut self, indices: ByteBufferPtr);
+ fn add_indices(&mut self, indices: Bytes);
fn consume(self) -> Page;
}
@@ -126,18 +127,18 @@ impl DataPageBuilder for DataPageBuilderImpl {
let encoded_values = encoder
.flush_buffer()
.expect("consume_buffer() should be OK");
- self.buffer.extend_from_slice(encoded_values.data());
+ self.buffer.extend_from_slice(encoded_values.as_slice());
}
- fn add_indices(&mut self, indices: ByteBufferPtr) {
+ fn add_indices(&mut self, indices: Bytes) {
self.encoding = Some(Encoding::RLE_DICTIONARY);
- self.buffer.extend_from_slice(indices.data());
+ self.buffer.extend_from_slice(indices.as_ref());
}
fn consume(self) -> Page {
if self.datapage_v2 {
Page::DataPageV2 {
- buf: ByteBufferPtr::new(self.buffer),
+ buf: Bytes::copy_from_slice(&self.buffer),
num_values: self.num_values,
encoding: self.encoding.unwrap(),
num_nulls: 0, /* set to dummy value - don't need this when
reading
@@ -151,7 +152,7 @@ impl DataPageBuilder for DataPageBuilderImpl {
}
} else {
Page::DataPage {
- buf: ByteBufferPtr::new(self.buffer),
+ buf: Bytes::copy_from_slice(&self.buffer),
num_values: self.num_values,
encoding: self.encoding.unwrap(),
def_level_encoding: Encoding::RLE,