This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch logical-types
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/logical-types by this push:
new 861a8b7bc6 [logical-types branch] Backport native and logical types
(#13016)
861a8b7bc6 is described below
commit 861a8b7bc6d8c7fb69d43f4e369933ce6b5a3018
Author: Filippo Rossi <[email protected]>
AuthorDate: Mon Oct 21 22:00:10 2024 +0200
[logical-types branch] Backport native and logical types (#13016)
* Backport native and logical types from #12853
* Fix clippy error on wasmtest (#12844)
---------
Co-authored-by: Jonah Gao <[email protected]>
---
datafusion/common/src/lib.rs | 1 +
datafusion/common/src/types/builtin.rs | 39 ++++
datafusion/common/src/types/logical.rs | 118 ++++++++++++
datafusion/common/src/types/mod.rs | 22 +++
datafusion/common/src/types/native.rs | 328 +++++++++++++++++++++++++++++++++
datafusion/wasmtest/Cargo.toml | 3 +-
datafusion/wasmtest/src/lib.rs | 5 +-
7 files changed, 513 insertions(+), 3 deletions(-)
diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 10541e0191..b8ba1ed4e8 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -43,6 +43,7 @@ pub mod scalar;
pub mod stats;
pub mod test_util;
pub mod tree_node;
+pub mod types;
pub mod utils;
/// Reexport arrow crate
diff --git a/datafusion/common/src/types/builtin.rs
b/datafusion/common/src/types/builtin.rs
new file mode 100644
index 0000000000..51fca8f21e
--- /dev/null
+++ b/datafusion/common/src/types/builtin.rs
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::{LogicalType, NativeType};
+
+#[derive(Debug)]
+pub struct BuiltinType {
+ native: NativeType,
+}
+
+impl LogicalType for BuiltinType {
+ fn native(&self) -> &NativeType {
+ &self.native
+ }
+
+ fn name(&self) -> Option<&str> {
+ None
+ }
+}
+
+impl From<NativeType> for BuiltinType {
+ fn from(native: NativeType) -> Self {
+ Self { native }
+ }
+}
diff --git a/datafusion/common/src/types/logical.rs
b/datafusion/common/src/types/logical.rs
new file mode 100644
index 0000000000..0121e33d9d
--- /dev/null
+++ b/datafusion/common/src/types/logical.rs
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use core::fmt;
+use std::{cmp::Ordering, hash::Hash, sync::Arc};
+
+use super::NativeType;
+
+/// Signature that uniquely identifies a type among other types.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeSignature<'a> {
+ /// Represents a built-in native type.
+ Native(&'a NativeType),
+ /// Represents an arrow-compatible extension type.
+ /// (<https://arrow.apache.org/docs/format/Columnar.html#extension-types>)
+ ///
+ /// The `name` should contain the same value as 'ARROW:extension:name'.
+ Extension {
+ name: &'a str,
+ parameters: &'a [TypeParameter<'a>],
+ },
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeParameter<'a> {
+ Type(TypeSignature<'a>),
+ Number(i128),
+}
+
+/// A reference counted [`LogicalType`].
+pub type LogicalTypeRef = Arc<dyn LogicalType>;
+
+/// Representation of a logical type with its signature and its native backing
+/// type.
+///
+/// The logical type is meant to be used during the DataFusion logical planning
+/// phase in order to reason about logical types without worrying about their
+/// underlying physical implementation.
+///
+/// ### Extension types
+///
+/// [`LogicalType`] is a trait in order to allow the possibility of declaring
+/// extension types:
+///
+/// ```
+/// use datafusion_common::types::{LogicalType, NativeType, TypeSignature};
+///
+/// struct JSON {}
+///
+/// impl LogicalType for JSON {
+/// fn native(&self) -> &NativeType {
+/// &NativeType::Utf8
+/// }
+///
+/// fn signature(&self) -> TypeSignature<'_> {
+/// TypeSignature::Extension {
+/// name: "JSON",
+/// parameters: &[],
+/// }
+/// }
+/// }
+/// ```
+pub trait LogicalType: Sync + Send {
+ fn native(&self) -> &NativeType;
+ fn signature(&self) -> TypeSignature<'_>;
+}
+
+impl fmt::Debug for dyn LogicalType {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ f.debug_tuple("LogicalType")
+ .field(&self.signature())
+ .field(&self.native())
+ .finish()
+ }
+}
+
+impl PartialEq for dyn LogicalType {
+ fn eq(&self, other: &Self) -> bool {
+ self.native().eq(other.native()) &&
self.signature().eq(&other.signature())
+ }
+}
+
+impl Eq for dyn LogicalType {}
+
+impl PartialOrd for dyn LogicalType {
+ fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+ Some(self.cmp(other))
+ }
+}
+
+impl Ord for dyn LogicalType {
+ fn cmp(&self, other: &Self) -> Ordering {
+ self.signature()
+ .cmp(&other.signature())
+ .then(self.native().cmp(other.native()))
+ }
+}
+
+impl Hash for dyn LogicalType {
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+ self.signature().hash(state);
+ self.native().hash(state);
+ }
+}
diff --git a/datafusion/common/src/types/mod.rs
b/datafusion/common/src/types/mod.rs
new file mode 100644
index 0000000000..4e1bcb75cb
--- /dev/null
+++ b/datafusion/common/src/types/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod logical;
+mod native;
+
+pub use logical::*;
+pub use native::*;
diff --git a/datafusion/common/src/types/native.rs
b/datafusion/common/src/types/native.rs
new file mode 100644
index 0000000000..1ebf4e19b0
--- /dev/null
+++ b/datafusion/common/src/types/native.rs
@@ -0,0 +1,328 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{ops::Deref, sync::Arc};
+
+use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit,
UnionFields};
+
+use super::{LogicalType, TypeSignature};
+
+/// A record of a native type, its name and its nullability.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeField {
+ name: String,
+ native_type: NativeType,
+ nullable: bool,
+}
+
+impl NativeField {
+ pub fn name(&self) -> &str {
+ &self.name
+ }
+
+ pub fn native_type(&self) -> &NativeType {
+ &self.native_type
+ }
+
+ pub fn nullable(&self) -> bool {
+ self.nullable
+ }
+}
+
+/// A reference counted [`NativeField`].
+pub type NativeFieldRef = Arc<NativeField>;
+
+/// A cheaply cloneable, owned collection of [`NativeFieldRef`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeFields(Arc<[NativeFieldRef]>);
+
+impl Deref for NativeFields {
+ type Target = [NativeFieldRef];
+
+ fn deref(&self) -> &Self::Target {
+ self.0.as_ref()
+ }
+}
+
+/// A cheaply cloneable, owned collection of [`NativeFieldRef`] and their
+/// corresponding type ids.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeUnionFields(Arc<[(i8, NativeFieldRef)]>);
+
+impl Deref for NativeUnionFields {
+ type Target = [(i8, NativeFieldRef)];
+
+ fn deref(&self) -> &Self::Target {
+ self.0.as_ref()
+ }
+}
+
+/// Representation of a type that DataFusion can handle natively. It is a
subset
+/// of the physical variants in Arrow's native [`DataType`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum NativeType {
+ /// Null type
+ Null,
+ /// A boolean type representing the values `true` and `false`.
+ Boolean,
+ /// A signed 8-bit integer.
+ Int8,
+ /// A signed 16-bit integer.
+ Int16,
+ /// A signed 32-bit integer.
+ Int32,
+ /// A signed 64-bit integer.
+ Int64,
+ /// An unsigned 8-bit integer.
+ UInt8,
+ /// An unsigned 16-bit integer.
+ UInt16,
+ /// An unsigned 32-bit integer.
+ UInt32,
+ /// An unsigned 64-bit integer.
+ UInt64,
+ /// A 16-bit floating point number.
+ Float16,
+ /// A 32-bit floating point number.
+ Float32,
+ /// A 64-bit floating point number.
+ Float64,
+ /// A timestamp with an optional timezone.
+ ///
+ /// Time is measured as a Unix epoch, counting the seconds from
+ /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
+ /// as a signed 64-bit integer.
+ ///
+ /// The time zone is a string indicating the name of a time zone, one of:
+ ///
+ /// * As used in the Olson time zone database (the "tz database" or
+ /// "tzdata"), such as "America/New_York"
+ /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as
+07:30
+ ///
+ /// Timestamps with a non-empty timezone
+ /// ------------------------------------
+ ///
+ /// If a Timestamp column has a non-empty timezone value, its epoch is
+ /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+ /// (the Unix epoch), regardless of the Timestamp's own timezone.
+ ///
+ /// Therefore, timestamp values with a non-empty timezone correspond to
+ /// physical points in time together with some additional information about
+ /// how the data was obtained and/or how to display it (the timezone).
+ ///
+ /// For example, the timestamp value 0 with the timezone string
"Europe/Paris"
+ /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+ /// application may prefer to display it as "January 1st 1970, 01h00" in
+ /// the Europe/Paris timezone (which is the same physical point in time).
+ ///
+ /// One consequence is that timestamp values with a non-empty timezone
+ /// can be compared and ordered directly, since they all share the same
+ /// well-known point of reference (the Unix epoch).
+ ///
+ /// Timestamps with an unset / empty timezone
+ /// -----------------------------------------
+ ///
+ /// If a Timestamp column has no timezone value, its epoch is
+ /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown*
timezone.
+ ///
+ /// Therefore, timestamp values without a timezone cannot be meaningfully
+ /// interpreted as physical points in time, but only as calendar / clock
+ /// indications ("wall clock time") in an unspecified timezone.
+ ///
+ /// For example, the timestamp value 0 with an empty timezone string
+ /// corresponds to "January 1st 1970, 00h00" in an unknown timezone:
there
+ /// is not enough information to interpret it as a well-defined physical
+ /// point in time.
+ ///
+ /// One consequence is that timestamp values without a timezone cannot
+ /// be reliably compared or ordered, since they may have different points
of
+ /// reference. In particular, it is *not* possible to interpret an unset
+ /// or empty timezone as the same as "UTC".
+ ///
+ /// Conversion between timezones
+ /// ----------------------------
+ ///
+ /// If a Timestamp column has a non-empty timezone, changing the timezone
+ /// to a different non-empty value is a metadata-only operation:
+ /// the timestamp values need not change as their point of reference
remains
+ /// the same (the Unix epoch).
+ ///
+ /// However, if a Timestamp column has no timezone value, changing it to a
+ /// non-empty value requires to think about the desired semantics.
+ /// One possibility is to assume that the original timestamp values are
+ /// relative to the epoch of the timezone being set; timestamp values
should
+ /// then adjusted to the Unix epoch (for example, changing the timezone
from
+ /// empty to "Europe/Paris" would require converting the timestamp values
+ /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+ /// nevertheless correct).
+ ///
+ /// ```
+ /// # use arrow_schema::{DataType, TimeUnit};
+ /// DataType::Timestamp(TimeUnit::Second, None);
+ /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
+ /// DataType::Timestamp(TimeUnit::Second,
Some("string".to_string().into()));
+ /// ```
+ Timestamp(TimeUnit, Option<Arc<str>>),
+ /// A signed date representing the elapsed time since UNIX epoch
(1970-01-01)
+ /// in days.
+ Date,
+ /// A signed time representing the elapsed time since midnight in the unit
of `TimeUnit`.
+ Time(TimeUnit),
+ /// Measure of elapsed time in either seconds, milliseconds, microseconds
or nanoseconds.
+ Duration(TimeUnit),
+ /// A "calendar" interval which models types that don't necessarily
+ /// have a precise duration without the context of a base timestamp (e.g.
+ /// days can differ in length during day light savings time transitions).
+ Interval(IntervalUnit),
+ /// Opaque binary data of variable length.
+ Binary,
+ /// Opaque binary data of fixed size.
+ /// Enum parameter specifies the number of bytes per value.
+ FixedSizeBinary(i32),
+ /// A variable-length string in Unicode with UTF-8 encoding.
+ Utf8,
+ /// A list of some logical data type with variable length.
+ List(NativeFieldRef),
+ /// A list of some logical data type with fixed length.
+ FixedSizeList(NativeFieldRef, i32),
+ /// A nested type that contains a number of sub-fields.
+ Struct(NativeFields),
+ /// A nested type that can represent slots of differing types.
+ Union(NativeUnionFields),
+ /// Decimal value with precision and scale
+ ///
+ /// * precision is the total number of digits
+ /// * scale is the number of digits past the decimal
+ ///
+ /// For example the number 123.45 has precision 5 and scale 2.
+ ///
+ /// In certain situations, scale could be negative number. For
+ /// negative scale, it is the number of padding 0 to the right
+ /// of the digits.
+ ///
+ /// For example the number 12300 could be treated as a decimal
+ /// has precision 3 and scale -2.
+ Decimal(u8, i8),
+ /// A Map is a type that an association between a key and a value.
+ ///
+ /// The key and value types are not constrained, but keys should be
+ /// hashable and unique.
+ ///
+ /// In a field with Map type, the field has a child Struct field, which
then
+ /// has two children: key type and the second the value type. The names of
the
+ /// child fields may be respectively "entries", "key", and "value", but
this is
+ /// not enforced.
+ Map(NativeFieldRef),
+}
+
+impl LogicalType for NativeType {
+ fn native(&self) -> &NativeType {
+ self
+ }
+
+ fn signature(&self) -> TypeSignature<'_> {
+ TypeSignature::Native(self)
+ }
+}
+
+// The following From<DataType>, From<Field>, ... implementations are temporary
+// mapping solutions to provide backwards compatibility while transitioning
from
+// the purely physical system to a logical / physical system.
+
+impl From<DataType> for NativeType {
+ fn from(value: DataType) -> Self {
+ use NativeType::*;
+ match value {
+ DataType::Null => Null,
+ DataType::Boolean => Boolean,
+ DataType::Int8 => Int8,
+ DataType::Int16 => Int16,
+ DataType::Int32 => Int32,
+ DataType::Int64 => Int64,
+ DataType::UInt8 => UInt8,
+ DataType::UInt16 => UInt16,
+ DataType::UInt32 => UInt32,
+ DataType::UInt64 => UInt64,
+ DataType::Float16 => Float16,
+ DataType::Float32 => Float32,
+ DataType::Float64 => Float64,
+ DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
+ DataType::Date32 | DataType::Date64 => Date,
+ DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
+ DataType::Duration(tu) => Duration(tu),
+ DataType::Interval(iu) => Interval(iu),
+ DataType::Binary | DataType::LargeBinary | DataType::BinaryView =>
Binary,
+ DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Utf8,
+ DataType::List(field)
+ | DataType::ListView(field)
+ | DataType::LargeList(field)
+ | DataType::LargeListView(field) =>
List(Arc::new(field.as_ref().into())),
+ DataType::FixedSizeList(field, size) => {
+ FixedSizeList(Arc::new(field.as_ref().into()), size)
+ }
+ DataType::Struct(fields) => Struct(NativeFields::from(&fields)),
+ DataType::Union(union_fields, _) => {
+ Union(NativeUnionFields::from(&union_fields))
+ }
+ DataType::Dictionary(_, data_type) =>
data_type.as_ref().clone().into(),
+ DataType::Decimal128(p, s) | DataType::Decimal256(p, s) =>
Decimal(p, s),
+ DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
+ DataType::RunEndEncoded(_, field) =>
field.data_type().clone().into(),
+ }
+ }
+}
+
+impl From<&Field> for NativeField {
+ fn from(value: &Field) -> Self {
+ Self {
+ name: value.name().clone(),
+ native_type: value.data_type().clone().into(),
+ nullable: value.is_nullable(),
+ }
+ }
+}
+
+impl From<&Fields> for NativeFields {
+ fn from(value: &Fields) -> Self {
+ value
+ .iter()
+ .map(|field| Arc::new(NativeField::from(field.as_ref())))
+ .collect()
+ }
+}
+
+impl FromIterator<NativeFieldRef> for NativeFields {
+ fn from_iter<T: IntoIterator<Item = NativeFieldRef>>(iter: T) -> Self {
+ Self(iter.into_iter().collect())
+ }
+}
+
+impl From<&UnionFields> for NativeUnionFields {
+ fn from(value: &UnionFields) -> Self {
+ value
+ .iter()
+ .map(|(i, field)| (i, Arc::new(NativeField::from(field.as_ref()))))
+ .collect()
+ }
+}
+
+impl FromIterator<(i8, NativeFieldRef)> for NativeUnionFields {
+ fn from_iter<T: IntoIterator<Item = (i8, NativeFieldRef)>>(iter: T) ->
Self {
+ Self(iter.into_iter().collect())
+ }
+}
diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml
index 46e157aecf..2440244d08 100644
--- a/datafusion/wasmtest/Cargo.toml
+++ b/datafusion/wasmtest/Cargo.toml
@@ -60,4 +60,5 @@ wasm-bindgen = "0.2.87"
wasm-bindgen-futures = "0.4.40"
[dev-dependencies]
-wasm-bindgen-test = "0.3"
+tokio = { workspace = true }
+wasm-bindgen-test = "0.3.44"
diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs
index 0f24449cbe..085064d16d 100644
--- a/datafusion/wasmtest/src/lib.rs
+++ b/datafusion/wasmtest/src/lib.rs
@@ -87,13 +87,14 @@ mod test {
wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
- #[wasm_bindgen_test]
+ #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+ #[cfg_attr(not(target_arch = "wasm32"), allow(dead_code))]
fn datafusion_test() {
basic_exprs();
basic_parse();
}
- #[wasm_bindgen_test]
+ #[wasm_bindgen_test(unsupported = tokio::test)]
async fn basic_execute() {
let sql = "SELECT 2 + 2;";
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]