This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch logical-types
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/logical-types by this push:
     new 861a8b7bc6 [logical-types branch] Backport native and logical types 
(#13016)
861a8b7bc6 is described below

commit 861a8b7bc6d8c7fb69d43f4e369933ce6b5a3018
Author: Filippo Rossi <[email protected]>
AuthorDate: Mon Oct 21 22:00:10 2024 +0200

    [logical-types branch] Backport native and logical types (#13016)
    
    * Backport native and logical types from #12853
    
    * Fix clippy error on wasmtest (#12844)
    
    ---------
    
    Co-authored-by: Jonah Gao <[email protected]>
---
 datafusion/common/src/lib.rs           |   1 +
 datafusion/common/src/types/builtin.rs |  39 ++++
 datafusion/common/src/types/logical.rs | 118 ++++++++++++
 datafusion/common/src/types/mod.rs     |  22 +++
 datafusion/common/src/types/native.rs  | 328 +++++++++++++++++++++++++++++++++
 datafusion/wasmtest/Cargo.toml         |   3 +-
 datafusion/wasmtest/src/lib.rs         |   5 +-
 7 files changed, 513 insertions(+), 3 deletions(-)

diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index 10541e0191..b8ba1ed4e8 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -43,6 +43,7 @@ pub mod scalar;
 pub mod stats;
 pub mod test_util;
 pub mod tree_node;
+pub mod types;
 pub mod utils;
 
 /// Reexport arrow crate
diff --git a/datafusion/common/src/types/builtin.rs 
b/datafusion/common/src/types/builtin.rs
new file mode 100644
index 0000000000..51fca8f21e
--- /dev/null
+++ b/datafusion/common/src/types/builtin.rs
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::{LogicalType, NativeType};
+
+#[derive(Debug)]
+pub struct BuiltinType {
+    native: NativeType,
+}
+
+impl LogicalType for BuiltinType {
+    fn native(&self) -> &NativeType {
+        &self.native
+    }
+
+    fn name(&self) -> Option<&str> {
+        None
+    }
+}
+
+impl From<NativeType> for BuiltinType {
+    fn from(native: NativeType) -> Self {
+        Self { native }
+    }
+}
diff --git a/datafusion/common/src/types/logical.rs 
b/datafusion/common/src/types/logical.rs
new file mode 100644
index 0000000000..0121e33d9d
--- /dev/null
+++ b/datafusion/common/src/types/logical.rs
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use core::fmt;
+use std::{cmp::Ordering, hash::Hash, sync::Arc};
+
+use super::NativeType;
+
+/// Signature that uniquely identifies a type among other types.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeSignature<'a> {
+    /// Represents a built-in native type.
+    Native(&'a NativeType),
+    /// Represents an arrow-compatible extension type.
+    /// (<https://arrow.apache.org/docs/format/Columnar.html#extension-types>)
+    ///
+    /// The `name` should contain the same value as 'ARROW:extension:name'.
+    Extension {
+        name: &'a str,
+        parameters: &'a [TypeParameter<'a>],
+    },
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeParameter<'a> {
+    Type(TypeSignature<'a>),
+    Number(i128),
+}
+
+/// A reference counted [`LogicalType`].
+pub type LogicalTypeRef = Arc<dyn LogicalType>;
+
+/// Representation of a logical type with its signature and its native backing
+/// type.
+///
+/// The logical type is meant to be used during the DataFusion logical planning
+/// phase in order to reason about logical types without worrying about their
+/// underlying physical implementation.
+///
+/// ### Extension types
+///
+/// [`LogicalType`] is a trait in order to allow the possibility of declaring
+/// extension types:
+///
+/// ```
+/// use datafusion_common::types::{LogicalType, NativeType, TypeSignature};
+///
+/// struct JSON {}
+///
+/// impl LogicalType for JSON {
+///     fn native(&self) -> &NativeType {
+///         &NativeType::Utf8
+///     }
+///
+///    fn signature(&self) -> TypeSignature<'_> {
+///        TypeSignature::Extension {
+///            name: "JSON",
+///            parameters: &[],
+///        }
+///    }
+/// }
+/// ```
+pub trait LogicalType: Sync + Send {
+    fn native(&self) -> &NativeType;
+    fn signature(&self) -> TypeSignature<'_>;
+}
+
+impl fmt::Debug for dyn LogicalType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_tuple("LogicalType")
+            .field(&self.signature())
+            .field(&self.native())
+            .finish()
+    }
+}
+
+impl PartialEq for dyn LogicalType {
+    fn eq(&self, other: &Self) -> bool {
+        self.native().eq(other.native()) && 
self.signature().eq(&other.signature())
+    }
+}
+
+impl Eq for dyn LogicalType {}
+
+impl PartialOrd for dyn LogicalType {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for dyn LogicalType {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.signature()
+            .cmp(&other.signature())
+            .then(self.native().cmp(other.native()))
+    }
+}
+
+impl Hash for dyn LogicalType {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.signature().hash(state);
+        self.native().hash(state);
+    }
+}
diff --git a/datafusion/common/src/types/mod.rs 
b/datafusion/common/src/types/mod.rs
new file mode 100644
index 0000000000..4e1bcb75cb
--- /dev/null
+++ b/datafusion/common/src/types/mod.rs
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod logical;
+mod native;
+
+pub use logical::*;
+pub use native::*;
diff --git a/datafusion/common/src/types/native.rs 
b/datafusion/common/src/types/native.rs
new file mode 100644
index 0000000000..1ebf4e19b0
--- /dev/null
+++ b/datafusion/common/src/types/native.rs
@@ -0,0 +1,328 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{ops::Deref, sync::Arc};
+
+use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit, 
UnionFields};
+
+use super::{LogicalType, TypeSignature};
+
+/// A record of a native type, its name and its nullability.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeField {
+    name: String,
+    native_type: NativeType,
+    nullable: bool,
+}
+
+impl NativeField {
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    pub fn native_type(&self) -> &NativeType {
+        &self.native_type
+    }
+
+    pub fn nullable(&self) -> bool {
+        self.nullable
+    }
+}
+
+/// A reference counted [`NativeField`].
+pub type NativeFieldRef = Arc<NativeField>;
+
+/// A cheaply cloneable, owned collection of [`NativeFieldRef`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeFields(Arc<[NativeFieldRef]>);
+
+impl Deref for NativeFields {
+    type Target = [NativeFieldRef];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+/// A cheaply cloneable, owned collection of [`NativeFieldRef`] and their
+/// corresponding type ids.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct NativeUnionFields(Arc<[(i8, NativeFieldRef)]>);
+
+impl Deref for NativeUnionFields {
+    type Target = [(i8, NativeFieldRef)];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+/// Representation of a type that DataFusion can handle natively. It is a 
subset
+/// of the physical variants in Arrow's native [`DataType`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum NativeType {
+    /// Null type
+    Null,
+    /// A boolean type representing the values `true` and `false`.
+    Boolean,
+    /// A signed 8-bit integer.
+    Int8,
+    /// A signed 16-bit integer.
+    Int16,
+    /// A signed 32-bit integer.
+    Int32,
+    /// A signed 64-bit integer.
+    Int64,
+    /// An unsigned 8-bit integer.
+    UInt8,
+    /// An unsigned 16-bit integer.
+    UInt16,
+    /// An unsigned 32-bit integer.
+    UInt32,
+    /// An unsigned 64-bit integer.
+    UInt64,
+    /// A 16-bit floating point number.
+    Float16,
+    /// A 32-bit floating point number.
+    Float32,
+    /// A 64-bit floating point number.
+    Float64,
+    /// A timestamp with an optional timezone.
+    ///
+    /// Time is measured as a Unix epoch, counting the seconds from
+    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
+    /// as a signed 64-bit integer.
+    ///
+    /// The time zone is a string indicating the name of a time zone, one of:
+    ///
+    /// * As used in the Olson time zone database (the "tz database" or
+    ///   "tzdata"), such as "America/New_York"
+    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as 
+07:30
+    ///
+    /// Timestamps with a non-empty timezone
+    /// ------------------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+    /// (the Unix epoch), regardless of the Timestamp's own timezone.
+    ///
+    /// Therefore, timestamp values with a non-empty timezone correspond to
+    /// physical points in time together with some additional information about
+    /// how the data was obtained and/or how to display it (the timezone).
+    ///
+    ///   For example, the timestamp value 0 with the timezone string 
"Europe/Paris"
+    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+    ///   application may prefer to display it as "January 1st 1970, 01h00" in
+    ///   the Europe/Paris timezone (which is the same physical point in time).
+    ///
+    /// One consequence is that timestamp values with a non-empty timezone
+    /// can be compared and ordered directly, since they all share the same
+    /// well-known point of reference (the Unix epoch).
+    ///
+    /// Timestamps with an unset / empty timezone
+    /// -----------------------------------------
+    ///
+    /// If a Timestamp column has no timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* 
timezone.
+    ///
+    /// Therefore, timestamp values without a timezone cannot be meaningfully
+    /// interpreted as physical points in time, but only as calendar / clock
+    /// indications ("wall clock time") in an unspecified timezone.
+    ///
+    ///   For example, the timestamp value 0 with an empty timezone string
+    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: 
there
+    ///   is not enough information to interpret it as a well-defined physical
+    ///   point in time.
+    ///
+    /// One consequence is that timestamp values without a timezone cannot
+    /// be reliably compared or ordered, since they may have different points 
of
+    /// reference.  In particular, it is *not* possible to interpret an unset
+    /// or empty timezone as the same as "UTC".
+    ///
+    /// Conversion between timezones
+    /// ----------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone, changing the timezone
+    /// to a different non-empty value is a metadata-only operation:
+    /// the timestamp values need not change as their point of reference 
remains
+    /// the same (the Unix epoch).
+    ///
+    /// However, if a Timestamp column has no timezone value, changing it to a
+    /// non-empty value requires to think about the desired semantics.
+    /// One possibility is to assume that the original timestamp values are
+    /// relative to the epoch of the timezone being set; timestamp values 
should
+    /// then adjusted to the Unix epoch (for example, changing the timezone 
from
+    /// empty to "Europe/Paris" would require converting the timestamp values
+    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+    /// nevertheless correct).
+    ///
+    /// ```
+    /// # use arrow_schema::{DataType, TimeUnit};
+    /// DataType::Timestamp(TimeUnit::Second, None);
+    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
+    /// DataType::Timestamp(TimeUnit::Second, 
Some("string".to_string().into()));
+    /// ```
+    Timestamp(TimeUnit, Option<Arc<str>>),
+    /// A signed date representing the elapsed time since UNIX epoch 
(1970-01-01)
+    /// in days.
+    Date,
+    /// A signed time representing the elapsed time since midnight in the unit 
of `TimeUnit`.
+    Time(TimeUnit),
+    /// Measure of elapsed time in either seconds, milliseconds, microseconds 
or nanoseconds.
+    Duration(TimeUnit),
+    /// A "calendar" interval which models types that don't necessarily
+    /// have a precise duration without the context of a base timestamp (e.g.
+    /// days can differ in length during day light savings time transitions).
+    Interval(IntervalUnit),
+    /// Opaque binary data of variable length.
+    Binary,
+    /// Opaque binary data of fixed size.
+    /// Enum parameter specifies the number of bytes per value.
+    FixedSizeBinary(i32),
+    /// A variable-length string in Unicode with UTF-8 encoding.
+    Utf8,
+    /// A list of some logical data type with variable length.
+    List(NativeFieldRef),
+    /// A list of some logical data type with fixed length.
+    FixedSizeList(NativeFieldRef, i32),
+    /// A nested type that contains a number of sub-fields.
+    Struct(NativeFields),
+    /// A nested type that can represent slots of differing types.
+    Union(NativeUnionFields),
+    /// Decimal value with precision and scale
+    ///
+    /// * precision is the total number of digits
+    /// * scale is the number of digits past the decimal
+    ///
+    /// For example the number 123.45 has precision 5 and scale 2.
+    ///
+    /// In certain situations, scale could be negative number. For
+    /// negative scale, it is the number of padding 0 to the right
+    /// of the digits.
+    ///
+    /// For example the number 12300 could be treated as a decimal
+    /// has precision 3 and scale -2.
+    Decimal(u8, i8),
+    /// A Map is a type that an association between a key and a value.
+    ///
+    /// The key and value types are not constrained, but keys should be
+    /// hashable and unique.
+    ///
+    /// In a field with Map type, the field has a child Struct field, which 
then
+    /// has two children: key type and the second the value type. The names of 
the
+    /// child fields may be respectively "entries", "key", and "value", but 
this is
+    /// not enforced.
+    Map(NativeFieldRef),
+}
+
+impl LogicalType for NativeType {
+    fn native(&self) -> &NativeType {
+        self
+    }
+
+    fn signature(&self) -> TypeSignature<'_> {
+        TypeSignature::Native(self)
+    }
+}
+
+// The following From<DataType>, From<Field>, ... implementations are temporary
+// mapping solutions to provide backwards compatibility while transitioning 
from
+// the purely physical system to a logical / physical system.
+
+impl From<DataType> for NativeType {
+    fn from(value: DataType) -> Self {
+        use NativeType::*;
+        match value {
+            DataType::Null => Null,
+            DataType::Boolean => Boolean,
+            DataType::Int8 => Int8,
+            DataType::Int16 => Int16,
+            DataType::Int32 => Int32,
+            DataType::Int64 => Int64,
+            DataType::UInt8 => UInt8,
+            DataType::UInt16 => UInt16,
+            DataType::UInt32 => UInt32,
+            DataType::UInt64 => UInt64,
+            DataType::Float16 => Float16,
+            DataType::Float32 => Float32,
+            DataType::Float64 => Float64,
+            DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
+            DataType::Date32 | DataType::Date64 => Date,
+            DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
+            DataType::Duration(tu) => Duration(tu),
+            DataType::Interval(iu) => Interval(iu),
+            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => 
Binary,
+            DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => Utf8,
+            DataType::List(field)
+            | DataType::ListView(field)
+            | DataType::LargeList(field)
+            | DataType::LargeListView(field) => 
List(Arc::new(field.as_ref().into())),
+            DataType::FixedSizeList(field, size) => {
+                FixedSizeList(Arc::new(field.as_ref().into()), size)
+            }
+            DataType::Struct(fields) => Struct(NativeFields::from(&fields)),
+            DataType::Union(union_fields, _) => {
+                Union(NativeUnionFields::from(&union_fields))
+            }
+            DataType::Dictionary(_, data_type) => 
data_type.as_ref().clone().into(),
+            DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => 
Decimal(p, s),
+            DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
+            DataType::RunEndEncoded(_, field) => 
field.data_type().clone().into(),
+        }
+    }
+}
+
+impl From<&Field> for NativeField {
+    fn from(value: &Field) -> Self {
+        Self {
+            name: value.name().clone(),
+            native_type: value.data_type().clone().into(),
+            nullable: value.is_nullable(),
+        }
+    }
+}
+
+impl From<&Fields> for NativeFields {
+    fn from(value: &Fields) -> Self {
+        value
+            .iter()
+            .map(|field| Arc::new(NativeField::from(field.as_ref())))
+            .collect()
+    }
+}
+
+impl FromIterator<NativeFieldRef> for NativeFields {
+    fn from_iter<T: IntoIterator<Item = NativeFieldRef>>(iter: T) -> Self {
+        Self(iter.into_iter().collect())
+    }
+}
+
+impl From<&UnionFields> for NativeUnionFields {
+    fn from(value: &UnionFields) -> Self {
+        value
+            .iter()
+            .map(|(i, field)| (i, Arc::new(NativeField::from(field.as_ref()))))
+            .collect()
+    }
+}
+
+impl FromIterator<(i8, NativeFieldRef)> for NativeUnionFields {
+    fn from_iter<T: IntoIterator<Item = (i8, NativeFieldRef)>>(iter: T) -> 
Self {
+        Self(iter.into_iter().collect())
+    }
+}
diff --git a/datafusion/wasmtest/Cargo.toml b/datafusion/wasmtest/Cargo.toml
index 46e157aecf..2440244d08 100644
--- a/datafusion/wasmtest/Cargo.toml
+++ b/datafusion/wasmtest/Cargo.toml
@@ -60,4 +60,5 @@ wasm-bindgen = "0.2.87"
 wasm-bindgen-futures = "0.4.40"
 
 [dev-dependencies]
-wasm-bindgen-test = "0.3"
+tokio = { workspace = true }
+wasm-bindgen-test = "0.3.44"
diff --git a/datafusion/wasmtest/src/lib.rs b/datafusion/wasmtest/src/lib.rs
index 0f24449cbe..085064d16d 100644
--- a/datafusion/wasmtest/src/lib.rs
+++ b/datafusion/wasmtest/src/lib.rs
@@ -87,13 +87,14 @@ mod test {
 
     wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
 
-    #[wasm_bindgen_test]
+    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+    #[cfg_attr(not(target_arch = "wasm32"), allow(dead_code))]
     fn datafusion_test() {
         basic_exprs();
         basic_parse();
     }
 
-    #[wasm_bindgen_test]
+    #[wasm_bindgen_test(unsupported = tokio::test)]
     async fn basic_execute() {
         let sql = "SELECT 2 + 2;";
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to