This is an automated email from the ASF dual-hosted git repository.

jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b40a298a3a feat(logical-types): add NativeType and LogicalType (#12853)
b40a298a3a is described below

commit b40a298a3a8e7eb0546c06168ef19b44b28acf42
Author: Filippo Rossi <[email protected]>
AuthorDate: Sun Nov 3 04:23:53 2024 +0100

    feat(logical-types): add NativeType and LogicalType (#12853)
    
    * [logical-types] add NativeType and LogicalType
    
    * Add license header
    
    * Add NativeField and derivates
    
    * Support TypeSignatures
    
    * Fix doc
    
    * Add documentation
    
    * Fix doc tests
    
    * Remove dummy test
    
    * From NativeField to LogicalField
    
    * Add default_cast_for
    
    * Add type order with can_cast_types
    
    * Rename NativeType Utf8 to String
    
    * NativeType from &DataType
    
    * Add builtin types
    
    * From LazyLock to OnceLock
---
 datafusion/common/src/lib.rs           |   1 +
 datafusion/common/src/types/builtin.rs |  49 ++++
 datafusion/common/src/types/field.rs   | 114 ++++++++++
 datafusion/common/src/types/logical.rs | 128 +++++++++++
 datafusion/common/src/types/mod.rs     |  26 +++
 datafusion/common/src/types/native.rs  | 399 +++++++++++++++++++++++++++++++++
 6 files changed, 717 insertions(+)

diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs
index e4575038ab..08431a36e8 100644
--- a/datafusion/common/src/lib.rs
+++ b/datafusion/common/src/lib.rs
@@ -44,6 +44,7 @@ pub mod scalar;
 pub mod stats;
 pub mod test_util;
 pub mod tree_node;
+pub mod types;
 pub mod utils;
 
 /// Reexport arrow crate
diff --git a/datafusion/common/src/types/builtin.rs 
b/datafusion/common/src/types/builtin.rs
new file mode 100644
index 0000000000..c6105d37c3
--- /dev/null
+++ b/datafusion/common/src/types/builtin.rs
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::{LogicalTypeRef, NativeType};
+use std::sync::{Arc, OnceLock};
+
+macro_rules! singleton {
+    ($name:ident, $getter:ident, $ty:ident) => {
+        // TODO: Use LazyLock instead of getter function when MSRV gets bumped
+        static $name: OnceLock<LogicalTypeRef> = OnceLock::new();
+
+        #[doc = "Getter for singleton instance of a logical type representing"]
+        #[doc = concat!("[`NativeType::", stringify!($ty), "`].")]
+        pub fn $getter() -> LogicalTypeRef {
+            Arc::clone($name.get_or_init(|| Arc::new(NativeType::$ty)))
+        }
+    };
+}
+
+singleton!(LOGICAL_NULL, logical_null, Null);
+singleton!(LOGICAL_BOOLEAN, logical_boolean, Boolean);
+singleton!(LOGICAL_INT8, logical_int8, Int8);
+singleton!(LOGICAL_INT16, logical_int16, Int16);
+singleton!(LOGICAL_INT32, logical_int32, Int32);
+singleton!(LOGICAL_INT64, logical_int64, Int64);
+singleton!(LOGICAL_UINT8, logical_uint8, UInt8);
+singleton!(LOGICAL_UINT16, logical_uint16, UInt16);
+singleton!(LOGICAL_UINT32, logical_uint32, UInt32);
+singleton!(LOGICAL_UINT64, logical_uint64, UInt64);
+singleton!(LOGICAL_FLOAT16, logical_float16, Float16);
+singleton!(LOGICAL_FLOAT32, logical_float32, Float32);
+singleton!(LOGICAL_FLOAT64, logical_float64, Float64);
+singleton!(LOGICAL_DATE, logical_date, Date);
+singleton!(LOGICAL_BINARY, logical_binary, Binary);
+singleton!(LOGICAL_STRING, logical_string, String);
diff --git a/datafusion/common/src/types/field.rs 
b/datafusion/common/src/types/field.rs
new file mode 100644
index 0000000000..85c7c15727
--- /dev/null
+++ b/datafusion/common/src/types/field.rs
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_schema::{Field, Fields, UnionFields};
+use std::hash::{Hash, Hasher};
+use std::{ops::Deref, sync::Arc};
+
+use super::{LogicalTypeRef, NativeType};
+
+/// A record of a logical type, its name and its nullability.
+#[derive(Debug, Clone, Eq, PartialOrd, Ord)]
+pub struct LogicalField {
+    pub name: String,
+    pub logical_type: LogicalTypeRef,
+    pub nullable: bool,
+}
+
+impl PartialEq for LogicalField {
+    fn eq(&self, other: &Self) -> bool {
+        self.name == other.name
+            && self.logical_type.eq(&other.logical_type)
+            && self.nullable == other.nullable
+    }
+}
+
+impl Hash for LogicalField {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.name.hash(state);
+        self.logical_type.hash(state);
+        self.nullable.hash(state);
+    }
+}
+
+impl From<&Field> for LogicalField {
+    fn from(value: &Field) -> Self {
+        Self {
+            name: value.name().clone(),
+            logical_type: 
Arc::new(NativeType::from(value.data_type().clone())),
+            nullable: value.is_nullable(),
+        }
+    }
+}
+
+/// A reference counted [`LogicalField`].
+pub type LogicalFieldRef = Arc<LogicalField>;
+
+/// A cheaply cloneable, owned collection of [`LogicalFieldRef`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct LogicalFields(Arc<[LogicalFieldRef]>);
+
+impl Deref for LogicalFields {
+    type Target = [LogicalFieldRef];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+impl From<&Fields> for LogicalFields {
+    fn from(value: &Fields) -> Self {
+        value
+            .iter()
+            .map(|field| Arc::new(LogicalField::from(field.as_ref())))
+            .collect()
+    }
+}
+
+impl FromIterator<LogicalFieldRef> for LogicalFields {
+    fn from_iter<T: IntoIterator<Item = LogicalFieldRef>>(iter: T) -> Self {
+        Self(iter.into_iter().collect())
+    }
+}
+
+/// A cheaply cloneable, owned collection of [`LogicalFieldRef`] and their
+/// corresponding type ids.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct LogicalUnionFields(Arc<[(i8, LogicalFieldRef)]>);
+
+impl Deref for LogicalUnionFields {
+    type Target = [(i8, LogicalFieldRef)];
+
+    fn deref(&self) -> &Self::Target {
+        self.0.as_ref()
+    }
+}
+
+impl From<&UnionFields> for LogicalUnionFields {
+    fn from(value: &UnionFields) -> Self {
+        value
+            .iter()
+            .map(|(i, field)| (i, 
Arc::new(LogicalField::from(field.as_ref()))))
+            .collect()
+    }
+}
+
+impl FromIterator<(i8, LogicalFieldRef)> for LogicalUnionFields {
+    fn from_iter<T: IntoIterator<Item = (i8, LogicalFieldRef)>>(iter: T) -> 
Self {
+        Self(iter.into_iter().collect())
+    }
+}
diff --git a/datafusion/common/src/types/logical.rs 
b/datafusion/common/src/types/logical.rs
new file mode 100644
index 0000000000..bde393992a
--- /dev/null
+++ b/datafusion/common/src/types/logical.rs
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::NativeType;
+use crate::error::Result;
+use arrow_schema::DataType;
+use core::fmt;
+use std::{cmp::Ordering, hash::Hash, sync::Arc};
+
+/// Signature that uniquely identifies a type among other types.
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeSignature<'a> {
+    /// Represents a built-in native type.
+    Native(&'a NativeType),
+    /// Represents an arrow-compatible extension type.
+    /// (<https://arrow.apache.org/docs/format/Columnar.html#extension-types>)
+    ///
+    /// The `name` should contain the same value as 'ARROW:extension:name'.
+    Extension {
+        name: &'a str,
+        parameters: &'a [TypeParameter<'a>],
+    },
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum TypeParameter<'a> {
+    Type(TypeSignature<'a>),
+    Number(i128),
+}
+
+/// A reference counted [`LogicalType`].
+pub type LogicalTypeRef = Arc<dyn LogicalType>;
+
+/// Representation of a logical type with its signature and its native backing
+/// type.
+///
+/// The logical type is meant to be used during the DataFusion logical planning
+/// phase in order to reason about logical types without worrying about their
+/// underlying physical implementation.
+///
+/// ### Extension types
+///
+/// [`LogicalType`] is a trait in order to allow the possibility of declaring
+/// extension types:
+///
+/// ```
+/// use datafusion_common::types::{LogicalType, NativeType, TypeSignature};
+///
+/// struct JSON {}
+///
+/// impl LogicalType for JSON {
+///     fn native(&self) -> &NativeType {
+///         &NativeType::String
+///     }
+///
+///    fn signature(&self) -> TypeSignature<'_> {
+///        TypeSignature::Extension {
+///            name: "JSON",
+///            parameters: &[],
+///        }
+///    }
+/// }
+/// ```
+pub trait LogicalType: Sync + Send {
+    /// Get the native backing type of this logical type.
+    fn native(&self) -> &NativeType;
+    /// Get the unique type signature for this logical type. Logical types 
with identical
+    /// signatures are considered equal.
+    fn signature(&self) -> TypeSignature<'_>;
+
+    /// Get the default physical type to cast `origin` to in order to obtain a 
physical type
+    /// that is logically compatible with this logical type.
+    fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
+        self.native().default_cast_for(origin)
+    }
+}
+
+impl fmt::Debug for dyn LogicalType {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_tuple("LogicalType")
+            .field(&self.signature())
+            .field(&self.native())
+            .finish()
+    }
+}
+
+impl PartialEq for dyn LogicalType {
+    fn eq(&self, other: &Self) -> bool {
+        self.signature().eq(&other.signature())
+    }
+}
+
+impl Eq for dyn LogicalType {}
+
+impl PartialOrd for dyn LogicalType {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for dyn LogicalType {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.signature()
+            .cmp(&other.signature())
+            .then(self.native().cmp(other.native()))
+    }
+}
+
+impl Hash for dyn LogicalType {
+    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
+        self.signature().hash(state);
+        self.native().hash(state);
+    }
+}
diff --git a/datafusion/common/src/types/mod.rs 
b/datafusion/common/src/types/mod.rs
new file mode 100644
index 0000000000..2f9ce4ce02
--- /dev/null
+++ b/datafusion/common/src/types/mod.rs
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+mod builtin;
+mod field;
+mod logical;
+mod native;
+
+pub use builtin::*;
+pub use field::*;
+pub use logical::*;
+pub use native::*;
diff --git a/datafusion/common/src/types/native.rs 
b/datafusion/common/src/types/native.rs
new file mode 100644
index 0000000000..bfb546783e
--- /dev/null
+++ b/datafusion/common/src/types/native.rs
@@ -0,0 +1,399 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use super::{
+    LogicalField, LogicalFieldRef, LogicalFields, LogicalType, 
LogicalUnionFields,
+    TypeSignature,
+};
+use crate::error::{Result, _internal_err};
+use arrow::compute::can_cast_types;
+use arrow_schema::{
+    DataType, Field, FieldRef, Fields, IntervalUnit, TimeUnit, UnionFields,
+};
+use std::sync::Arc;
+
+/// Representation of a type that DataFusion can handle natively. It is a 
subset
+/// of the physical variants in Arrow's native [`DataType`].
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub enum NativeType {
+    /// Null type
+    Null,
+    /// A boolean type representing the values `true` and `false`.
+    Boolean,
+    /// A signed 8-bit integer.
+    Int8,
+    /// A signed 16-bit integer.
+    Int16,
+    /// A signed 32-bit integer.
+    Int32,
+    /// A signed 64-bit integer.
+    Int64,
+    /// An unsigned 8-bit integer.
+    UInt8,
+    /// An unsigned 16-bit integer.
+    UInt16,
+    /// An unsigned 32-bit integer.
+    UInt32,
+    /// An unsigned 64-bit integer.
+    UInt64,
+    /// A 16-bit floating point number.
+    Float16,
+    /// A 32-bit floating point number.
+    Float32,
+    /// A 64-bit floating point number.
+    Float64,
+    /// A timestamp with an optional timezone.
+    ///
+    /// Time is measured as a Unix epoch, counting the seconds from
+    /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
+    /// as a signed 64-bit integer.
+    ///
+    /// The time zone is a string indicating the name of a time zone, one of:
+    ///
+    /// * As used in the Olson time zone database (the "tz database" or
+    ///   "tzdata"), such as "America/New_York"
+    /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as 
+07:30
+    ///
+    /// Timestamps with a non-empty timezone
+    /// ------------------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+    /// (the Unix epoch), regardless of the Timestamp's own timezone.
+    ///
+    /// Therefore, timestamp values with a non-empty timezone correspond to
+    /// physical points in time together with some additional information about
+    /// how the data was obtained and/or how to display it (the timezone).
+    ///
+    ///   For example, the timestamp value 0 with the timezone string 
"Europe/Paris"
+    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+    ///   application may prefer to display it as "January 1st 1970, 01h00" in
+    ///   the Europe/Paris timezone (which is the same physical point in time).
+    ///
+    /// One consequence is that timestamp values with a non-empty timezone
+    /// can be compared and ordered directly, since they all share the same
+    /// well-known point of reference (the Unix epoch).
+    ///
+    /// Timestamps with an unset / empty timezone
+    /// -----------------------------------------
+    ///
+    /// If a Timestamp column has no timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* 
timezone.
+    ///
+    /// Therefore, timestamp values without a timezone cannot be meaningfully
+    /// interpreted as physical points in time, but only as calendar / clock
+    /// indications ("wall clock time") in an unspecified timezone.
+    ///
+    ///   For example, the timestamp value 0 with an empty timezone string
+    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: 
there
+    ///   is not enough information to interpret it as a well-defined physical
+    ///   point in time.
+    ///
+    /// One consequence is that timestamp values without a timezone cannot
+    /// be reliably compared or ordered, since they may have different points 
of
+    /// reference.  In particular, it is *not* possible to interpret an unset
+    /// or empty timezone as the same as "UTC".
+    ///
+    /// Conversion between timezones
+    /// ----------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone, changing the timezone
+    /// to a different non-empty value is a metadata-only operation:
+    /// the timestamp values need not change as their point of reference 
remains
+    /// the same (the Unix epoch).
+    ///
+    /// However, if a Timestamp column has no timezone value, changing it to a
+    /// non-empty value requires to think about the desired semantics.
+    /// One possibility is to assume that the original timestamp values are
+    /// relative to the epoch of the timezone being set; timestamp values 
should
+    /// then adjusted to the Unix epoch (for example, changing the timezone 
from
+    /// empty to "Europe/Paris" would require converting the timestamp values
+    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+    /// nevertheless correct).
+    ///
+    /// ```
+    /// # use arrow_schema::{DataType, TimeUnit};
+    /// DataType::Timestamp(TimeUnit::Second, None);
+    /// DataType::Timestamp(TimeUnit::Second, Some("literal".into()));
+    /// DataType::Timestamp(TimeUnit::Second, 
Some("string".to_string().into()));
+    /// ```
+    Timestamp(TimeUnit, Option<Arc<str>>),
+    /// A signed date representing the elapsed time since UNIX epoch 
(1970-01-01)
+    /// in days.
+    Date,
+    /// A signed time representing the elapsed time since midnight in the unit 
of `TimeUnit`.
+    Time(TimeUnit),
+    /// Measure of elapsed time in either seconds, milliseconds, microseconds 
or nanoseconds.
+    Duration(TimeUnit),
+    /// A "calendar" interval which models types that don't necessarily
+    /// have a precise duration without the context of a base timestamp (e.g.
+    /// days can differ in length during day light savings time transitions).
+    Interval(IntervalUnit),
+    /// Opaque binary data of variable length.
+    Binary,
+    /// Opaque binary data of fixed size.
+    /// Enum parameter specifies the number of bytes per value.
+    FixedSizeBinary(i32),
+    /// A variable-length string in Unicode with UTF-8 encoding.
+    String,
+    /// A list of some logical data type with variable length.
+    List(LogicalFieldRef),
+    /// A list of some logical data type with fixed length.
+    FixedSizeList(LogicalFieldRef, i32),
+    /// A nested type that contains a number of sub-fields.
+    Struct(LogicalFields),
+    /// A nested type that can represent slots of differing types.
+    Union(LogicalUnionFields),
+    /// Decimal value with precision and scale
+    ///
+    /// * precision is the total number of digits
+    /// * scale is the number of digits past the decimal
+    ///
+    /// For example the number 123.45 has precision 5 and scale 2.
+    ///
+    /// In certain situations, scale could be negative number. For
+    /// negative scale, it is the number of padding 0 to the right
+    /// of the digits.
+    ///
+    /// For example the number 12300 could be treated as a decimal
+    /// has precision 3 and scale -2.
+    Decimal(u8, i8),
+    /// A Map is a type that an association between a key and a value.
+    ///
+    /// The key and value types are not constrained, but keys should be
+    /// hashable and unique.
+    ///
+    /// In a field with Map type, key type and the second the value type. The 
names of the
+    /// child fields may be respectively "entries", "key", and "value", but 
this is
+    /// not enforced.
+    Map(LogicalFieldRef),
+}
+
+impl LogicalType for NativeType {
+    fn native(&self) -> &NativeType {
+        self
+    }
+
+    fn signature(&self) -> TypeSignature<'_> {
+        TypeSignature::Native(self)
+    }
+
+    fn default_cast_for(&self, origin: &DataType) -> Result<DataType> {
+        use DataType::*;
+
+        fn default_field_cast(to: &LogicalField, from: &Field) -> 
Result<FieldRef> {
+            Ok(Arc::new(Field::new(
+                to.name.clone(),
+                to.logical_type.default_cast_for(from.data_type())?,
+                to.nullable,
+            )))
+        }
+
+        Ok(match (self, origin) {
+            (Self::Null, _) => Null,
+            (Self::Boolean, _) => Boolean,
+            (Self::Int8, _) => Int8,
+            (Self::Int16, _) => Int16,
+            (Self::Int32, _) => Int32,
+            (Self::Int64, _) => Int64,
+            (Self::UInt8, _) => UInt8,
+            (Self::UInt16, _) => UInt16,
+            (Self::UInt32, _) => UInt32,
+            (Self::UInt64, _) => UInt64,
+            (Self::Float16, _) => Float16,
+            (Self::Float32, _) => Float32,
+            (Self::Float64, _) => Float64,
+            (Self::Decimal(p, s), _) if p <= &38 => Decimal128(*p, *s),
+            (Self::Decimal(p, s), _) => Decimal256(*p, *s),
+            (Self::Timestamp(tu, tz), _) => Timestamp(*tu, tz.clone()),
+            (Self::Date, _) => Date32,
+            (Self::Time(tu), _) => match tu {
+                TimeUnit::Second | TimeUnit::Millisecond => Time32(*tu),
+                TimeUnit::Microsecond | TimeUnit::Nanosecond => Time64(*tu),
+            },
+            (Self::Duration(tu), _) => Duration(*tu),
+            (Self::Interval(iu), _) => Interval(*iu),
+            (Self::Binary, LargeUtf8) => LargeBinary,
+            (Self::Binary, Utf8View) => BinaryView,
+            (Self::Binary, data_type) if can_cast_types(data_type, 
&BinaryView) => {
+                BinaryView
+            }
+            (Self::Binary, data_type) if can_cast_types(data_type, 
&LargeBinary) => {
+                LargeBinary
+            }
+            (Self::Binary, data_type) if can_cast_types(data_type, &Binary) => 
Binary,
+            (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size),
+            (Self::String, LargeBinary) => LargeUtf8,
+            (Self::String, BinaryView) => Utf8View,
+            (Self::String, data_type) if can_cast_types(data_type, &Utf8View) 
=> Utf8View,
+            (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) 
=> {
+                LargeUtf8
+            }
+            (Self::String, data_type) if can_cast_types(data_type, &Utf8) => 
Utf8,
+            (Self::List(to_field), List(from_field) | 
FixedSizeList(from_field, _)) => {
+                List(default_field_cast(to_field, from_field)?)
+            }
+            (Self::List(to_field), LargeList(from_field)) => {
+                LargeList(default_field_cast(to_field, from_field)?)
+            }
+            (Self::List(to_field), ListView(from_field)) => {
+                ListView(default_field_cast(to_field, from_field)?)
+            }
+            (Self::List(to_field), LargeListView(from_field)) => {
+                LargeListView(default_field_cast(to_field, from_field)?)
+            }
+            // List array where each element is a len 1 list of the origin type
+            (Self::List(field), _) => List(Arc::new(Field::new(
+                field.name.clone(),
+                field.logical_type.default_cast_for(origin)?,
+                field.nullable,
+            ))),
+            (
+                Self::FixedSizeList(to_field, to_size),
+                FixedSizeList(from_field, from_size),
+            ) if from_size == to_size => {
+                FixedSizeList(default_field_cast(to_field, from_field)?, 
*to_size)
+            }
+            (
+                Self::FixedSizeList(to_field, size),
+                List(from_field)
+                | LargeList(from_field)
+                | ListView(from_field)
+                | LargeListView(from_field),
+            ) => FixedSizeList(default_field_cast(to_field, from_field)?, 
*size),
+            // FixedSizeList array where each element is a len 1 list of the 
origin type
+            (Self::FixedSizeList(field, size), _) => FixedSizeList(
+                Arc::new(Field::new(
+                    field.name.clone(),
+                    field.logical_type.default_cast_for(origin)?,
+                    field.nullable,
+                )),
+                *size,
+            ),
+            // From 
https://github.com/apache/arrow-rs/blob/56525efbd5f37b89d1b56aa51709cab9f81bc89e/arrow-cast/src/cast/mod.rs#L189-L196
+            (Self::Struct(to_fields), Struct(from_fields))
+                if from_fields.len() == to_fields.len() =>
+            {
+                Struct(
+                    from_fields
+                        .iter()
+                        .zip(to_fields.iter())
+                        .map(|(from, to)| default_field_cast(to, from))
+                        .collect::<Result<Fields>>()?,
+                )
+            }
+            (Self::Struct(to_fields), Null) => Struct(
+                to_fields
+                    .iter()
+                    .map(|field| {
+                        Ok(Arc::new(Field::new(
+                            field.name.clone(),
+                            field.logical_type.default_cast_for(&Null)?,
+                            field.nullable,
+                        )))
+                    })
+                    .collect::<Result<Fields>>()?,
+            ),
+            (Self::Map(to_field), Map(from_field, sorted)) => {
+                Map(default_field_cast(to_field, from_field)?, *sorted)
+            }
+            (Self::Map(field), Null) => Map(
+                Arc::new(Field::new(
+                    field.name.clone(),
+                    field.logical_type.default_cast_for(&Null)?,
+                    field.nullable,
+                )),
+                false,
+            ),
+            (Self::Union(to_fields), Union(from_fields, mode))
+                if from_fields.len() == to_fields.len() =>
+            {
+                Union(
+                    from_fields
+                        .iter()
+                        .zip(to_fields.iter())
+                        .map(|((_, from), (i, to))| {
+                            Ok((*i, default_field_cast(to, from)?))
+                        })
+                        .collect::<Result<UnionFields>>()?,
+                    *mode,
+                )
+            }
+            _ => {
+                return _internal_err!(
+                "Unavailable default cast for native type {:?} from physical 
type {:?}",
+                self,
+                origin
+            )
+            }
+        })
+    }
+}
+
+// The following From<DataType>, From<Field>, ... implementations are temporary
+// mapping solutions to provide backwards compatibility while transitioning 
from
+// the purely physical system to a logical / physical system.
+
+impl From<DataType> for NativeType {
+    fn from(value: DataType) -> Self {
+        use NativeType::*;
+        match value {
+            DataType::Null => Null,
+            DataType::Boolean => Boolean,
+            DataType::Int8 => Int8,
+            DataType::Int16 => Int16,
+            DataType::Int32 => Int32,
+            DataType::Int64 => Int64,
+            DataType::UInt8 => UInt8,
+            DataType::UInt16 => UInt16,
+            DataType::UInt32 => UInt32,
+            DataType::UInt64 => UInt64,
+            DataType::Float16 => Float16,
+            DataType::Float32 => Float32,
+            DataType::Float64 => Float64,
+            DataType::Timestamp(tu, tz) => Timestamp(tu, tz),
+            DataType::Date32 | DataType::Date64 => Date,
+            DataType::Time32(tu) | DataType::Time64(tu) => Time(tu),
+            DataType::Duration(tu) => Duration(tu),
+            DataType::Interval(iu) => Interval(iu),
+            DataType::Binary | DataType::LargeBinary | DataType::BinaryView => 
Binary,
+            DataType::FixedSizeBinary(size) => FixedSizeBinary(size),
+            DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => 
String,
+            DataType::List(field)
+            | DataType::ListView(field)
+            | DataType::LargeList(field)
+            | DataType::LargeListView(field) => 
List(Arc::new(field.as_ref().into())),
+            DataType::FixedSizeList(field, size) => {
+                FixedSizeList(Arc::new(field.as_ref().into()), size)
+            }
+            DataType::Struct(fields) => Struct(LogicalFields::from(&fields)),
+            DataType::Union(union_fields, _) => {
+                Union(LogicalUnionFields::from(&union_fields))
+            }
+            DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => 
Decimal(p, s),
+            DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())),
+            DataType::Dictionary(_, data_type) => 
data_type.as_ref().clone().into(),
+            DataType::RunEndEncoded(_, field) => 
field.data_type().clone().into(),
+        }
+    }
+}
+
+impl From<&DataType> for NativeType {
+    fn from(value: &DataType) -> Self {
+        value.clone().into()
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to