martin-g commented on code in PR #21737:
URL: https://github.com/apache/datafusion/pull/21737#discussion_r3108825521
##########
datafusion/catalog/src/information_schema.rs:
##########
@@ -411,6 +412,127 @@ impl InformationSchemaConfig {
}
}
+/// Resolve a native type `NativeType` to `DataType` for use in the
information schema
+/// Since it is one-to-many, use the most representative type on tie
+fn get_data_type_for_schema(native_type: &NativeType) -> Option<DataType> {
+ match native_type {
+ NativeType::Null => Some(DataType::Null),
+ NativeType::Boolean => Some(DataType::Boolean),
+ NativeType::Int8 => Some(DataType::Int8),
+ NativeType::Int16 => Some(DataType::Int16),
+ NativeType::Int32 => Some(DataType::Int32),
+ NativeType::Int64 => Some(DataType::Int64),
+ NativeType::UInt8 => Some(DataType::UInt8),
+ NativeType::UInt16 => Some(DataType::UInt16),
+ NativeType::UInt32 => Some(DataType::UInt32),
+ NativeType::UInt64 => Some(DataType::UInt64),
+ NativeType::Float16 => Some(DataType::Float16),
+ NativeType::Float32 => Some(DataType::Float32),
+ NativeType::Float64 => Some(DataType::Float64),
+ NativeType::Date => Some(DataType::Date32), // A tie
+ NativeType::Binary => Some(DataType::Binary), // A tie
+ NativeType::String => Some(DataType::Utf8), // A tie
+ NativeType::Decimal(precision, scale) => {
+ Some(DataType::Decimal256(*precision, *scale)) // A tie, use the
widest type
+ }
+ NativeType::Timestamp(time_unit, timezone) => {
+ Some(DataType::Timestamp(*time_unit, timezone.to_owned()))
+ }
+ NativeType::Time(TimeUnit::Second) =>
Some(DataType::Time32(TimeUnit::Second)),
+ NativeType::Time(TimeUnit::Millisecond) => {
+ Some(DataType::Time32(TimeUnit::Millisecond))
+ }
+ NativeType::Time(TimeUnit::Microsecond) => {
+ Some(DataType::Time64(TimeUnit::Microsecond))
+ }
+ NativeType::Time(TimeUnit::Nanosecond) => {
+ Some(DataType::Time64(TimeUnit::Nanosecond))
+ }
+ NativeType::Duration(time_unit) =>
Some(DataType::Duration(*time_unit)),
+ NativeType::Interval(interval_unit) =>
Some(DataType::Interval(*interval_unit)),
+ NativeType::FixedSizeBinary(size) =>
Some(DataType::FixedSizeBinary(*size)),
+ NativeType::FixedSizeList(logical_field, size) =>
get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ DataType::FixedSizeList(
+ Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )),
+ *size,
+ )
+ }),
+ NativeType::List(logical_field) => get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ // A tie, use List
+ DataType::List(Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )))
+ }),
+ NativeType::Struct(logical_fields) => {
+ let fields = logical_fields
+ .iter()
+ .map(|logical_field| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
+ )))
+ })
+ .collect::<Option<Fields>>()?;
+ Some(DataType::Struct(fields))
+ }
+ NativeType::Union(logical_fields) => {
+ let ids = logical_fields.iter().map(|(i, _)|
*i).collect::<Vec<i8>>();
+ let fields: Vec<FieldRef> = logical_fields
+ .iter()
+ .map(|(_, logical_field)| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
Review Comment:
Should the metadata be preserved as well ?
##########
datafusion/expr-common/src/signature.rs:
##########
@@ -595,18 +589,25 @@ impl Display for ArrayFunctionArgument {
}
}
-static NUMERICS: &[DataType] = &[
- DataType::Int8,
- DataType::Int16,
- DataType::Int32,
- DataType::Int64,
- DataType::UInt8,
- DataType::UInt16,
- DataType::UInt32,
- DataType::UInt64,
- DataType::Float16,
- DataType::Float32,
- DataType::Float64,
+/// Constant that is used as a Decimal type for `get_example_types`
+/// Use Decimal256 precision as a reasonable default
+const NATIVE_TYPE_DECIMAL: NativeType =
+ NativeType::Decimal(DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE);
Review Comment:
precision == scale == 76 => no room for integer digits
```suggestion
NativeType::Decimal(DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE);
```
##########
datafusion/expr-common/src/signature.rs:
##########
@@ -881,36 +882,41 @@ impl TypeSignature {
}
#[deprecated(since = "46.0.0", note = "See get_example_types instead")]
- pub fn get_possible_types(&self) -> Vec<Vec<DataType>> {
+ pub fn get_possible_types(&self) -> Vec<Vec<NativeType>> {
Review Comment:
Hm. A public API break for a deprecated method ... Maybe it is time to just
remove it ?!
##########
datafusion/catalog/src/information_schema.rs:
##########
@@ -411,6 +412,127 @@ impl InformationSchemaConfig {
}
}
+/// Resolve a native type `NativeType` to `DataType` for use in the
information schema
+/// Since it is one-to-many, use the most representative type on tie
+fn get_data_type_for_schema(native_type: &NativeType) -> Option<DataType> {
+ match native_type {
+ NativeType::Null => Some(DataType::Null),
+ NativeType::Boolean => Some(DataType::Boolean),
+ NativeType::Int8 => Some(DataType::Int8),
+ NativeType::Int16 => Some(DataType::Int16),
+ NativeType::Int32 => Some(DataType::Int32),
+ NativeType::Int64 => Some(DataType::Int64),
+ NativeType::UInt8 => Some(DataType::UInt8),
+ NativeType::UInt16 => Some(DataType::UInt16),
+ NativeType::UInt32 => Some(DataType::UInt32),
+ NativeType::UInt64 => Some(DataType::UInt64),
+ NativeType::Float16 => Some(DataType::Float16),
+ NativeType::Float32 => Some(DataType::Float32),
+ NativeType::Float64 => Some(DataType::Float64),
+ NativeType::Date => Some(DataType::Date32), // A tie
+ NativeType::Binary => Some(DataType::Binary), // A tie
+ NativeType::String => Some(DataType::Utf8), // A tie
+ NativeType::Decimal(precision, scale) => {
+ Some(DataType::Decimal256(*precision, *scale)) // A tie, use the
widest type
+ }
+ NativeType::Timestamp(time_unit, timezone) => {
+ Some(DataType::Timestamp(*time_unit, timezone.to_owned()))
+ }
+ NativeType::Time(TimeUnit::Second) =>
Some(DataType::Time32(TimeUnit::Second)),
+ NativeType::Time(TimeUnit::Millisecond) => {
+ Some(DataType::Time32(TimeUnit::Millisecond))
+ }
+ NativeType::Time(TimeUnit::Microsecond) => {
+ Some(DataType::Time64(TimeUnit::Microsecond))
+ }
+ NativeType::Time(TimeUnit::Nanosecond) => {
+ Some(DataType::Time64(TimeUnit::Nanosecond))
+ }
+ NativeType::Duration(time_unit) =>
Some(DataType::Duration(*time_unit)),
+ NativeType::Interval(interval_unit) =>
Some(DataType::Interval(*interval_unit)),
+ NativeType::FixedSizeBinary(size) =>
Some(DataType::FixedSizeBinary(*size)),
+ NativeType::FixedSizeList(logical_field, size) =>
get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ DataType::FixedSizeList(
+ Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )),
+ *size,
+ )
+ }),
+ NativeType::List(logical_field) => get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ // A tie, use List
+ DataType::List(Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )))
+ }),
+ NativeType::Struct(logical_fields) => {
+ let fields = logical_fields
+ .iter()
+ .map(|logical_field| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
+ )))
+ })
+ .collect::<Option<Fields>>()?;
+ Some(DataType::Struct(fields))
+ }
+ NativeType::Union(logical_fields) => {
+ let ids = logical_fields.iter().map(|(i, _)|
*i).collect::<Vec<i8>>();
+ let fields: Vec<FieldRef> = logical_fields
+ .iter()
+ .map(|(_, logical_field)| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
+ )))
+ })
+ .collect::<Option<Vec<FieldRef>>>()?;
+ Some(DataType::Union(
+ UnionFields::try_new(ids, fields).ok()?,
+ UnionMode::Dense,
+ ))
+ }
+ NativeType::Map(logical_field) => get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ DataType::Map(
+ Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )),
+ true,
+ )
+ }),
+ }
+}
+
+pub fn resolve_informational_field(idx: usize, t: &NativeType) ->
Result<FieldRef> {
Review Comment:
```suggestion
fn resolve_informational_field(idx: usize, t: &NativeType) ->
Result<FieldRef> {
```
##########
datafusion/catalog/src/information_schema.rs:
##########
@@ -411,6 +412,127 @@ impl InformationSchemaConfig {
}
}
+/// Resolve a native type `NativeType` to `DataType` for use in the
information schema
+/// Since it is one-to-many, use the most representative type on tie
+fn get_data_type_for_schema(native_type: &NativeType) -> Option<DataType> {
+ match native_type {
+ NativeType::Null => Some(DataType::Null),
+ NativeType::Boolean => Some(DataType::Boolean),
+ NativeType::Int8 => Some(DataType::Int8),
+ NativeType::Int16 => Some(DataType::Int16),
+ NativeType::Int32 => Some(DataType::Int32),
+ NativeType::Int64 => Some(DataType::Int64),
+ NativeType::UInt8 => Some(DataType::UInt8),
+ NativeType::UInt16 => Some(DataType::UInt16),
+ NativeType::UInt32 => Some(DataType::UInt32),
+ NativeType::UInt64 => Some(DataType::UInt64),
+ NativeType::Float16 => Some(DataType::Float16),
+ NativeType::Float32 => Some(DataType::Float32),
+ NativeType::Float64 => Some(DataType::Float64),
+ NativeType::Date => Some(DataType::Date32), // A tie
+ NativeType::Binary => Some(DataType::Binary), // A tie
+ NativeType::String => Some(DataType::Utf8), // A tie
+ NativeType::Decimal(precision, scale) => {
+ Some(DataType::Decimal256(*precision, *scale)) // A tie, use the
widest type
+ }
+ NativeType::Timestamp(time_unit, timezone) => {
+ Some(DataType::Timestamp(*time_unit, timezone.to_owned()))
+ }
+ NativeType::Time(TimeUnit::Second) =>
Some(DataType::Time32(TimeUnit::Second)),
+ NativeType::Time(TimeUnit::Millisecond) => {
+ Some(DataType::Time32(TimeUnit::Millisecond))
+ }
+ NativeType::Time(TimeUnit::Microsecond) => {
+ Some(DataType::Time64(TimeUnit::Microsecond))
+ }
+ NativeType::Time(TimeUnit::Nanosecond) => {
+ Some(DataType::Time64(TimeUnit::Nanosecond))
+ }
+ NativeType::Duration(time_unit) =>
Some(DataType::Duration(*time_unit)),
+ NativeType::Interval(interval_unit) =>
Some(DataType::Interval(*interval_unit)),
+ NativeType::FixedSizeBinary(size) =>
Some(DataType::FixedSizeBinary(*size)),
+ NativeType::FixedSizeList(logical_field, size) =>
get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ DataType::FixedSizeList(
+ Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )),
+ *size,
+ )
+ }),
+ NativeType::List(logical_field) => get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ // A tie, use List
+ DataType::List(Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )))
+ }),
+ NativeType::Struct(logical_fields) => {
+ let fields = logical_fields
+ .iter()
+ .map(|logical_field| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
+ )))
+ })
+ .collect::<Option<Fields>>()?;
+ Some(DataType::Struct(fields))
+ }
+ NativeType::Union(logical_fields) => {
+ let ids = logical_fields.iter().map(|(i, _)|
*i).collect::<Vec<i8>>();
+ let fields: Vec<FieldRef> = logical_fields
+ .iter()
+ .map(|(_, logical_field)| {
+ let dt =
+
get_data_type_for_schema(logical_field.logical_type.native())?;
+ Some(Arc::new(Field::new(
+ logical_field.name.clone(),
+ dt,
+ logical_field.nullable,
+ )))
+ })
+ .collect::<Option<Vec<FieldRef>>>()?;
+ Some(DataType::Union(
+ UnionFields::try_new(ids, fields).ok()?,
+ UnionMode::Dense,
+ ))
+ }
+ NativeType::Map(logical_field) => get_data_type_for_schema(
+ logical_field.logical_type.native(),
+ )
+ .map(|child_dt| {
+ DataType::Map(
+ Arc::new(Field::new(
+ logical_field.name.clone(),
+ child_dt,
+ logical_field.nullable,
+ )),
+ true,
+ )
+ }),
+ }
+}
+
+pub fn resolve_informational_field(idx: usize, t: &NativeType) ->
Result<FieldRef> {
+ if let Some(data_type) = get_data_type_for_schema(t) {
Review Comment:
It would be better `get_data_type_for_schema()` to return a `Result` and
return an Err instead of None when some type is not resolved.
Currently it will use `t` for the error but for nested types (like Struct,
List, Union, ...) the problem might be in a child.
##########
datafusion/expr-common/src/signature.rs:
##########
@@ -595,18 +589,25 @@ impl Display for ArrayFunctionArgument {
}
}
-static NUMERICS: &[DataType] = &[
- DataType::Int8,
- DataType::Int16,
- DataType::Int32,
- DataType::Int64,
- DataType::UInt8,
- DataType::UInt16,
- DataType::UInt32,
- DataType::UInt64,
- DataType::Float16,
- DataType::Float32,
- DataType::Float64,
+/// Constant that is used as a Decimal type for `get_example_types`
+/// Use Decimal256 precision as a reasonable default
Review Comment:
nit: Decimal128 looks more reasonable to me.
76-digit precision is used for special cases.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]