jorisvandenbossche commented on code in PR #36846:
URL: https://github.com/apache/arrow/pull/36846#discussion_r1303024203


##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,67 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer. (See increase_decimal_precision.)
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs.
+    bool promote_integer_to_float = false;
+
+    /// Allow an unsigned integer of a given bit width to be promoted
+    /// to a signed integer of the equal or greater bit width.
+    bool promote_integer_sign = false;

Review Comment:
   Potentially dubious case from reading the above: uint32 + int8 -> ? (I 
assume int64, but only if `promote_numeric_width` is also true, because 
otherwise the int8 needs to stay int8?)



##########
python/pyarrow/table.pxi:
##########
@@ -5042,9 +5043,14 @@ def concat_tables(tables, c_bool promote=False, 
MemoryPool memory_pool=None):
     tables : iterable of pyarrow.Table objects
         Pyarrow tables to concatenate into a single Table.
     promote : bool, default False
-        If True, concatenate tables with null-filling and null type promotion.
+        If True, concatenate tables with null-filling and type promotion.
+        See field_merge_options for the type promotion behavior.

Review Comment:
   From a user experience perspective, I find it a bit strange that I still 
have to specify `promote=True` when I already specified 
field_merge_options="permissive" (because the latter only makes sense with 
promote=True). Essentially the `promote=False` can be seen as a "no promotion" 
field merge option? 
   
   I know this is an existing keyword, but maybe going forward we should try to 
merge `promote` and `field_merge_options` in a single keyword? 
   
   (on the naming side, I think I actually like "promote_options" better than 
"field_merge_options", but that would then deviate from the C++ naming ..)



##########
cpp/src/arrow/type.cc:
##########
@@ -327,6 +315,449 @@ std::shared_ptr<Field> Field::WithNullable(const bool 
nullable) const {
   return std::make_shared<Field>(name_, type_, nullable, metadata_);
 }
 
+Field::MergeOptions Field::MergeOptions::Permissive() {
+  MergeOptions options = Defaults();
+  options.promote_nullability = true;
+  options.promote_decimal = true;
+  options.promote_decimal_to_float = true;
+  options.promote_integer_to_decimal = true;
+  options.promote_integer_to_float = true;
+  options.promote_integer_sign = true;
+  options.promote_numeric_width = true;
+  options.promote_binary = true;
+  options.promote_temporal_unit = true;
+  options.promote_list = true;
+  options.promote_dictionary = true;
+  options.promote_dictionary_ordered = false;
+  return options;
+}
+
+std::string Field::MergeOptions::ToString() const {
+  std::stringstream ss;
+  ss << "MergeOptions{";
+  ss << "promote_nullability=" << (promote_nullability ? "true" : "false");
+  ss << ", promote_decimal=" << (promote_decimal ? "true" : "false");
+  ss << ", promote_decimal_to_float=" << (promote_decimal_to_float ? "true" : 
"false");
+  ss << ", promote_integer_to_decimal="
+     << (promote_integer_to_decimal ? "true" : "false");
+  ss << ", promote_integer_to_float=" << (promote_integer_to_float ? "true" : 
"false");
+  ss << ", promote_integer_sign=" << (promote_integer_sign ? "true" : "false");
+  ss << ", promote_numeric_width=" << (promote_numeric_width ? "true" : 
"false");
+  ss << ", promote_binary=" << (promote_binary ? "true" : "false");
+  ss << ", promote_temporal_unit=" << (promote_temporal_unit ? "true" : 
"false");
+  ss << ", promote_list=" << (promote_list ? "true" : "false");
+  ss << ", promote_dictionary=" << (promote_dictionary ? "true" : "false");
+  ss << ", promote_dictionary_ordered="
+     << (promote_dictionary_ordered ? "true" : "false");
+  ss << '}';
+  return ss.str();
+}
+
+namespace {
+// Utilities for Field::MergeWith
+
+std::shared_ptr<DataType> MakeBinary(const DataType& type) {
+  switch (type.id()) {
+    case Type::BINARY:
+    case Type::STRING:
+      return binary();
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return large_binary();
+    default:
+      Unreachable("Hit an unknown type");
+  }
+  return nullptr;
+}
+
+Result<std::shared_ptr<DataType>> WidenDecimals(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  const auto& left = checked_cast<const DecimalType&>(*promoted_type);
+  const auto& right = checked_cast<const DecimalType&>(*other_type);
+  if (!options.promote_numeric_width && left.bit_width() != right.bit_width()) 
{
+    return Status::TypeError(
+        "Cannot promote decimal128 to decimal256 without 
promote_numeric_width=true");
+  }
+  const int32_t max_scale = std::max<int32_t>(left.scale(), right.scale());
+  const int32_t common_precision =
+      std::max<int32_t>(left.precision() + max_scale - left.scale(),
+                        right.precision() + max_scale - right.scale());
+  if (left.id() == Type::DECIMAL256 || right.id() == Type::DECIMAL256 ||
+      common_precision > BasicDecimal128::kMaxPrecision) {
+    return DecimalType::Make(Type::DECIMAL256, common_precision, max_scale);
+  }
+  return DecimalType::Make(Type::DECIMAL128, common_precision, max_scale);
+}
+
+Result<std::shared_ptr<DataType>> MergeTypes(std::shared_ptr<DataType> 
promoted_type,
+                                             std::shared_ptr<DataType> 
other_type,
+                                             const Field::MergeOptions& 
options);
+
+// Merge temporal types based on options. Returns nullptr for non-temporal 
types.
+Result<std::shared_ptr<DataType>> MaybeMergeTemporalTypes(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  if (options.promote_temporal_unit) {
+    if (promoted_type->id() == Type::DATE32 && other_type->id() == 
Type::DATE64) {
+      return date64();
+    }
+    if (promoted_type->id() == Type::DATE64 && other_type->id() == 
Type::DATE32) {
+      return date64();
+    }
+
+    if (promoted_type->id() == Type::DURATION && other_type->id() == 
Type::DURATION) {
+      const auto& left = checked_cast<const DurationType&>(*promoted_type);
+      const auto& right = checked_cast<const DurationType&>(*other_type);
+      return duration(std::max(left.unit(), right.unit()));
+    }
+
+    if (is_time(promoted_type->id()) && is_time(other_type->id())) {
+      const auto& left = checked_cast<const TimeType&>(*promoted_type);
+      const auto& right = checked_cast<const TimeType&>(*other_type);
+      const auto unit = std::max(left.unit(), right.unit());
+      if (unit == TimeUnit::MICRO || unit == TimeUnit::NANO) {
+        return time64(unit);
+      }
+      return time32(unit);
+    }
+  }
+
+  if (promoted_type->id() == Type::TIMESTAMP && other_type->id() == 
Type::TIMESTAMP) {
+    const auto& left = checked_cast<const TimestampType&>(*promoted_type);
+    const auto& right = checked_cast<const TimestampType&>(*other_type);
+    if (left.timezone().empty() ^ right.timezone().empty()) {
+      return Status::TypeError(
+          "Cannot merge timestamp with timezone and timestamp without 
timezone");
+    }
+    if (left.timezone() != right.timezone()) {
+      return Status::TypeError("Cannot merge timestamps with differing 
timezones");
+    }
+    if (options.promote_temporal_unit) {
+      return timestamp(std::max(left.unit(), right.unit()), left.timezone());
+    }
+  }
+
+  return nullptr;
+}
+
+// Merge numeric types based on options. Returns nullptr for non-numeric types.
+Result<std::shared_ptr<DataType>> MaybeMergeNumericTypes(
+    std::shared_ptr<DataType> promoted_type, std::shared_ptr<DataType> 
other_type,
+    const Field::MergeOptions& options) {
+  bool promoted = false;
+  if (options.promote_decimal_to_float) {
+    if (is_decimal(promoted_type->id()) && is_floating(other_type->id())) {
+      promoted_type = other_type;
+      promoted = true;
+    } else if (is_floating(promoted_type->id()) && 
is_decimal(other_type->id())) {
+      other_type = promoted_type;
+      promoted = true;
+    }
+  }
+
+  if (options.promote_integer_to_decimal &&
+      ((is_decimal(promoted_type->id()) && is_integer(other_type->id())) ||
+       (is_decimal(other_type->id()) && is_integer(promoted_type->id())))) {
+    if (is_integer(promoted_type->id()) && is_decimal(other_type->id())) {
+      // Other type is always the int
+      promoted_type.swap(other_type);
+    }
+    ARROW_ASSIGN_OR_RAISE(const int32_t precision,
+                          MaxDecimalDigitsForInteger(other_type->id()));
+    ARROW_ASSIGN_OR_RAISE(const auto promoted_decimal,
+                          DecimalType::Make(promoted_type->id(), precision, 
0));
+    ARROW_ASSIGN_OR_RAISE(promoted_type,
+                          WidenDecimals(promoted_type, promoted_decimal, 
options));
+    return promoted_type;
+  }
+
+  if (options.promote_decimal && is_decimal(promoted_type->id()) &&
+      is_decimal(other_type->id())) {
+    ARROW_ASSIGN_OR_RAISE(promoted_type,
+                          WidenDecimals(promoted_type, other_type, options));
+    return promoted_type;
+  }
+
+  if (options.promote_integer_sign && 
((is_unsigned_integer(promoted_type->id()) &&
+                                        is_signed_integer(other_type->id())) ||
+                                       (is_signed_integer(promoted_type->id()) 
&&
+                                        
is_unsigned_integer(other_type->id())))) {
+    if (!options.promote_numeric_width &&
+        bit_width(promoted_type->id()) != bit_width(other_type->id())) {
+      return Status::TypeError(
+          "Cannot widen (un)signed integers without 
promote_numeric_width=true");
+    }
+    const int max_width =
+        std::max<int>(bit_width(promoted_type->id()), 
bit_width(other_type->id()));
+
+    if (max_width >= 64) {
+      promoted_type = int64();
+    } else if (max_width >= 32) {
+      promoted_type = int32();

Review Comment:
   This means that `uint32` gets promoted to `int32`? But should that be 
`int64` instead?



##########
cpp/src/arrow/table.cc:
##########
@@ -502,9 +515,23 @@ Result<std::shared_ptr<Table>> PromoteTableToSchema(const 
std::shared_ptr<Table>
       continue;
     }
 
+#ifdef ARROW_COMPUTE
+    if (!compute::CanCast(*current_field->type(), *field->type())) {
+      return Status::TypeError("Unable to promote field ", field->name(),
+                               ": incompatible types: ", 
field->type()->ToString(),
+                               " vs ", current_field->type()->ToString());
+    }
+    compute::ExecContext ctx(pool);
+    ARROW_ASSIGN_OR_RAISE(auto casted, 
compute::Cast(table->column(field_index),
+                                                     field->type(), *options, 
&ctx));
+    columns.push_back(casted.chunked_array());
+#else
     return Status::Invalid("Unable to promote field ", field->name(),
                            ": incompatible types: ", 
field->type()->ToString(), " vs ",

Review Comment:
   But related question: I thought the core cast kernels were always available, 
even if ARROW_COMPUTE wasn't enabled? 
(https://github.com/apache/arrow/pull/34295)



##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,67 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;

Review Comment:
   So this only supports combining decimals of the same width, and thus for 
example not decimal128 and decimal256? 
   
   That sounds a bit strange to me, as we do support casting from decimal128 to 
decimal256.



##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,67 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer. (See increase_decimal_precision.)
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs.
+    bool promote_integer_to_float = false;

Review Comment:
   In addition, I think ideally we also document somewhere _exactly_ how our 
numeric type promotions work (the comment above says "equal or greater" for the 
resulting bitwidth, but what's the exact logic for deciding this?)



##########
cpp/src/arrow/table.cc:
##########
@@ -502,9 +515,23 @@ Result<std::shared_ptr<Table>> PromoteTableToSchema(const 
std::shared_ptr<Table>
       continue;
     }
 
+#ifdef ARROW_COMPUTE
+    if (!compute::CanCast(*current_field->type(), *field->type())) {
+      return Status::TypeError("Unable to promote field ", field->name(),
+                               ": incompatible types: ", 
field->type()->ToString(),
+                               " vs ", current_field->type()->ToString());
+    }
+    compute::ExecContext ctx(pool);
+    ARROW_ASSIGN_OR_RAISE(auto casted, 
compute::Cast(table->column(field_index),
+                                                     field->type(), *options, 
&ctx));
+    columns.push_back(casted.chunked_array());
+#else
     return Status::Invalid("Unable to promote field ", field->name(),
                            ": incompatible types: ", 
field->type()->ToString(), " vs ",

Review Comment:
   The "incompatible types" here is now a potentially confusing term? (I would 
consider int32 and int64 as "compatible") Maybe just use "not equal" instead?



##########
python/pyarrow/types.pxi:
##########
@@ -2014,6 +2014,48 @@ cpdef KeyValueMetadata ensure_metadata(object meta, 
c_bool allow_none=False):
         return KeyValueMetadata(meta)
 
 
+cdef class FieldMergeOptions(_Weakrefable):
+    """
+    Options controlling how to merge the types of two fields.
+
+    By default, types must match exactly, except the null type can be
+    merged with any other type.
+
+    """
+
+    cdef:
+        CField.CMergeOptions c_options
+
+    __slots__ = ()
+
+    def __init__(self, *):
+        self.c_options = CField.CMergeOptions.Defaults()

Review Comment:
   Do we want to expose all fine grained options in Python? (now it's only the 
"permissive" and "default")
   
   While trying this out, I expected I could do something like 
`concat_tables(..., 
field_merge_options=pa.FieldMergeOptions(promote_integer_to_decimal=True)`
   
   If we only allow the two main options, I am not sure it is worth actually 
exposing this class publicly. 
   
   (exposing the fine grained options in python can also be a follow-up)



##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,67 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer. (See increase_decimal_precision.)
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs.
+    bool promote_integer_to_float = false;
+
+    /// Allow an unsigned integer of a given bit width to be promoted
+    /// to a signed integer of the equal or greater bit width.
+    bool promote_integer_sign = false;
+
+    /// Allow an integer, float, or decimal of a given bit width to be
+    /// promoted to an equivalent type of a greater bit width.
+    bool promote_numeric_width = false;
+
+    /// Allow strings to be promoted to binary types. Promotion of fixed size
+    /// binary types to variable sized formats, and binary to large binary,
+    /// and string to large string.
+    bool promote_binary = false;
+
+    /// Second to millisecond, Time32 to Time64, Time32(SECOND) to 
Time32(MILLI), etc
+    bool promote_temporal_unit = false;
+
+    /// Allow promotion from a list to a large-list
+    bool promote_list = false;

Review Comment:
   and also from fixed size list to variable sized list?



##########
python/pyarrow/types.pxi:
##########
@@ -3173,6 +3215,9 @@ def unify_schemas(schemas):
     ----------
     schemas : list of Schema
         Schemas to merge into a single one.
+    options : FieldMergeOptions or string, optional

Review Comment:
   Maybe we should align the keyword name with the other places where we pass 
this? In `concat_tables`, it's called `field_merge_options`, which is quite 
long, so another option is something in between (`merge_options`, although 
that's probably not very descriptive being about the field types, or 
`promote_options`, but then that deviates from the naming in C++)



##########
python/pyarrow/types.pxi:
##########
@@ -3173,6 +3215,9 @@ def unify_schemas(schemas):
     ----------
     schemas : list of Schema
         Schemas to merge into a single one.
+    options : FieldMergeOptions or string, optional
+        Options for merging duplicate fields.

Review Comment:
   ```suggestion
           Options for merging fields with the same name (which type to promote 
to).
   ```
   
   ("duplicate" is a bit confusing in this context I think)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to