jorisvandenbossche commented on code in PR #36846:
URL: https://github.com/apache/arrow/pull/36846#discussion_r1303020868


##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,67 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer. (See increase_decimal_precision.)
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs.
+    bool promote_integer_to_float = false;

Review Comment:
   I think for some of those cases, it might be useful to given an example 
(like "int64 + float32 -> float64").



##########
cpp/src/arrow/type.cc:
##########
@@ -327,6 +315,449 @@ std::shared_ptr<Field> Field::WithNullable(const bool 
nullable) const {
   return std::make_shared<Field>(name_, type_, nullable, metadata_);
 }
 
+Field::MergeOptions Field::MergeOptions::Permissive() {
+  MergeOptions options = Defaults();
+  options.promote_nullability = true;
+  options.promote_decimal = true;
+  options.promote_decimal_to_float = true;
+  options.promote_integer_to_decimal = true;
+  options.promote_integer_to_float = true;
+  options.promote_integer_sign = true;
+  options.promote_numeric_width = true;
+  options.promote_binary = true;
+  options.promote_temporal_unit = true;
+  options.promote_list = true;
+  options.promote_dictionary = true;
+  options.promote_dictionary_ordered = false;
+  return options;
+}
+
+std::string Field::MergeOptions::ToString() const {
+  std::stringstream ss;
+  ss << "MergeOptions{";
+  ss << "promote_nullability=" << (promote_nullability ? "true" : "false");
+  ss << ", promote_decimal=" << (promote_decimal ? "true" : "false");
+  ss << ", promote_decimal_to_float=" << (promote_decimal_to_float ? "true" : 
"false");
+  ss << ", promote_integer_to_decimal="
+     << (promote_integer_to_decimal ? "true" : "false");
+  ss << ", promote_integer_to_float=" << (promote_integer_to_float ? "true" : 
"false");
+  ss << ", promote_integer_sign=" << (promote_integer_sign ? "true" : "false");
+  ss << ", promote_numeric_width=" << (promote_numeric_width ? "true" : 
"false");
+  ss << ", promote_binary=" << (promote_binary ? "true" : "false");
+  ss << ", promote_temporal_unit=" << (promote_temporal_unit ? "true" : 
"false");
+  ss << ", promote_list=" << (promote_list ? "true" : "false");
+  ss << ", promote_dictionary=" << (promote_dictionary ? "true" : "false");
+  ss << ", promote_dictionary_ordered="
+     << (promote_dictionary_ordered ? "true" : "false");
+  ss << '}';
+  return ss.str();
+}
+
+namespace {
+// Utilities for Field::MergeWith
+
+std::shared_ptr<DataType> MakeBinary(const DataType& type) {
+  switch (type.id()) {
+    case Type::BINARY:
+    case Type::STRING:
+      return binary();
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return large_binary();
+    default:
+      Unreachable("Hit an unknown type");
+  }
+  return nullptr;
+}
+
+Result<std::shared_ptr<DataType>> WidenDecimals(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  const auto& left = checked_cast<const DecimalType&>(*promoted_type);
+  const auto& right = checked_cast<const DecimalType&>(*other_type);
+  if (!options.promote_numeric_width && left.bit_width() != right.bit_width()) 
{
+    return Status::TypeError(
+        "Cannot promote decimal128 to decimal256 without 
promote_numeric_width=true");
+  }
+  const int32_t max_scale = std::max<int32_t>(left.scale(), right.scale());
+  const int32_t common_precision =
+      std::max<int32_t>(left.precision() + max_scale - left.scale(),
+                        right.precision() + max_scale - right.scale());
+  if (left.id() == Type::DECIMAL256 || right.id() == Type::DECIMAL256 ||
+      common_precision > BasicDecimal128::kMaxPrecision) {
+    return DecimalType::Make(Type::DECIMAL256, common_precision, max_scale);
+  }
+  return DecimalType::Make(Type::DECIMAL128, common_precision, max_scale);
+}
+
+Result<std::shared_ptr<DataType>> MergeTypes(std::shared_ptr<DataType> 
promoted_type,
+                                             std::shared_ptr<DataType> 
other_type,
+                                             const Field::MergeOptions& 
options);
+
+// Merge temporal types based on options. Returns nullptr for non-temporal 
types.
+Result<std::shared_ptr<DataType>> MaybeMergeTemporalTypes(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  if (options.promote_temporal_unit) {
+    if (promoted_type->id() == Type::DATE32 && other_type->id() == 
Type::DATE64) {
+      return date64();
+    }
+    if (promoted_type->id() == Type::DATE64 && other_type->id() == 
Type::DATE32) {
+      return date64();
+    }
+
+    if (promoted_type->id() == Type::DURATION && other_type->id() == 
Type::DURATION) {
+      const auto& left = checked_cast<const DurationType&>(*promoted_type);
+      const auto& right = checked_cast<const DurationType&>(*other_type);
+      return duration(std::max(left.unit(), right.unit()));
+    }
+
+    if (is_time(promoted_type->id()) && is_time(other_type->id())) {
+      const auto& left = checked_cast<const TimeType&>(*promoted_type);
+      const auto& right = checked_cast<const TimeType&>(*other_type);
+      const auto unit = std::max(left.unit(), right.unit());
+      if (unit == TimeUnit::MICRO || unit == TimeUnit::NANO) {
+        return time64(unit);
+      }
+      return time32(unit);
+    }
+  }
+
+  if (promoted_type->id() == Type::TIMESTAMP && other_type->id() == 
Type::TIMESTAMP) {
+    const auto& left = checked_cast<const TimestampType&>(*promoted_type);
+    const auto& right = checked_cast<const TimestampType&>(*other_type);
+    if (left.timezone().empty() ^ right.timezone().empty()) {
+      return Status::TypeError(
+          "Cannot merge timestamp with timezone and timestamp without 
timezone");
+    }
+    if (left.timezone() != right.timezone()) {
+      return Status::TypeError("Cannot merge timestamps with differing 
timezones");
+    }

Review Comment:
   A potential option could be actually allow promoting timezones, and with the 
result using UTC (the actual time points will stay the same since we store UTC 
under the hood, you only loose the information about the specified timezone)
   
   (but this is perfect to discuss for a follow-up issue as well ;))



##########
cpp/src/arrow/type.h:
##########
@@ -397,14 +397,63 @@ class ARROW_EXPORT Field : public detail::Fingerprintable,
   /// \brief Options that control the behavior of `MergeWith`.
   /// Options are to be added to allow type conversions, including integer
   /// widening, promotion from integer to float, or conversion to or from 
boolean.
-  struct MergeOptions {
+  struct ARROW_EXPORT MergeOptions : public 
util::ToStringOstreamable<MergeOptions> {
     /// If true, a Field of NullType can be unified with a Field of another 
type.
     /// The unified field will be of the other type and become nullable.
     /// Nullability will be promoted to the looser option (nullable if one is 
not
     /// nullable).
     bool promote_nullability = true;
 
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer. (See increase_decimal_precision.)
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs.
+    bool promote_integer_to_float = false;
+
+    /// Allow an unsigned integer of a given bit width to be promoted
+    /// to a signed integer of the equal or greater bit width.
+    bool promote_integer_sign = false;
+
+    /// Allow an integer, float, or decimal of a given bit width to be
+    /// promoted to an equivalent type of a greater bit width.
+    bool promote_numeric_width = false;
+
+    /// Allow strings to be promoted to binary types.
+    bool promote_binary = false;

Review Comment:
   Don't know if this has come up before, but would it make sense to have a 
separate `promote_string_to_binary`? Logically that seems something different 
from allowing string to become large_string
   
   (there are already many options though ..)



##########
cpp/src/arrow/type.cc:
##########
@@ -327,6 +315,449 @@ std::shared_ptr<Field> Field::WithNullable(const bool 
nullable) const {
   return std::make_shared<Field>(name_, type_, nullable, metadata_);
 }
 
+Field::MergeOptions Field::MergeOptions::Permissive() {
+  MergeOptions options = Defaults();
+  options.promote_nullability = true;
+  options.promote_decimal = true;
+  options.promote_decimal_to_float = true;
+  options.promote_integer_to_decimal = true;
+  options.promote_integer_to_float = true;
+  options.promote_integer_sign = true;
+  options.promote_numeric_width = true;
+  options.promote_binary = true;
+  options.promote_temporal_unit = true;
+  options.promote_list = true;
+  options.promote_dictionary = true;
+  options.promote_dictionary_ordered = false;
+  return options;
+}
+
+std::string Field::MergeOptions::ToString() const {
+  std::stringstream ss;
+  ss << "MergeOptions{";
+  ss << "promote_nullability=" << (promote_nullability ? "true" : "false");
+  ss << ", promote_decimal=" << (promote_decimal ? "true" : "false");
+  ss << ", promote_decimal_to_float=" << (promote_decimal_to_float ? "true" : 
"false");
+  ss << ", promote_integer_to_decimal="
+     << (promote_integer_to_decimal ? "true" : "false");
+  ss << ", promote_integer_to_float=" << (promote_integer_to_float ? "true" : 
"false");
+  ss << ", promote_integer_sign=" << (promote_integer_sign ? "true" : "false");
+  ss << ", promote_numeric_width=" << (promote_numeric_width ? "true" : 
"false");
+  ss << ", promote_binary=" << (promote_binary ? "true" : "false");
+  ss << ", promote_temporal_unit=" << (promote_temporal_unit ? "true" : 
"false");
+  ss << ", promote_list=" << (promote_list ? "true" : "false");
+  ss << ", promote_dictionary=" << (promote_dictionary ? "true" : "false");
+  ss << ", promote_dictionary_ordered="
+     << (promote_dictionary_ordered ? "true" : "false");
+  ss << '}';
+  return ss.str();
+}
+
+namespace {
+// Utilities for Field::MergeWith
+
+std::shared_ptr<DataType> MakeBinary(const DataType& type) {
+  switch (type.id()) {
+    case Type::BINARY:
+    case Type::STRING:
+      return binary();
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return large_binary();
+    default:
+      Unreachable("Hit an unknown type");
+  }
+  return nullptr;
+}
+
+Result<std::shared_ptr<DataType>> WidenDecimals(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  const auto& left = checked_cast<const DecimalType&>(*promoted_type);
+  const auto& right = checked_cast<const DecimalType&>(*other_type);
+  if (!options.promote_numeric_width && left.bit_width() != right.bit_width()) 
{
+    return Status::TypeError(
+        "Cannot promote decimal128 to decimal256 without 
promote_numeric_width=true");
+  }
+  const int32_t max_scale = std::max<int32_t>(left.scale(), right.scale());
+  const int32_t common_precision =
+      std::max<int32_t>(left.precision() + max_scale - left.scale(),
+                        right.precision() + max_scale - right.scale());
+  if (left.id() == Type::DECIMAL256 || right.id() == Type::DECIMAL256 ||
+      common_precision > BasicDecimal128::kMaxPrecision) {
+    return DecimalType::Make(Type::DECIMAL256, common_precision, max_scale);
+  }
+  return DecimalType::Make(Type::DECIMAL128, common_precision, max_scale);
+}
+
+Result<std::shared_ptr<DataType>> MergeTypes(std::shared_ptr<DataType> 
promoted_type,
+                                             std::shared_ptr<DataType> 
other_type,
+                                             const Field::MergeOptions& 
options);
+
+// Merge temporal types based on options. Returns nullptr for non-temporal 
types.
+Result<std::shared_ptr<DataType>> MaybeMergeTemporalTypes(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  if (options.promote_temporal_unit) {
+    if (promoted_type->id() == Type::DATE32 && other_type->id() == 
Type::DATE64) {
+      return date64();
+    }
+    if (promoted_type->id() == Type::DATE64 && other_type->id() == 
Type::DATE32) {
+      return date64();
+    }
+
+    if (promoted_type->id() == Type::DURATION && other_type->id() == 
Type::DURATION) {
+      const auto& left = checked_cast<const DurationType&>(*promoted_type);
+      const auto& right = checked_cast<const DurationType&>(*other_type);
+      return duration(std::max(left.unit(), right.unit()));
+    }
+
+    if (is_time(promoted_type->id()) && is_time(other_type->id())) {
+      const auto& left = checked_cast<const TimeType&>(*promoted_type);
+      const auto& right = checked_cast<const TimeType&>(*other_type);
+      const auto unit = std::max(left.unit(), right.unit());
+      if (unit == TimeUnit::MICRO || unit == TimeUnit::NANO) {
+        return time64(unit);
+      }
+      return time32(unit);
+    }
+  }
+
+  if (promoted_type->id() == Type::TIMESTAMP && other_type->id() == 
Type::TIMESTAMP) {
+    const auto& left = checked_cast<const TimestampType&>(*promoted_type);
+    const auto& right = checked_cast<const TimestampType&>(*other_type);
+    if (left.timezone().empty() ^ right.timezone().empty()) {
+      return Status::TypeError(
+          "Cannot merge timestamp with timezone and timestamp without 
timezone");
+    }
+    if (left.timezone() != right.timezone()) {
+      return Status::TypeError("Cannot merge timestamps with differing 
timezones");
+    }
+    if (options.promote_temporal_unit) {
+      return timestamp(std::max(left.unit(), right.unit()), left.timezone());
+    }
+  }
+
+  return nullptr;
+}
+
+// Merge numeric types based on options. Returns nullptr for non-numeric types.
+Result<std::shared_ptr<DataType>> MaybeMergeNumericTypes(
+    std::shared_ptr<DataType> promoted_type, std::shared_ptr<DataType> 
other_type,
+    const Field::MergeOptions& options) {
+  bool promoted = false;
+  if (options.promote_decimal_to_float) {
+    if (is_decimal(promoted_type->id()) && is_floating(other_type->id())) {
+      promoted_type = other_type;
+      promoted = true;
+    } else if (is_floating(promoted_type->id()) && 
is_decimal(other_type->id())) {
+      other_type = promoted_type;
+      promoted = true;
+    }
+  }
+
+  if (options.promote_integer_to_decimal &&
+      ((is_decimal(promoted_type->id()) && is_integer(other_type->id())) ||
+       (is_decimal(other_type->id()) && is_integer(promoted_type->id())))) {
+    if (is_integer(promoted_type->id()) && is_decimal(other_type->id())) {
+      // Other type is always the int
+      promoted_type.swap(other_type);
+    }
+    ARROW_ASSIGN_OR_RAISE(const int32_t precision,
+                          MaxDecimalDigitsForInteger(other_type->id()));
+    ARROW_ASSIGN_OR_RAISE(const auto promoted_decimal,
+                          DecimalType::Make(promoted_type->id(), precision, 
0));
+    ARROW_ASSIGN_OR_RAISE(promoted_type,
+                          WidenDecimals(promoted_type, promoted_decimal, 
options));

Review Comment:
   ~How useful is it to widen the decimal to the biggest possible precision 
needed to store the largest integer? Because often you have small integers. And 
you could also keep the existing decimal type, and rely on safe casting to 
catch errors?~
   
   Actually, it seems our casting kernel has the same logic, so nevermind (it 
will error anyway)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to