[GitHub] [arrow] Fokko commented on a diff in pull request #36846: GH-36845: [C++][Python] Allow type promotion on `pa.concat_tables`

via GitHub Mon, 28 Aug 2023 02:23:49 -0700


Fokko commented on code in PR #36846:
URL: https://github.com/apache/arrow/pull/36846#discussion_r1307176960



##########
cpp/src/arrow/type.cc:
##########
@@ -327,6 +315,449 @@ std::shared_ptr<Field> Field::WithNullable(const bool 
nullable) const {
   return std::make_shared<Field>(name_, type_, nullable, metadata_);
 }
 
+Field::MergeOptions Field::MergeOptions::Permissive() {
+  MergeOptions options = Defaults();
+  options.promote_nullability = true;
+  options.promote_decimal = true;
+  options.promote_decimal_to_float = true;
+  options.promote_integer_to_decimal = true;
+  options.promote_integer_to_float = true;
+  options.promote_integer_sign = true;
+  options.promote_numeric_width = true;
+  options.promote_binary = true;
+  options.promote_temporal_unit = true;
+  options.promote_list = true;
+  options.promote_dictionary = true;
+  options.promote_dictionary_ordered = false;
+  return options;
+}
+
+std::string Field::MergeOptions::ToString() const {
+  std::stringstream ss;
+  ss << "MergeOptions{";
+  ss << "promote_nullability=" << (promote_nullability ? "true" : "false");
+  ss << ", promote_decimal=" << (promote_decimal ? "true" : "false");
+  ss << ", promote_decimal_to_float=" << (promote_decimal_to_float ? "true" : 
"false");
+  ss << ", promote_integer_to_decimal="
+     << (promote_integer_to_decimal ? "true" : "false");
+  ss << ", promote_integer_to_float=" << (promote_integer_to_float ? "true" : 
"false");
+  ss << ", promote_integer_sign=" << (promote_integer_sign ? "true" : "false");
+  ss << ", promote_numeric_width=" << (promote_numeric_width ? "true" : 
"false");
+  ss << ", promote_binary=" << (promote_binary ? "true" : "false");
+  ss << ", promote_temporal_unit=" << (promote_temporal_unit ? "true" : 
"false");
+  ss << ", promote_list=" << (promote_list ? "true" : "false");
+  ss << ", promote_dictionary=" << (promote_dictionary ? "true" : "false");
+  ss << ", promote_dictionary_ordered="
+     << (promote_dictionary_ordered ? "true" : "false");
+  ss << '}';
+  return ss.str();
+}
+
+namespace {
+// Utilities for Field::MergeWith
+
+std::shared_ptr<DataType> MakeBinary(const DataType& type) {
+  switch (type.id()) {
+    case Type::BINARY:
+    case Type::STRING:
+      return binary();
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return large_binary();
+    default:
+      Unreachable("Hit an unknown type");
+  }
+  return nullptr;
+}
+
+Result<std::shared_ptr<DataType>> WidenDecimals(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  const auto& left = checked_cast<const DecimalType&>(*promoted_type);
+  const auto& right = checked_cast<const DecimalType&>(*other_type);
+  if (!options.promote_numeric_width && left.bit_width() != right.bit_width()) 
{
+    return Status::TypeError(
+        "Cannot promote decimal128 to decimal256 without 
promote_numeric_width=true");
+  }
+  const int32_t max_scale = std::max<int32_t>(left.scale(), right.scale());
+  const int32_t common_precision =
+      std::max<int32_t>(left.precision() + max_scale - left.scale(),
+                        right.precision() + max_scale - right.scale());
+  if (left.id() == Type::DECIMAL256 || right.id() == Type::DECIMAL256 ||
+      common_precision > BasicDecimal128::kMaxPrecision) {
+    return DecimalType::Make(Type::DECIMAL256, common_precision, max_scale);
+  }
+  return DecimalType::Make(Type::DECIMAL128, common_precision, max_scale);
+}
+
+Result<std::shared_ptr<DataType>> MergeTypes(std::shared_ptr<DataType> 
promoted_type,
+                                             std::shared_ptr<DataType> 
other_type,
+                                             const Field::MergeOptions& 
options);
+
+// Merge temporal types based on options. Returns nullptr for non-temporal 
types.
+Result<std::shared_ptr<DataType>> MaybeMergeTemporalTypes(
+    const std::shared_ptr<DataType>& promoted_type,
+    const std::shared_ptr<DataType>& other_type, const Field::MergeOptions& 
options) {
+  if (options.promote_temporal_unit) {
+    if (promoted_type->id() == Type::DATE32 && other_type->id() == 
Type::DATE64) {
+      return date64();
+    }
+    if (promoted_type->id() == Type::DATE64 && other_type->id() == 
Type::DATE32) {
+      return date64();
+    }
+
+    if (promoted_type->id() == Type::DURATION && other_type->id() == 
Type::DURATION) {
+      const auto& left = checked_cast<const DurationType&>(*promoted_type);
+      const auto& right = checked_cast<const DurationType&>(*other_type);
+      return duration(std::max(left.unit(), right.unit()));
+    }
+
+    if (is_time(promoted_type->id()) && is_time(other_type->id())) {
+      const auto& left = checked_cast<const TimeType&>(*promoted_type);
+      const auto& right = checked_cast<const TimeType&>(*other_type);
+      const auto unit = std::max(left.unit(), right.unit());
+      if (unit == TimeUnit::MICRO || unit == TimeUnit::NANO) {
+        return time64(unit);
+      }
+      return time32(unit);
+    }
+  }
+
+  if (promoted_type->id() == Type::TIMESTAMP && other_type->id() == 
Type::TIMESTAMP) {
+    const auto& left = checked_cast<const TimestampType&>(*promoted_type);
+    const auto& right = checked_cast<const TimestampType&>(*other_type);
+    if (left.timezone().empty() ^ right.timezone().empty()) {
+      return Status::TypeError(
+          "Cannot merge timestamp with timezone and timestamp without 
timezone");
+    }
+    if (left.timezone() != right.timezone()) {
+      return Status::TypeError("Cannot merge timestamps with differing 
timezones");
+    }
+    if (options.promote_temporal_unit) {
+      return timestamp(std::max(left.unit(), right.unit()), left.timezone());
+    }
+  }
+
+  return nullptr;
+}
+
+// Merge numeric types based on options. Returns nullptr for non-numeric types.
+Result<std::shared_ptr<DataType>> MaybeMergeNumericTypes(
+    std::shared_ptr<DataType> promoted_type, std::shared_ptr<DataType> 
other_type,
+    const Field::MergeOptions& options) {
+  bool promoted = false;
+  if (options.promote_decimal_to_float) {
+    if (is_decimal(promoted_type->id()) && is_floating(other_type->id())) {
+      promoted_type = other_type;
+      promoted = true;
+    } else if (is_floating(promoted_type->id()) && 
is_decimal(other_type->id())) {
+      other_type = promoted_type;
+      promoted = true;
+    }
+  }
+
+  if (options.promote_integer_to_decimal &&
+      ((is_decimal(promoted_type->id()) && is_integer(other_type->id())) ||
+       (is_decimal(other_type->id()) && is_integer(promoted_type->id())))) {
+    if (is_integer(promoted_type->id()) && is_decimal(other_type->id())) {
+      // Other type is always the int
+      promoted_type.swap(other_type);
+    }
+    ARROW_ASSIGN_OR_RAISE(const int32_t precision,
+                          MaxDecimalDigitsForInteger(other_type->id()));
+    ARROW_ASSIGN_OR_RAISE(const auto promoted_decimal,
+                          DecimalType::Make(promoted_type->id(), precision, 
0));
+    ARROW_ASSIGN_OR_RAISE(promoted_type,
+                          WidenDecimals(promoted_type, promoted_decimal, 
options));
+    return promoted_type;
+  }
+
+  if (options.promote_decimal && is_decimal(promoted_type->id()) &&
+      is_decimal(other_type->id())) {
+    ARROW_ASSIGN_OR_RAISE(promoted_type,
+                          WidenDecimals(promoted_type, other_type, options));
+    return promoted_type;
+  }
+
+  if (options.promote_integer_sign && 
((is_unsigned_integer(promoted_type->id()) &&
+                                        is_signed_integer(other_type->id())) ||
+                                       (is_signed_integer(promoted_type->id()) 
&&
+                                        
is_unsigned_integer(other_type->id())))) {
+    if (!options.promote_numeric_width &&
+        bit_width(promoted_type->id()) != bit_width(other_type->id())) {
+      return Status::TypeError(
+          "Cannot widen (un)signed integers without 
promote_numeric_width=true");
+    }
+    const int max_width =
+        std::max<int>(bit_width(promoted_type->id()), 
bit_width(other_type->id()));
+
+    if (max_width >= 64) {
+      promoted_type = int64();
+    } else if (max_width >= 32) {
+      promoted_type = int32();

Review Comment:
   You're right:
   ```
   >>> import pyarrow as pa
   >>> pa.array([4294967295], type=pa.uint32())
   <pyarrow.lib.UInt32Array object at 0x1542b2f20>
   [
     4294967295
   ]
   >>> pa.array([4294967295], type=pa.int32())
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "pyarrow/array.pxi", line 320, in pyarrow.lib.array
     File "pyarrow/array.pxi", line 39, in pyarrow.lib._sequence_to_array
     File "pyarrow/error.pxi", line 144, in 
pyarrow.lib.pyarrow_internal_check_status
     File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
   pyarrow.lib.ArrowInvalid: Value 4294967295 too large to fit in C integer type
   ```
   But then again we should also check for negatives:
   ```
   >>> pa.array([-1], type=pa.int32())
   <pyarrow.lib.Int32Array object at 0x1542b3220>
   [
     -1
   ]
   >>> pa.array([-1], type=pa.uint32())
   Traceback (most recent call last):
     File "<stdin>", line 1, in <module>
     File "pyarrow/array.pxi", line 320, in pyarrow.lib.array
     File "pyarrow/array.pxi", line 39, in pyarrow.lib._sequence_to_array
     File "pyarrow/error.pxi", line 144, in 
pyarrow.lib.pyarrow_internal_check_status
   OverflowError: can't convert negative value to unsigned int
   ```
   Otherwise, it will raise an `OverflowError` on runtime



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] Fokko commented on a diff in pull request #36846: GH-36845: [C++][Python] Allow type promotion on `pa.concat_tables`

Reply via email to