emkornfield commented on a change in pull request #9582: URL: https://github.com/apache/arrow/pull/9582#discussion_r584862144
########## File path: cpp/src/parquet/statistics.cc ########## @@ -133,56 +136,95 @@ struct CompareHelper<Int96Type, is_signed> { static T Max(int type_length, const T& a, const T& b) { return Compare(0, a, b) ? b : a; } -}; // namespace parquet - -template <typename DType, bool is_signed> -struct ByteLikeCompareHelperBase { - using T = ByteArrayType::c_type; - using PtrType = typename std::conditional<is_signed, int8_t, uint8_t>::type; +}; - static T DefaultMin() { return {}; } - static T DefaultMax() { return {}; } - static T Coalesce(T val, T fallback) { return val; } +template <typename T, bool is_signed> +struct BinaryLikeComparer {}; - static inline bool Compare(int type_length, const T& a, const T& b) { - const auto* aptr = reinterpret_cast<const PtrType*>(a.ptr); - const auto* bptr = reinterpret_cast<const PtrType*>(b.ptr); - return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); +template <typename T> +struct BinaryLikeComparer<T, /*is_signed=*/false> { + static bool Compare(int type_length, const T& a, const T& b) { + int a_length = value_length(type_length, a); + int b_length = value_length(type_length, b); + // Unsigned comparison is used for non-numeric types so straight + // lexiographic comparison makes sense. (a.ptr is always unsigned).... + return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr + b_length); } +}; - static T Min(int type_length, const T& a, const T& b) { - if (a.ptr == nullptr) return b; - if (b.ptr == nullptr) return a; - return Compare(type_length, a, b) ? a : b; - } +template <typename T> +struct BinaryLikeComparer<T, /*is_signed=*/true> { + static bool Compare(int type_length, const T& a, const T& b) { + // Is signed is used for integers encoded as big-endian twos + // complement integers. (e.g. decimals). + int a_length = value_length(type_length, a); + int b_length = value_length(type_length, b); + + // At least of the lengths is zero. + if (a_length == 0 || b_length == 0) { + return a_length == 0 && b_length > 0; + } - static T Max(int type_length, const T& a, const T& b) { - if (a.ptr == nullptr) return b; - if (b.ptr == nullptr) return a; - return Compare(type_length, a, b) ? b : a; + int8_t first_a = *a.ptr; + int8_t first_b = *b.ptr; + // We can short circuit for different signed numbers or + // for equal length bytes arrays that have different first bytes. + if ((0x80 & first_a) != (0x80 & first_b) || + (a_length == b_length && first_a != first_b)) { + return first_a < first_b; + } Review comment: It is not entirely clear to me all potential use cases for this, so this is somewhat defensive. Decimals can be encoded either variable width or fixed width within a parquet file. Simply comparing the first bytes fails when sign extending values. For instance `0xFF80` and `0x80` are equal. This scenario seems like it could arise in two cases: 1. Trying to compare statistics in encoded form between a a fixed width encoded decimal column and a variable length encoded column (likely from two separate files). 2. The specification only says `The minimum number of bytes to store the unscaled value should be used.` since this isn't a "must" I could image some implementations doing something wacky here still be in compliance. The first scenario seems more likely but still potentially rare for this code to hit in practice but like I said I'm being defensive. If this reasoning seems correct to you, I can add a comment to this affect. I can also remove the code but I think that might lead to bugs down the road (I quickly checked the java implementation and it looks like they have a notion of sign extension also. A third option is we could still short-circuit if neither first byte corresponded to a sign extended value (I would need to think about it, but I think this would would work). ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org