[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-04 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r587428039



##
File path: cpp/src/parquet/statistics.cc
##
@@ -133,56 +136,95 @@ struct CompareHelper {
   static T Max(int type_length, const T& a, const T& b) {
 return Compare(0, a, b) ? b : a;
   }
-};  // namespace parquet
-
-template 
-struct ByteLikeCompareHelperBase {
-  using T = ByteArrayType::c_type;
-  using PtrType = typename std::conditional::type;
+};
 
-  static T DefaultMin() { return {}; }
-  static T DefaultMax() { return {}; }
-  static T Coalesce(T val, T fallback) { return val; }
+template 
+struct BinaryLikeComparer {};
 
-  static inline bool Compare(int type_length, const T& a, const T& b) {
-const auto* aptr = reinterpret_cast(a.ptr);
-const auto* bptr = reinterpret_cast(b.ptr);
-return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + 
b.len);
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+// Unsigned comparison is used for non-numeric types so straight
+// lexiographic comparison makes sense. (a.ptr is always unsigned)
+return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr 
+ b_length);
   }
+};
 
-  static T Min(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? a : b;
-  }
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+// Is signed is used for integers encoded as big-endian twos
+// complement integers. (e.g. decimals).
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+
+// At least of the lengths is zero.
+if (a_length == 0 || b_length == 0) {
+  return a_length == 0 && b_length > 0;
+}
 
-  static T Max(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? b : a;
+int8_t first_a = *a.ptr;
+int8_t first_b = *b.ptr;
+// We can short circuit for different signed numbers or
+// for equal length bytes arrays that have different first bytes.
+if ((0x80 & first_a) != (0x80 & first_b) ||
+(a_length == b_length && first_a != first_b)) {
+  return first_a < first_b;
+}
+// When the lengths are unequal and the numbers are of the same
+// sign we need to extend the digits.
+const uint8_t* a_start = a.ptr;
+const uint8_t* b_start = b.ptr;
+if (a_length != b_length) {
+  const uint8_t* lead_start = nullptr;
+  const uint8_t* lead_end = nullptr;
+  if (a_length > b_length) {
+int lead_length = a_length - b_length;
+lead_start = a.ptr;
+lead_end = a.ptr + lead_length;
+a_start += lead_length;
+  } else {
+DCHECK_LT(a_length, b_length);
+int lead_length = b_length - a_length;
+lead_start = b.ptr;
+lead_end = b.ptr + lead_length;
+b_start += lead_length;
+  }
+  // Compare extra bytes to the sign extension of the first
+  // byte of the other number.
+  uint8_t extension = first_a < 0 ? 0xFF : 0;
+  for (; lead_start != lead_end; lead_start++) {
+if (*lead_start < extension) {
+  // The first bytes of the long value are less
+  // then the extended short one.  So if a is the long value
+  // we can return true.
+  return a_length > b_length;
+} else if (*lead_start > extension) {

Review comment:
   Yes, thank you.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-04 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r587423364



##
File path: cpp/src/parquet/arrow/arrow_reader_writer_test.cc
##
@@ -2673,6 +2673,36 @@ TEST(ArrowReadWrite, Decimal256) {
   CheckSimpleRoundtrip(table, 2, props_store_schema);
 }
 
+TEST(ArrowReadWrite, DecimalStats) {
+  using ::arrow::Decimal128;
+  using ::arrow::field;
+
+  auto type = ::arrow::decimal128(/*precision=*/8, /*scale=*/0);
+
+  const char* json = R"(["255", "128", null, "0", "1", "-127", "-128", "-129", 
"-255"])";
+  auto array = ::arrow::ArrayFromJSON(type, json);
+  auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), 
{array});
+
+  std::shared_ptr buffer;
+  ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, /*row_grop_size=*/100,
+ 
default_arrow_writer_properties(), ));
+
+  std::unique_ptr reader;
+  ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer),
+  ::arrow::default_memory_pool(), ));
+
+  std::shared_ptr min, max;
+  ReadSingleColumnFileStatistics(std::move(reader), , );
+
+  std::shared_ptr expected_min, expected_max;
+  ASSERT_OK_AND_ASSIGN(expected_min, array->GetScalar(array->length() - 1));
+  ASSERT_OK_AND_ASSIGN(expected_max, array->GetScalar(0));
+  ASSERT_TRUE(expected_min->Equals(*min))

Review comment:
   I think we have `AssertScalarsEqual`?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-02 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r585572255



##
File path: cpp/src/parquet/statistics.cc
##
@@ -133,56 +136,95 @@ struct CompareHelper {
   static T Max(int type_length, const T& a, const T& b) {
 return Compare(0, a, b) ? b : a;
   }
-};  // namespace parquet
-
-template 
-struct ByteLikeCompareHelperBase {
-  using T = ByteArrayType::c_type;
-  using PtrType = typename std::conditional::type;
+};
 
-  static T DefaultMin() { return {}; }
-  static T DefaultMax() { return {}; }
-  static T Coalesce(T val, T fallback) { return val; }
+template 
+struct BinaryLikeComparer {};
 
-  static inline bool Compare(int type_length, const T& a, const T& b) {
-const auto* aptr = reinterpret_cast(a.ptr);
-const auto* bptr = reinterpret_cast(b.ptr);
-return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + 
b.len);
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+// Unsigned comparison is used for non-numeric types so straight
+// lexiographic comparison makes sense. (a.ptr is always unsigned)
+return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr 
+ b_length);
   }
+};
 
-  static T Min(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? a : b;
-  }
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+// Is signed is used for integers encoded as big-endian twos
+// complement integers. (e.g. decimals).
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+
+// At least of the lengths is zero.
+if (a_length == 0 || b_length == 0) {
+  return a_length == 0 && b_length > 0;
+}
 
-  static T Max(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? b : a;
+int8_t first_a = *a.ptr;
+int8_t first_b = *b.ptr;
+// We can short circuit for different signed numbers or
+// for equal length bytes arrays that have different first bytes.
+if ((0x80 & first_a) != (0x80 & first_b) ||
+(a_length == b_length && first_a != first_b)) {
+  return first_a < first_b;
+}
+// When the lengths are unequal and the numbers are of the same
+// sign we need to extend the digits.
+const uint8_t* a_start = a.ptr;
+const uint8_t* b_start = b.ptr;
+if (a_length != b_length) {
+  const uint8_t* lead_start = nullptr;
+  const uint8_t* lead_end = nullptr;
+  if (a_length > b_length) {
+int lead_length = a_length - b_length;
+lead_start = a.ptr;
+lead_end = a.ptr + lead_length;
+a_start += lead_length;
+  } else {
+DCHECK_LT(a_length, b_length);
+int lead_length = b_length - a_length;
+lead_start = b.ptr;
+lead_end = b.ptr + lead_length;
+b_start += lead_length;
+  }
+  // Compare extra bytes to the sign extension of the first
+  // byte of the other number.
+  uint8_t extension = first_a < 0 ? 0xFF : 0;
+  for (; lead_start != lead_end; lead_start++) {
+if (*lead_start < extension) {
+  // The first bytes of the long value are less
+  // then the extended short one.  So if a is the long value
+  // we can return true.
+  return a_length > b_length;
+} else if (*lead_start > extension) {

Review comment:
   Same here: if `extension` is 0xFF then this is never possible?
   
   So perhaps it's possible to rewrite this:
   ```c++
   if (first_a < 0) {
 // Both numbers negative
 for (; lead_start != lead_end; lead_start++) {
   if (*lead_start != 0xff) {
 return a_length > b_length;
   }
 }
   } else {
 // Both numbers positive
 for (; lead_start != lead_end; lead_start++) {
   if (*lead_start != 0) {
 return a_length < b_length;
   }
 }
   }
   ```
   
   Note sure which one is more understandable.

##
File path: cpp/src/parquet/statistics_test.cc
##
@@ -71,19 +71,25 @@ static FLBA FLBAFromString(const std::string& s) {
 TEST(Comparison, SignedByteArray) {
   auto comparator = MakeComparator(Type::BYTE_ARRAY, 
SortOrder::SIGNED);
 
-  std::string s1 = "12345";
-  std::string s2 = "12345678";
-  ByteArray s1ba = ByteArrayFromString(s1);
-  ByteArray s2ba = ByteArrayFromString(s2);
-  ASSERT_TRUE(comparator->Compare(s1ba, s2ba));
+  std::vector byte_values[] = {
+  {0x80, 0x80, 0, 0},   {/*0xFF,*/ 0x80, 0, 0}, {/*0xFF,*/ 0xFF, 
0x01, 0},
+  {/*0xFF,0xFF,*/ 0x80, 0}, {/*0xFF,0xFF,0xFF,*/ 0x80}, {/*0xFF, 0xFF, 
0xFF,*/ 

[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-02 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r585569492



##
File path: cpp/src/parquet/statistics_test.cc
##
@@ -111,21 +117,28 @@ TEST(Comparison, UnsignedByteArray) {
 }
 
 TEST(Comparison, SignedFLBA) {
-  int size = 10;
+  int size = 4;
   auto comparator =
   MakeComparator(Type::FIXED_LEN_BYTE_ARRAY, SortOrder::SIGNED, 
size);
 
-  std::string s1 = "Anti123456";
-  std::string s2 = "Bunkd123456";
-  FLBA s1flba = FLBAFromString(s1);
-  FLBA s2flba = FLBAFromString(s2);
-  ASSERT_TRUE(comparator->Compare(s1flba, s2flba));
+  std::vector byte_values[] = {
+  {0x80, 0, 0, 0},  {0xFF, 0xFF, 0x01, 0},{0xFF, 0xFF, 0x80, 
0},
+  {0xFF, 0xFF, 0xFF, 0x80}, {0xFF, 0xFF, 0xFF, 0xFF}, {0, 0, 0x01, 0x01},
+  {0, 0x01, 0x01, 0},   {0x01, 0x01, 0, 0}};
+  std::vector values_to_compare;
+  for (auto& bytes : byte_values) {
+values_to_compare.emplace_back(FLBA(bytes.data()));
+  }
 
-  s1 = "Bünk123456";
-  s2 = "Bunk123456";
-  s1flba = FLBAFromString(s1);
-  s2flba = FLBAFromString(s2);
-  ASSERT_TRUE(comparator->Compare(s1flba, s2flba));
+  for (size_t x = 0; x < values_to_compare.size(); x++) {
+EXPECT_FALSE(comparator->Compare(values_to_compare[x], 
values_to_compare[x])) << x;
+for (size_t y = x + 1; y < values_to_compare.size(); y++) {
+  EXPECT_TRUE(comparator->Compare(values_to_compare[x], 
values_to_compare[y]))
+  << x << " " << y;
+  EXPECT_FALSE(comparator->Compare(values_to_compare[y], 
values_to_compare[x]))
+  << y << " " << x;
+}
+  }

Review comment:
   I was merely wondering if we're sure that decimal columns trigger the 
code paths under test. But I'll trust you that it's the case :-)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-01 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r584992780



##
File path: cpp/src/parquet/statistics.cc
##
@@ -133,56 +136,95 @@ struct CompareHelper {
   static T Max(int type_length, const T& a, const T& b) {
 return Compare(0, a, b) ? b : a;
   }
-};  // namespace parquet
-
-template 
-struct ByteLikeCompareHelperBase {
-  using T = ByteArrayType::c_type;
-  using PtrType = typename std::conditional::type;
+};
 
-  static T DefaultMin() { return {}; }
-  static T DefaultMax() { return {}; }
-  static T Coalesce(T val, T fallback) { return val; }
+template 
+struct BinaryLikeComparer {};
 
-  static inline bool Compare(int type_length, const T& a, const T& b) {
-const auto* aptr = reinterpret_cast(a.ptr);
-const auto* bptr = reinterpret_cast(b.ptr);
-return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + 
b.len);
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+// Unsigned comparison is used for non-numeric types so straight
+// lexiographic comparison makes sense. (a.ptr is always unsigned)
+return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr 
+ b_length);
   }
+};
 
-  static T Min(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? a : b;
-  }
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+// Is signed is used for integers encoded as big-endian twos
+// complement integers. (e.g. decimals).
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+
+// At least of the lengths is zero.
+if (a_length == 0 || b_length == 0) {
+  return a_length == 0 && b_length > 0;
+}
 
-  static T Max(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? b : a;
+int8_t first_a = *a.ptr;
+int8_t first_b = *b.ptr;
+// We can short circuit for different signed numbers or
+// for equal length bytes arrays that have different first bytes.
+if ((0x80 & first_a) != (0x80 & first_b) ||
+(a_length == b_length && first_a != first_b)) {
+  return first_a < first_b;
+}

Review comment:
   Hmm, I see. I'll take a closer look :-)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #9582: PARQUET-1655: [C++] Fix comparison of Decimal values in statistics

2021-03-01 Thread GitBox


pitrou commented on a change in pull request #9582:
URL: https://github.com/apache/arrow/pull/9582#discussion_r584709632



##
File path: cpp/src/parquet/statistics.cc
##
@@ -133,56 +136,95 @@ struct CompareHelper {
   static T Max(int type_length, const T& a, const T& b) {
 return Compare(0, a, b) ? b : a;
   }
-};  // namespace parquet
-
-template 
-struct ByteLikeCompareHelperBase {
-  using T = ByteArrayType::c_type;
-  using PtrType = typename std::conditional::type;
+};
 
-  static T DefaultMin() { return {}; }
-  static T DefaultMax() { return {}; }
-  static T Coalesce(T val, T fallback) { return val; }
+template 
+struct BinaryLikeComparer {};
 
-  static inline bool Compare(int type_length, const T& a, const T& b) {
-const auto* aptr = reinterpret_cast(a.ptr);
-const auto* bptr = reinterpret_cast(b.ptr);
-return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + 
b.len);
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+// Unsigned comparison is used for non-numeric types so straight
+// lexiographic comparison makes sense. (a.ptr is always unsigned)
+return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr 
+ b_length);
   }
+};
 
-  static T Min(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? a : b;
-  }
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+// Is signed is used for integers encoded as big-endian twos
+// complement integers. (e.g. decimals).
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+
+// At least of the lengths is zero.
+if (a_length == 0 || b_length == 0) {
+  return a_length == 0 && b_length > 0;
+}
 
-  static T Max(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? b : a;
+int8_t first_a = *a.ptr;
+int8_t first_b = *b.ptr;
+// We can short circuit for different signed numbers or
+// for equal length bytes arrays that have different first bytes.
+if ((0x80 & first_a) != (0x80 & first_b) ||
+(a_length == b_length && first_a != first_b)) {
+  return first_a < first_b;
+}
+// When the lengths are unequal and the numbers are of the same
+// sign we need to extend the digits.

Review comment:
   I'm not sure why all this is needed. The first bytes are equal. 
Regardless of the sign, wouldn't it be sufficient to write e.g.:
   ```c++
 return std::lexicographical_compare(a.ptr + 1, a.ptr + a_length, b.ptr + 
1, b.ptr + b_length);
   ```
   i.e. only the MSB in a two's complement integer needs special handling, the 
other bytes are ordered naturally.
   

##
File path: cpp/src/parquet/statistics.cc
##
@@ -133,56 +136,95 @@ struct CompareHelper {
   static T Max(int type_length, const T& a, const T& b) {
 return Compare(0, a, b) ? b : a;
   }
-};  // namespace parquet
-
-template 
-struct ByteLikeCompareHelperBase {
-  using T = ByteArrayType::c_type;
-  using PtrType = typename std::conditional::type;
+};
 
-  static T DefaultMin() { return {}; }
-  static T DefaultMax() { return {}; }
-  static T Coalesce(T val, T fallback) { return val; }
+template 
+struct BinaryLikeComparer {};
 
-  static inline bool Compare(int type_length, const T& a, const T& b) {
-const auto* aptr = reinterpret_cast(a.ptr);
-const auto* bptr = reinterpret_cast(b.ptr);
-return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + 
b.len);
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+// Unsigned comparison is used for non-numeric types so straight
+// lexiographic comparison makes sense. (a.ptr is always unsigned)
+return std::lexicographical_compare(a.ptr, a.ptr + a_length, b.ptr, b.ptr 
+ b_length);
   }
+};
 
-  static T Min(int type_length, const T& a, const T& b) {
-if (a.ptr == nullptr) return b;
-if (b.ptr == nullptr) return a;
-return Compare(type_length, a, b) ? a : b;
-  }
+template 
+struct BinaryLikeComparer {
+  static bool Compare(int type_length, const T& a, const T& b) {
+// Is signed is used for integers encoded as big-endian twos
+// complement integers. (e.g. decimals).
+int a_length = value_length(type_length, a);
+int b_length = value_length(type_length, b);
+
+// At least of the lengths is zero.
+if (a_length == 0 || b_length == 0) {
+  return a_length == 0 && b_length > 0;
+}
 
-  static T Max(int