Luminarys commented on a change in pull request #8344:
URL: https://github.com/apache/arrow/pull/8344#discussion_r503505320



##########
File path: cpp/src/arrow/util/basic_decimal.cc
##########
@@ -254,69 +252,148 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t 
bits) {
 
 namespace {
 
-// TODO: Remove this guard once it's used by BasicDecimal256
-#ifndef ARROW_USE_NATIVE_INT128
-// This method losslessly multiplies x and y into a 128 bit unsigned integer
-// whose high bits will be stored in hi and low bits in lo.
-void ExtendAndMultiplyUint64(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* 
lo) {
-#ifdef ARROW_USE_NATIVE_INT128
-  const __uint128_t r = static_cast<__uint128_t>(x) * y;
-  *lo = r & kInt64Mask;
-  *hi = r >> 64;
-#else
-  // If we can't use a native fallback, perform multiplication
-  // by splitting up x and y into 32 bit high/low bit components,
+// Multiply two N bit word components into a 2*N bit result, with high bits
+// stored in hi and low bits in lo.
+template <typename Word>
+void ExtendAndMultiplyUint(Word x, Word y, Word* hi, Word* lo) {

Review comment:
       I realized that I wasn't using the native path prior, which is why the 
benchmark was so slow. Updated, new results are 32 ns when __uint128_t is used 
and 65 ns when uint64_t is used, which I think is more reasonable.

##########
File path: cpp/src/arrow/util/basic_decimal.cc
##########
@@ -254,69 +252,148 @@ BasicDecimal128& BasicDecimal128::operator>>=(uint32_t 
bits) {
 
 namespace {
 
-// TODO: Remove this guard once it's used by BasicDecimal256
-#ifndef ARROW_USE_NATIVE_INT128
-// This method losslessly multiplies x and y into a 128 bit unsigned integer
-// whose high bits will be stored in hi and low bits in lo.
-void ExtendAndMultiplyUint64(uint64_t x, uint64_t y, uint64_t* hi, uint64_t* 
lo) {
-#ifdef ARROW_USE_NATIVE_INT128
-  const __uint128_t r = static_cast<__uint128_t>(x) * y;
-  *lo = r & kInt64Mask;
-  *hi = r >> 64;
-#else
-  // If we can't use a native fallback, perform multiplication
-  // by splitting up x and y into 32 bit high/low bit components,
+// Multiply two N bit word components into a 2*N bit result, with high bits
+// stored in hi and low bits in lo.
+template <typename Word>
+void ExtendAndMultiplyUint(Word x, Word y, Word* hi, Word* lo) {
+  // Perform multiplication on two N bit words x and y into a 2*N bit result
+  // by splitting up x and y into N/2 bit high/low bit components,
   // allowing us to represent the multiplication as
-  // x * y = x_lo * y_lo + x_hi * y_lo * 2^32 + y_hi * x_lo * 2^32
-  // + x_hi * y_hi * 2^64.
+  // x * y = x_lo * y_lo + x_hi * y_lo * 2^N/2 + y_hi * x_lo * 2^N/2
+  // + x_hi * y_hi * 2^N
   //
-  // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi.
+  // Now, consider the final output as lo_lo || lo_hi || hi_lo || hi_hi
   // Therefore,
   // lo_lo is (x_lo * y_lo)_lo,
   // lo_hi is ((x_lo * y_lo)_hi + (x_hi * y_lo)_lo + (x_lo * y_hi)_lo)_lo,
   // hi_lo is ((x_hi * y_hi)_lo + (x_hi * y_lo)_hi + (x_lo * y_hi)_hi)_hi,
   // hi_hi is (x_hi * y_hi)_hi
-  const uint64_t x_lo = x & kIntMask;
-  const uint64_t y_lo = y & kIntMask;
-  const uint64_t x_hi = x >> 32;
-  const uint64_t y_hi = y >> 32;
+  constexpr Word kHighBitShift = sizeof(Word) * 4;
+  constexpr Word kLowBitMask = (static_cast<Word>(1) << kHighBitShift) - 1;
 
-  const uint64_t t = x_lo * y_lo;
-  const uint64_t t_lo = t & kIntMask;
-  const uint64_t t_hi = t >> 32;
+  const Word x_lo = x & kLowBitMask;
+  const Word y_lo = y & kLowBitMask;
+  const Word x_hi = x >> kHighBitShift;
+  const Word y_hi = y >> kHighBitShift;
 
-  const uint64_t u = x_hi * y_lo + t_hi;
-  const uint64_t u_lo = u & kIntMask;
-  const uint64_t u_hi = u >> 32;
+  const Word t = x_lo * y_lo;
+  const Word t_lo = t & kLowBitMask;
+  const Word t_hi = t >> kHighBitShift;
 
-  const uint64_t v = x_lo * y_hi + u_lo;
-  const uint64_t v_hi = v >> 32;
+  const Word u = x_hi * y_lo + t_hi;
+  const Word u_lo = u & kLowBitMask;
+  const Word u_hi = u >> kHighBitShift;
+
+  const Word v = x_lo * y_hi + u_lo;
+  const Word v_hi = v >> kHighBitShift;
 
   *hi = x_hi * y_hi + u_hi + v_hi;
-  *lo = (v << 32) | t_lo;
-#endif
+  *lo = (v << kHighBitShift) + t_lo;
+}
+
+// Convenience wrapper type over 128 bit unsigned integers
+#ifdef ARROW_USE_NATIVE_INT128
+struct uint128_t {

Review comment:
       Done.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to