[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

ASF GitHub Bot (JIRA) Wed, 25 Oct 2017 06:11:39 -0700

    [ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16218591#comment-16218591
 ]


ASF GitHub Bot commented on ARROW-1588:
---------------------------------------

wesm closed pull request #1211: ARROW-1588: [C++/Format] Harden Decimal Format
URL: https://github.com/apache/arrow/pull/1211
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 1178c658c..5df5e748f 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -42,6 +42,7 @@ install(FILES
   rle-encoding.h
   sse-util.h
   stl.h
+  type_traits.h
   visibility.h
   DESTINATION include/arrow/util)
 
diff --git a/cpp/src/arrow/util/bit-util-test.cc 
b/cpp/src/arrow/util/bit-util-test.cc
index 5a66d7e85..92bdcb5fc 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -28,7 +28,6 @@
 
 #include "arrow/buffer.h"
 #include "arrow/memory_pool.h"
-#include "arrow/status.h"
 #include "arrow/test-util.h"
 #include "arrow/util/bit-stream-utils.h"
 #include "arrow/util/bit-util.h"
@@ -334,4 +333,36 @@ TEST(BitStreamUtil, ZigZag) {
   TestZigZag(-std::numeric_limits<int32_t>::max());
 }
 
+TEST(BitUtil, RoundTripLittleEndianTest) {
+  uint64_t value = 0xFF;
+
+#if ARROW_LITTLE_ENDIAN
+  uint64_t expected = value;
+#else
+  uint64_t expected = std::numeric_limits<uint64_t>::max() << 56;
+#endif
+
+  uint64_t little_endian_result = BitUtil::ToLittleEndian(value);
+  ASSERT_EQ(expected, little_endian_result);
+
+  uint64_t from_little_endian = 
BitUtil::FromLittleEndian(little_endian_result);
+  ASSERT_EQ(value, from_little_endian);
+}
+
+TEST(BitUtil, RoundTripBigEndianTest) {
+  uint64_t value = 0xFF;
+
+#if ARROW_LITTLE_ENDIAN
+  uint64_t expected = std::numeric_limits<uint64_t>::max() << 56;
+#else
+  uint64_t expected = value;
+#endif
+
+  uint64_t big_endian_result = BitUtil::ToBigEndian(value);
+  ASSERT_EQ(expected, big_endian_result);
+
+  uint64_t from_big_endian = BitUtil::FromBigEndian(big_endian_result);
+  ASSERT_EQ(value, from_big_endian);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 2509de21f..8043f90cc 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -56,6 +56,7 @@
 #include <vector>
 
 #include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
 #include "arrow/util/visibility.h"
 
 #ifdef ARROW_USE_SSE
@@ -305,7 +306,7 @@ static inline uint32_t ByteSwap(uint32_t value) {
   return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
 }
 static inline int16_t ByteSwap(int16_t value) {
-  constexpr int16_t m = static_cast<int16_t>(0xff);
+  constexpr auto m = static_cast<int16_t>(0xff);
   return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
 }
 static inline uint16_t ByteSwap(uint16_t value) {
@@ -331,8 +332,8 @@ static inline void ByteSwap(void* dst, const void* src, int 
len) {
       break;
   }
 
-  uint8_t* d = reinterpret_cast<uint8_t*>(dst);
-  const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
+  auto d = reinterpret_cast<uint8_t*>(dst);
+  auto s = reinterpret_cast<const uint8_t*>(src);
   for (int i = 0; i < len; ++i) {
     d[i] = s[len - i - 1];
   }
@@ -341,36 +342,57 @@ static inline void ByteSwap(void* dst, const void* src, 
int len) {
 /// Converts to big endian format (if not already in big endian) from the
 /// machine's native endian format.
 #if ARROW_LITTLE_ENDIAN
-static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
-static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
-static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
-static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
-static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
-static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T ToBigEndian(T value) {
+  return ByteSwap(value);
+}
+
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T ToLittleEndian(T value) {
+  return value;
+}
 #else
-static inline int64_t ToBigEndian(int64_t val) { return val; }
-static inline uint64_t ToBigEndian(uint64_t val) { return val; }
-static inline int32_t ToBigEndian(int32_t val) { return val; }
-static inline uint32_t ToBigEndian(uint32_t val) { return val; }
-static inline int16_t ToBigEndian(int16_t val) { return val; }
-static inline uint16_t ToBigEndian(uint16_t val) { return val; }
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T ToBigEndian(T value) {
+  return value;
+}
 #endif
 
 /// Converts from big endian format to the machine's native endian format.
 #if ARROW_LITTLE_ENDIAN
-static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
-static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); 
}
-static inline int32_t FromBigEndian(int32_t value) { return ByteSwap(value); }
-static inline uint32_t FromBigEndian(uint32_t value) { return ByteSwap(value); 
}
-static inline int16_t FromBigEndian(int16_t value) { return ByteSwap(value); }
-static inline uint16_t FromBigEndian(uint16_t value) { return ByteSwap(value); 
}
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T FromBigEndian(T value) {
+  return ByteSwap(value);
+}
+
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T FromLittleEndian(T value) {
+  return value;
+}
 #else
-static inline int64_t FromBigEndian(int64_t val) { return val; }
-static inline uint64_t FromBigEndian(uint64_t val) { return val; }
-static inline int32_t FromBigEndian(int32_t val) { return val; }
-static inline uint32_t FromBigEndian(uint32_t val) { return val; }
-static inline int16_t FromBigEndian(int16_t val) { return val; }
-static inline uint16_t FromBigEndian(uint16_t val) { return val; }
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T FromBigEndian(T value) {
+  return value;
+}
+
+template <typename T,
+          typename =
+              EnableIfIsOneOf<T, int64_t, uint64_t, int32_t, uint32_t, 
int16_t, uint16_t>>
+static inline T FromLittleEndian(T value) {
+  return ByteSwap(value);
+}
 #endif
 
 // Logical right shift for signed integer types
diff --git a/cpp/src/arrow/util/decimal.cc b/cpp/src/arrow/util/decimal.cc
index 49d5c0249..7196b252c 100644
--- a/cpp/src/arrow/util/decimal.cc
+++ b/cpp/src/arrow/util/decimal.cc
@@ -27,6 +27,7 @@
 #pragma intrinsic(_BitScanReverse)
 #endif
 
+#include "arrow/util/bit-util.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/logging.h"
 
@@ -41,11 +42,13 @@ Decimal128::Decimal128(const std::string& str) : 
Decimal128() {
 }
 
 Decimal128::Decimal128(const uint8_t* bytes)
-    : Decimal128(reinterpret_cast<const int64_t*>(bytes)[0],
-                 reinterpret_cast<const uint64_t*>(bytes)[1]) {}
+    : Decimal128(BitUtil::FromLittleEndian(reinterpret_cast<const 
int64_t*>(bytes)[1]),
+                 BitUtil::FromLittleEndian(reinterpret_cast<const 
uint64_t*>(bytes)[0])) {
+}
 
 std::array<uint8_t, 16> Decimal128::ToBytes() const {
-  const uint64_t raw[] = {static_cast<uint64_t>(high_bits_), low_bits_};
+  const uint64_t raw[] = {BitUtil::ToLittleEndian(low_bits_),
+                          
BitUtil::ToLittleEndian(static_cast<uint64_t>(high_bits_))};
   const auto* raw_data = reinterpret_cast<const uint8_t*>(raw);
   std::array<uint8_t, 16> out{{0}};
   std::copy(raw_data, raw_data + out.size(), out.begin());
diff --git a/cpp/src/arrow/util/decimal.h b/cpp/src/arrow/util/decimal.h
index 58496a874..ba252bee4 100644
--- a/cpp/src/arrow/util/decimal.h
+++ b/cpp/src/arrow/util/decimal.h
@@ -53,7 +53,8 @@ class ARROW_EXPORT Decimal128 {
   /// \brief Parse the number from a base 10 string representation.
   explicit Decimal128(const std::string& value);
 
-  /// \brief Create an Decimal128 from an array of bytes
+  /// \brief Create an Decimal128 from an array of bytes. Bytes are assumed to 
be in
+  /// little endian byte order.
   explicit Decimal128(const uint8_t* bytes);
 
   /// \brief Negate the current value
@@ -104,7 +105,7 @@ class ARROW_EXPORT Decimal128 {
   /// \brief Get the low bits of the two's complement representation of the 
number.
   uint64_t low_bits() const { return low_bits_; }
 
-  /// \brief Return the raw bytes of the value.
+  /// \brief Return the raw bytes of the value in little-endian byte order.
   std::array<uint8_t, 16> ToBytes() const;
 
   /// \brief Convert the Decimal128 value to a base 10 decimal string with the 
given
diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h
new file mode 100644
index 000000000..c05309af8
--- /dev/null
+++ b/cpp/src/arrow/util/type_traits.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_TYPE_TRAITS_H
+#define ARROW_UTIL_TYPE_TRAITS_H
+
+#include <type_traits>
+
+namespace arrow {
+
+/// \brief Metafunction to allow checking if a type matches any of another set 
of types
+template <typename...>
+struct IsOneOf : std::false_type {};  /// Base case: nothing has matched
+
+template <typename T, typename U, typename... Args>
+struct IsOneOf<T, U, Args...> {
+  /// Recursive case: T == U or T matches any other types provided (not 
including U).
+  static constexpr bool value = std::is_same<T, U>::value || IsOneOf<T, 
Args...>::value;
+};
+
+/// \brief Shorthand for using IsOneOf + std::enable_if
+template <typename T, typename... Args>
+using EnableIfIsOneOf = typename std::enable_if<IsOneOf<T, Args...>::value, 
T>::type;
+
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_TYPE_TRAITS_H
diff --git a/format/Layout.md b/format/Layout.md
index 3c21dbc0d..ebf93821a 100644
--- a/format/Layout.md
+++ b/format/Layout.md
@@ -41,9 +41,8 @@ concepts, here is a small glossary to help disambiguate.
   or a fully-specified nested type. When we say slot we mean a relative type
   value, not necessarily any physical storage region.
 * Logical type: A data type that is implemented using some relative (physical)
-  type. For example, a Decimal value stored in 16 bytes could be stored in a
-  primitive array with slot size 16 bytes. Similarly, strings can be stored as
-  `List<1-byte>`.
+  type. For example, Decimal values are stored as 16 bytes in a fixed byte
+  size array. Similarly, strings can be stored as `List<1-byte>`.
 * Parent and child arrays: names to express relationships between physical
   value arrays in a nested type structure. For example, a `List<T>`-type parent
   array has a T-type array as its child (see more on lists below).
diff --git a/format/Metadata.md b/format/Metadata.md
index 80ca08ae1..893b0a474 100644
--- a/format/Metadata.md
+++ b/format/Metadata.md
@@ -391,7 +391,8 @@ logical type, which have no children) and 3 buffers:
 
 ### Decimal
 
-TBD
+Decimals are represented as a 2's complement 128-bit (16 byte) signed integer
+in little-endian byte order.
 
 ### Timestamp
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++/Format] Harden Decimal Format
> ----------------------------------
>
>                 Key: ARROW-1588
>                 URL: https://issues.apache.org/jira/browse/ARROW-1588
>             Project: Apache Arrow
>          Issue Type: Improvement
>          Components: C++, Format
>    Affects Versions: 0.7.0
>            Reporter: Phillip Cloud
>            Assignee: Phillip Cloud
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

Reply via email to