[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-25 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16218591#comment-16218591
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

wesm closed pull request #1211: ARROW-1588: [C++/Format] Harden Decimal Format
URL: https://github.com/apache/arrow/pull/1211
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 1178c658c..5df5e748f 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -42,6 +42,7 @@ install(FILES
   rle-encoding.h
   sse-util.h
   stl.h
+  type_traits.h
   visibility.h
   DESTINATION include/arrow/util)
 
diff --git a/cpp/src/arrow/util/bit-util-test.cc 
b/cpp/src/arrow/util/bit-util-test.cc
index 5a66d7e85..92bdcb5fc 100644
--- a/cpp/src/arrow/util/bit-util-test.cc
+++ b/cpp/src/arrow/util/bit-util-test.cc
@@ -28,7 +28,6 @@
 
 #include "arrow/buffer.h"
 #include "arrow/memory_pool.h"
-#include "arrow/status.h"
 #include "arrow/test-util.h"
 #include "arrow/util/bit-stream-utils.h"
 #include "arrow/util/bit-util.h"
@@ -334,4 +333,36 @@ TEST(BitStreamUtil, ZigZag) {
   TestZigZag(-std::numeric_limits::max());
 }
 
+TEST(BitUtil, RoundTripLittleEndianTest) {
+  uint64_t value = 0xFF;
+
+#if ARROW_LITTLE_ENDIAN
+  uint64_t expected = value;
+#else
+  uint64_t expected = std::numeric_limits::max() << 56;
+#endif
+
+  uint64_t little_endian_result = BitUtil::ToLittleEndian(value);
+  ASSERT_EQ(expected, little_endian_result);
+
+  uint64_t from_little_endian = 
BitUtil::FromLittleEndian(little_endian_result);
+  ASSERT_EQ(value, from_little_endian);
+}
+
+TEST(BitUtil, RoundTripBigEndianTest) {
+  uint64_t value = 0xFF;
+
+#if ARROW_LITTLE_ENDIAN
+  uint64_t expected = std::numeric_limits::max() << 56;
+#else
+  uint64_t expected = value;
+#endif
+
+  uint64_t big_endian_result = BitUtil::ToBigEndian(value);
+  ASSERT_EQ(expected, big_endian_result);
+
+  uint64_t from_big_endian = BitUtil::FromBigEndian(big_endian_result);
+  ASSERT_EQ(value, from_big_endian);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index 2509de21f..8043f90cc 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -56,6 +56,7 @@
 #include 
 
 #include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
 #include "arrow/util/visibility.h"
 
 #ifdef ARROW_USE_SSE
@@ -305,7 +306,7 @@ static inline uint32_t ByteSwap(uint32_t value) {
   return static_cast(ARROW_BYTE_SWAP32(value));
 }
 static inline int16_t ByteSwap(int16_t value) {
-  constexpr int16_t m = static_cast(0xff);
+  constexpr auto m = static_cast(0xff);
   return static_cast(((value >> 8) & m) | ((value & m) << 8));
 }
 static inline uint16_t ByteSwap(uint16_t value) {
@@ -331,8 +332,8 @@ static inline void ByteSwap(void* dst, const void* src, int 
len) {
   break;
   }
 
-  uint8_t* d = reinterpret_cast(dst);
-  const uint8_t* s = reinterpret_cast(src);
+  auto d = reinterpret_cast(dst);
+  auto s = reinterpret_cast(src);
   for (int i = 0; i < len; ++i) {
 d[i] = s[len - i - 1];
   }
@@ -341,36 +342,57 @@ static inline void ByteSwap(void* dst, const void* src, 
int len) {
 /// Converts to big endian format (if not already in big endian) from the
 /// machine's native endian format.
 #if ARROW_LITTLE_ENDIAN
-static inline int64_t ToBigEndian(int64_t value) { return ByteSwap(value); }
-static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
-static inline int32_t ToBigEndian(int32_t value) { return ByteSwap(value); }
-static inline uint32_t ToBigEndian(uint32_t value) { return ByteSwap(value); }
-static inline int16_t ToBigEndian(int16_t value) { return ByteSwap(value); }
-static inline uint16_t ToBigEndian(uint16_t value) { return ByteSwap(value); }
+template >
+static inline T ToBigEndian(T value) {
+  return ByteSwap(value);
+}
+
+template >
+static inline T ToLittleEndian(T value) {
+  return value;
+}
 #else
-static inline int64_t ToBigEndian(int64_t val) { return val; }
-static inline uint64_t ToBigEndian(uint64_t val) { return val; }
-static inline int32_t ToBigEndian(int32_t val) { return val; }
-static inline uint32_t ToBigEndian(uint32_t val) { return val; }
-static inline int16_t ToBigEndian(int16_t val) { return val; }
-static inline uint16_t ToBigEndian(uint16_t val) { return val; }
+template >
+static inline T ToBigEndian(T value) {
+  return value;
+}
 #endif
 
 /// Converts from big endian format to the machine's native endian format.
 #if ARROW_LITTLE_ENDIAN
-static inline int64_t FromBigEndian(int64_t value) { return ByteSwap(value); }
-static inline uint64_t 

[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-24 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16217954#comment-16217954
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

jacques-n commented on issue #1211: ARROW-1588: [C++/Format] Harden Decimal 
Format
URL: https://github.com/apache/arrow/pull/1211#issuecomment-339182615
 
 
   LGTM +1


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-24 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16217865#comment-16217865
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

wesm commented on issue #1211: ARROW-1588: [C++/Format] Harden Decimal Format
URL: https://github.com/apache/arrow/pull/1211#issuecomment-339165470
 
 
   I think we should merge this and then address the integration test issue and 
Java fixes in follow up patches. @jacques-n or @siddharthteotia can you please 
review?


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-23 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16215966#comment-16215966
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

wesm commented on issue #1211: ARROW-1588: [C++/Format] Harden Decimal Format
URL: https://github.com/apache/arrow/pull/1211#issuecomment-338815288
 
 
   @jacques-n could you take a look at this? we still need to figure out what's 
going on with the integration tests


This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-20 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16212656#comment-16212656
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

Github user cpcloud commented on the issue:

https://github.com/apache/arrow/pull/1211
  
Yes, I'm not sure why they're passing. I'll take a look and get to the 
bottom of it


> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-19 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16212005#comment-16212005
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

Github user wesm commented on the issue:

https://github.com/apache/arrow/pull/1211
  
I'm surprised the integration tests pass. Shouldn't they not?


> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread ASF GitHub Bot (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16210312#comment-16210312
 ] 

ASF GitHub Bot commented on ARROW-1588:
---

GitHub user cpcloud opened a pull request:

https://github.com/apache/arrow/pull/1211

ARROW-1588: [C++/Format] Harden Decimal Format



You can merge this pull request into a Git repository by running:

$ git pull https://github.com/cpcloud/arrow ARROW-1588

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/arrow/pull/1211.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #1211


commit cc2574774e65bcd1d376febf987314c4f2c31581
Author: Phillip Cloud 
Date:   2017-10-18T22:32:48Z

ARROW-1588: [C++/Format] Harden Decimal Format




> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
>  Labels: pull-request-available
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread Wes McKinney (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16210038#comment-16210038
 ] 

Wes McKinney commented on ARROW-1588:
-

If Java is assuming big-endian representation then byte-swapping from 
little-endian, I guess. We also need to document the memory format in the 
format/ documents. See the TBD section for Decimals in 
http://arrow.apache.org/docs/metadata.html

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread Phillip Cloud (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16210022#comment-16210022
 ] 

Phillip Cloud commented on ARROW-1588:
--

I have a patch ready to go on the C++ side, modulo integration tests. I'm in 
the process of running them now. Is there anything that needs to be done on the 
Java side?

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread Wes McKinney (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16210005#comment-16210005
 ] 

Wes McKinney commented on ARROW-1588:
-

+1. What do we need to do to effect this change?

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread Jacques Nadeau (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16209934#comment-16209934
 ] 

Jacques Nadeau commented on ARROW-1588:
---

Java is currently big-endian. See [1] and [2]
[1] 
https://github.com/apache/arrow/blob/master/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java#L144
[2] 
https://docs.oracle.com/javase/7/docs/api/java/math/BigInteger.html#BigInteger(byte[])

I agree that we should switch to little endian. Arrow should prioritize the cpu 
friendly format. 

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-10-18 Thread Phillip Cloud (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16209706#comment-16209706
 ] 

Phillip Cloud commented on ARROW-1588:
--

Here's the state of the in memory decimal format on the C++ side of things.

Right now, arrow's in memory decimal format is actually a mixture of both big 
and little endian (when the platform is little endian, when the platform is big 
endian it's just big endian).

The {{Decimal128}} class that holds the upper and lower 64 bits of a 2's 
complement 128 bit integer is laid out like this (ignoring methods)

{code}
class Decimal128 {
int64_t upper;
uint64_t lower;
};
{code}

When we convert this to a C array of bytes (with the {{ToBytes()}}) method we 
do the following (paraphrasing):

{code}
const uint64_t big_bytes[] = {upper, lower};
const uint8_t* raw_bytes = reinterpret_cast(big_bytes);
{code}

What this means is that groups of 8 bytes are in big-endian order, and within 
each group of 8 bytes they are platform native.

This was an oversight on my part when writing {{ToBytes()}}, that I will 
rectify. My apologies for not mentioning this earlier.

The good news is that:

# This won't require much effort to fix.
# The work I've already done to make parquet read/write work will only require 
a one or two line change to make this work regardless of whether we choose big 
or little endian.

We still need to make a choice about which byte order to use.

Something to note:

When operating on arrow {{DecimalArray}} bytes, we have to convert each group 
of 16 bytes to {{Decimal128}} before doing any operations like addition, 
multiplication, etc. To keep performance snappy we need to spend as little time 
as possible converting bytes to {{int64_t}}/{{uint64_t}}. If bytes are passed 
to {{Decimal128}} as little endian then we elide a byteswap operation and 
simply convert each 8 byte chunk to the respective types of the upper and lower 
bits. If they're passed as big endian (as in the case of parquet) then we have 
to do some work to convert them to little endian. For this reason, I think we 
should choose little endian byte order. Of course, systems that are big endian 
take a small hit reading arrow decimal arrays.

Is there someone from the Java side that can clarify what the current in-memory 
layout for the Java equivalent of C++'s {{DecimalArray}}? Are bytes in little 
endian order?

cc [~wesmckinn] [~jnadeau]

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Wes McKinney (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16175062#comment-16175062
 ] 

Wes McKinney commented on ARROW-1588:
-

[~tarmstrong] in the not-too-distant future we will be creating a columnar 
function kernel library in C++ to process arrays of contiguous decimals 
in-memory, so we would want operations like {{arr * 2}], where {{arr}} contains 
multiple decimal values in a buffer to evaluate as fast as possible. For the 
moment the only place where Arrow users are doing analytics on Decimals in in 
Dremio https://github.com/dremio/dremio-oss. 

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Wes McKinney (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16175056#comment-16175056
 ] 

Wes McKinney commented on ARROW-1588:
-

This seems to support the hypothesis that we should be 16-bytes, little-endian 
in-memory. It would be helpful to get the historical context on why things are 
the way the are in Parquet, though

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Tim Armstrong (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16175026#comment-16175026
 ] 

Tim Armstrong commented on ARROW-1588:
--

Parquet does encodes some decimals as big-endian values when encoded as 
FIXED_LEN_BYTE_ARRAY but Impala byte-swaps them when reading. I always found 
this curious. This was actually a major source of runtime overhead so we ended 
up with a complicated SIMD byte-swap implementation 
https://github.com/apache/incubator-impala/blob/2e63752858d71cc745534367a686980e060a8180/be/src/util/bit-util.cc#L210

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Tim Armstrong (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16175019#comment-16175019
 ] 

Tim Armstrong commented on ARROW-1588:
--

Thanks for the CC Wes. Is there some additional context here about the goals? 
Those decisions (16 bytes and big-endian) are not what I'd expect if we were 
optimising for in-memory processing speed on little-endian architectures. 

[~tarasbob] has recently been working on Impala's decimal implementation so may 
have some thoughts too.

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Wes McKinney (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16174993#comment-16174993
 ] 

Wes McKinney commented on ARROW-1588:
-

cc [~henryr] [~tarmstrong]

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)


[jira] [Commented] (ARROW-1588) [C++/Format] Harden Decimal Format

2017-09-21 Thread Phillip Cloud (JIRA)

[ 
https://issues.apache.org/jira/browse/ARROW-1588?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=16174984#comment-16174984
 ] 

Phillip Cloud commented on ARROW-1588:
--

[~nongli], [~jacq...@dremio.com] mentioned that you and he discussed why 
big-endian might be the optimal choice for byte ordering. Do you remember how 
you determined that to be the case?

> [C++/Format] Harden Decimal Format
> --
>
> Key: ARROW-1588
> URL: https://issues.apache.org/jira/browse/ARROW-1588
> Project: Apache Arrow
>  Issue Type: Improvement
>  Components: C++, Format
>Affects Versions: 0.7.0
>Reporter: Phillip Cloud
>Assignee: Phillip Cloud
> Fix For: 0.8.0
>
>
> We should finalize and harden the decimal format. The remaining issues are 
> officially writing down the choice of making every decimal value 16 bytes and 
> byte order.
> For byte order we'll need to run some benchmarks to compare little endian vs 
> big endian. I plan to work on this over the next week or two.
> [~jacq...@dremio.com] [~wesmckinn] If there are any additional items you'd 
> like to see addressed here please chime in. 



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)