Copilot commented on code in PR #48180:
URL: https://github.com/apache/arrow/pull/48180#discussion_r3352057997
##########
cpp/src/arrow/compute/util.cc:
##########
@@ -30,33 +30,33 @@ namespace util {
namespace bit_util {
inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
- // This will not be correct on big-endian architectures.
-#if !ARROW_LITTLE_ENDIAN
- ARROW_DCHECK(false);
-#endif
ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
if (num_bytes == 8) {
return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
} else {
uint64_t word = 0;
for (int i = 0; i < num_bytes; ++i) {
+#if ARROW_LITTLE_ENDIAN
word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
+#else
+ word |= static_cast<uint64_t>(bytes[i]) << (8 * (num_bytes - 1 - i));
+#endif
Review Comment:
On big-endian, the shift used in SafeLoadUpTo8Bytes for num_bytes < 8 places
the first input byte into the least-significant byte of the returned word.
However, callers treat this helper as if it behaved like a native 64-bit load
with zero padding and then apply ByteSwap; with the current shift, ByteSwap
moves the data into the most-significant bytes and tail processing (e.g. tail <
57) will produce incorrect results.
##########
cpp/src/arrow/compute/util.cc:
##########
@@ -30,33 +30,33 @@ namespace util {
namespace bit_util {
inline uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
- // This will not be correct on big-endian architectures.
-#if !ARROW_LITTLE_ENDIAN
- ARROW_DCHECK(false);
-#endif
ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
if (num_bytes == 8) {
return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
} else {
uint64_t word = 0;
for (int i = 0; i < num_bytes; ++i) {
+#if ARROW_LITTLE_ENDIAN
word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
+#else
+ word |= static_cast<uint64_t>(bytes[i]) << (8 * (num_bytes - 1 - i));
+#endif
}
return word;
}
}
inline void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value)
{
- // This will not be correct on big-endian architectures.
-#if !ARROW_LITTLE_ENDIAN
- ARROW_DCHECK(false);
-#endif
ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
if (num_bytes == 8) {
util::SafeStore(reinterpret_cast<uint64_t*>(bytes), value);
} else {
for (int i = 0; i < num_bytes; ++i) {
+#if ARROW_LITTLE_ENDIAN
bytes[i] = static_cast<uint8_t>(value >> (8 * i));
+#else
+ bytes[i] = static_cast<uint8_t>(value >> (8 * (num_bytes - 1 - i)));
+#endif
Review Comment:
SafeStoreUpTo8Bytes has inconsistent semantics on big-endian: for
num_bytes==8 it delegates to util::SafeStore (native big-endian byte order),
but for num_bytes<8 it currently writes from the least-significant bytes (value
>> (8*(num_bytes-1-i))). This doesn't match the behavior of storing a uint64_t
and taking its first num_bytes bytes, and will break round-trip expectations
with SafeLoadUpTo8Bytes when used to emulate partial native stores/loads.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]