kou commented on code in PR #48205:
URL: https://github.com/apache/arrow/pull/48205#discussion_r2553142118
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -132,7 +134,11 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t
max_level,
"Number of buffered values too large (corrupt data page?)");
}
num_bytes = static_cast<int32_t>(bit_util::BytesForBits(num_bits));
+#if ARROW_LITTLE_ENDIAN
if (num_bytes < 0 || num_bytes > data_size - 4) {
+#else
+ if (num_bytes < 0 || num_bytes > data_size) {
+#endif
Review Comment:
@pitrou You added `- 4` in #6848. Do you think that we need `- 4` with big
endian too?
##########
cpp/src/parquet/column_writer.h:
##########
@@ -260,13 +261,21 @@ constexpr int64_t kJulianEpochOffsetDays =
INT64_C(2440588);
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96*
impala_timestamp) {
int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
+#if ARROW_LITTLE_ENDIAN
(*impala_timestamp).value[2] = (uint32_t)julian_days;
+#endif
int64_t last_day_units = time % UnitPerDay;
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
+#if ARROW_LITTLE_ENDIAN
// impala_timestamp will be unaligned every other entry so do memcpy instead
// of assign and reinterpret cast to avoid undefined behavior.
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
+#else
+ (*impala_timestamp).value[0] = static_cast<uint32_t>(last_day_nanos);
+ (*impala_timestamp).value[1] = static_cast<uint32_t>(last_day_nanos >> 32);
Review Comment:
Can we use the following instead of `#if`?
```cpp
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
auto last_day_nanos_little_endian =
::arrow::bit_util::ToLittleEndian(last_day_nanos);
std::memcpy(impala_timestamp, &last_day_nanos_little_endian,
sizeof(int64_t));
```
##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2621,6 +2641,38 @@ struct SerializeFunctor<::parquet::FLBAType,
::arrow::HalfFloatType> {
return FLBA{reinterpret_cast<const uint8_t*>(value_ptr)};
}
};
+#else
+template <>
+struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
+ Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*,
FLBA* out) {
+ const uint16_t* values = array.raw_values();
+ const int64_t length = array.length();
+
+ // Allocate buffer for little-endian converted values
+ converted_values_.resize(length);
+
+ if (array.null_count() == 0) {
+ for (int64_t i = 0; i < length; ++i) {
+ converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]);
+ out[i] = FLBA{reinterpret_cast<const uint8_t*>(&converted_values_[i])};
+ }
+ } else {
+ for (int64_t i = 0; i < length; ++i) {
+ if (array.IsValid(i)) {
+ converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]);
+ out[i] = FLBA{reinterpret_cast<const
uint8_t*>(&converted_values_[i])};
+ } else {
+ out[i] = FLBA{};
+ }
+ }
+ }
+ return Status::OK();
+ }
+
+ private:
+ std::vector<uint16_t> converted_values_;
+};
+#endif
Review Comment:
Could you share implementation as much as possible something like:
```cpp
template <>
struct SerializeFunctor<::parquet::FLBAType, ::arrow::HalfFloatType> {
Status Serialize(const ::arrow::HalfFloatArray& array, ArrowWriteContext*,
FLBA* out) {
#if ARROW_LITTLE_ENDIAN
return SerializeLittleEndianValues(array.raw_values(), out);
#else
const uint16_t* values = array.raw_values();
const int64_t length = array.length();
converted_values_.resize(length);
for (int64_t i = 0; i < length; ++i) {
// We don't need IsValid() here. Non valid values are just ignored in
SerializeLittleEndianValues().
converted_values_[i] = ::arrow::bit_util::ToLittleEndian(values[i]);
}
return SerializeLittleEndianValues(converted_values_.data(), out);
#endif
}
private:
Status SerializeLittleEndianValues(const uint16_t* values, FLBA* out) {
if (array.null_count() == 0) {
for (int64_t i = 0; i < array.length(); ++i) {
out[i] = ToFLBA(&values[i]);
}
} else {
for (int64_t i = 0; i < array.length(); ++i) {
out[i] = array.IsValid(i) ? ToFLBA(&values[i]) : FLBA{};
}
}
return Status::OK();
}
FLBA ToFLBA(const uint16_t* value_ptr) const {
return FLBA{reinterpret_cast<const uint8_t*>(value_ptr)};
}
#if !ARROW_LITTLE_ENDIAN
std::vector<uint16_t> converted_values_;
#endif
};
```
##########
cpp/src/parquet/column_writer.h:
##########
@@ -260,13 +261,21 @@ constexpr int64_t kJulianEpochOffsetDays =
INT64_C(2440588);
template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96*
impala_timestamp) {
int64_t julian_days = (time / UnitPerDay) + kJulianEpochOffsetDays;
+#if ARROW_LITTLE_ENDIAN
(*impala_timestamp).value[2] = (uint32_t)julian_days;
+#endif
Review Comment:
Do we need this `#if`?
It seems that the below `(*impala_timestamp).value[2] =
static_cast<uint32_t>(julian_days);` does the same thing.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]