mathyingzhou commented on a change in pull request #8648: URL: https://github.com/apache/arrow/pull/8648#discussion_r567595076
########## File path: cpp/src/arrow/adapters/orc/adapter_util.cc ########## @@ -315,13 +342,462 @@ Status AppendBatch(const liborc::Type* type, liborc::ColumnVectorBatch* batch, return Status::NotImplemented("Not implemented type kind: ", kind); } } +} // namespace orc +} // namespace adapters +} // namespace arrow + +namespace { + +using arrow::internal::checked_cast; + +arrow::Status WriteBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t* arrow_offset, int64_t* orc_offset, + const int64_t& length, const arrow::Array& parray, + const std::vector<bool>* incoming_mask = NULLPTR); + +// incoming_mask is exclusively used by FillStructBatch. The cause is that ORC is much +// stricter than Arrow in terms of consistency. In this case if a struct scalar is null +// all its children must be set to null or ORC is not going to function properly. This is +// why I added incoming_mask to pass on null status from a struct to its children. +// +// static_cast from int64_t or double to itself shouldn't introduce overhead +// Pleae see +// https://stackoverflow.com/questions/19106826/ +// can-static-cast-to-same-type-introduce-runtime-overhead +template <class ArrayType, class BatchType, class TargetType> +arrow::Status WriteNumericBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t* arrow_offset, int64_t* orc_offset, + const int64_t& length, const arrow::Array& array, + const std::vector<bool>* incoming_mask) { + const ArrayType& numeric_array(checked_cast<const ArrayType&>(array)); + auto batch = checked_cast<BatchType*>(column_vector_batch); + int64_t arrow_length = array.length(); + if (!arrow_length) { + return arrow::Status::OK(); + } + if (array.null_count() || incoming_mask) { + batch->hasNulls = true; + } + for (; *orc_offset < length && *arrow_offset < arrow_length; + (*orc_offset)++, (*arrow_offset)++) { + if (array.IsNull(*arrow_offset) || + (incoming_mask && !(*incoming_mask)[*orc_offset])) { + batch->notNull[*orc_offset] = false; + } else { + batch->data[*orc_offset] = + static_cast<TargetType>(numeric_array.Value(*arrow_offset)); + batch->notNull[*orc_offset] = true; + } + } + batch->numElements = *orc_offset; + return arrow::Status::OK(); +} + +template <class ArrayType> +arrow::Status WriteTimestampBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t* arrow_offset, int64_t* orc_offset, + const int64_t& length, const arrow::Array& array, + const std::vector<bool>* incoming_mask, + const int64_t& conversion_factor_from_second, + const int64_t& conversion_factor_to_nano) { + const ArrayType& timestamp_array(checked_cast<const ArrayType&>(array)); + auto batch = checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch); + int64_t arrow_length = array.length(); + if (!arrow_length) { + return arrow::Status::OK(); + } + if (array.null_count() || incoming_mask) { + batch->hasNulls = true; + } + for (; *orc_offset < length && *arrow_offset < arrow_length; + (*orc_offset)++, (*arrow_offset)++) { + if (array.IsNull(*arrow_offset) || + (incoming_mask && !(*incoming_mask)[*orc_offset])) { + batch->notNull[*orc_offset] = false; + } else { + int64_t data = timestamp_array.Value(*arrow_offset); + batch->notNull[*orc_offset] = true; + batch->data[*orc_offset] = + static_cast<int64_t>(std::floor(data / conversion_factor_from_second)); + batch->nanoseconds[*orc_offset] = + (data - conversion_factor_from_second * batch->data[*orc_offset]) * + conversion_factor_to_nano; + } + } + batch->numElements = *orc_offset; + return arrow::Status::OK(); +} + +template <class ArrayType, class OffsetType> +arrow::Status WriteBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch, + int64_t* arrow_offset, int64_t* orc_offset, + const int64_t& length, const arrow::Array& array, + const std::vector<bool>* incoming_mask) { + const ArrayType& binary_array(checked_cast<const ArrayType&>(array)); + auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch); + int64_t arrow_length = array.length(); + if (!arrow_length) { + return arrow::Status::OK(); + } + if (array.null_count() || incoming_mask) { + batch->hasNulls = true; + } + for (; *orc_offset < length && *arrow_offset < arrow_length; + (*orc_offset)++, (*arrow_offset)++) { + if (array.IsNull(*arrow_offset) || + (incoming_mask && !(*incoming_mask)[*orc_offset])) { + batch->notNull[*orc_offset] = false; + } else { + batch->notNull[*orc_offset] = true; + OffsetType data_length = 0; + const uint8_t* data = binary_array.GetValue(*arrow_offset, &data_length); + if (batch->data[*orc_offset]) delete batch->data[*orc_offset]; Review comment: Yes, they are. The batches are reused which is why it is necessary to overwrite everything. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org