[GitHub] [arrow] mathyingzhou commented on a change in pull request #8648: ARROW-7906: [C++] [Python] Add ORC write support

GitBox Sun, 31 Jan 2021 22:48:43 -0800


mathyingzhou commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r567595076




##########
File path: cpp/src/arrow/adapters/orc/adapter_util.cc
##########
@@ -315,13 +342,462 @@ Status AppendBatch(const liborc::Type* type, 
liborc::ColumnVectorBatch* batch,
       return Status::NotImplemented("Not implemented type kind: ", kind);
   }
 }
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
+
+namespace {
+
+using arrow::internal::checked_cast;
+
+arrow::Status WriteBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                         int64_t* arrow_offset, int64_t* orc_offset,
+                         const int64_t& length, const arrow::Array& parray,
+                         const std::vector<bool>* incoming_mask = NULLPTR);
+
+// incoming_mask is exclusively used by FillStructBatch. The cause is that ORC 
is much
+// stricter than Arrow in terms of consistency. In this case if a struct 
scalar is null
+// all its children must be set to null or ORC is not going to function 
properly. This is
+// why I added incoming_mask to pass on null status from a struct to its 
children.
+//
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class ArrayType, class BatchType, class TargetType>
+arrow::Status WriteNumericBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                                int64_t* arrow_offset, int64_t* orc_offset,
+                                const int64_t& length, const arrow::Array& 
array,
+                                const std::vector<bool>* incoming_mask) {
+  const ArrayType& numeric_array(checked_cast<const ArrayType&>(array));
+  auto batch = checked_cast<BatchType*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      batch->data[*orc_offset] =
+          static_cast<TargetType>(numeric_array.Value(*arrow_offset));
+      batch->notNull[*orc_offset] = true;
+    }
+  }
+  batch->numElements = *orc_offset;
+  return arrow::Status::OK();
+}
+
+template <class ArrayType>
+arrow::Status WriteTimestampBatch(liborc::ColumnVectorBatch* 
column_vector_batch,
+                                  int64_t* arrow_offset, int64_t* orc_offset,
+                                  const int64_t& length, const arrow::Array& 
array,
+                                  const std::vector<bool>* incoming_mask,
+                                  const int64_t& conversion_factor_from_second,
+                                  const int64_t& conversion_factor_to_nano) {
+  const ArrayType& timestamp_array(checked_cast<const ArrayType&>(array));
+  auto batch = 
checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      int64_t data = timestamp_array.Value(*arrow_offset);
+      batch->notNull[*orc_offset] = true;
+      batch->data[*orc_offset] =
+          static_cast<int64_t>(std::floor(data / 
conversion_factor_from_second));
+      batch->nanoseconds[*orc_offset] =
+          (data - conversion_factor_from_second * batch->data[*orc_offset]) *
+          conversion_factor_to_nano;
+    }
+  }
+  batch->numElements = *orc_offset;
+  return arrow::Status::OK();
+}
+
+template <class ArrayType, class OffsetType>
+arrow::Status WriteBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                               int64_t* arrow_offset, int64_t* orc_offset,
+                               const int64_t& length, const arrow::Array& 
array,
+                               const std::vector<bool>* incoming_mask) {
+  const ArrayType& binary_array(checked_cast<const ArrayType&>(array));
+  auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      batch->notNull[*orc_offset] = true;
+      OffsetType data_length = 0;
+      const uint8_t* data = binary_array.GetValue(*arrow_offset, &data_length);
+      if (batch->data[*orc_offset]) delete batch->data[*orc_offset];

Review comment:
       Yes, they are. The batches are reused which is why it is necessary to 
overwrite everything.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] mathyingzhou commented on a change in pull request #8648: ARROW-7906: [C++] [Python] Add ORC write support

Reply via email to