emkornfield commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r567584931



##########
File path: cpp/src/arrow/adapters/orc/adapter_util.cc
##########
@@ -315,13 +342,462 @@ Status AppendBatch(const liborc::Type* type, 
liborc::ColumnVectorBatch* batch,
       return Status::NotImplemented("Not implemented type kind: ", kind);
   }
 }
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
+
+namespace {
+
+using arrow::internal::checked_cast;
+
+arrow::Status WriteBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                         int64_t* arrow_offset, int64_t* orc_offset,
+                         const int64_t& length, const arrow::Array& parray,
+                         const std::vector<bool>* incoming_mask = NULLPTR);
+
+// incoming_mask is exclusively used by FillStructBatch. The cause is that ORC 
is much
+// stricter than Arrow in terms of consistency. In this case if a struct 
scalar is null
+// all its children must be set to null or ORC is not going to function 
properly. This is
+// why I added incoming_mask to pass on null status from a struct to its 
children.
+//
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class ArrayType, class BatchType, class TargetType>
+arrow::Status WriteNumericBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                                int64_t* arrow_offset, int64_t* orc_offset,
+                                const int64_t& length, const arrow::Array& 
array,
+                                const std::vector<bool>* incoming_mask) {
+  const ArrayType& numeric_array(checked_cast<const ArrayType&>(array));
+  auto batch = checked_cast<BatchType*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||

Review comment:
       This can be optimized quite a bit.  Anding the two masks together (see 
util/bit*) and then using BitBlockCounter or BitRunReader so that in the common 
cases these can be set in batches.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to