emkornfield commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r567590991



##########
File path: cpp/src/arrow/adapters/orc/adapter_util.cc
##########
@@ -315,13 +342,462 @@ Status AppendBatch(const liborc::Type* type, 
liborc::ColumnVectorBatch* batch,
       return Status::NotImplemented("Not implemented type kind: ", kind);
   }
 }
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
+
+namespace {
+
+using arrow::internal::checked_cast;
+
+arrow::Status WriteBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                         int64_t* arrow_offset, int64_t* orc_offset,
+                         const int64_t& length, const arrow::Array& parray,
+                         const std::vector<bool>* incoming_mask = NULLPTR);
+
+// incoming_mask is exclusively used by FillStructBatch. The cause is that ORC 
is much
+// stricter than Arrow in terms of consistency. In this case if a struct 
scalar is null
+// all its children must be set to null or ORC is not going to function 
properly. This is
+// why I added incoming_mask to pass on null status from a struct to its 
children.
+//
+// static_cast from int64_t or double to itself shouldn't introduce overhead
+// Pleae see
+// https://stackoverflow.com/questions/19106826/
+// can-static-cast-to-same-type-introduce-runtime-overhead
+template <class ArrayType, class BatchType, class TargetType>
+arrow::Status WriteNumericBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                                int64_t* arrow_offset, int64_t* orc_offset,
+                                const int64_t& length, const arrow::Array& 
array,
+                                const std::vector<bool>* incoming_mask) {
+  const ArrayType& numeric_array(checked_cast<const ArrayType&>(array));
+  auto batch = checked_cast<BatchType*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      batch->data[*orc_offset] =
+          static_cast<TargetType>(numeric_array.Value(*arrow_offset));
+      batch->notNull[*orc_offset] = true;
+    }
+  }
+  batch->numElements = *orc_offset;
+  return arrow::Status::OK();
+}
+
+template <class ArrayType>
+arrow::Status WriteTimestampBatch(liborc::ColumnVectorBatch* 
column_vector_batch,
+                                  int64_t* arrow_offset, int64_t* orc_offset,
+                                  const int64_t& length, const arrow::Array& 
array,
+                                  const std::vector<bool>* incoming_mask,
+                                  const int64_t& conversion_factor_from_second,
+                                  const int64_t& conversion_factor_to_nano) {
+  const ArrayType& timestamp_array(checked_cast<const ArrayType&>(array));
+  auto batch = 
checked_cast<liborc::TimestampVectorBatch*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      int64_t data = timestamp_array.Value(*arrow_offset);
+      batch->notNull[*orc_offset] = true;
+      batch->data[*orc_offset] =
+          static_cast<int64_t>(std::floor(data / 
conversion_factor_from_second));
+      batch->nanoseconds[*orc_offset] =
+          (data - conversion_factor_from_second * batch->data[*orc_offset]) *
+          conversion_factor_to_nano;
+    }
+  }
+  batch->numElements = *orc_offset;
+  return arrow::Status::OK();
+}
+
+template <class ArrayType, class OffsetType>
+arrow::Status WriteBinaryBatch(liborc::ColumnVectorBatch* column_vector_batch,
+                               int64_t* arrow_offset, int64_t* orc_offset,
+                               const int64_t& length, const arrow::Array& 
array,
+                               const std::vector<bool>* incoming_mask) {
+  const ArrayType& binary_array(checked_cast<const ArrayType&>(array));
+  auto batch = checked_cast<liborc::StringVectorBatch*>(column_vector_batch);
+  int64_t arrow_length = array.length();
+  if (!arrow_length) {
+    return arrow::Status::OK();
+  }
+  if (array.null_count() || incoming_mask) {
+    batch->hasNulls = true;
+  }
+  for (; *orc_offset < length && *arrow_offset < arrow_length;
+       (*orc_offset)++, (*arrow_offset)++) {
+    if (array.IsNull(*arrow_offset) ||
+        (incoming_mask && !(*incoming_mask)[*orc_offset])) {
+      batch->notNull[*orc_offset] = false;
+    } else {
+      batch->notNull[*orc_offset] = true;
+      OffsetType data_length = 0;
+      const uint8_t* data = binary_array.GetValue(*arrow_offset, &data_length);
+      if (batch->data[*orc_offset]) delete batch->data[*orc_offset];
+      batch->data[*orc_offset] = new char[data_length];  // Do not include null

Review comment:
       The ORC library will delete all of these after the fact?  This actually 
looks like a memory leak looking at:
   
https://github.com/apache/orc/blob/0128f817b0ab28fa2d0660737234ac966f0f5c50/c%2B%2B/src/MemoryPool.cc#L137
 and 
https://github.com/apache/orc/blob/0128f817b0ab28fa2d0660737234ac966f0f5c50/c%2B%2B/include/orc/Vector.hh#L122
   
   If this is only temporary.  a zero copy way of doing this would be to assign 
data to the base pointer of the data vector + the offset of the current array 
element.  This would still probably be cheaper even if after writing you had to 
null out all values again




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to