emkornfield commented on a change in pull request #8219: URL: https://github.com/apache/arrow/pull/8219#discussion_r492834953
########## File path: cpp/src/parquet/column_writer.cc ########## @@ -1009,12 +1046,33 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels, int64_t num_levels, const ::arrow::Array& array, - ArrowWriteContext* ctx) override { + ArrowWriteContext* ctx, bool nested, bool array_nullable) override { + BEGIN_PARQUET_CATCH_EXCEPTIONS + bool leaf_is_not_nullable = !level_info_.HasNullableValues(); + // Leaf nulls are canonical when there is only a single null element and it is at the + // leaf. + bool leaf_nulls_are_canonical = + (level_info_.def_level == level_info_.repeated_ancestor_def_level + 1) && + array_nullable; + bool maybe_parent_nulls = + nested && !(leaf_is_not_nullable || leaf_nulls_are_canonical); + if (maybe_parent_nulls) { + ARROW_ASSIGN_OR_RAISE( + bits_buffer_, + arrow::AllocateResizableBuffer( + BitUtil::BytesForBits(properties_->write_batch_size()), ctx->memory_pool)); + bits_buffer_->ZeroPadding(); + std::static_pointer_cast<ResizableBuffer>(AllocateBuffer(allocator_, 0)); Review comment: this line should be removed. but above, yes, we do allocate a new buffer for each WriteArrow call. I think the lifecycle of this object might only be used for one WriteArrow call. internally there is a concept of batching, and the allocation should only happen once for here for each of those batches. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org