westonpace commented on code in PR #34327:
URL: https://github.com/apache/arrow/pull/34327#discussion_r1117082245


##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1362,7 +1375,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, 
public TypedColumnWriter<
           *out_spaced_values_to_write +=
               def_levels[x] >= level_info_.repeated_ancestor_def_level ? 1 : 0;
         }
-        *null_count = *out_values_to_write - *out_spaced_values_to_write;
+        *null_count = batch_size - *out_values_to_write;

Review Comment:
   Why did this calculation need to change?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -949,11 +954,17 @@ void ColumnWriterImpl::BuildDataPageV2(int64_t 
definition_levels_rle_size,
   ResetPageStatistics();
 
   int32_t num_values = static_cast<int32_t>(num_buffered_values_);
-  int32_t null_count = static_cast<int32_t>(page_stats.null_count);
+  int32_t null_count = static_cast<int32_t>(num_buffered_nulls_);
   int32_t num_rows = static_cast<int32_t>(num_buffered_rows_);
   int32_t def_levels_byte_length = 
static_cast<int32_t>(definition_levels_rle_size);
   int32_t rep_levels_byte_length = 
static_cast<int32_t>(repetition_levels_rle_size);
 
+  // page_stats.null_count is not set when page_statistics_ is nullptr. It is 
only used
+  // here for safety check.
+  if (page_stats.has_null_count && page_stats.null_count != 
num_buffered_nulls_) {

Review Comment:
   Should this maybe be a `DCHECK` since users shouldn't be able to influence 
these values and wouldn't understand the exception?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to