fatemehp commented on code in PR #17877:
URL: https://github.com/apache/arrow/pull/17877#discussion_r1113542602
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1803,7 +1805,100 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
CheckNumberDecoded(num_decoded, values_to_read);
}
- // Return number of logical records read
+ // Reads repeated records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ int64_t ReadRepeatedRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ const int64_t start_levels_position = levels_position_;
+ // For repeated fields, DelimitRecords will update both records_read and
+ // values_to_read.
+ int64_t records_read = DelimitRecords(num_records, values_to_read);
+ if (read_dense_for_nullable_) {
+ ReadValuesDense(*values_to_read);
+ // Any values with def_level < max_def_level is considered null.
+ *null_count = levels_position_ - start_levels_position - *values_to_read;
+ } else {
+ ReadSpacedForNullable(start_levels_position, values_to_read, null_count);
+ DCHECK_EQ(*null_count, levels_position_ - start_levels_position -
*values_to_read);
+ }
+ return records_read;
+ }
+
+ // Reads optional records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ int64_t ReadOptionalRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ const int64_t start_levels_position = levels_position_;
+ // No repetition levels, skip delimiting logic. Each level represents a
+ // null or not null entry
+ int64_t records_read =
+ std::min<int64_t>(levels_written_ - levels_position_, num_records);
+ // This is advanced by DelimitRecords, which we skipped
+ levels_position_ += records_read;
+
+ if (read_dense_for_nullable_) {
+ ReadDenseForOptional(start_levels_position, values_to_read, null_count);
+ } else {
+ ReadSpacedForNullable(start_levels_position, values_to_read, null_count);
+ }
+ return records_read;
+ }
+
+ // Reads optional records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ // null_count will be always 0. Kept the argument to be consistent with the
+ // other functions for repeated/optional.
+ int64_t ReadRequiredRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ *values_to_read = num_records;
+ *null_count = 0;
+ ReadValuesDense(*values_to_read);
+ return num_records;
+ }
+
+ // Reads dense for optional records. First, it figures out number of values
to
+ // read and null count.
+ void ReadDenseForOptional(int64_t start_levels_position, int64_t*
values_to_read,
+ int64_t* null_count) {
+ // levels_position_ must already be incremented based on number of records
+ // read.
+ DCHECK_GE(levels_position_, start_levels_position);
+
+ // When reading dense we need to figure out number of values to read.
+ const int16_t* def_levels = this->def_levels();
+ for (int i = start_levels_position; i < levels_position_; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++(*values_to_read);
+ } else {
+ ++(*null_count);
Review Comment:
I changed the contract so that we do not populate null_count when reading
dense. It turns out that null_count is the number of null values where we left
space for it in the values vector. I clarified the null_count() contract. I
found this through arrow internal tests. See [reconstruct_internal_test.cc
testcase:
ThreeLevelListOptionalOptional](https://github.com/apache/arrow/blob/45918a90a6ca1cf3fd67c256a7d6a240249e555a/cpp/src/parquet/arrow/reconstruct_internal_test.cc#L805).
Here, the empty list was not counted towards the null count in the record
reader.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]