mapleFU commented on code in PR #41362:
URL: https://github.com/apache/arrow/pull/41362#discussion_r1594919172
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1675,42 +1675,50 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
//
// \return Number of records delimited
int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+ if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ ==
levels_written_)) {
+ *values_seen = 0;
+ return 0;
+ }
int64_t values_to_read = 0;
int64_t records_read = 0;
-
- const int16_t* def_levels = this->def_levels() + levels_position_;
- const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
+ const int16_t* const rep_levels = this->rep_levels();
+ const int16_t* const def_levels = this->def_levels();
ARROW_DCHECK_GT(this->max_rep_level_, 0);
-
- // Count logical records and number of values to read
- while (levels_position_ < levels_written_) {
- const int16_t rep_level = *rep_levels++;
- if (rep_level == 0) {
- // If at_record_start_ is true, we are seeing the start of a record
- // for the second time, such as after repeated calls to
- // DelimitRecords. In this case we must continue until we find
- // another record start or exhausting the ColumnChunk
- if (!at_record_start_) {
- // We've reached the end of a record; increment the record count.
- ++records_read;
- if (records_read == num_records) {
- // We've found the number of records we were looking for. Set
- // at_record_start_ to true and break
- at_record_start_ = true;
- break;
- }
- }
- }
+ // If at_record_start_ is true, we are seeing the start of a record
+ // for the second time, such as after repeated calls to
+ // DelimitRecords. In this case we must continue until we find
+ // another record start or exhausting the ColumnChunk
+ if (at_record_start_) {
+ ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+ values_to_read += def_levels[levels_position_] == this->max_def_level_;
+ ++levels_position_;
// We have decided to consume the level at this position; therefore we
// must advance until we find another record boundary
at_record_start_ = false;
+ }
- const int16_t def_level = *def_levels++;
- if (def_level == this->max_def_level_) {
- ++values_to_read;
+ // Count logical records and number of non-null values to read
Review Comment:
With:
```diff
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1699,13 +1699,14 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
// Count logical records and number of non-null values to read
ARROW_DCHECK(!at_record_start_);
+ int64_t begin_levels_position = levels_position_;
while (levels_position_ < levels_written_) {
int64_t stride =
std::min(levels_written_ - levels_position_, num_records -
records_read);
const int64_t position_end = levels_position_ + stride;
for (int64_t i = levels_position_; i < position_end; ++i) {
records_read += rep_levels[i] == 0;
- values_to_read += def_levels[i] == this->max_def_level_;
+ // values_to_read += def_levels[i] == this->max_def_level_;
}
levels_position_ = position_end;
if (records_read == num_records) {
@@ -1716,11 +1717,11 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
at_record_start_ = true;
// Remove last value if we have reaches the end of the record
levels_position_ = levels_position_ - 1;
- values_to_read -= def_levels[levels_position_] ==
this->max_def_level_;
+ // values_to_read -= def_levels[levels_position_] ==
this->max_def_level_;
break;
}
}
- *values_seen = values_to_read;
+ *values_seen = std::count(def_levels + begin_levels_position,
def_levels + levels_position_, this->max_def_level_);
return records_read;
}
```
```
RecordReaderSkipRecords/Repetition:0/BatchSize:1000
142677 ns 136192 ns 5298 bytes_per_second=35.0122G/s
items_per_second=9.39851G/s
RecordReaderSkipRecords/Repetition:1/BatchSize:1000
773494 ns 716574 ns 986 bytes_per_second=3.5454G/s
items_per_second=1.78628G/s
RecordReaderSkipRecords/Repetition:2/BatchSize:1000
2730163 ns 2184899 ns 337 bytes_per_second=1.23993G/s
items_per_second=585.839M/s
RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:1
148304 ns 143400 ns 4599 bytes_per_second=33.2522G/s
items_per_second=8.92606G/s
RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:0
145492 ns 142997 ns 4651 bytes_per_second=33.3459G/s
items_per_second=8.95122G/s
RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:1
819043 ns 813848 ns 840 bytes_per_second=3.12164G/s
items_per_second=1.57278G/s
RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:0
4554815 ns 4539451 ns 153 bytes_per_second=573.09M/s
items_per_second=281.972M/s
RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:1
2384075 ns 2146202 ns 336 bytes_per_second=1.26229G/s
items_per_second=596.402M/s
RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:0
6736048 ns 6484066 ns 106 bytes_per_second=427.84M/s
items_per_second=197.407M/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10/LevelsPerPage:80000
3446307 ns 3155848 ns 223 bytes_per_second=1.51096G/s
items_per_second=405.596M/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:1000/LevelsPerPage:80000
139276 ns 128461 ns 5561 bytes_per_second=37.1192G/s
items_per_second=9.96412G/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10000/LevelsPerPage:1000000
1609839 ns 1591719 ns 424 bytes_per_second=37.4467G/s
items_per_second=10.052G/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10/LevelsPerPage:80000
13379542 ns 13350075 ns 53 bytes_per_second=194.869M/s
items_per_second=95.8796M/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:1000/LevelsPerPage:80000
2575969 ns 2563573 ns 274 bytes_per_second=1014.8M/s
items_per_second=499.303M/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10000/LevelsPerPage:1000000
30089898 ns 29941087 ns 23 bytes_per_second=1084.81M/s
items_per_second=534.383M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10/LevelsPerPage:80000
17028265 ns 16966634 ns 41 bytes_per_second=163.506M/s
items_per_second=75.4422M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:100/LevelsPerPage:80000
5797889 ns 5680685 ns 124 bytes_per_second=488.346M/s
items_per_second=225.325M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10000/LevelsPerPage:1000000
46209783 ns 46127533 ns 15 bytes_per_second=750.959M/s
items_per_second=346.864M/s
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]