mapleFU commented on code in PR #41362:
URL: https://github.com/apache/arrow/pull/41362#discussion_r1584183075
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1675,42 +1675,50 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
//
// \return Number of records delimited
int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+ if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ ==
levels_written_)) {
+ *values_seen = 0;
+ return 0;
+ }
int64_t values_to_read = 0;
int64_t records_read = 0;
-
- const int16_t* def_levels = this->def_levels() + levels_position_;
- const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
+ const int16_t* const rep_levels = this->rep_levels();
+ const int16_t* const def_levels = this->def_levels();
ARROW_DCHECK_GT(this->max_rep_level_, 0);
-
- // Count logical records and number of values to read
- while (levels_position_ < levels_written_) {
- const int16_t rep_level = *rep_levels++;
- if (rep_level == 0) {
- // If at_record_start_ is true, we are seeing the start of a record
- // for the second time, such as after repeated calls to
- // DelimitRecords. In this case we must continue until we find
- // another record start or exhausting the ColumnChunk
- if (!at_record_start_) {
- // We've reached the end of a record; increment the record count.
- ++records_read;
- if (records_read == num_records) {
- // We've found the number of records we were looking for. Set
- // at_record_start_ to true and break
- at_record_start_ = true;
- break;
- }
- }
- }
+ // If at_record_start_ is true, we are seeing the start of a record
+ // for the second time, such as after repeated calls to
+ // DelimitRecords. In this case we must continue until we find
+ // another record start or exhausting the ColumnChunk
+ if (at_record_start_) {
+ ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+ values_to_read += def_levels[levels_position_] == this->max_def_level_;
+ ++levels_position_;
// We have decided to consume the level at this position; therefore we
// must advance until we find another record boundary
at_record_start_ = false;
+ }
- const int16_t def_level = *def_levels++;
- if (def_level == this->max_def_level_) {
- ++values_to_read;
+ // Count logical records and number of non-null values to read
+ ARROW_DCHECK(!at_record_start_);
+ while (levels_position_ < levels_written_) {
+ int64_t stride =
+ std::min(levels_written_ - levels_position_, num_records -
records_read);
+ const int64_t position_end = levels_position_ + stride;
+ for (int64_t i = levels_position_; i < position_end; ++i) {
Review Comment:
```diff
diff --git a/cpp/src/parquet/column_reader.cc
b/cpp/src/parquet/column_reader.cc
index d3f918186..25ac0b513 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1703,10 +1703,9 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
int64_t stride =
std::min(levels_written_ - levels_position_, num_records -
records_read);
const int64_t position_end = levels_position_ + stride;
- for (int64_t i = levels_position_; i < position_end; ++i) {
- records_read += rep_levels[i] == 0;
- values_to_read += def_levels[i] == this->max_def_level_;
- }
+ records_read += std::count(rep_levels + levels_position_, rep_levels
+ position_end, 0);
+ values_to_read += std::count(def_levels + levels_position_,
def_levels + position_end,
+ this->max_def_level_);
levels_position_ = position_end;
if (records_read == num_records) {
```
It runs a bit slower on my MacOS with Release O2
```
RecordReaderSkipRecords/Repetition:0/BatchSize:1000
162494 ns 134518 ns 5261 bytes_per_second=35.4479G/s
items_per_second=9.51548G/s
RecordReaderSkipRecords/Repetition:1/BatchSize:1000
950868 ns 730564 ns 960 bytes_per_second=3.47751G/s
items_per_second=1.75207G/s
RecordReaderSkipRecords/Repetition:2/BatchSize:1000
2590388 ns 2251851 ns 315 bytes_per_second=1.20306G/s
items_per_second=568.421M/s
RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:1
187780 ns 149951 ns 4921 bytes_per_second=31.7994G/s
items_per_second=8.5361G/s
RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:0
193405 ns 152667 ns 4498 bytes_per_second=31.2339G/s
items_per_second=8.38428G/s
RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:1
1076330 ns 860129 ns 836 bytes_per_second=2.95368G/s
items_per_second=1.48815G/s
RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:0
6279539 ns 4948944 ns 143 bytes_per_second=525.671M/s
items_per_second=258.641M/s
RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:1
3087965 ns 2390981 ns 314 bytes_per_second=1.13306G/s
items_per_second=535.345M/s
RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:0
7156952 ns 6809000 ns 100 bytes_per_second=407.423M/s
items_per_second=187.986M/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10/LevelsPerPage:80000
5616643 ns 3536000 ns 207 bytes_per_second=1.34852G/s
items_per_second=361.991M/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:1000/LevelsPerPage:80000
180200 ns 146042 ns 5469 bytes_per_second=32.6507G/s
items_per_second=8.7646G/s
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10000/LevelsPerPage:1000000
1629441 ns 1572834 ns 451 bytes_per_second=37.8963G/s
items_per_second=10.1727G/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10/LevelsPerPage:80000
14144133 ns 13841654 ns 52 bytes_per_second=187.948M/s
items_per_second=92.4745M/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:1000/LevelsPerPage:80000
2673686 ns 2613805 ns 272 bytes_per_second=995.298M/s
items_per_second=489.708M/s
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10000/LevelsPerPage:1000000
32442177 ns 30155571 ns 21 bytes_per_second=1077.09M/s
items_per_second=530.582M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10/LevelsPerPage:80000
17366862 ns 17322366 ns 41 bytes_per_second=160.148M/s
items_per_second=73.8929M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:100/LevelsPerPage:80000
5906808 ns 5796854 ns 123 bytes_per_second=478.56M/s
items_per_second=220.809M/s
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10000/LevelsPerPage:1000000
47098050 ns 46723000 ns 14 bytes_per_second=741.389M/s
items_per_second=342.444M/s
```
I decide to keep the current code if no strong reason (Previously loop
changed here: https://github.com/apache/arrow/pull/41362#discussion_r1580516737
)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]