mapleFU commented on code in PR #41362:
URL: https://github.com/apache/arrow/pull/41362#discussion_r1594919172


##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1675,42 +1675,50 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
   //
   // \return Number of records delimited
   int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+    if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ == 
levels_written_)) {
+      *values_seen = 0;
+      return 0;
+    }
     int64_t values_to_read = 0;
     int64_t records_read = 0;
-
-    const int16_t* def_levels = this->def_levels() + levels_position_;
-    const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
+    const int16_t* const rep_levels = this->rep_levels();
+    const int16_t* const def_levels = this->def_levels();
     ARROW_DCHECK_GT(this->max_rep_level_, 0);
-
-    // Count logical records and number of values to read
-    while (levels_position_ < levels_written_) {
-      const int16_t rep_level = *rep_levels++;
-      if (rep_level == 0) {
-        // If at_record_start_ is true, we are seeing the start of a record
-        // for the second time, such as after repeated calls to
-        // DelimitRecords. In this case we must continue until we find
-        // another record start or exhausting the ColumnChunk
-        if (!at_record_start_) {
-          // We've reached the end of a record; increment the record count.
-          ++records_read;
-          if (records_read == num_records) {
-            // We've found the number of records we were looking for. Set
-            // at_record_start_ to true and break
-            at_record_start_ = true;
-            break;
-          }
-        }
-      }
+    // If at_record_start_ is true, we are seeing the start of a record
+    // for the second time, such as after repeated calls to
+    // DelimitRecords. In this case we must continue until we find
+    // another record start or exhausting the ColumnChunk
+    if (at_record_start_) {
+      ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+      values_to_read += def_levels[levels_position_] == this->max_def_level_;
+      ++levels_position_;
       // We have decided to consume the level at this position; therefore we
       // must advance until we find another record boundary
       at_record_start_ = false;
+    }
 
-      const int16_t def_level = *def_levels++;
-      if (def_level == this->max_def_level_) {
-        ++values_to_read;
+    // Count logical records and number of non-null values to read

Review Comment:
   With:
   
   ```diff
   --- a/cpp/src/parquet/column_reader.cc
   +++ b/cpp/src/parquet/column_reader.cc
   @@ -1699,13 +1699,14 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
    
        // Count logical records and number of non-null values to read
        ARROW_DCHECK(!at_record_start_);
   +    int64_t begin_levels_position = levels_position_;
        while (levels_position_ < levels_written_) {
          int64_t stride =
              std::min(levels_written_ - levels_position_, num_records - 
records_read);
          const int64_t position_end = levels_position_ + stride;
          for (int64_t i = levels_position_; i < position_end; ++i) {
            records_read += rep_levels[i] == 0;
   -        values_to_read += def_levels[i] == this->max_def_level_;
   +        // values_to_read += def_levels[i] == this->max_def_level_;
          }
          levels_position_ = position_end;
          if (records_read == num_records) {
   @@ -1716,11 +1717,11 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
            at_record_start_ = true;
            // Remove last value if we have reaches the end of the record
            levels_position_ = levels_position_ - 1;
   -        values_to_read -= def_levels[levels_position_] == 
this->max_def_level_;
   +        // values_to_read -= def_levels[levels_position_] == 
this->max_def_level_;
            break;
          }
        }
   -    *values_seen = values_to_read;
   +    *values_seen = std::count(def_levels + begin_levels_position, 
def_levels + levels_position_, this->max_def_level_);
        return records_read;
      }
   ```
   
   ```
   RecordReaderSkipRecords/Repetition:0/BatchSize:1000                          
         142677 ns       136192 ns         5298 bytes_per_second=35.0122G/s 
items_per_second=9.39851G/s
   RecordReaderSkipRecords/Repetition:1/BatchSize:1000                          
         773494 ns       716574 ns          986 bytes_per_second=3.5454G/s 
items_per_second=1.78628G/s
   RecordReaderSkipRecords/Repetition:2/BatchSize:1000                          
        2730163 ns      2184899 ns          337 bytes_per_second=1.23993G/s 
items_per_second=585.839M/s
   RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:1              
         148304 ns       143400 ns         4599 bytes_per_second=33.2522G/s 
items_per_second=8.92606G/s
   RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:0              
         145492 ns       142997 ns         4651 bytes_per_second=33.3459G/s 
items_per_second=8.95122G/s
   RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:1              
         819043 ns       813848 ns          840 bytes_per_second=3.12164G/s 
items_per_second=1.57278G/s
   RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:0              
        4554815 ns      4539451 ns          153 bytes_per_second=573.09M/s 
items_per_second=281.972M/s
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:1              
        2384075 ns      2146202 ns          336 bytes_per_second=1.26229G/s 
items_per_second=596.402M/s
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:0              
        6736048 ns      6484066 ns          106 bytes_per_second=427.84M/s 
items_per_second=197.407M/s
   RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10/LevelsPerPage:80000 
        3446307 ns      3155848 ns          223 bytes_per_second=1.51096G/s 
items_per_second=405.596M/s
   
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:1000/LevelsPerPage:80000  
      139276 ns       128461 ns         5561 bytes_per_second=37.1192G/s 
items_per_second=9.96412G/s
   
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10000/LevelsPerPage:1000000
    1609839 ns      1591719 ns          424 bytes_per_second=37.4467G/s 
items_per_second=10.052G/s
   RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10/LevelsPerPage:80000 
       13379542 ns     13350075 ns           53 bytes_per_second=194.869M/s 
items_per_second=95.8796M/s
   
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:1000/LevelsPerPage:80000  
     2575969 ns      2563573 ns          274 bytes_per_second=1014.8M/s 
items_per_second=499.303M/s
   
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10000/LevelsPerPage:1000000
   30089898 ns     29941087 ns           23 bytes_per_second=1084.81M/s 
items_per_second=534.383M/s
   RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10/LevelsPerPage:80000 
       17028265 ns     16966634 ns           41 bytes_per_second=163.506M/s 
items_per_second=75.4422M/s
   
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:100/LevelsPerPage:80000   
     5797889 ns      5680685 ns          124 bytes_per_second=488.346M/s 
items_per_second=225.325M/s
   
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10000/LevelsPerPage:1000000
   46209783 ns     46127533 ns           15 bytes_per_second=750.959M/s 
items_per_second=346.864M/s
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to