Re: [PR] GH-41361: [C++][Parquet] Optimize DelimitRecords by batch execution [arrow]

via GitHub Mon, 29 Apr 2024 23:12:35 -0700


mapleFU commented on code in PR #41362:
URL: https://github.com/apache/arrow/pull/41362#discussion_r1584183075



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1675,42 +1675,50 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
   //
   // \return Number of records delimited
   int64_t DelimitRecords(int64_t num_records, int64_t* values_seen) {
+    if (ARROW_PREDICT_FALSE(num_records == 0 || levels_position_ == 
levels_written_)) {
+      *values_seen = 0;
+      return 0;
+    }
     int64_t values_to_read = 0;
     int64_t records_read = 0;
-
-    const int16_t* def_levels = this->def_levels() + levels_position_;
-    const int16_t* rep_levels = this->rep_levels() + levels_position_;
-
+    const int16_t* const rep_levels = this->rep_levels();
+    const int16_t* const def_levels = this->def_levels();
     ARROW_DCHECK_GT(this->max_rep_level_, 0);
-
-    // Count logical records and number of values to read
-    while (levels_position_ < levels_written_) {
-      const int16_t rep_level = *rep_levels++;
-      if (rep_level == 0) {
-        // If at_record_start_ is true, we are seeing the start of a record
-        // for the second time, such as after repeated calls to
-        // DelimitRecords. In this case we must continue until we find
-        // another record start or exhausting the ColumnChunk
-        if (!at_record_start_) {
-          // We've reached the end of a record; increment the record count.
-          ++records_read;
-          if (records_read == num_records) {
-            // We've found the number of records we were looking for. Set
-            // at_record_start_ to true and break
-            at_record_start_ = true;
-            break;
-          }
-        }
-      }
+    // If at_record_start_ is true, we are seeing the start of a record
+    // for the second time, such as after repeated calls to
+    // DelimitRecords. In this case we must continue until we find
+    // another record start or exhausting the ColumnChunk
+    if (at_record_start_) {
+      ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+      values_to_read += def_levels[levels_position_] == this->max_def_level_;
+      ++levels_position_;
       // We have decided to consume the level at this position; therefore we
       // must advance until we find another record boundary
       at_record_start_ = false;
+    }
 
-      const int16_t def_level = *def_levels++;
-      if (def_level == this->max_def_level_) {
-        ++values_to_read;
+    // Count logical records and number of non-null values to read
+    ARROW_DCHECK(!at_record_start_);
+    while (levels_position_ < levels_written_) {
+      int64_t stride =
+          std::min(levels_written_ - levels_position_, num_records - 
records_read);
+      const int64_t position_end = levels_position_ + stride;
+      for (int64_t i = levels_position_; i < position_end; ++i) {

Review Comment:
   ```diff
   diff --git a/cpp/src/parquet/column_reader.cc 
b/cpp/src/parquet/column_reader.cc
   index d3f918186..25ac0b513 100644
   --- a/cpp/src/parquet/column_reader.cc
   +++ b/cpp/src/parquet/column_reader.cc
   @@ -1703,10 +1703,9 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
          int64_t stride =
              std::min(levels_written_ - levels_position_, num_records - 
records_read);
          const int64_t position_end = levels_position_ + stride;
   -      for (int64_t i = levels_position_; i < position_end; ++i) {
   -        records_read += rep_levels[i] == 0;
   -        values_to_read += def_levels[i] == this->max_def_level_;
   -      }
   +      records_read += std::count(rep_levels + levels_position_, rep_levels 
+ position_end, 0);
   +      values_to_read += std::count(def_levels + levels_position_, 
def_levels + position_end,
   +                                    this->max_def_level_);
          levels_position_ = position_end;
          if (records_read == num_records) {
   
   ```
   
   It runs a bit slower on my MacOS with Release O2
   
   ```
   RecordReaderSkipRecords/Repetition:0/BatchSize:1000                          
         162494 ns       134518 ns         5261 bytes_per_second=35.4479G/s 
items_per_second=9.51548G/s
   RecordReaderSkipRecords/Repetition:1/BatchSize:1000                          
         950868 ns       730564 ns          960 bytes_per_second=3.47751G/s 
items_per_second=1.75207G/s
   RecordReaderSkipRecords/Repetition:2/BatchSize:1000                          
        2590388 ns      2251851 ns          315 bytes_per_second=1.20306G/s 
items_per_second=568.421M/s
   RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:1              
         187780 ns       149951 ns         4921 bytes_per_second=31.7994G/s 
items_per_second=8.5361G/s
   RecordReaderReadRecords/Repetition:0/BatchSize:1000/ReadDense:0              
         193405 ns       152667 ns         4498 bytes_per_second=31.2339G/s 
items_per_second=8.38428G/s
   RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:1              
        1076330 ns       860129 ns          836 bytes_per_second=2.95368G/s 
items_per_second=1.48815G/s
   RecordReaderReadRecords/Repetition:1/BatchSize:1000/ReadDense:0              
        6279539 ns      4948944 ns          143 bytes_per_second=525.671M/s 
items_per_second=258.641M/s
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:1              
        3087965 ns      2390981 ns          314 bytes_per_second=1.13306G/s 
items_per_second=535.345M/s
   RecordReaderReadRecords/Repetition:2/BatchSize:1000/ReadDense:0              
        7156952 ns      6809000 ns          100 bytes_per_second=407.423M/s 
items_per_second=187.986M/s
   RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10/LevelsPerPage:80000 
        5616643 ns      3536000 ns          207 bytes_per_second=1.34852G/s 
items_per_second=361.991M/s
   
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:1000/LevelsPerPage:80000  
      180200 ns       146042 ns         5469 bytes_per_second=32.6507G/s 
items_per_second=8.7646G/s
   
RecordReaderReadAndSkipRecords/Repetition:0/BatchSize:10000/LevelsPerPage:1000000
    1629441 ns      1572834 ns          451 bytes_per_second=37.8963G/s 
items_per_second=10.1727G/s
   RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10/LevelsPerPage:80000 
       14144133 ns     13841654 ns           52 bytes_per_second=187.948M/s 
items_per_second=92.4745M/s
   
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:1000/LevelsPerPage:80000  
     2673686 ns      2613805 ns          272 bytes_per_second=995.298M/s 
items_per_second=489.708M/s
   
RecordReaderReadAndSkipRecords/Repetition:1/BatchSize:10000/LevelsPerPage:1000000
   32442177 ns     30155571 ns           21 bytes_per_second=1077.09M/s 
items_per_second=530.582M/s
   RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10/LevelsPerPage:80000 
       17366862 ns     17322366 ns           41 bytes_per_second=160.148M/s 
items_per_second=73.8929M/s
   
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:100/LevelsPerPage:80000   
     5906808 ns      5796854 ns          123 bytes_per_second=478.56M/s 
items_per_second=220.809M/s
   
RecordReaderReadAndSkipRecords/Repetition:2/BatchSize:10000/LevelsPerPage:1000000
   47098050 ns     46723000 ns           14 bytes_per_second=741.389M/s 
items_per_second=342.444M/s
   ```
   
   I decide to keep the current code if no strong reason (Previously loop 
changed here: https://github.com/apache/arrow/pull/41362#discussion_r1580516737 
)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-41361: [C++][Parquet] Optimize DelimitRecords by batch execution [arrow]

Reply via email to