rtpsw commented on PR #13880:
URL: https://github.com/apache/arrow/pull/13880#issuecomment-1235474550

   For posterity, I also made an attempt to improve performance by refactoring 
`AdvanceAndMemoize` for the different latest-key paths, but this resulted in 
slightly decreased performance yet.
   ```
   $ archery benchmark diff ARROW-17412.template-memoize.json baseline.json
   INFO:numexpr.utils:NumExpr defaulting to 8 threads.
   
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   Non-regressions: (1)
   
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                                                
                                                                benchmark       
 baseline       contender  change %                                             
                                                                                
                                                                                
                                                                                
                                                                              
counters
   
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:32000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time
 142.364 MiB/sec 166.963 MiB/sec    17.279 {'family_index': 0, 
'per_family_instance_index': 14, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:32000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 7, 
'input_rows_per_second': 867904.5056496062, 'maximum_peak_memory': 686301504.0}
   
   
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   Regressions: (10)
   
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                                                                                
                                                                 benchmark      
  baseline       contender  change %                                            
                                                                                
                                                                                
                                                                                
                                                                                
counters
    
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:10/right_freq:400/right_cols:20/right_ids:500/real_time
 344.387 MiB/sec 300.780 MiB/sec   -12.662 {'family_index': 0, 
'per_family_instance_index': 10, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:10/right_freq:400/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 3, 
'input_rows_per_second': 2099512.4178041373, 'maximum_peak_memory': 148407104.0}
     
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:1000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time
 315.636 MiB/sec 269.865 MiB/sec   -14.501 {'family_index': 0, 
'per_family_instance_index': 12, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:1000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 16, 
'input_rows_per_second': 1924236.4216933611, 'maximum_peak_memory': 686301504.0}
    
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:50/right_freq:400/right_cols:20/right_ids:500/real_time
 323.553 MiB/sec 275.253 MiB/sec   -14.928  {'family_index': 0, 
'per_family_instance_index': 11, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:50/right_freq:400/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 1, 
'input_rows_per_second': 1972499.868090142, 'maximum_peak_memory': 686301504.0}
   
AsOfJoinOverhead/left_freq:400/left_cols:100/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:100/right_ids:500/real_time
 404.655 MiB/sec 336.061 MiB/sec   -16.951  {'family_index': 0, 
'per_family_instance_index': 5, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:100/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:100/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 4, 
'input_rows_per_second': 522550.5184384509, 'maximum_peak_memory': 131071104.0}
     
AsOfJoinOverhead/left_freq:200/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:200/right_cols:20/right_ids:500/real_time
 355.718 MiB/sec 293.828 MiB/sec   -17.399    {'family_index': 0, 
'per_family_instance_index': 0, 'run_name': 
'AsOfJoinOverhead/left_freq:200/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:200/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 8, 
'input_rows_per_second': 2168591.3894433207, 'maximum_peak_memory': 54420864.0}
     
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:100/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:100/real_time
 339.172 MiB/sec 277.452 MiB/sec   -18.197    {'family_index': 0, 
'per_family_instance_index': 6, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:100/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:100/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 84, 
'input_rows_per_second': 2067719.28348546, 'maximum_peak_memory': 131071104.0}
   
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:1000/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:1000/real_time
 351.580 MiB/sec 285.147 MiB/sec   -18.896 {'family_index': 0, 
'per_family_instance_index': 8, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:1000/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:1000/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 9, 
'input_rows_per_second': 2143364.0434252876, 'maximum_peak_memory': 131071104.0}
     
AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time
 349.482 MiB/sec 282.849 MiB/sec   -19.066   {'family_index': 0, 
'per_family_instance_index': 9, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 19, 
'input_rows_per_second': 2111813.458550057, 'maximum_peak_memory': 131071104.0}
   
AsOfJoinOverhead/left_freq:1000/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:1000/right_cols:20/right_ids:500/real_time
 348.038 MiB/sec 278.493 MiB/sec   -19.982 {'family_index': 0, 
'per_family_instance_index': 2, 'run_name': 
'AsOfJoinOverhead/left_freq:1000/left_cols:20/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:1000/right_cols:20/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 44, 
'input_rows_per_second': 2121769.8288677963, 'maximum_peak_memory': 54420864.0}
     
AsOfJoinOverhead/left_freq:400/left_cols:10/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:10/right_ids:500/real_time
 305.968 MiB/sec 223.128 MiB/sec   -27.075    {'family_index': 0, 
'per_family_instance_index': 3, 'run_name': 
'AsOfJoinOverhead/left_freq:400/left_cols:10/left_ids:500/left_batch_size:4000/num_right_tables:1/right_freq:400/right_cols:10/right_ids:500/real_time',
 'repetitions': 1, 'repetition_index': 0, 'threads': 1, 'iterations': 29, 
'input_rows_per_second': 3487291.181883503, 'maximum_peak_memory': 54420864.0}
   ```
   The code diff for this benchmark is:
   ```
   diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc 
b/cpp/src/arrow/compute/exec/asof_join_node.cc
   index d4e704ba6..eb13ba5a7 100644
   --- a/cpp/src/arrow/compute/exec/asof_join_node.cc
   +++ b/cpp/src/arrow/compute/exec/asof_join_node.cc
   @@ -302,6 +302,10 @@ class InputState {
        if (key_col_index_.size() == 0) {
          return 0;
        }
   +    return GetLatestKeyFastPath(batch, row);
   +  }
   +
   +  inline ByType GetLatestKeyFastPath(const RecordBatch* batch, row_index_t 
row) const {
        auto data = batch->column_data(key_col_index_[0]);
        switch (key_type_id_[0]) {
          LATEST_VAL_CASE(INT8, key_value)
   @@ -382,7 +386,8 @@ class InputState {
      // latest_time and latest_ref_row to the value that immediately pass the
      // specified timestamp.
      // Returns true if updates were made, false if not.
   -  Result<bool> AdvanceAndMemoize(OnType ts) {
   +  template <typename LatestKey>
   +  Result<bool> AdvanceAndMemoize(OnType ts, LatestKey get_latest_key) {
        // Advance the right side row index until we reach the latest right row 
(for each key)
        // for the given left timestamp.
    
   @@ -405,14 +410,33 @@ class InputState {
            must_hash_ = true;
            may_rehash_ = false;
            Rehash();
   +       break;
          }
   -      memo_.Store(rb, latest_ref_row_, latest_time, GetLatestKey());
   +      memo_.Store(rb, latest_ref_row_, latest_time, get_latest_key());
          updated = true;
          ARROW_ASSIGN_OR_RAISE(advanced, Advance());
        } while (advanced);
        return updated;
      }
    
   +  Result<bool> AdvanceAndMemoize(OnType ts) {
   +    if (key_col_index_.size() == 0) {
   +      return AdvanceAndMemoize(ts, []() { return 0; });
   +    }
   +    if (must_hash_) {
   +      return AdvanceAndMemoize(ts, [this]() { return 
key_hasher_->HashesFor(queue_.UnsyncFront().get())[latest_ref_row_]; });
   +    }
   +    ARROW_ASSIGN_OR_RAISE(bool updated,
   +                          AdvanceAndMemoize(ts, [this]() { return 
GetLatestKeyFastPath(queue_.UnsyncFront().get(), latest_ref_row_); }));
   +    if (must_hash_) {
   +      ARROW_ASSIGN_OR_RAISE(
   +          bool updated_after_rehash,
   +          AdvanceAndMemoize(ts, [this]() { return 
key_hasher_->HashesFor(queue_.UnsyncFront().get())[latest_ref_row_]; }));
   +      return updated || updated_after_rehash;
   +    }
   +    return updated;
   +  }
   +
      void Rehash() {
        MemoStore new_memo;
        for (const auto& entry : memo_.entries_) {
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to