github-actions[bot] commented on code in PR #25785:
URL: https://github.com/apache/doris/pull/25785#discussion_r1368420691


##########
be/src/vec/common/hash_table/hash_map.h:
##########
@@ -216,53 +216,102 @@ class JoinHashMapTable : public HashMapTable<Key, Cell, 
Hash, Grower, Allocator>
         return phmap::priv::NormalizeCapacity(expect_bucket_size) + 1;
     }
 
-    void reserve(int num_elem) {
+    template <int JoinOpType>
+    void prepare_build(size_t num_elem, int batch_size) {
+        max_batch_size = batch_size;
         bucket_size = calc_bucket_size(num_elem + 1);
         first.resize(bucket_size, 0);
         next.resize(num_elem);
-    }
 
-    void build(const Key* __restrict keys, const size_t* __restrict 
hash_values, size_t num_elem,
-               int batch_size) {
-        _batch_size = batch_size;
-        bucket_size = calc_bucket_size(num_elem);
-        first.resize(bucket_size, 0);
-        next.resize(num_elem);
+        if constexpr (JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) {
+            visited.resize(num_elem, 0);
+        }
+    }
 
+    void build(const Key* __restrict keys, const size_t* __restrict 
bucket_nums,
+               const size_t num_elem) {
         build_keys = keys;
         for (size_t i = 1; i < num_elem; i++) {
-            next[i] = first[hash_values[i]];
-            first[hash_values[i]] = i;
+            next[i] = first[bucket_nums[i]];
+            first[bucket_nums[i]] = i;
         }
     }
 
     template <int JoinOpType>
-    auto find_batch(const Key* __restrict keys, const size_t* __restrict 
hash_values, int probe_idx,
+    auto find_batch(const Key* __restrict keys, const size_t* __restrict 
bucket_nums, int probe_idx,
                     int probe_rows, std::vector<uint32_t>& probe_idxs,
                     std::vector<uint32_t>& build_idxs) {
         if constexpr (JoinOpType == doris::TJoinOp::INNER_JOIN ||
-                      JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN) {
-            return _find_batch_inner_outer_join<JoinOpType>(keys, hash_values, 
probe_idx,
+                      JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN) {
+            return _find_batch_inner_outer_join<JoinOpType>(keys, bucket_nums, 
probe_idx,
                                                             probe_rows, 
probe_idxs, build_idxs);
         }
         if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
                       JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN) {
-            return _find_batch_left_semi_anti<JoinOpType>(keys, hash_values, 
probe_idx, probe_rows,
+            return _find_batch_left_semi_anti<JoinOpType>(keys, bucket_nums, 
probe_idx, probe_rows,
                                                           probe_idxs);
         }
+        if constexpr (JoinOpType == doris::TJoinOp::RIGHT_ANTI_JOIN ||
+                      JoinOpType == doris::TJoinOp::RIGHT_SEMI_JOIN) {
+            return _find_batch_right_semi_anti(keys, bucket_nums, probe_idx, 
probe_rows);
+        }
         return std::pair {0, 0};
     }
 
     size_t get_bucket_mask() { return bucket_size - 1; }
 
+    template <int JoinOpType>
+    bool iterate_map(std::vector<uint32_t>& build_idxs) const {
+        const auto batch_size = max_batch_size;
+        const auto elem_num = visited.size();
+        int count = 0;
+        build_idxs.reserve(batch_size);
+
+        while (count < batch_size && iter_idx < elem_num) {
+            const auto matched = visited[iter_idx];
+            build_idxs[count] = iter_idx;
+            if constexpr (JoinOpType != doris::TJoinOp::RIGHT_ANTI_JOIN) {
+                count += !matched;
+            } else {
+                count += matched;
+            }
+            iter_idx++;
+        }
+
+        build_idxs.resize(count);
+        return iter_idx == elem_num;
+    }
+
 private:
+    auto _find_batch_right_semi_anti(const Key* __restrict keys,
+                                     const size_t* __restrict bucket_nums, int 
probe_idx,
+                                     int probe_rows) {
+        while (LIKELY(probe_idx < probe_rows)) {
+            auto build_idx = first[bucket_nums[probe_idx]];
+
+            while (build_idx) {
+                if (keys[probe_idx] == build_keys[build_idx]) {
+                    visited[build_idx] = 1;
+                }
+                build_idx = next[build_idx];
+            }
+        }
+        return std::pair {probe_rows, 0};
+    }
+
     template <int JoinOpType>
     auto _find_batch_left_semi_anti(const Key* __restrict keys,
-                                    const size_t* __restrict hash_values, int 
probe_idx,
+                                    const size_t* __restrict bucket_nums, int 
probe_idx,
                                     int probe_rows, std::vector<uint32_t>& 
probe_idxs) {
+        const auto batch_size = max_batch_size;
         int matched_cnt = 0;
-        while (LIKELY(probe_idx < probe_rows && matched_cnt < _batch_size)) {
-            uint32_t build_idx = first[hash_values[probe_idx]];
+        while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {

Review Comment:
   warning: boolean expression can be simplified by DeMorgan's theorem 
[readability-simplify-boolean-expr]
   ```cpp
           while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
                  ^
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/src/common/compiler_util.h:34:** expanded from macro 'LIKELY'
   ```cpp
   #define LIKELY(expr) __builtin_expect(!!(expr), 1)
                                          ^
   ```
   
   </details>
   



##########
be/src/vec/common/hash_table/hash_map.h:
##########
@@ -314,25 +368,31 @@
             current_build_idx = 0;
             do_the_probe();
         }
-        while (LIKELY(probe_idx < probe_rows && matched_cnt < _batch_size)) {
-            build_idx = first[hash_values[probe_idx]];
+        while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {

Review Comment:
   warning: boolean expression can be simplified by DeMorgan's theorem 
[readability-simplify-boolean-expr]
   ```cpp
           while (LIKELY(probe_idx < probe_rows && matched_cnt < batch_size)) {
                  ^
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/src/common/compiler_util.h:34:** expanded from macro 'LIKELY'
   ```cpp
   #define LIKELY(expr) __builtin_expect(!!(expr), 1)
                                          ^
   ```
   
   </details>
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to