mihaibudiu commented on code in PR #4725:
URL: https://github.com/apache/calcite/pull/4725#discussion_r2669821618
##########
linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java:
##########
@@ -1685,6 +1686,279 @@ private static <TSource, TInner, TKey, TResult>
Enumerable<TResult> hashJoinWith
};
}
+ /**
+ * Left mark join implementation based on hash. It will keep all rows from
the left side and
+ * creates a new attribute to mark the rows from left input as having join
partners from right
+ * side or not.
+ *
+ * <p> Left mark join is produced by rewriting IN/SOME/EXISTS subqueries, so
left mark join will
+ * never contain both not null-safe join keys and non-equi predicates.
+ *
+ * @param outer Left input
+ * @param inner Right input
+ * @param outerKeyNullAwareSelector Function that extracts keys from the
row of left input
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param innerKeyNullAwareSelector Function that extracts keys from the
row of right input
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param outerNullSafeKeySelector Function that extracts the null-safe
keys from the row of
+ * left input
+ * @param innerNullSafeKeySelector Function that extracts the null-safe
keys from the row of
+ * right input
+ * @param atMostOneNotNullSafeKey There is at most one not null-safe key
in join keys
+ * @param resultSelector Function that concats the row of left
input and marker
+ * @param comparer Function that compares the keys
+ * @param nullSafeComparer Function that compares the null-safe
keys
+ * @param nonEquiPredicate Non-equi predicate that can return NULL
+ * @param equiPredicate Equi predicate that can return NULL
+ */
+ public static <TSource, TInner, TKey, TNsKey, TResult> Enumerable<TResult>
leftMarkHashJoin(
+ final Enumerable<TSource> outer, final Enumerable<TInner> inner,
+ final Function1<TSource, TKey> outerKeyNullAwareSelector,
+ final Function1<TInner, TKey> innerKeyNullAwareSelector,
+ final @Nullable Function1<TSource, TNsKey> outerNullSafeKeySelector,
+ final @Nullable Function1<TInner, TNsKey> innerNullSafeKeySelector,
+ final boolean atMostOneNotNullSafeKey,
+ final Function2<TSource, @Nullable Boolean, TResult> resultSelector,
+ final @Nullable EqualityComparer<TKey> comparer,
+ final @Nullable EqualityComparer<TNsKey> nullSafeComparer,
+ final @Nullable NullablePredicate2<TSource, TInner> nonEquiPredicate,
+ final NullablePredicate2<TSource, TInner> equiPredicate) {
+ if (atMostOneNotNullSafeKey) {
+ return leftMarkHashJoinOptimized(outer, inner, outerKeyNullAwareSelector,
+ innerKeyNullAwareSelector, outerNullSafeKeySelector,
innerNullSafeKeySelector,
+ resultSelector, comparer, nullSafeComparer, nonEquiPredicate);
+ }
+ return leftMarkHashJoinGeneral(outer, inner, outerKeyNullAwareSelector,
+ innerKeyNullAwareSelector, outerNullSafeKeySelector,
innerNullSafeKeySelector,
+ resultSelector, comparer, nullSafeComparer, nonEquiPredicate,
equiPredicate);
+ }
+
+ /**
+ * For other join types (especially INNER join), the hash table can be used
to quickly determine
+ * which right-side rows should be joined with a given left-side row. But
left mark join is more
+ * complicated. The <code>marker</code> indicates whether a left row has a
join partner on the
+ * right side, it is a three-valued boolean, meaning we need to know the
actual result (TRUE,
+ * FALSE, or NULL) of the join condition.
+ *
+ * <p> Join key comes in two categories:
+ * <ul>
+ * <li>null-safe key (IS NOT DISTINCT FROM): it only produces
TRUE/FALSE.</li>
+ * <li>not null-safe key (EQUALS): it produces a three-valued boolean.</li>
+ * </ul>
+ *
+ * <p> If all join keys are null-safe, we can get the comparison result
(TRUE or FALSE) based on
+ * hash table matching.
+ *
+ * <p> If there are multiple not null-safe join keys, the hash table
matching alone is
+ * insufficient to determine the comparison result. However, if there is
only one not null-safe
+ * key, we can record whether this key has any NULL values when constructing
the hash table.
+ * During probing, if no match is found in the hash table and there are any
NULL values on this
+ * unique not null-safe key, we can know that the <code>marker</code> is
NULL.
+ */
+ static <TSource, TInner, TKey, TNsKey, TResult> Enumerable<TResult>
leftMarkHashJoinOptimized(
+ final Enumerable<TSource> outer, final Enumerable<TInner> inner,
+ final Function1<TSource, TKey> outerKeyNullAwareSelector,
+ final Function1<TInner, TKey> innerKeyNullAwareSelector,
+ final @Nullable Function1<TSource, TNsKey> outerNullSafeKeySelector,
+ final @Nullable Function1<TInner, TNsKey> innerNullSafeKeySelector,
+ final Function2<TSource, @Nullable Boolean, TResult> resultSelector,
+ final @Nullable EqualityComparer<TKey> comparer,
+ final @Nullable EqualityComparer<TNsKey> nullSafeComparer,
+ final @Nullable NullablePredicate2<TSource, TInner> nonEquiPredicate) {
+ return new AbstractEnumerable<TResult>() {
Review Comment:
you could add this to the comments, in case someone compares this to the
paper.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]