mihaibudiu commented on code in PR #4725:
URL: https://github.com/apache/calcite/pull/4725#discussion_r2665714446
##########
core/src/main/java/org/apache/calcite/rel/core/Join.java:
##########
@@ -260,9 +268,20 @@ protected int deepHashCode0() {
}
@Override protected RelDataType deriveRowType() {
+ RelDataTypeFactory typeFactory = getCluster().getTypeFactory();
+ RelDataType rightType = right.getRowType();
+ if (joinType == JoinRelType.LEFT_MARK) {
Review Comment:
this is weird, the type depends on the condition type but not on the actual
right input type?
##########
linq4j/src/main/java/org/apache/calcite/linq4j/ExtendedEnumerable.java:
##########
@@ -646,6 +647,41 @@ <TInner, TKey, TResult> Enumerable<TResult>
hashJoin(Enumerable<TInner> inner,
boolean generateNullsOnLeft, boolean generateNullsOnRight,
Predicate2<TSource, TInner> predicate);
+ /**
+ * Mark each row of the current enumerable to see if it has a join partner
in the
+ * <code>inner</code>. Whether a join partner exists depends on:
+ * - matching keys
+ * - non-equi predicate (if provided)
+ *
+ * @param inner Inner enumerable
+ * @param outerKeyNullAwareSelector Function that extracts keys from the
current enumerable
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param innerKeyNullAwareSelector Function that extracts keys from the
inner enumerable
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param outerNullSafeKeySelector Function that extracts null-safe keys
from the current
+ * enumerable
+ * @param innerNullSafeKeySelector Function that extracts null-safe keys
from the inner
+ * enumerable
+ * @param atMostOneNotNullSafeKey There is at most one not null-safe key
in join keys
Review Comment:
True when ...
##########
linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java:
##########
@@ -1685,6 +1686,279 @@ private static <TSource, TInner, TKey, TResult>
Enumerable<TResult> hashJoinWith
};
}
+ /**
+ * Left mark join implementation based on hash. It will keep all rows from
the left side and
+ * creates a new attribute to mark the rows from left input as having join
partners from right
+ * side or not.
+ *
+ * <p> Left mark join is produced by rewriting IN/SOME/EXISTS subqueries, so
left mark join will
+ * never contain both not null-safe join keys and non-equi predicates.
+ *
+ * @param outer Left input
+ * @param inner Right input
+ * @param outerKeyNullAwareSelector Function that extracts keys from the
row of left input
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param innerKeyNullAwareSelector Function that extracts keys from the
row of right input
+ * (return NULL when a not null-safe key
has a NULL value)
+ * @param outerNullSafeKeySelector Function that extracts the null-safe
keys from the row of
+ * left input
+ * @param innerNullSafeKeySelector Function that extracts the null-safe
keys from the row of
+ * right input
+ * @param atMostOneNotNullSafeKey There is at most one not null-safe key
in join keys
+ * @param resultSelector Function that concats the row of left
input and marker
+ * @param comparer Function that compares the keys
+ * @param nullSafeComparer Function that compares the null-safe
keys
+ * @param nonEquiPredicate Non-equi predicate that can return NULL
+ * @param equiPredicate Equi predicate that can return NULL
+ */
+ public static <TSource, TInner, TKey, TNsKey, TResult> Enumerable<TResult>
leftMarkHashJoin(
+ final Enumerable<TSource> outer, final Enumerable<TInner> inner,
+ final Function1<TSource, TKey> outerKeyNullAwareSelector,
+ final Function1<TInner, TKey> innerKeyNullAwareSelector,
+ final @Nullable Function1<TSource, TNsKey> outerNullSafeKeySelector,
+ final @Nullable Function1<TInner, TNsKey> innerNullSafeKeySelector,
+ final boolean atMostOneNotNullSafeKey,
+ final Function2<TSource, @Nullable Boolean, TResult> resultSelector,
+ final @Nullable EqualityComparer<TKey> comparer,
+ final @Nullable EqualityComparer<TNsKey> nullSafeComparer,
+ final @Nullable NullablePredicate2<TSource, TInner> nonEquiPredicate,
+ final NullablePredicate2<TSource, TInner> equiPredicate) {
+ if (atMostOneNotNullSafeKey) {
+ return leftMarkHashJoinOptimized(outer, inner, outerKeyNullAwareSelector,
+ innerKeyNullAwareSelector, outerNullSafeKeySelector,
innerNullSafeKeySelector,
+ resultSelector, comparer, nullSafeComparer, nonEquiPredicate);
+ }
+ return leftMarkHashJoinGeneral(outer, inner, outerKeyNullAwareSelector,
+ innerKeyNullAwareSelector, outerNullSafeKeySelector,
innerNullSafeKeySelector,
+ resultSelector, comparer, nullSafeComparer, nonEquiPredicate,
equiPredicate);
+ }
+
+ /**
+ * For other join types (especially INNER join), the hash table can be used
to quickly determine
+ * which right-side rows should be joined with a given left-side row. But
left mark join is more
+ * complicated. The <code>marker</code> indicates whether a left row has a
join partner on the
+ * right side, it is a three-valued boolean, meaning we need to know the
actual result (TRUE,
+ * FALSE, or NULL) of the join condition.
+ *
+ * <p> Join key comes in two categories:
+ * <ul>
+ * <li>null-safe key (IS NOT DISTINCT FROM): it only produces
TRUE/FALSE.</li>
+ * <li>not null-safe key (EQUALS): it produces a three-valued boolean.</li>
+ * </ul>
+ *
+ * <p> If all join keys are null-safe, we can get the comparison result
(TRUE or FALSE) based on
+ * hash table matching.
+ *
+ * <p> If there are multiple not null-safe join keys, the hash table
matching alone is
+ * insufficient to determine the comparison result. However, if there is
only one not null-safe
+ * key, we can record whether this key has any NULL values when constructing
the hash table.
+ * During probing, if no match is found in the hash table and there are any
NULL values on this
+ * unique not null-safe key, we can know that the <code>marker</code> is
NULL.
+ */
+ static <TSource, TInner, TKey, TNsKey, TResult> Enumerable<TResult>
leftMarkHashJoinOptimized(
+ final Enumerable<TSource> outer, final Enumerable<TInner> inner,
+ final Function1<TSource, TKey> outerKeyNullAwareSelector,
+ final Function1<TInner, TKey> innerKeyNullAwareSelector,
+ final @Nullable Function1<TSource, TNsKey> outerNullSafeKeySelector,
+ final @Nullable Function1<TInner, TNsKey> innerNullSafeKeySelector,
+ final Function2<TSource, @Nullable Boolean, TResult> resultSelector,
+ final @Nullable EqualityComparer<TKey> comparer,
+ final @Nullable EqualityComparer<TNsKey> nullSafeComparer,
+ final @Nullable NullablePredicate2<TSource, TInner> nonEquiPredicate) {
+ return new AbstractEnumerable<TResult>() {
Review Comment:
I have to confess that I haven't compared this implementation to the one in
the paper carefully.
The fact that the tests produce the right result is what makes me approve
this.
I think that the unit tests should have good coverage.
##########
core/src/main/java/org/apache/calcite/adapter/enumerable/EnumUtils.java:
##########
@@ -891,6 +923,20 @@ static Expression generatePredicate(
PhysType leftPhysType,
PhysType rightPhysType,
RexNode condition) {
+ return generatePredicate(implementor, rexBuilder, left, right,
+ leftPhysType, rightPhysType, condition, false);
+ }
+
+ /** Returns a predicate expression based on a join condition. */
+ static Expression generatePredicate(
+ EnumerableRelImplementor implementor,
+ RexBuilder rexBuilder,
+ RelNode left,
+ RelNode right,
+ PhysType leftPhysType,
+ PhysType rightPhysType,
+ RexNode condition,
+ boolean nullable) {
Review Comment:
I think at least this parameter needs to be documented
##########
linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java:
##########
@@ -3840,6 +4101,24 @@ public static <TSource, TKey, TElement> Lookup<TKey,
TElement> toLookup(
elementSelector);
}
+ private static <TKey, TElement> void updateMapDuringBuildLookup(
Review Comment:
Why not call this just "appendDataForKey" or something more descriptive?
##########
linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java:
##########
@@ -4024,6 +4303,63 @@ public static <T, C extends Collection<? super T>> C
remove(
return sink;
}
+ /**
+ * Hash table with null-safe key set.
+ *
+ * @param <TKey> key type
+ * @param <TNsKey> null-safe key type
+ * @param <TInner> build side row type
+ */
+ static class HashTableWithNullSafeKeySet<TKey, TNsKey, TInner> {
+ final Lookup<TKey, TInner> lookup;
+ final Set<Wrapped<TNsKey>> nullSafeKeySet;
+ final EqualityComparer<TNsKey> nullSafeComparer;
+ // whether the build side is empty set
+ final boolean innerIsEmpty;
+
+ private HashTableWithNullSafeKeySet(
+ Lookup<TKey, TInner> lookup,
+ Set<Wrapped<TNsKey>> nullSafeKeySet,
+ EqualityComparer<TNsKey> nullSafeComparer) {
+ this.lookup = lookup;
+ this.nullSafeKeySet = nullSafeKeySet;
+ this.nullSafeComparer = nullSafeComparer;
+ this.innerIsEmpty = lookup.isEmpty();
+ }
+
+ static <TKey, TNsKey, TInner> HashTableWithNullSafeKeySet<TKey, TNsKey,
TInner> build(
+ Enumerable<TInner> inner,
+ Function1<TInner, TKey> innerKeyNullAwareSelector,
Review Comment:
Maybe "inner" is not really necessary in these names: ther is no confusion.
##########
linq4j/src/main/java/org/apache/calcite/linq4j/EnumerableDefaults.java:
##########
@@ -4024,6 +4303,63 @@ public static <T, C extends Collection<? super T>> C
remove(
return sink;
}
+ /**
+ * Hash table with null-safe key set.
+ *
+ * @param <TKey> key type
+ * @param <TNsKey> null-safe key type
+ * @param <TInner> build side row type
+ */
+ static class HashTableWithNullSafeKeySet<TKey, TNsKey, TInner> {
+ final Lookup<TKey, TInner> lookup;
+ final Set<Wrapped<TNsKey>> nullSafeKeySet;
+ final EqualityComparer<TNsKey> nullSafeComparer;
+ // whether the build side is empty set
+ final boolean innerIsEmpty;
+
+ private HashTableWithNullSafeKeySet(
+ Lookup<TKey, TInner> lookup,
+ Set<Wrapped<TNsKey>> nullSafeKeySet,
+ EqualityComparer<TNsKey> nullSafeComparer) {
+ this.lookup = lookup;
+ this.nullSafeKeySet = nullSafeKeySet;
+ this.nullSafeComparer = nullSafeComparer;
+ this.innerIsEmpty = lookup.isEmpty();
+ }
+
+ static <TKey, TNsKey, TInner> HashTableWithNullSafeKeySet<TKey, TNsKey,
TInner> build(
+ Enumerable<TInner> inner,
Review Comment:
why not just call this parameter `data`? The type parameter can be similarly
called `TData`
##########
core/src/main/java/org/apache/calcite/adapter/enumerable/EnumUtils.java:
##########
@@ -875,14 +876,45 @@ static JoinType toLinq4jJoinType(JoinRelType joinRelType)
{
return JoinType.ASOF;
case LEFT_ASOF:
return JoinType.LEFT_ASOF;
+ case LEFT_MARK:
+ return JoinType.LEFT_MARK;
default:
break;
}
throw new IllegalStateException(
"Unable to convert " + joinRelType + " to Linq4j JoinType");
}
- /** Returns a predicate expression based on a join condition. */
+ /**
+ * Return the result selector of a mark join. It is a Expression that will
generate a Function2 in
+ * runtime, the Function2 will concat the left/right side row and the marker.
Review Comment:
perhaps giving an example of the generated Java code will make this easier
to understsand
##########
core/src/main/java/org/apache/calcite/rel/core/Join.java:
##########
@@ -190,6 +191,13 @@ public JoinRelType getJoinType() {
+ " failures in condition " + condition);
}
}
+ if (joinType == JoinRelType.LEFT_MARK
+ && joinInfo.nullExclusionFlags.contains(true)
+ && !joinInfo.nonEquiConditions.isEmpty()) {
+ return litmus.fail("Left mark join is produced by rewriting
IN/SOME/EXISTS "
Review Comment:
could other optimizations produce left mark joins which do not have this
property?
##########
linq4j/src/main/java/org/apache/calcite/linq4j/ExtendedEnumerable.java:
##########
@@ -646,6 +647,41 @@ <TInner, TKey, TResult> Enumerable<TResult>
hashJoin(Enumerable<TInner> inner,
boolean generateNullsOnLeft, boolean generateNullsOnRight,
Predicate2<TSource, TInner> predicate);
+ /**
+ * Mark each row of the current enumerable to see if it has a join partner
in the
Review Comment:
please add a reference to the paper defining the mark join here as well
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]