gustavodemorais commented on code in PR #26313: URL: https://github.com/apache/flink/pull/26313#discussion_r2084064814
########## flink-table/flink-table-runtime/src/main/java/org/apache/flink/table/runtime/operators/join/stream/state/MultiJoinStateViews.java: ########## @@ -0,0 +1,356 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.table.runtime.operators.join.stream.state; + +import org.apache.flink.api.common.functions.RuntimeContext; +import org.apache.flink.api.common.state.MapState; +import org.apache.flink.api.common.state.MapStateDescriptor; +import org.apache.flink.api.common.state.StateTtlConfig; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.runtime.operators.join.stream.utils.JoinInputSideSpec; +import org.apache.flink.table.runtime.typeutils.InternalTypeInfo; +import org.apache.flink.util.IterableIterator; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; + +import static org.apache.flink.table.runtime.util.StateConfigUtil.createTtlConfig; +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * Factory class to create different implementations of {@link MultiJoinStateView} based on the + * characteristics described in {@link JoinInputSideSpec}. + * + * <p>Each state view uses a {@link MapState} where the primary key is the `mapKey` derived from the + * join conditions (via {@link + * org.apache.flink.table.runtime.operators.join.stream.keyselector.JoinKeyExtractor}). The value + * stored within this map depends on whether the input side has a unique key and how it relates to + * the join key, optimizing storage and access patterns. + */ +public final class MultiJoinStateViews { + + /** Creates a {@link MultiJoinStateView} depends on {@link JoinInputSideSpec}. */ + public static MultiJoinStateView create( + RuntimeContext ctx, + String stateName, + JoinInputSideSpec inputSideSpec, + InternalTypeInfo<RowData> mapKeyType, // Type info for the outer map key + InternalTypeInfo<RowData> recordType, + long retentionTime) { + StateTtlConfig ttlConfig = createTtlConfig(retentionTime); + + if (inputSideSpec.hasUniqueKey()) { + if (inputSideSpec.joinKeyContainsUniqueKey()) { + return new JoinKeyContainsUniqueKey( + ctx, stateName, mapKeyType, recordType, ttlConfig); + } else { + return new InputSideHasUniqueKey( + ctx, + stateName, + mapKeyType, + recordType, + inputSideSpec.getUniqueKeyType(), + inputSideSpec.getUniqueKeySelector(), + ttlConfig); + } + } else { + return new InputSideHasNoUniqueKey(ctx, stateName, mapKeyType, recordType, ttlConfig); + } + } + + /** + * Creates a {@link MapStateDescriptor} with the given parameters and applies TTL configuration. + * + * @param <K> Key type + * @param <V> Value type + * @param stateName Unique name for the state + * @param keyTypeInfo Type information for the key + * @param valueTypeInfo Type information for the value + * @param ttlConfig State TTL configuration + * @return Configured MapStateDescriptor + */ + private static <K, V> MapStateDescriptor<K, V> createStateDescriptor( + String stateName, + TypeInformation<K> keyTypeInfo, + TypeInformation<V> valueTypeInfo, + StateTtlConfig ttlConfig) { + MapStateDescriptor<K, V> descriptor = + new MapStateDescriptor<>(stateName, keyTypeInfo, valueTypeInfo); + if (ttlConfig.isEnabled()) { + descriptor.enableTimeToLive(ttlConfig); + } + return descriptor; + } + + // ------------------------------------------------------------------------------------ + // Multi Join State View Implementations + // ------------------------------------------------------------------------------------ + + /** + * State view for input sides where the unique key is fully contained within the join key. + * + * <p>Stores data as {@code MapState<MapKey, Record>}. + */ + private static final class JoinKeyContainsUniqueKey implements MultiJoinStateView { Review Comment: > The state is scoped by the mapKey already I don't think that's always the case. If we partition the input using only one key and the join condition contains two keys. For example. `SELECT * FROM T1 JOIN T2 on T1.k1 = T2.k2 JOIN T3 on T2.k2 = T3.k3 AND T2.k99 == T3.k99` Let's suppose (T2.k2, T2.k99) and (T3.k3, T3.k99) are composite unique keys from T2 and T3. We will partition rows across multi-join operators based on T1.k1, T2.k2 and T2.k3. >There will be at most 1. record per key Inside the operator, we can use the composite unique key as the key of the map and store one record per key. What do you think? Either way, I'll add tests for multiple conditions like this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@flink.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org