Re: [PR] fix(flink): enable batch read it for flink source v2 [hudi]

via GitHub Fri, 20 Mar 2026 18:57:27 -0700


xushiyan commented on code in PR #18325:
URL: https://github.com/apache/hudi/pull/18325#discussion_r2968657261



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/DefaultBatchReader.java:
##########
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.source.reader;
+
+import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.hudi.source.reader.function.SplitReaderFunction;
+import org.apache.hudi.source.split.HoodieSourceSplit;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * The default BatchReader implementation.
+ */
+public class DefaultBatchReader implements BatchReader, Serializable {

Review Comment:
   should avoid raw type like this? use `RowData` or `T`
   
   ```suggestion
   public class DefaultBatchReader implements BatchReader<RowData>, 
Serializable {
   ```



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/DefaultBatchReader.java:
##########
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.source.reader;
+
+import org.apache.flink.connector.base.source.reader.RecordsWithSplitIds;
+import org.apache.flink.util.CloseableIterator;
+import org.apache.hudi.source.reader.function.SplitReaderFunction;
+import org.apache.hudi.source.split.HoodieSourceSplit;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * The default BatchReader implementation.
+ */
+public class DefaultBatchReader implements BatchReader, Serializable {
+  private SplitReaderFunction splitReaderFunction;
+
+  DefaultBatchReader(SplitReaderFunction splitReaderFunction) {
+    this.splitReaderFunction = splitReaderFunction;
+  }
+
+  @Override
+  public CloseableIterator<RecordsWithSplitIds> read(HoodieSourceSplit split) {
+    return 
CloseableIterator.adapterForIterator(List.of(splitReaderFunction.read(split)).iterator());

Review Comment:
   this wraps a plain iterator which does a no-op close(). we need to close 
`RecordsWithSplitIds` properly.



##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/reader/HoodieSourceSplitReader.java:
##########
@@ -43,42 +44,59 @@
 /**
  * The split reader of Hoodie source.
  *
+ * <p>Each call to {@link #fetch()} reads one split and returns it as a single
+ * {@link RecordsWithSplitIds} batch. Flink's {@code SourceReaderBase} is 
responsible for
+ * draining all records from the batch (via {@code nextRecordFromSplit()}) and 
marking
+ * the split finished (via {@code finishedSplits()}) before calling {@link 
#fetch()} again.
+ *
  * @param <T> record type
  */
 public class HoodieSourceSplitReader<T> implements 
SplitReader<HoodieRecordWithPosition<T>, HoodieSourceSplit> {
   private static final Logger LOG = 
LoggerFactory.getLogger(HoodieSourceSplitReader.class);
 
   private final SerializableComparator<HoodieSourceSplit> splitComparator;
-  private final SplitReaderFunction<T> readerFunction;
   private final Queue<HoodieSourceSplit> splits;
-  private final SourceReaderContext context;
   private final FlinkStreamReadMetrics readerMetrics;
-
-  private HoodieSourceSplit currentSplit;
+  private final BatchReader batchReader;
+  private transient HoodieSourceSplit currentSplit;
+  private transient 
CloseableIterator<RecordsWithSplitIds<HoodieRecordWithPosition<T>>> 
currentReader;
 
   public HoodieSourceSplitReader(
       String tableName,
       SourceReaderContext context,
       SplitReaderFunction<T> readerFunction,
       SerializableComparator<HoodieSourceSplit> splitComparator) {
-    this.context = context;
     this.splitComparator = splitComparator;
-    this.readerFunction = readerFunction;
     this.splits = new ArrayDeque<>();
+    this.batchReader = new DefaultBatchReader(readerFunction);
     this.readerMetrics = new FlinkStreamReadMetrics(context.metricGroup(), 
tableName);
     this.readerMetrics.registerMetrics();
   }
 
   @Override
   public RecordsWithSplitIds<HoodieRecordWithPosition<T>> fetch() throws 
IOException {
-    HoodieSourceSplit nextSplit = splits.poll();
-    if (nextSplit != null) {
-      currentSplit = nextSplit;
-      return readerFunction.read(currentSplit);
+    if (currentReader == null) {
+      HoodieSourceSplit nextSplit = splits.poll();
+      if (nextSplit != null) {
+        currentSplit = nextSplit;
+        currentReader = batchReader.read(nextSplit);
+      } else {
+        // return an empty result, which will lead to split fetch to be idle.
+        // SplitFetcherManager will then close idle fetcher.
+        return new RecordsBySplits<>(Collections.emptyMap(), 
Collections.emptySet());
+      }
+    }
+
+    if (currentReader != null && currentReader.hasNext()) {
+      // Because Iterator#next() doesn't support checked exception,
+      // we need to wrap and unwrap the checked IOException with 
UncheckedIOException
+      try {
+        return currentReader.next();
+      } catch (Exception e) {
+        throw new IOException(e.getCause());

Review Comment:
   ```suggestion
           throw new IOException(e);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] fix(flink): enable batch read it for flink source v2 [hudi]

Reply via email to