luoyuxia commented on code in PR #1480: URL: https://github.com/apache/fluss/pull/1480#discussion_r2257012751
########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/SortedRecordReader.java: ########## @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import com.alibaba.fluss.row.InternalRow; + +import java.util.Comparator; + +/** + * A specialized {@link RecordReader} that produces records in a defined sorted order. + * + * <p>Extends the basic record reading capability with sorting semantics, ensuring that records are + * returned according to a specified ordering. + * + * <p>Implementations must guarantee that the {@link #read()} method returns records in the order + * defined by the comparator from {@link #order()}. + * + * <p>Note: This is mainly used for union read primary key table since we will do sort merge records + * in lake and fluss. The records in primary key table for lake may should implement this method for + * union read with a better performance. + */ +public interface SortedRecordReader extends RecordReader { Review Comment: nit: add @since and @PublicEvolving ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/RecordReader.java: ########## @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import com.alibaba.fluss.record.LogRecord; +import com.alibaba.fluss.utils.CloseableIterator; + +import java.io.IOException; + +/** + * An interface for reading records from {@link LakeSplit}. + * + * <p>Implementations of this interface provide an iterator-style access to records, allowing + * efficient sequential reading of potentially large datasets without loading all data into memory + * at once. The reading should consider the pushed-down optimizations (project, filters, limits, + * etc.) from {@link LakeSource}. + */ +public interface RecordReader { Review Comment: add `@since` and `@PublicEvolving` ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/LakeSource.java: ########## @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import com.alibaba.fluss.lake.serializer.SimpleVersionedSerializer; + +import java.io.IOException; +import java.io.Serializable; + +/** + * A generic interface for lake data sources that defines how to plan splits and read data. Any data + * lake format supporting reading from data tiered in lake as Fluss records should implement this + * interface. + * + * <p>This interface provides methods for projection, filtering, limiting to enable query engine to + * push to lake source. Implementations must ensure that split planning and record reading + * operations properly account for these pushed-down operations during execution. + * + * @param <Split> The type of data split, which must extend {@link LakeSplit} + * @since 0.8 + */ +public interface LakeSource<Split extends LakeSplit> extends Serializable { + + /** + * Applies column projection to the data source. it provides the field index paths that should + * be used for a projection. The indices are 0-based and support fields within (possibly nested) + * structures. + * + * <p>For nested, given the following SQL, CREATE TABLE t (i INT, r ROW < d DOUBLE, b BOOLEAN>, + * s STRING); SELECT s, r.d FROM t; the project will be [[2], [1, 0]] + */ + void withProject(int[][] project); + + /** Applies a row limit to the data source. */ + void withLimit(int limit); + + // TODO: Support paimon filter pushdown + // FilterPushDownResult withFilters(List<Predicate> predicates); + + /** + * Creates a planner for plan splits to be read. + * + * @param context The planning context providing necessary planning information + * @return A planner instance for this data source + * @throws IOException if an error occurs during planner creation + */ + Planner<Split> createPlanner(PlannerContext context) throws IOException; + + /** + * Creates a record reader for reading data of data lake for the specified split . Review Comment: nit ```suggestion * Creates a record reader for reading data of data lake for the specified split. ``` ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/Planner.java: ########## @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import java.io.IOException; +import java.util.List; + +/** + * A planner interface for generating readable splits for lake data sources. + * + * <p>Implementations of this interface are responsible for determining how to divide the data into + * manageable splits that can be read in parallel. The planning should consider the pushed-down + * optimizations (filters, limits, etc.) from {@link LakeSource}. + * + * @param <Split> the type of data split this planner generates, must extend {@link LakeSplit} + */ +public interface Planner<Split extends LakeSplit> { Review Comment: nit: add `@since` and `@PublicEvolving` ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/LakeSource.java: ########## @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import com.alibaba.fluss.lake.serializer.SimpleVersionedSerializer; + +import java.io.IOException; +import java.io.Serializable; + +/** + * A generic interface for lake data sources that defines how to plan splits and read data. Any data + * lake format supporting reading from data tiered in lake as Fluss records should implement this + * interface. + * + * <p>This interface provides methods for projection, filtering, limiting to enable query engine to + * push to lake source. Implementations must ensure that split planning and record reading + * operations properly account for these pushed-down operations during execution. + * + * @param <Split> The type of data split, which must extend {@link LakeSplit} + * @since 0.8 + */ Review Comment: nit: add `@PublicEvolving` ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/LakeSource.java: ########## @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import com.alibaba.fluss.lake.serializer.SimpleVersionedSerializer; + +import java.io.IOException; +import java.io.Serializable; + +/** + * A generic interface for lake data sources that defines how to plan splits and read data. Any data + * lake format supporting reading from data tiered in lake as Fluss records should implement this + * interface. + * + * <p>This interface provides methods for projection, filtering, limiting to enable query engine to + * push to lake source. Implementations must ensure that split planning and record reading + * operations properly account for these pushed-down operations during execution. + * + * @param <Split> The type of data split, which must extend {@link LakeSplit} + * @since 0.8 + */ +public interface LakeSource<Split extends LakeSplit> extends Serializable { + + /** + * Applies column projection to the data source. it provides the field index paths that should + * be used for a projection. The indices are 0-based and support fields within (possibly nested) + * structures. + * + * <p>For nested, given the following SQL, CREATE TABLE t (i INT, r ROW < d DOUBLE, b BOOLEAN>, + * s STRING); SELECT s, r.d FROM t; the project will be [[2], [1, 0]] + */ + void withProject(int[][] project); + + /** Applies a row limit to the data source. */ + void withLimit(int limit); + + // TODO: Support paimon filter pushdown Review Comment: FYI: #515 is ready to introduce the Predicate ########## fluss-common/src/main/java/com/alibaba/fluss/lake/source/LakeSplit.java: ########## @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.fluss.lake.source; + +import java.util.List; + +/** Represents a logical partition or segment of data in data-lake. */ Review Comment: add `@since` and `@PublicEvolving` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
