JingsongLi commented on code in PR #7919: URL: https://github.com/apache/paimon/pull/7919#discussion_r3278317535
########## paimon-common/src/main/java/org/apache/paimon/fileindex/rtree/RTree.java: ########## @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.rtree; + +import java.util.ArrayList; +import java.util.List; + +/** R-Tree spatial index implementation. */ +public class RTree { + private RTreeNode root; + private final int dimensions; + private final int minEntries; + private final int maxEntries; + private int size; + + public RTree(int dimensions, int maxEntries) { + this.dimensions = dimensions; + this.maxEntries = maxEntries; + this.minEntries = Math.max(2, maxEntries / 2); + this.root = new RTreeNode(dimensions, maxEntries, true); + this.size = 0; + } + + public int getDimensions() { + return dimensions; + } + + public int getMinEntries() { + return minEntries; + } + + public int getMaxEntries() { + return maxEntries; + } + + public RTreeNode getRoot() { + return root; + } + + public void setRoot(RTreeNode newRoot) { + this.root = newRoot; + } + + public void setSize(int newSize) { + this.size = newSize; + } + + public int getSize() { + return size; + } + + public void insert(double[] point, int rowId) { + BoundingBox pointBox = BoundingBox.fromPoint(point); + insert(pointBox, rowId, root); + size++; + } + + private void insert(BoundingBox bbox, int rowId, RTreeNode node) { + if (node.isLeaf()) { + node.addLeafEntry(new LeafEntry(rowId, bbox)); + + if (node.canSplit()) { + splitNode(node); + } + } else { + RTreeNode bestChild = chooseBestChild(node, bbox); + insert(bbox, rowId, bestChild); + + if (bestChild.canSplit()) { + splitNode(bestChild); + } + node.getBoundingBox().expand(bbox); + } + } + + private RTreeNode chooseBestChild(RTreeNode node, BoundingBox bbox) { + RTreeNode best = null; + double minExpansion = Double.POSITIVE_INFINITY; + double minArea = Double.POSITIVE_INFINITY; + + for (RTreeNode child : node.getChildren()) { + double expansion = child.getBoundingBox().getExpansionArea(bbox); + double area = child.getBoundingBox().getArea(); + + if (expansion < minExpansion || (expansion == minExpansion && area < minArea)) { + minExpansion = expansion; + minArea = area; + best = child; + } + } + + if (best == null) { + throw new IllegalStateException("No child found in non-leaf node"); + } + return best; + } + + private void splitNode(RTreeNode node) { + if (node.isLeaf()) { + splitLeafNode(node); + } else { + splitInternalNode(node); + } + } + + private void splitLeafNode(RTreeNode node) { + List<LeafEntry> entries = new ArrayList<>(node.getLeafEntries()); + node.getLeafRowIds().clear(); + node.getLeafEntries().clear(); + node.getBoundingBox().clear(); + + RTreeNode newNode = new RTreeNode(dimensions, maxEntries, true); + + int mid = entries.size() / 2; + for (int i = 0; i < mid; i++) { + node.addLeafEntry(entries.get(i)); + } + for (int i = mid; i < entries.size(); i++) { + newNode.addLeafEntry(entries.get(i)); + } + + if (node == root) { + RTreeNode newRoot = new RTreeNode(dimensions, maxEntries, false); + newRoot.addChild(node); + newRoot.addChild(newNode); + root = newRoot; + } else { + adjustParent(node, newNode); + } + } + + private void splitInternalNode(RTreeNode node) { + List<RTreeNode> children = new ArrayList<>(node.getChildren()); + node.getChildren().clear(); + node.getBoundingBox().clear(); + + RTreeNode newNode = new RTreeNode(dimensions, maxEntries, false); + + int mid = children.size() / 2; + for (int i = 0; i < mid; i++) { + node.addChild(children.get(i)); + } + for (int i = mid; i < children.size(); i++) { + newNode.addChild(children.get(i)); + } + + if (node == root) { + RTreeNode newRoot = new RTreeNode(dimensions, maxEntries, false); + newRoot.addChild(node); + newRoot.addChild(newNode); + root = newRoot; + } else { + adjustParent(node, newNode); + } + } + + private void adjustParent(RTreeNode node, RTreeNode newNode) { + RTreeNode parent = node.getParent(); + if (parent == null) { + return; + } + + parent.addChild(newNode); + + if (parent.canSplit()) { + splitNode(parent); + } + } + + public List<Integer> search(BoundingBox searchBox) { + List<Integer> results = new ArrayList<>(); + search(searchBox, root, results); + return results; + } + + private void search(BoundingBox searchBox, RTreeNode node, List<Integer> results) { + if (!node.getBoundingBox().intersects(searchBox)) { + return; + } + + if (node.isLeaf()) { + for (Integer rowId : node.getLeafRowIds()) { + results.add(rowId); Review Comment: Directly joined without checking if entry.bbox intersects with searchBox ########## paimon-common/src/main/java/org/apache/paimon/fileindex/rtree/RTree.java: ########## @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.rtree; + +import java.util.ArrayList; +import java.util.List; + +/** R-Tree spatial index implementation. */ +public class RTree { + private RTreeNode root; + private final int dimensions; + private final int minEntries; + private final int maxEntries; + private int size; + + public RTree(int dimensions, int maxEntries) { + this.dimensions = dimensions; + this.maxEntries = maxEntries; + this.minEntries = Math.max(2, maxEntries / 2); + this.root = new RTreeNode(dimensions, maxEntries, true); + this.size = 0; + } + + public int getDimensions() { + return dimensions; + } + + public int getMinEntries() { + return minEntries; + } + + public int getMaxEntries() { + return maxEntries; + } + + public RTreeNode getRoot() { + return root; + } + + public void setRoot(RTreeNode newRoot) { + this.root = newRoot; + } + + public void setSize(int newSize) { + this.size = newSize; + } + + public int getSize() { + return size; + } + + public void insert(double[] point, int rowId) { + BoundingBox pointBox = BoundingBox.fromPoint(point); + insert(pointBox, rowId, root); + size++; + } + + private void insert(BoundingBox bbox, int rowId, RTreeNode node) { + if (node.isLeaf()) { + node.addLeafEntry(new LeafEntry(rowId, bbox)); + + if (node.canSplit()) { + splitNode(node); + } + } else { + RTreeNode bestChild = chooseBestChild(node, bbox); + insert(bbox, rowId, bestChild); + + if (bestChild.canSplit()) { + splitNode(bestChild); + } + node.getBoundingBox().expand(bbox); + } + } + + private RTreeNode chooseBestChild(RTreeNode node, BoundingBox bbox) { + RTreeNode best = null; + double minExpansion = Double.POSITIVE_INFINITY; + double minArea = Double.POSITIVE_INFINITY; + + for (RTreeNode child : node.getChildren()) { + double expansion = child.getBoundingBox().getExpansionArea(bbox); + double area = child.getBoundingBox().getArea(); + + if (expansion < minExpansion || (expansion == minExpansion && area < minArea)) { + minExpansion = expansion; + minArea = area; + best = child; + } + } + + if (best == null) { + throw new IllegalStateException("No child found in non-leaf node"); + } + return best; + } + + private void splitNode(RTreeNode node) { + if (node.isLeaf()) { + splitLeafNode(node); + } else { + splitInternalNode(node); + } + } + + private void splitLeafNode(RTreeNode node) { + List<LeafEntry> entries = new ArrayList<>(node.getLeafEntries()); + node.getLeafRowIds().clear(); + node.getLeafEntries().clear(); + node.getBoundingBox().clear(); + + RTreeNode newNode = new RTreeNode(dimensions, maxEntries, true); + + int mid = entries.size() / 2; Review Comment: This completely disregards spatial location. The correct R-Tree splitting should minimize the MBR overlap between two result nodes, otherwise a large number of nodes will "intersect" during queries, degenerating into linear scans. ########## paimon-common/src/main/java/org/apache/paimon/fileindex/rtree/RTreeFileIndexReader.java: ########## @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.rtree; + +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexResult; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.predicate.FieldRef; +import org.apache.paimon.utils.RoaringBitmap32; + +import java.io.DataInputStream; +import java.io.IOException; +import java.util.List; + +/** Reader for R-Tree file index. */ +public class RTreeFileIndexReader extends FileIndexReader { + + private final SeekableInputStream stream; + private final int start; + private final Options options; + private RTree rtree; + private int dimensions; + private int maxEntries; + private int treeSize; + + public RTreeFileIndexReader(SeekableInputStream stream, int start, Options options) + throws IOException { + this.stream = stream; + this.start = start; + this.options = options; + deserializeRTree(); + } + + private void deserializeRTree() throws IOException { + stream.seek(start); + DataInputStream dis = new DataInputStream(stream); + + this.dimensions = dis.readInt(); + this.maxEntries = dis.readInt(); + this.treeSize = dis.readInt(); + + this.rtree = new RTree(dimensions, maxEntries); + + if (treeSize > 0) { + deserializeNodes(dis, rtree); + } + } + + private void deserializeNodes(DataInputStream dis, RTree tree) throws IOException { + RTreeNode root = tree.getRoot(); + deserializeNode(dis, root, true); + } + + private void deserializeNode(DataInputStream dis, RTreeNode node, boolean isRoot) + throws IOException { + boolean isLeaf = dis.readBoolean(); + int entryCount = dis.readInt(); + + BoundingBox bbox = BoundingBox.deserialize(dis, dimensions); + node.getBoundingBox().expand(bbox); + + if (isLeaf) { + for (int i = 0; i < entryCount; i++) { + int rowId = dis.readInt(); + BoundingBox entryBbox = BoundingBox.deserialize(dis, dimensions); + node.addLeafEntry(new LeafEntry(rowId, entryBbox)); + } + } else { + for (int i = 0; i < entryCount; i++) { + RTreeNode child = new RTreeNode(dimensions, maxEntries, false); + node.addChild(child); Review Comment: RTreeNode.isLeaf is final. When the RTree constructor creates root, isLeaf=true. However, during deserialization, if the root of the tree is an internal node, the deserialization code will add children to the node with isLeaf=true. Afterwards, when searching, node. isLeaf() returns true and will look for leafRowIds instead of recursive children. The deserialized tree cannot be queried correctly at all. ########## paimon-common/src/main/java/org/apache/paimon/fileindex/rtree/RTreeFileIndex.java: ########## @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.fileindex.rtree; + +import org.apache.paimon.fileindex.FileIndexReader; +import org.apache.paimon.fileindex.FileIndexWriter; +import org.apache.paimon.fileindex.FileIndexer; +import org.apache.paimon.fs.SeekableInputStream; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.DataType; + +/** The implementation of R-Tree file index. */ +public class RTreeFileIndex implements FileIndexer { Review Comment: As a File Index, all data is already known at the time of writing, and the quality of the tree constructed by inserting each item is much lower than that of STR (Sort Tile Recursive) bulk loading -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
