This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e13c28ce2 [core] Fix BTree index meta for empty keys (#8140)
0e13c28ce2 is described below

commit 0e13c28ce2a6624b3d470416a3b23a7a4661348c
Author: YeJunHao <[email protected]>
AuthorDate: Sat Jun 6 16:37:26 2026 +0800

    [core] Fix BTree index meta for empty keys (#8140)
    
    BTreeIndexMeta used zero-length encoded keys to represent null
    boundaries. This collides with valid empty serialized keys, for example
    an empty string, and can deserialize BTree metadata with a null boundary
    key. Readers or compactors that wrap the boundary key can then hit a
    NullPointerException.
---
 .../paimon/globalindex/btree/BTreeIndexMeta.java   |  37 ++++-
 .../globalindex/btree/BTreeIndexMetaTest.java      | 171 +++++++++++++++++++++
 2 files changed, 203 insertions(+), 5 deletions(-)

diff --git 
a/paimon-common/src/main/java/org/apache/paimon/globalindex/btree/BTreeIndexMeta.java
 
b/paimon-common/src/main/java/org/apache/paimon/globalindex/btree/BTreeIndexMeta.java
index 83d12b217a..96e0d85a74 100644
--- 
a/paimon-common/src/main/java/org/apache/paimon/globalindex/btree/BTreeIndexMeta.java
+++ 
b/paimon-common/src/main/java/org/apache/paimon/globalindex/btree/BTreeIndexMeta.java
@@ -30,6 +30,10 @@ import javax.annotation.Nullable;
  */
 public class BTreeIndexMeta {
 
+    private static final byte FORMAT_VERSION_WITH_NULL_FLAGS = 1;
+    private static final byte FIRST_KEY_IS_NULL = 1;
+    private static final byte LAST_KEY_IS_NULL = 1 << 1;
+
     @Nullable private final byte[] firstKey;
     @Nullable private final byte[] lastKey;
     private final boolean hasNulls;
@@ -61,36 +65,59 @@ public class BTreeIndexMeta {
     private int memorySize() {
         return (firstKey == null ? 0 : firstKey.length)
                 + (lastKey == null ? 0 : lastKey.length)
-                + 9;
+                + 11;
     }
 
     public byte[] serialize() {
         MemorySliceOutput sliceOutput = new MemorySliceOutput(memorySize());
+        byte nullKeyFlags = 0;
         if (firstKey != null) {
             sliceOutput.writeInt(firstKey.length);
             sliceOutput.writeBytes(firstKey);
         } else {
             sliceOutput.writeInt(0);
+            nullKeyFlags |= FIRST_KEY_IS_NULL;
         }
         if (lastKey != null) {
             sliceOutput.writeInt(lastKey.length);
             sliceOutput.writeBytes(lastKey);
         } else {
             sliceOutput.writeInt(0);
+            nullKeyFlags |= LAST_KEY_IS_NULL;
         }
         sliceOutput.writeByte(hasNulls ? 1 : 0);
+        sliceOutput.writeByte(FORMAT_VERSION_WITH_NULL_FLAGS);
+        sliceOutput.writeByte(nullKeyFlags);
         return sliceOutput.toSlice().getHeapMemory();
     }
 
     public static BTreeIndexMeta deserialize(byte[] data) {
         MemorySliceInput sliceInput = MemorySlice.wrap(data).toInput();
         int firstKeyLength = sliceInput.readInt();
-        byte[] firstKey =
-                firstKeyLength == 0 ? null : 
sliceInput.readSlice(firstKeyLength).copyBytes();
+        byte[] firstKey = readKey(sliceInput, firstKeyLength);
         int lastKeyLength = sliceInput.readInt();
-        byte[] lastKey =
-                lastKeyLength == 0 ? null : 
sliceInput.readSlice(lastKeyLength).copyBytes();
+        byte[] lastKey = readKey(sliceInput, lastKeyLength);
         boolean hasNulls = sliceInput.readByte() == 1;
+
+        if (sliceInput.available() >= 2) {
+            int formatVersion = sliceInput.readByte();
+            if (formatVersion == FORMAT_VERSION_WITH_NULL_FLAGS) {
+                int nullKeyFlags = sliceInput.readByte();
+                firstKey = (nullKeyFlags & FIRST_KEY_IS_NULL) != 0 ? null : 
firstKey;
+                lastKey = (nullKeyFlags & LAST_KEY_IS_NULL) != 0 ? null : 
lastKey;
+            }
+        } else if (firstKeyLength == 0 && lastKeyLength == 0 && hasNulls) {
+            // Compatibility with old metadata which used zero length to 
represent null keys.
+            // A legacy all-null index file can be identified by both key 
lengths being zero with
+            // null bitmap. When only one length is zero, the zero-length key 
is a valid serialized
+            // key, for example an empty string.
+            firstKey = null;
+            lastKey = null;
+        }
         return new BTreeIndexMeta(firstKey, lastKey, hasNulls);
     }
+
+    private static byte[] readKey(MemorySliceInput sliceInput, int keyLength) {
+        return sliceInput.readSlice(keyLength).copyBytes();
+    }
 }
diff --git 
a/paimon-common/src/test/java/org/apache/paimon/globalindex/btree/BTreeIndexMetaTest.java
 
b/paimon-common/src/test/java/org/apache/paimon/globalindex/btree/BTreeIndexMetaTest.java
new file mode 100644
index 0000000000..5b37aac50d
--- /dev/null
+++ 
b/paimon-common/src/test/java/org/apache/paimon/globalindex/btree/BTreeIndexMetaTest.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.globalindex.btree;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.fs.FileIO;
+import org.apache.paimon.fs.Path;
+import org.apache.paimon.fs.PositionOutputStream;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.globalindex.GlobalIndexParallelWriter;
+import org.apache.paimon.globalindex.ResultEntry;
+import org.apache.paimon.globalindex.io.GlobalIndexFileWriter;
+import org.apache.paimon.memory.MemorySliceOutput;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataField;
+import org.apache.paimon.types.VarCharType;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Test for {@link BTreeIndexMeta}. */
+class BTreeIndexMetaTest {
+
+    @TempDir private java.nio.file.Path tempPath;
+
+    @Test
+    void testEmptyFirstKey() {
+        byte[] lastKey = new byte[] {1, 2, 3};
+
+        BTreeIndexMeta meta = roundTrip(new BTreeIndexMeta(new byte[0], 
lastKey, false));
+
+        assertThat(meta.getFirstKey()).isEmpty();
+        assertThat(meta.getLastKey()).containsExactly(lastKey);
+        assertThat(meta.hasNulls()).isFalse();
+        assertThat(meta.onlyNulls()).isFalse();
+    }
+
+    @Test
+    void testEmptyFirstAndLastKeyWithNulls() {
+        BTreeIndexMeta meta = roundTrip(new BTreeIndexMeta(new byte[0], new 
byte[0], true));
+
+        assertThat(meta.getFirstKey()).isEmpty();
+        assertThat(meta.getLastKey()).isEmpty();
+        assertThat(meta.hasNulls()).isTrue();
+        assertThat(meta.onlyNulls()).isFalse();
+    }
+
+    @Test
+    void testOnlyNulls() {
+        BTreeIndexMeta meta = roundTrip(new BTreeIndexMeta(null, null, true));
+
+        assertThat(meta.getFirstKey()).isNull();
+        assertThat(meta.getLastKey()).isNull();
+        assertThat(meta.hasNulls()).isTrue();
+        assertThat(meta.onlyNulls()).isTrue();
+    }
+
+    @Test
+    void testWriterEmptyStringKeyWithNulls() throws Exception {
+        List<ResultEntry> results =
+                writeVarCharIndex(
+                        null, BinaryString.fromString(""), 
BinaryString.fromString("abc"));
+
+        BTreeIndexMeta meta = 
BTreeIndexMeta.deserialize(results.get(0).meta());
+
+        assertThat(meta.getFirstKey()).isEmpty();
+        
assertThat(meta.getLastKey()).containsExactly(BinaryString.fromString("abc").toBytes());
+        assertThat(meta.hasNulls()).isTrue();
+        assertThat(meta.onlyNulls()).isFalse();
+    }
+
+    @Test
+    void testLegacyOnlyNulls() {
+        BTreeIndexMeta meta =
+                BTreeIndexMeta.deserialize(legacyMetaBytes(new byte[0], new 
byte[0], true));
+
+        assertThat(meta.getFirstKey()).isNull();
+        assertThat(meta.getLastKey()).isNull();
+        assertThat(meta.hasNulls()).isTrue();
+        assertThat(meta.onlyNulls()).isTrue();
+    }
+
+    @Test
+    void testLegacyEmptyFirstKey() {
+        byte[] lastKey = new byte[] {1, 2, 3};
+
+        BTreeIndexMeta meta =
+                BTreeIndexMeta.deserialize(legacyMetaBytes(new byte[0], 
lastKey, false));
+
+        assertThat(meta.getFirstKey()).isEmpty();
+        assertThat(meta.getLastKey()).containsExactly(lastKey);
+        assertThat(meta.hasNulls()).isFalse();
+        assertThat(meta.onlyNulls()).isFalse();
+    }
+
+    @Test
+    void testLegacyEmptyFirstAndLastKeyWithoutNulls() {
+        BTreeIndexMeta meta =
+                BTreeIndexMeta.deserialize(legacyMetaBytes(new byte[0], new 
byte[0], false));
+
+        assertThat(meta.getFirstKey()).isEmpty();
+        assertThat(meta.getLastKey()).isEmpty();
+        assertThat(meta.hasNulls()).isFalse();
+        assertThat(meta.onlyNulls()).isFalse();
+    }
+
+    private BTreeIndexMeta roundTrip(BTreeIndexMeta meta) {
+        return BTreeIndexMeta.deserialize(meta.serialize());
+    }
+
+    private byte[] legacyMetaBytes(byte[] firstKey, byte[] lastKey, boolean 
hasNulls) {
+        MemorySliceOutput output = new MemorySliceOutput(firstKey.length + 
lastKey.length + 9);
+        output.writeInt(firstKey.length);
+        output.writeBytes(firstKey);
+        output.writeInt(lastKey.length);
+        output.writeBytes(lastKey);
+        output.writeByte(hasNulls ? 1 : 0);
+        return output.toSlice().copyBytes();
+    }
+
+    private List<ResultEntry> writeVarCharIndex(Object... keys) throws 
IOException {
+        FileIO fileIO = LocalFileIO.create();
+        GlobalIndexFileWriter fileWriter =
+                new GlobalIndexFileWriter() {
+                    @Override
+                    public String newFileName(String prefix) {
+                        return "test-btree-" + prefix;
+                    }
+
+                    @Override
+                    public PositionOutputStream newOutputStream(String 
fileName)
+                            throws IOException {
+                        return fileIO.newOutputStream(
+                                new Path(new Path(tempPath.toUri()), 
fileName), true);
+                    }
+                };
+        GlobalIndexParallelWriter indexWriter =
+                new BTreeGlobalIndexer(
+                                new DataField(
+                                        1, "testField", new 
VarCharType(VarCharType.MAX_LENGTH)),
+                                new Options())
+                        .createWriter(fileWriter);
+        for (int index = 0; index < keys.length; index++) {
+            indexWriter.write(keys[index], index);
+        }
+        List<ResultEntry> results = indexWriter.finish();
+        assertThat(results).hasSize(1);
+        return results;
+    }
+}

Reply via email to