This is an automated email from the ASF dual-hosted git repository.

Fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-java.git


The following commit(s) were added to refs/heads/master by this push:
     new aaff5db42 GH-3489: Fix Binary.copy() for direct ByteBuffer-backed 
values (#3490)
aaff5db42 is described below

commit aaff5db42a59ad1e528e88d2d236c1793f95c010
Author: André Rouél <[email protected]>
AuthorDate: Sun Jun 21 21:23:19 2026 +0200

    GH-3489: Fix Binary.copy() for direct ByteBuffer-backed values (#3490)
    
    Why
    
    Binary.copy() is a no-op for ByteBufferBackedBinary instances when 
isBackingBytesReused is false, returning this instead of a heap-backed copy. 
When the off-heap decompression path is enabled, column readers produce 
ByteBufferBackedBinary values that are zero-copy views into Arena-managed 
direct buffers. Code that calls copy() to take ownership of the data -- such as 
DictionaryValuesWriter.writeBytes() -- silently receives the original object 
still pointing into the direct buffer. Whe [...]
    
    What
    
    Override copy() in Binary.ByteBufferBackedBinary to always materialize to a 
heap-backed Binary via fromConstantByteArray(getBytes()) when the backing 
ByteBuffer is direct. Heap-backed ByteBuffer values delegate to the existing 
super.copy() behavior to avoid unnecessary copies. The change is in 
parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java.
---
 .../java/org/apache/parquet/io/api/Binary.java     |  10 ++
 .../java/org/apache/parquet/io/api/TestBinary.java | 137 +++++++++++++++++++++
 2 files changed, 147 insertions(+)

diff --git a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java 
b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
index 173581bdd..3160f091e 100644
--- a/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
+++ b/parquet-column/src/main/java/org/apache/parquet/io/api/Binary.java
@@ -498,6 +498,16 @@ public abstract class Binary implements 
Comparable<Binary>, Serializable {
       return Binary.fromConstantByteArray(getBytesUnsafe(), start, length);
     }
 
+    @Override
+    public Binary copy() {
+      if (value.isDirect()) {
+        // Direct ByteBuffers may be backed by memory that can be freed 
independently, so always materialize to
+        // a heap-backed copy to avoid use-after-free.
+        return Binary.fromConstantByteArray(getBytes());
+      }
+      return super.copy();
+    }
+
     @Override
     public int hashCode() {
       if (value.hasArray()) {
diff --git 
a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java 
b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java
index 3dcb878d2..515d4d5b9 100644
--- a/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java
+++ b/parquet-column/src/test/java/org/apache/parquet/io/api/TestBinary.java
@@ -20,6 +20,7 @@ package org.apache.parquet.io.api;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotSame;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -109,6 +110,27 @@ public class TestBinary {
     }
   };
 
+  private static final BinaryFactory DIRECT_BUFFER_BF = new BinaryFactory() {
+    @Override
+    public BinaryAndOriginal get(byte[] bytes, boolean reused) throws 
Exception {
+      ByteBuffer direct = ByteBuffer.allocateDirect(bytes.length);
+      direct.put(bytes);
+      direct.flip();
+      Binary b;
+
+      if (reused) {
+        b = Binary.fromReusedByteBuffer(direct);
+      } else {
+        b = Binary.fromConstantByteBuffer(direct);
+      }
+
+      assertArrayEquals(bytes, b.getBytes());
+      // Return the backing byte[] so tests can mutate it, though for direct 
buffers
+      // there is no accessible backing array. We return a copy of the 
original bytes.
+      return new BinaryAndOriginal(b, bytes);
+    }
+  };
+
   private static final BinaryFactory STRING_BF = new BinaryFactory() {
     @Override
     public BinaryAndOriginal get(byte[] bytes, boolean reused) throws 
Exception {
@@ -151,6 +173,71 @@ public class TestBinary {
     testBinary(BUFFER_BF, false);
   }
 
+  @Test
+  public void testDirectByteBufferBackedBinary() throws Exception {
+    // Direct buffers have different copy() semantics (always materializes to 
heap),
+    // so we test them separately instead of using the generic testBinary flow.
+    testSlice(DIRECT_BUFFER_BF, true);
+    testSlice(DIRECT_BUFFER_BF, false);
+    testDirectConstantCopy(DIRECT_BUFFER_BF);
+    testDirectReusedCopy(DIRECT_BUFFER_BF);
+    testSerializable(DIRECT_BUFFER_BF, true);
+    testSerializable(DIRECT_BUFFER_BF, false);
+  }
+
+  @Test
+  public void testDirectByteBufferCopyAlwaysMaterializesToHeap() throws 
Exception {
+    // For constant (non-reused) direct ByteBuffers, copy() must return a new 
Binary
+    // rather than 'this', because the direct memory can be freed 
independently.
+    byte[] data = testString.getBytes(UTF8);
+    ByteBuffer direct = ByteBuffer.allocateDirect(data.length);
+    direct.put(data);
+    direct.flip();
+
+    Binary binary = Binary.fromConstantByteBuffer(direct);
+    Binary copy = binary.copy();
+
+    // The copy must NOT be the same object, even though the binary is constant
+    assertNotSame("copy() of a direct ByteBuffer-backed constant Binary must 
not return 'this'", binary, copy);
+    assertArrayEquals(data, copy.getBytes());
+    assertArrayEquals(data, copy.getBytesUnsafe());
+  }
+
+  @Test
+  public void testDirectByteBufferCopyIsIndependentOfOriginalBuffer() throws 
Exception {
+    // Verify the copied Binary is independent of the original direct 
ByteBuffer.
+    // Simulates the scenario where direct memory is overwritten after copy.
+    byte[] data = testString.getBytes(UTF8);
+    ByteBuffer direct = ByteBuffer.allocateDirect(data.length);
+    direct.put(data);
+    direct.flip();
+
+    Binary binary = Binary.fromReusedByteBuffer(direct);
+    Binary copy = binary.copy();
+
+    // Overwrite the direct buffer content to simulate memory reuse / free
+    direct.clear();
+    for (int i = 0; i < data.length; i++) {
+      direct.put((byte) 0);
+    }
+
+    // The copy should still hold the original data
+    assertArrayEquals(data, copy.getBytes());
+    assertArrayEquals(data, copy.getBytesUnsafe());
+  }
+
+  @Test
+  public void testHeapByteBufferConstantCopyReturnsSame() throws Exception {
+    // For heap-backed constant ByteBuffers, copy() should return 'this' 
(existing behavior)
+    byte[] data = testString.getBytes(UTF8);
+    ByteBuffer heap = ByteBuffer.wrap(data);
+
+    Binary binary = Binary.fromConstantByteBuffer(heap);
+    Binary copy = binary.copy();
+
+    assertSame("copy() of a heap ByteBuffer-backed constant Binary should 
return 'this'", binary, copy);
+  }
+
   @Test
   public void testEqualityMethods() throws Exception {
     Binary bin1 = Binary.fromConstantByteArray("alice".getBytes(), 1, 3);
@@ -226,6 +313,56 @@ public class TestBinary {
     assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytes());
   }
 
+  /**
+   * Tests copy() on a constant (non-reused) direct ByteBuffer-backed Binary.
+   * Unlike heap-backed binaries, copy() must return a new object because the 
direct
+   * memory can be freed independently.
+   */
+  private void testDirectConstantCopy(BinaryFactory bf) throws Exception {
+    BinaryAndOriginal bao = bf.get(testString.getBytes(UTF8), false);
+    assertEquals(false, bao.binary.isBackingBytesReused());
+
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytes());
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), 
bao.binary.copy().getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytes());
+
+    bao = bf.get(testString.getBytes(UTF8), false);
+    assertEquals(false, bao.binary.isBackingBytesReused());
+
+    Binary copy = bao.binary.copy();
+
+    // Direct ByteBuffer-backed constant Binary.copy() must NOT return 'this'
+    assertNotSame(copy, bao.binary);
+    // But the data must be equal
+    assertEquals(bao.binary, copy);
+  }
+
+  /**
+   * Tests copy() on a reused direct ByteBuffer-backed Binary.
+   * The copy must be fully independent and survive mutation of the original 
buffer.
+   */
+  private void testDirectReusedCopy(BinaryFactory bf) throws Exception {
+    BinaryAndOriginal bao = bf.get(testString.getBytes(UTF8), true);
+    assertEquals(true, bao.binary.isBackingBytesReused());
+
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytes());
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), 
bao.binary.copy().getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), bao.binary.copy().getBytes());
+
+    bao = bf.get(testString.getBytes(UTF8), true);
+    assertEquals(true, bao.binary.isBackingBytesReused());
+
+    Binary copy = bao.binary.copy();
+    assertNotSame(copy, bao.binary);
+
+    assertArrayEquals(testString.getBytes(UTF8), copy.getBytes());
+    assertArrayEquals(testString.getBytes(UTF8), copy.getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytesUnsafe());
+    assertArrayEquals(testString.getBytes(UTF8), copy.copy().getBytes());
+  }
+
   private void testSerializable(BinaryFactory bf, boolean reused) throws 
Exception {
     BinaryAndOriginal bao = bf.get("polygon".getBytes(UTF8), reused);
 

Reply via email to