(paimon) branch master updated: [python] Fix string size being an integer multiple of bytes (#6225)

lzljs3620320 Tue, 09 Sep 2025 08:54:57 -0700

This is an automated email from the ASF dual-hosted git repository.

lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git



The following commit(s) were added to refs/heads/master by this push:
     new 6e7481ea95 [python] Fix string size being an integer multiple of bytes 
(#6225)
6e7481ea95 is described below

commit 6e7481ea95bfec3fa6d89b3850d1b3653cd85084
Author: umi <55790489+discivig...@users.noreply.github.com>
AuthorDate: Tue Sep 9 23:54:44 2025 +0800

    [python] Fix string size being an integer multiple of bytes (#6225)
---
 paimon-python/pypaimon/table/row/binary_row.py     | 14 ++++++++--
 .../pypaimon/tests/py36/ao_read_write_test.py      | 30 ++++++++++++++++++++++
 paimon-python/pypaimon/tests/reader_basic_test.py  | 30 ++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/paimon-python/pypaimon/table/row/binary_row.py 
b/paimon-python/pypaimon/table/row/binary_row.py
index 468556dcb0..d52e38cf11 100644
--- a/paimon-python/pypaimon/table/row/binary_row.py
+++ b/paimon-python/pypaimon/table/row/binary_row.py
@@ -276,9 +276,11 @@ class BinaryRowSerializer:
                     header_byte = 0x80 | length
                     fixed_part[field_fixed_offset + 7] = header_byte
                 else:
+                    var_length = 
cls._round_number_of_bytes_to_nearest_word(len(value_bytes))
+                    var_value_bytes = value_bytes + b'\x00' * (var_length - 
length)
                     offset_in_variable_part = current_variable_offset
-                    variable_part_data.append(value_bytes)
-                    current_variable_offset += length
+                    variable_part_data.append(var_value_bytes)
+                    current_variable_offset += var_length
 
                     absolute_offset = fixed_part_size + offset_in_variable_part
                     offset_and_len = (absolute_offset << 32) | length
@@ -401,3 +403,11 @@ class BinaryRowSerializer:
         else:
             millis = value.hour * 3600000 + value.minute * 60000 + 
value.second * 1000 + value.microsecond // 1000
         return struct.pack('<i', millis)
+
+    @classmethod
+    def _round_number_of_bytes_to_nearest_word(cls, num_bytes: int) -> int:
+        remainder = num_bytes & 0x07
+        if remainder == 0:
+            return num_bytes
+        else:
+            return num_bytes + (8 - remainder)
diff --git a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py 
b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
index 7ae847c9f3..de174e6413 100644
--- a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
+++ b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py
@@ -19,6 +19,11 @@ import logging
 
 import pandas as pd
 import pyarrow as pa
+from pypaimon.schema.data_types import DataField, AtomicType
+
+from pypaimon.table.row.row_kind import RowKind
+
+from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer, 
BinaryRowDeserializer
 
 from pypaimon.api.options import Options
 from pypaimon.catalog.catalog_context import CatalogContext
@@ -364,3 +369,28 @@ class RESTTableReadWritePy36Test(RESTCatalogBaseTest):
         table_read = read_builder.new_read()
         splits = read_builder.new_scan().plan().splits()
         self.assertEqual(table_read.to_arrow(splits).num_rows, total_rows)
+
+    def test_to_bytes_with_long_string(self):
+        """Test serialization of strings longer than 7 bytes which require 
variable part storage."""
+        # Create fields with a long string value
+        fields = [
+            DataField(0, "long_string", AtomicType("STRING")),
+        ]
+
+        # String longer than 7 bytes will be stored in variable part
+        long_string = "This is a long string that exceeds 7 bytes"
+        values = [long_string]
+
+        binary_row = BinaryRow(values, fields, RowKind.INSERT)
+        serialized_bytes = BinaryRowSerializer.to_bytes(binary_row)
+
+        # Verify the last 6 bytes are 0
+        # This is because the variable part data is rounded to the nearest 
word (8 bytes)
+        # The last 6 bytes check is to ensure proper padding
+        self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00')
+        self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string)
+        # Deserialize to verify
+        deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes, 
fields)
+
+        self.assertEqual(deserialized_row.values[0], long_string)
+        self.assertEqual(deserialized_row.row_kind, RowKind.INSERT)
diff --git a/paimon-python/pypaimon/tests/reader_basic_test.py 
b/paimon-python/pypaimon/tests/reader_basic_test.py
index 445a65763f..9402ae93f7 100644
--- a/paimon-python/pypaimon/tests/reader_basic_test.py
+++ b/paimon-python/pypaimon/tests/reader_basic_test.py
@@ -23,6 +23,11 @@ import unittest
 
 import pandas as pd
 import pyarrow as pa
+from pypaimon.table.row.row_kind import RowKind
+
+from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer, 
BinaryRowDeserializer
+
+from pypaimon.schema.data_types import DataField, AtomicType
 
 from pypaimon.catalog.catalog_factory import CatalogFactory
 from pypaimon.schema.schema import Schema
@@ -192,3 +197,28 @@ class ReaderBasicTest(unittest.TestCase):
         actual = duckdb_con.query("SELECT * FROM duckdb_table").fetchdf()
         expect = pd.DataFrame(self.raw_data)
         pd.testing.assert_frame_equal(actual.reset_index(drop=True), 
expect.reset_index(drop=True))
+
+    def test_to_bytes_with_long_string(self):
+        """Test serialization of strings longer than 7 bytes which require 
variable part storage."""
+        # Create fields with a long string value
+        fields = [
+            DataField(0, "long_string", AtomicType("STRING")),
+        ]
+
+        # String longer than 7 bytes will be stored in variable part
+        long_string = "This is a long string that exceeds 7 bytes"
+        values = [long_string]
+
+        binary_row = BinaryRow(values, fields, RowKind.INSERT)
+        serialized_bytes = BinaryRowSerializer.to_bytes(binary_row)
+
+        # Verify the last 6 bytes are 0
+        # This is because the variable part data is rounded to the nearest 
word (8 bytes)
+        # The last 6 bytes check is to ensure proper padding
+        self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00')
+        self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string)
+        # Deserialize to verify
+        deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes, 
fields)
+
+        self.assertEqual(deserialized_row.values[0], long_string)
+        self.assertEqual(deserialized_row.row_kind, RowKind.INSERT)

(paimon) branch master updated: [python] Fix string size being an integer multiple of bytes (#6225)

Reply via email to