This is an automated email from the ASF dual-hosted git repository. lzljs3620320 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push: new 6e7481ea95 [python] Fix string size being an integer multiple of bytes (#6225) 6e7481ea95 is described below commit 6e7481ea95bfec3fa6d89b3850d1b3653cd85084 Author: umi <55790489+discivig...@users.noreply.github.com> AuthorDate: Tue Sep 9 23:54:44 2025 +0800 [python] Fix string size being an integer multiple of bytes (#6225) --- paimon-python/pypaimon/table/row/binary_row.py | 14 ++++++++-- .../pypaimon/tests/py36/ao_read_write_test.py | 30 ++++++++++++++++++++++ paimon-python/pypaimon/tests/reader_basic_test.py | 30 ++++++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/paimon-python/pypaimon/table/row/binary_row.py b/paimon-python/pypaimon/table/row/binary_row.py index 468556dcb0..d52e38cf11 100644 --- a/paimon-python/pypaimon/table/row/binary_row.py +++ b/paimon-python/pypaimon/table/row/binary_row.py @@ -276,9 +276,11 @@ class BinaryRowSerializer: header_byte = 0x80 | length fixed_part[field_fixed_offset + 7] = header_byte else: + var_length = cls._round_number_of_bytes_to_nearest_word(len(value_bytes)) + var_value_bytes = value_bytes + b'\x00' * (var_length - length) offset_in_variable_part = current_variable_offset - variable_part_data.append(value_bytes) - current_variable_offset += length + variable_part_data.append(var_value_bytes) + current_variable_offset += var_length absolute_offset = fixed_part_size + offset_in_variable_part offset_and_len = (absolute_offset << 32) | length @@ -401,3 +403,11 @@ class BinaryRowSerializer: else: millis = value.hour * 3600000 + value.minute * 60000 + value.second * 1000 + value.microsecond // 1000 return struct.pack('<i', millis) + + @classmethod + def _round_number_of_bytes_to_nearest_word(cls, num_bytes: int) -> int: + remainder = num_bytes & 0x07 + if remainder == 0: + return num_bytes + else: + return num_bytes + (8 - remainder) diff --git a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py index 7ae847c9f3..de174e6413 100644 --- a/paimon-python/pypaimon/tests/py36/ao_read_write_test.py +++ b/paimon-python/pypaimon/tests/py36/ao_read_write_test.py @@ -19,6 +19,11 @@ import logging import pandas as pd import pyarrow as pa +from pypaimon.schema.data_types import DataField, AtomicType + +from pypaimon.table.row.row_kind import RowKind + +from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer, BinaryRowDeserializer from pypaimon.api.options import Options from pypaimon.catalog.catalog_context import CatalogContext @@ -364,3 +369,28 @@ class RESTTableReadWritePy36Test(RESTCatalogBaseTest): table_read = read_builder.new_read() splits = read_builder.new_scan().plan().splits() self.assertEqual(table_read.to_arrow(splits).num_rows, total_rows) + + def test_to_bytes_with_long_string(self): + """Test serialization of strings longer than 7 bytes which require variable part storage.""" + # Create fields with a long string value + fields = [ + DataField(0, "long_string", AtomicType("STRING")), + ] + + # String longer than 7 bytes will be stored in variable part + long_string = "This is a long string that exceeds 7 bytes" + values = [long_string] + + binary_row = BinaryRow(values, fields, RowKind.INSERT) + serialized_bytes = BinaryRowSerializer.to_bytes(binary_row) + + # Verify the last 6 bytes are 0 + # This is because the variable part data is rounded to the nearest word (8 bytes) + # The last 6 bytes check is to ensure proper padding + self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00') + self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string) + # Deserialize to verify + deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes, fields) + + self.assertEqual(deserialized_row.values[0], long_string) + self.assertEqual(deserialized_row.row_kind, RowKind.INSERT) diff --git a/paimon-python/pypaimon/tests/reader_basic_test.py b/paimon-python/pypaimon/tests/reader_basic_test.py index 445a65763f..9402ae93f7 100644 --- a/paimon-python/pypaimon/tests/reader_basic_test.py +++ b/paimon-python/pypaimon/tests/reader_basic_test.py @@ -23,6 +23,11 @@ import unittest import pandas as pd import pyarrow as pa +from pypaimon.table.row.row_kind import RowKind + +from pypaimon.table.row.binary_row import BinaryRow, BinaryRowSerializer, BinaryRowDeserializer + +from pypaimon.schema.data_types import DataField, AtomicType from pypaimon.catalog.catalog_factory import CatalogFactory from pypaimon.schema.schema import Schema @@ -192,3 +197,28 @@ class ReaderBasicTest(unittest.TestCase): actual = duckdb_con.query("SELECT * FROM duckdb_table").fetchdf() expect = pd.DataFrame(self.raw_data) pd.testing.assert_frame_equal(actual.reset_index(drop=True), expect.reset_index(drop=True)) + + def test_to_bytes_with_long_string(self): + """Test serialization of strings longer than 7 bytes which require variable part storage.""" + # Create fields with a long string value + fields = [ + DataField(0, "long_string", AtomicType("STRING")), + ] + + # String longer than 7 bytes will be stored in variable part + long_string = "This is a long string that exceeds 7 bytes" + values = [long_string] + + binary_row = BinaryRow(values, fields, RowKind.INSERT) + serialized_bytes = BinaryRowSerializer.to_bytes(binary_row) + + # Verify the last 6 bytes are 0 + # This is because the variable part data is rounded to the nearest word (8 bytes) + # The last 6 bytes check is to ensure proper padding + self.assertEqual(serialized_bytes[-6:], b'\x00\x00\x00\x00\x00\x00') + self.assertEqual(serialized_bytes[20:62].decode('utf-8'), long_string) + # Deserialize to verify + deserialized_row = BinaryRowDeserializer.from_bytes(serialized_bytes, fields) + + self.assertEqual(deserialized_row.values[0], long_string) + self.assertEqual(deserialized_row.row_kind, RowKind.INSERT)