This is an automated email from the ASF dual-hosted git repository.
XiaoHongbo-Hope pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 9571f14d26 [python] Fix null blob write (#7901)
9571f14d26 is described below
commit 9571f14d26b9ca002c646429b72c468f56679d59
Author: XiaoHongbo <[email protected]>
AuthorDate: Wed May 20 22:11:02 2026 +0800
[python] Fix null blob write (#7901)
### Purpose
PR #7847 added null support to `BlobFormatWriter.add_element` /
`write_value` and `FormatBlobReader`, but the public `write_arrow` path
(`DataBlobWriter` → `BlobWriter` → `BlobFileWriter._to_blob`) still
rejects `None` and raises `ValueError`, so writing a batch with a NULL
inline blob fails end-to-end:
```
ValueError: Blob field value must be bytes/blob or serialized
BlobDescriptor bytes, got <class 'NoneType'>.
```
This PR makes `BlobFileWriter._to_blob` return `None` for `None` input.
### Tests
- New e2e test `DataBlobWriterTest.test_null_blob`
---
paimon-python/pypaimon/tests/blob_table_test.py | 49 ++++++++++++++++++++++
.../pypaimon/write/writer/blob_file_writer.py | 8 +++-
2 files changed, 55 insertions(+), 2 deletions(-)
diff --git a/paimon-python/pypaimon/tests/blob_table_test.py
b/paimon-python/pypaimon/tests/blob_table_test.py
index 56359c1d1f..95f89ff8de 100755
--- a/paimon-python/pypaimon/tests/blob_table_test.py
+++ b/paimon-python/pypaimon/tests/blob_table_test.py
@@ -1014,6 +1014,55 @@ class DataBlobWriterTest(unittest.TestCase):
print(
f"✅ End-to-end blob write/read test passed: wrote and read back
{len(blob_data)} blob records correctly") # noqa: E501
+ def test_null_blob(self):
+ from pypaimon import Schema
+
+ pa_schema = pa.schema([
+ ('id', pa.int32()),
+ ('name', pa.string()),
+ ('blob_data', pa.large_binary()),
+ ])
+
+ schema = Schema.from_pyarrow_schema(
+ pa_schema,
+ options={
+ 'row-tracking.enabled': 'true',
+ 'data-evolution.enabled': 'true'
+ }
+ )
+ self.catalog.create_table('test_db.blob_write_read_e2e_null', schema,
False)
+ table = self.catalog.get_table('test_db.blob_write_read_e2e_null')
+
+ test_data = pa.Table.from_pydict({
+ 'id': [1, 2, 3, 4, 5],
+ 'name': ['a', 'b', 'c', 'd', 'e'],
+ 'blob_data': [
+ b'first_blob',
+ None,
+ b'third_blob',
+ None,
+ b'fifth_blob',
+ ],
+ }, schema=pa_schema)
+
+ write_builder = table.new_batch_write_builder()
+ writer = write_builder.new_write()
+ writer.write_arrow(test_data)
+
+ commit_messages = writer.prepare_commit()
+ write_builder.new_commit().commit(commit_messages)
+ writer.close()
+
+ read_builder = table.new_read_builder()
+ result =
read_builder.new_read().to_arrow(read_builder.new_scan().plan().splits())
+
+ self.assertEqual(result.column('id').to_pylist(), [1, 2, 3, 4, 5])
+ self.assertEqual(result.column('name').to_pylist(), ['a', 'b', 'c',
'd', 'e'])
+ self.assertEqual(
+ result.column('blob_data').to_pylist(),
+ [b'first_blob', None, b'third_blob', None, b'fifth_blob'],
+ )
+
def test_blob_write_read_partition(self):
"""Test complete end-to-end blob functionality: write blob data and
read it back to verify correctness."""
from pypaimon import Schema
diff --git a/paimon-python/pypaimon/write/writer/blob_file_writer.py
b/paimon-python/pypaimon/write/writer/blob_file_writer.py
index 2a7e9580e9..31d945a4ed 100644
--- a/paimon-python/pypaimon/write/writer/blob_file_writer.py
+++ b/paimon-python/pypaimon/write/writer/blob_file_writer.py
@@ -15,8 +15,10 @@
# specific language governing permissions and limitations
# under the License.
-import pyarrow as pa
from pathlib import Path
+from typing import Optional
+
+import pyarrow as pa
from pypaimon.write.blob_format_writer import BlobFormatWriter
from pypaimon.table.row.generic_row import GenericRow, RowKind
@@ -58,9 +60,11 @@ class BlobFileWriter:
self.writer.add_element(row)
self.row_count += 1
- def _to_blob(self, col_data) -> Blob:
+ def _to_blob(self, col_data) -> Optional[Blob]:
if hasattr(col_data, 'as_py'):
col_data = col_data.as_py()
+ if col_data is None:
+ return None
if isinstance(col_data, str):
col_data = col_data.encode('utf-8')
if isinstance(col_data, bytearray):