This is an automated email from the ASF dual-hosted git repository.
lzljs3620320 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new a1c6f06800 [python] Light refactor: move _is_blob_file check into
DataFileMeta (#7256)
a1c6f06800 is described below
commit a1c6f06800899a2df869a560f38dbb7cc1c57d80
Author: XiaoHongbo <[email protected]>
AuthorDate: Tue Feb 10 23:15:02 2026 +0800
[python] Light refactor: move _is_blob_file check into DataFileMeta (#7256)
---
paimon-python/pypaimon/manifest/schema/data_file_meta.py | 4 ++++
paimon-python/pypaimon/read/reader/field_bunch.py | 7 +------
.../pypaimon/read/scanner/data_evolution_split_generator.py | 10 +++++-----
paimon-python/pypaimon/read/scanner/split_generator.py | 5 -----
paimon-python/pypaimon/read/split_read.py | 13 ++++---------
paimon-python/pypaimon/write/file_store_commit.py | 8 ++------
6 files changed, 16 insertions(+), 31 deletions(-)
diff --git a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
index 239c63dbba..84d7db20af 100644
--- a/paimon-python/pypaimon/manifest/schema/data_file_meta.py
+++ b/paimon-python/pypaimon/manifest/schema/data_file_meta.py
@@ -158,6 +158,10 @@ class DataFileMeta:
file_path=self.file_path
)
+ @staticmethod
+ def is_blob_file(file_name: str) -> bool:
+ return file_name.endswith(".blob")
+
def assign_first_row_id(self, first_row_id: int) -> 'DataFileMeta':
"""Create a new DataFileMeta with the assigned first_row_id."""
return DataFileMeta(
diff --git a/paimon-python/pypaimon/read/reader/field_bunch.py
b/paimon-python/pypaimon/read/reader/field_bunch.py
index 4ba82bd80e..0e20654b69 100644
--- a/paimon-python/pypaimon/read/reader/field_bunch.py
+++ b/paimon-python/pypaimon/read/reader/field_bunch.py
@@ -64,7 +64,7 @@ class BlobBunch(FieldBunch):
def add(self, file: DataFileMeta) -> None:
"""Add a blob file to this bunch."""
- if not self._is_blob_file(file.file_name):
+ if not DataFileMeta.is_blob_file(file.file_name):
raise ValueError("Only blob file can be added to a blob bunch.")
if file.first_row_id == self.latest_first_row_id:
@@ -113,8 +113,3 @@ class BlobBunch(FieldBunch):
def files(self) -> List[DataFileMeta]:
return self._files
-
- @staticmethod
- def _is_blob_file(file_name: str) -> bool:
- """Check if a file is a blob file based on its extension."""
- return file_name.endswith('.blob')
diff --git
a/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py
b/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py
index d9329af302..241966134f 100644
--- a/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py
+++ b/paimon-python/pypaimon/read/scanner/data_evolution_split_generator.py
@@ -54,7 +54,7 @@ class DataEvolutionSplitGenerator(AbstractSplitGenerator):
if manifest_entry.file.first_row_id is not None
else float('-inf')
)
- is_blob = 1 if self._is_blob_file(manifest_entry.file.file_name)
else 0
+ is_blob = 1 if
DataFileMeta.is_blob_file(manifest_entry.file.file_name) else 0
max_seq = manifest_entry.file.max_sequence_number
return first_row_id, is_blob, -max_seq
@@ -341,7 +341,7 @@ class DataEvolutionSplitGenerator(AbstractSplitGenerator):
split_by_row_id.append([file])
continue
- if not self._is_blob_file(file.file_name) and first_row_id !=
last_row_id:
+ if not DataFileMeta.is_blob_file(file.file_name) and first_row_id
!= last_row_id:
if current_split:
split_by_row_id.append(current_split)
@@ -382,7 +382,7 @@ class DataEvolutionSplitGenerator(AbstractSplitGenerator):
current_pos = file_end_pos
data_file_infos = []
for file in split.files:
- if self._is_blob_file(file.file_name):
+ if DataFileMeta.is_blob_file(file.file_name):
continue
file_begin_pos = current_pos
current_pos += file.row_count
@@ -402,7 +402,7 @@ class DataEvolutionSplitGenerator(AbstractSplitGenerator):
# Second pass: only blob files (data files already in
shard_file_idx_map from first pass)
for file in split.files:
- if not self._is_blob_file(file.file_name):
+ if not DataFileMeta.is_blob_file(file.file_name):
continue
blob_first_row_id = file.first_row_id if file.first_row_id is not
None else 0
data_file_range = None
@@ -489,7 +489,7 @@ class DataEvolutionSplitGenerator(AbstractSplitGenerator):
row_id_end = -1
for file in files:
- if not DataEvolutionSplitGenerator._is_blob_file(file.file_name):
+ if not DataFileMeta.is_blob_file(file.file_name):
if file.first_row_id is not None:
row_id_start = file.first_row_id
row_id_end = file.first_row_id + file.row_count
diff --git a/paimon-python/pypaimon/read/scanner/split_generator.py
b/paimon-python/pypaimon/read/scanner/split_generator.py
index 6dab4fc12a..f4f2cebbe7 100644
--- a/paimon-python/pypaimon/read/scanner/split_generator.py
+++ b/paimon-python/pypaimon/read/scanner/split_generator.py
@@ -240,8 +240,3 @@ class AbstractSplitGenerator(ABC):
return -1, -1
# File is completely within the shard range
return None
-
- @staticmethod
- def _is_blob_file(file_name: str) -> bool:
- """Check if a file is a blob file."""
- return file_name.endswith('.blob')
diff --git a/paimon-python/pypaimon/read/split_read.py
b/paimon-python/pypaimon/read/split_read.py
index 2088310aa4..5b876e9488 100644
--- a/paimon-python/pypaimon/read/split_read.py
+++ b/paimon-python/pypaimon/read/split_read.py
@@ -475,7 +475,7 @@ class DataEvolutionSplitRead(SplitRead):
# Sort files by firstRowId and then by maxSequenceNumber
def sort_key(file: DataFileMeta) -> tuple:
first_row_id = file.first_row_id if file.first_row_id is not None
else float('-inf')
- is_blob = 1 if self._is_blob_file(file.file_name) else 0
+ is_blob = 1 if DataFileMeta.is_blob_file(file.file_name) else 0
max_seq = file.max_sequence_number
return (first_row_id, is_blob, -max_seq)
@@ -493,7 +493,7 @@ class DataEvolutionSplitRead(SplitRead):
split_by_row_id.append([file])
continue
- if not self._is_blob_file(file.file_name) and first_row_id !=
last_row_id:
+ if not DataFileMeta.is_blob_file(file.file_name) and first_row_id
!= last_row_id:
if current_split:
split_by_row_id.append(current_split)
if first_row_id < check_row_id_start:
@@ -541,7 +541,7 @@ class DataEvolutionSplitRead(SplitRead):
first_file = bunch.files()[0]
# Get field IDs for this bunch
- if self._is_blob_file(first_file.file_name):
+ if DataFileMeta.is_blob_file(first_file.file_name):
# For blob files, we need to get the field ID from the write
columns
field_ids = [self._get_field_id_from_write_cols(first_file)]
elif first_file.write_cols:
@@ -615,7 +615,7 @@ class DataEvolutionSplitRead(SplitRead):
row_count = -1
for file in need_merge_files:
- if self._is_blob_file(file.file_name):
+ if DataFileMeta.is_blob_file(file.file_name):
field_id = self._get_field_id_from_write_cols(file)
if field_id not in blob_bunch_map:
blob_bunch_map[field_id] = BlobBunch(row_count)
@@ -650,10 +650,5 @@ class DataEvolutionSplitRead(SplitRead):
field_ids.append(SpecialFields.SEQUENCE_NUMBER.id)
return field_ids
- @staticmethod
- def _is_blob_file(file_name: str) -> bool:
- """Check if a file is a blob file based on its extension."""
- return file_name.endswith('.blob')
-
def _get_all_data_fields(self):
return SpecialFields.row_type_with_row_tracking(self.table.fields)
diff --git a/paimon-python/pypaimon/write/file_store_commit.py
b/paimon-python/pypaimon/write/file_store_commit.py
index 881c9d52c3..65b5611cf8 100644
--- a/paimon-python/pypaimon/write/file_store_commit.py
+++ b/paimon-python/pypaimon/write/file_store_commit.py
@@ -25,6 +25,7 @@ from typing import Dict, List, Optional
from pypaimon.common.predicate_builder import PredicateBuilder
from pypaimon.manifest.manifest_file_manager import ManifestFileManager
from pypaimon.manifest.manifest_list_manager import ManifestListManager
+from pypaimon.manifest.schema.data_file_meta import DataFileMeta
from pypaimon.manifest.schema.manifest_entry import ManifestEntry
from pypaimon.manifest.schema.manifest_file_meta import ManifestFileMeta
from pypaimon.manifest.schema.simple_stats import SimpleStats
@@ -648,7 +649,7 @@ class FileStoreCommit:
entry.file.file_source == 0 and # APPEND file source
entry.file.first_row_id is None): # No existing
first_row_id
- if self._is_blob_file(entry.file.file_name):
+ if DataFileMeta.is_blob_file(entry.file.file_name):
# Handle blob files specially
if blob_start >= start:
raise RuntimeError(
@@ -669,8 +670,3 @@ class FileStoreCommit:
row_id_assigned.append(entry)
return row_id_assigned, start
-
- @staticmethod
- def _is_blob_file(file_name: str) -> bool:
- """Check if a file is a blob file based on its extension."""
- return file_name.endswith('.blob')