This is an automated email from the ASF dual-hosted git repository.
JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git
The following commit(s) were added to refs/heads/master by this push:
new 6f942d558f [python][daft] Use external paths for native Parquet reads
(#7979)
6f942d558f is described below
commit 6f942d558feaf0c0195580d9553c56726234438d
Author: QuakeWang <[email protected]>
AuthorDate: Tue May 26 18:14:50 2026 +0800
[python][daft] Use external paths for native Parquet reads (#7979)
---
paimon-python/pypaimon/daft/daft_datasource.py | 7 ++++-
.../pypaimon/tests/daft/daft_datasource_test.py | 33 ++++++++++++++++++++++
2 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/paimon-python/pypaimon/daft/daft_datasource.py
b/paimon-python/pypaimon/daft/daft_datasource.py
index de4b22bda1..457fae375c 100644
--- a/paimon-python/pypaimon/daft/daft_datasource.py
+++ b/paimon-python/pypaimon/daft/daft_datasource.py
@@ -38,6 +38,7 @@ if TYPE_CHECKING:
from collections.abc import AsyncIterator
from pypaimon.common.predicate import Predicate
+ from pypaimon.manifest.schema.data_file_meta import DataFileMeta
from pypaimon.read.table_read import TableRead
from pypaimon.read.split import Split
from pypaimon.table.file_store_table import FileStoreTable
@@ -285,7 +286,7 @@ class PaimonDataSource(DataSource):
pv = pv_cache[pv_key]
for data_file in split.files:
- file_uri = self._build_file_uri(data_file.file_path)
+ file_uri =
self._build_file_uri(self._data_file_path(data_file))
yield DataSourceTask.parquet(
path=file_uri,
schema=self._schema,
@@ -330,6 +331,10 @@ class PaimonDataSource(DataSource):
return f"{self._warehouse_scheme}://{file_path}"
return f"file://{file_path}"
+ @staticmethod
+ def _data_file_path(data_file: DataFileMeta) -> str:
+ return data_file.external_path if data_file.external_path else
data_file.file_path
+
def _build_partition_values(self, split: Split) ->
daft.recordbatch.RecordBatch | None:
"""Build a single-row RecordBatch encoding the partition values for a
split."""
if not self._table.partition_keys:
diff --git a/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
b/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
index 1ad6e7024b..fd49adf242 100644
--- a/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
+++ b/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
@@ -16,6 +16,8 @@
# limitations under the License.
################################################################################
import unittest
+from dataclasses import dataclass
+from typing import Optional
import pytest
@@ -25,6 +27,12 @@ daft = pytest.importorskip("daft")
from pypaimon.daft.daft_datasource import PaimonDataSource
+@dataclass
+class _DataFile:
+ file_path: str
+ external_path: Optional[str] = None
+
+
def _build_uri(warehouse_scheme: str, file_path: str) -> str:
class _Stub:
pass
@@ -63,5 +71,30 @@ class BuildFileUriTest(unittest.TestCase):
)
+class DataFilePathTest(unittest.TestCase):
+
+ def test_prefers_external_path(self):
+ data_file = _DataFile(
+ file_path="file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+
external_path="s3://external-bucket/data/db.db/tbl/bucket-0/data.parquet",
+ )
+
+ self.assertEqual(
+ PaimonDataSource._data_file_path(data_file),
+ "s3://external-bucket/data/db.db/tbl/bucket-0/data.parquet",
+ )
+
+ def test_falls_back_to_file_path(self):
+ data_file = _DataFile(
+ file_path="file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+ external_path=None,
+ )
+
+ self.assertEqual(
+ PaimonDataSource._data_file_path(data_file),
+ "file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+ )
+
+
if __name__ == "__main__":
unittest.main()