This is an automated email from the ASF dual-hosted git repository.

JingsongLi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/paimon.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f942d558f [python][daft] Use external paths for native Parquet reads 
(#7979)
6f942d558f is described below

commit 6f942d558feaf0c0195580d9553c56726234438d
Author: QuakeWang <[email protected]>
AuthorDate: Tue May 26 18:14:50 2026 +0800

    [python][daft] Use external paths for native Parquet reads (#7979)
---
 paimon-python/pypaimon/daft/daft_datasource.py     |  7 ++++-
 .../pypaimon/tests/daft/daft_datasource_test.py    | 33 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/paimon-python/pypaimon/daft/daft_datasource.py 
b/paimon-python/pypaimon/daft/daft_datasource.py
index de4b22bda1..457fae375c 100644
--- a/paimon-python/pypaimon/daft/daft_datasource.py
+++ b/paimon-python/pypaimon/daft/daft_datasource.py
@@ -38,6 +38,7 @@ if TYPE_CHECKING:
     from collections.abc import AsyncIterator
 
     from pypaimon.common.predicate import Predicate
+    from pypaimon.manifest.schema.data_file_meta import DataFileMeta
     from pypaimon.read.table_read import TableRead
     from pypaimon.read.split import Split
     from pypaimon.table.file_store_table import FileStoreTable
@@ -285,7 +286,7 @@ class PaimonDataSource(DataSource):
                     pv = pv_cache[pv_key]
 
                 for data_file in split.files:
-                    file_uri = self._build_file_uri(data_file.file_path)
+                    file_uri = 
self._build_file_uri(self._data_file_path(data_file))
                     yield DataSourceTask.parquet(
                         path=file_uri,
                         schema=self._schema,
@@ -330,6 +331,10 @@ class PaimonDataSource(DataSource):
             return f"{self._warehouse_scheme}://{file_path}"
         return f"file://{file_path}"
 
+    @staticmethod
+    def _data_file_path(data_file: DataFileMeta) -> str:
+        return data_file.external_path if data_file.external_path else 
data_file.file_path
+
     def _build_partition_values(self, split: Split) -> 
daft.recordbatch.RecordBatch | None:
         """Build a single-row RecordBatch encoding the partition values for a 
split."""
         if not self._table.partition_keys:
diff --git a/paimon-python/pypaimon/tests/daft/daft_datasource_test.py 
b/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
index 1ad6e7024b..fd49adf242 100644
--- a/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
+++ b/paimon-python/pypaimon/tests/daft/daft_datasource_test.py
@@ -16,6 +16,8 @@
 # limitations under the License.
 
################################################################################
 import unittest
+from dataclasses import dataclass
+from typing import Optional
 
 import pytest
 
@@ -25,6 +27,12 @@ daft = pytest.importorskip("daft")
 from pypaimon.daft.daft_datasource import PaimonDataSource
 
 
+@dataclass
+class _DataFile:
+    file_path: str
+    external_path: Optional[str] = None
+
+
 def _build_uri(warehouse_scheme: str, file_path: str) -> str:
     class _Stub:
         pass
@@ -63,5 +71,30 @@ class BuildFileUriTest(unittest.TestCase):
         )
 
 
+class DataFilePathTest(unittest.TestCase):
+
+    def test_prefers_external_path(self):
+        data_file = _DataFile(
+            file_path="file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+            
external_path="s3://external-bucket/data/db.db/tbl/bucket-0/data.parquet",
+        )
+
+        self.assertEqual(
+            PaimonDataSource._data_file_path(data_file),
+            "s3://external-bucket/data/db.db/tbl/bucket-0/data.parquet",
+        )
+
+    def test_falls_back_to_file_path(self):
+        data_file = _DataFile(
+            file_path="file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+            external_path=None,
+        )
+
+        self.assertEqual(
+            PaimonDataSource._data_file_path(data_file),
+            "file:///warehouse/db.db/tbl/bucket-0/data.parquet",
+        )
+
+
 if __name__ == "__main__":
     unittest.main()

Reply via email to