(iceberg-python) branch main updated: Add tests and fixes for Daft integration (#381)

fokko Wed, 07 Feb 2024 05:30:43 -0800

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git



The following commit(s) were added to refs/heads/main by this push:
     new cec051f  Add tests and fixes for Daft integration (#381)
cec051f is described below

commit cec051f230edfb584f1267e505ee218305389c11
Author: Jay Chia <[email protected]>
AuthorDate: Wed Feb 7 05:30:34 2024 -0800

    Add tests and fixes for Daft integration (#381)
    
    * Implement to_daft on Table instead of Scan
    
    * Add integration tests
    
    ---------
    
    Co-authored-by: Jay Chia <[email protected]@users.noreply.github.com>
---
 pyiceberg/table/__init__.py     | 20 ++++++++++----------
 tests/integration/test_reads.py | 22 ++++++++++++++++++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index b9d44b7..1feffc6 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -1025,6 +1025,16 @@ class Table:
         result_str = f"{table_name}(\n  
{schema_str}\n),\n{partition_str},\n{sort_order_str},\n{snapshot_str}"
         return result_str
 
+    def to_daft(self) -> daft.DataFrame:
+        """Read a Daft DataFrame lazily from this Iceberg table.
+
+        Returns:
+            daft.DataFrame: Unmaterialized Daft Dataframe created from the 
Iceberg table
+        """
+        import daft
+
+        return daft.read_iceberg(self)
+
 
 class StaticTable(Table):
     """Load a table directly from a metadata file (i.e., without using a 
catalog)."""
@@ -1382,16 +1392,6 @@ class DataScan(TableScan):
 
         return ray.data.from_arrow(self.to_arrow())
 
-    def to_daft(self) -> daft.DataFrame:
-        """Read a Daft DataFrame lazily from this Iceberg table.
-
-        Returns:
-            daft.DataFrame: Unmaterialized Daft Dataframe created from the 
Iceberg table
-        """
-        import daft
-
-        return daft.read_iceberg(self)
-
 
 class MoveOperation(Enum):
     First = 1
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index b35b348..d487a64 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -178,6 +178,28 @@ def test_pyarrow_limit(catalog: Catalog) -> None:
     assert len(full_result) == 10
 
 
[email protected]
[email protected]("ignore")
[email protected]('catalog', [pytest.lazy_fixture('catalog_hive'), 
pytest.lazy_fixture('catalog_rest')])
+def test_daft_nan(catalog: Catalog) -> None:
+    table_test_null_nan_rewritten = 
catalog.load_table("default.test_null_nan_rewritten")
+    df = table_test_null_nan_rewritten.to_daft()
+    assert df.count_rows() == 3
+    assert math.isnan(df.to_pydict()["col_numeric"][0])
+
+
[email protected]
[email protected]('catalog', [pytest.lazy_fixture('catalog_hive'), 
pytest.lazy_fixture('catalog_rest')])
+def test_daft_nan_rewritten(catalog: Catalog) -> None:
+    table_test_null_nan_rewritten = 
catalog.load_table("default.test_null_nan_rewritten")
+    df = table_test_null_nan_rewritten.to_daft()
+    df = df.where(df["col_numeric"].float.is_nan())
+    df = df.select("idx", "col_numeric")
+    assert df.count_rows() == 1
+    assert df.to_pydict()["idx"][0] == 1
+    assert math.isnan(df.to_pydict()["col_numeric"][0])
+
+
 @pytest.mark.integration
 @pytest.mark.filterwarnings("ignore")
 @pytest.mark.parametrize('catalog', [pytest.lazy_fixture('catalog_hive'), 
pytest.lazy_fixture('catalog_rest')])

(iceberg-python) branch main updated: Add tests and fixes for Daft integration (#381)

Reply via email to