This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-python.git
The following commit(s) were added to refs/heads/main by this push:
new cc0fd86 Add Daft examples and code into PyIceberg docs and Table
(#355)
cc0fd86 is described below
commit cc0fd860a5d4a48edac83e539674735fb3e86015
Author: Jay Chia <[email protected]>
AuthorDate: Tue Feb 6 07:04:38 2024 -0800
Add Daft examples and code into PyIceberg docs and Table (#355)
* Add Daft integration docs
* Add to_daft() implementation
* nit: slight amendment to show call
* Proper dependency handling for poetry
* lints
* lint
---------
Co-authored-by: Jay Chia <[email protected]@users.noreply.github.com>
Co-authored-by: Fokko Driesprong <[email protected]>
---
Makefile | 2 +-
mkdocs/docs/api.md | 53 ++++++++++++++++++++++++++++
mkdocs/docs/index.md | 1 +
poetry.lock | 86 ++++++++++++++++++++++++++++++++++++++++++++-
pyiceberg/table/__init__.py | 11 ++++++
pyproject.toml | 6 ++++
6 files changed, 157 insertions(+), 2 deletions(-)
diff --git a/Makefile b/Makefile
index 4226614..c3e816e 100644
--- a/Makefile
+++ b/Makefile
@@ -19,7 +19,7 @@ install-poetry:
pip install poetry==1.7.1
install-dependencies:
- poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E
ray -E sql-postgres -E gcsfs -E sql-sqlite
+ poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E
ray -E sql-postgres -E gcsfs -E sql-sqlite -E daft
install: | install-poetry install-dependencies
diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md
index 650d391..5380192 100644
--- a/mkdocs/docs/api.md
+++ b/mkdocs/docs/api.md
@@ -636,3 +636,56 @@ print(ray_dataset.take(2))
},
]
```
+
+### Daft
+
+PyIceberg interfaces closely with Daft Dataframes (see also: [Daft integration
with
Iceberg](https://www.getdaft.io/projects/docs/en/latest/user_guide/integrations/iceberg.html))
which provides a full lazily optimized query engine interface on top of
PyIceberg tables.
+
+<!-- prettier-ignore-start -->
+
+!!! note "Requirements"
+ This requires [Daft to be installed](index.md).
+
+<!-- prettier-ignore-end -->
+
+A table can be read easily into a Daft Dataframe:
+
+```python
+df = table.to_daft() # equivalent to `daft.read_iceberg(table)`
+df = df.where(df["trip_distance"] >= 10.0)
+df = df.select("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime")
+```
+
+This returns a Daft Dataframe which is lazily materialized. Printing `df` will
display the schema:
+
+```
+╭──────────┬───────────────────────────────┬───────────────────────────────╮
+│ VendorID ┆ tpep_pickup_datetime ┆ tpep_dropoff_datetime │
+│ --- ┆ --- ┆ --- │
+│ Int64 ┆ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │
+╰──────────┴───────────────────────────────┴───────────────────────────────╯
+
+(No data to display: Dataframe not materialized)
+```
+
+We can execute the Dataframe to preview the first few rows of the query with
`df.show()`.
+
+This is correctly optimized to take advantage of Iceberg features such as
hidden partitioning and file-level statistics for efficient reads.
+
+```python
+df.show(2)
+```
+
+```
+╭──────────┬───────────────────────────────┬───────────────────────────────╮
+│ VendorID ┆ tpep_pickup_datetime ┆ tpep_dropoff_datetime │
+│ --- ┆ --- ┆ --- │
+│ Int64 ┆ Timestamp(Microseconds, None) ┆ Timestamp(Microseconds, None) │
+╞══════════╪═══════════════════════════════╪═══════════════════════════════╡
+│ 2 ┆ 2008-12-31T23:23:50.000000 ┆ 2009-01-01T00:34:31.000000 │
+├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+│ 2 ┆ 2008-12-31T23:05:03.000000 ┆ 2009-01-01T16:10:18.000000 │
+╰──────────┴───────────────────────────────┴───────────────────────────────╯
+
+(Showing first 2 rows)
+```
diff --git a/mkdocs/docs/index.md b/mkdocs/docs/index.md
index f1c1fa9..ffe3ab4 100644
--- a/mkdocs/docs/index.md
+++ b/mkdocs/docs/index.md
@@ -51,6 +51,7 @@ You can mix and match optional dependencies depending on your
needs:
| pandas | Installs both PyArrow and Pandas
|
| duckdb | Installs both PyArrow and DuckDB
|
| ray | Installs PyArrow, Pandas, and Ray
|
+| daft | Installs Daft
|
| s3fs | S3FS as a FileIO implementation to interact with the object
store |
| adlfs | ADLFS as a FileIO implementation to interact with the object
store |
| snappy | Support for snappy Avro compression
|
diff --git a/poetry.lock b/poetry.lock
index e6e679a..8e2241a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1157,6 +1157,10 @@ files = [
{file = "fsspec-2023.12.2.tar.gz", hash =
"sha256:8548d39e8810b59c38014934f6b31e57f40c1b20f911f4cc2b85389c7e9bf0cb"},
]
+[package.dependencies]
+aiohttp = {version = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1", optional =
true, markers = "extra == \"http\""}
+requests = {version = "*", optional = true, markers = "extra == \"http\""}
+
[package.extras]
abfs = ["adlfs"]
adl = ["adlfs"]
@@ -1205,6 +1209,38 @@ requests = "*"
crc = ["crcmod"]
gcsfuse = ["fusepy"]
+[[package]]
+name = "getdaft"
+version = "0.2.12"
+description = "A Distributed DataFrame library for large scale complex data
processing."
+optional = true
+python-versions = ">=3.7"
+files = [
+ {file = "getdaft-0.2.12-cp37-abi3-macosx_10_7_x86_64.whl", hash =
"sha256:ec3d9693e6d859a7604e7597695e51383bdd4a0100fa892a030510c81dc7a214"},
+ {file = "getdaft-0.2.12-cp37-abi3-macosx_11_0_arm64.whl", hash =
"sha256:44f12f57b8dfd35c6a8ff9da749f4243c9558403e0c65720298f65079436e505"},
+ {file =
"getdaft-0.2.12-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
hash =
"sha256:565713fa393970b3a891e9c02e8fb20ebce9b7b1278a00299e5159b71a76fac7"},
+ {file =
"getdaft-0.2.12-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash
= "sha256:ed061e93c55cb757061eac48675f4d9bb8960ed3ee1b2df64ade848946875f2f"},
+ {file = "getdaft-0.2.12-cp37-abi3-win_amd64.whl", hash =
"sha256:7970903d4986acbf0a99e812d71fd12c9b63911cb7b601085354864e032dfa94"},
+ {file = "getdaft-0.2.12.tar.gz", hash =
"sha256:9a8698cf04b8e6ed11c1cc960498d438748e8d1a316b4da2e7faed62e90f00b8"},
+]
+
+[package.dependencies]
+fsspec = {version = "*", extras = ["http"]}
+psutil = "*"
+pyarrow = ">=6.0.1"
+tqdm = "*"
+typing-extensions = {version = ">=4.0.0", markers = "python_version <
\"3.10\""}
+
+[package.extras]
+all = ["getdaft[aws,azure,gcp,iceberg,numpy,pandas,ray]"]
+aws = ["s3fs"]
+azure = ["adlfs"]
+gcp = ["gcsfs"]
+iceberg = ["packaging", "pyiceberg (>=0.4.0)"]
+numpy = ["numpy"]
+pandas = ["pandas"]
+ray = ["packaging", "ray[client,data] (>=2.0.0)"]
+
[[package]]
name = "google-api-core"
version = "2.15.0"
@@ -2601,6 +2637,34 @@ files = [
{file = "protobuf-4.25.2.tar.gz", hash =
"sha256:fe599e175cb347efc8ee524bcd4b902d11f7262c0e569ececcb89995c15f0a5e"},
]
+[[package]]
+name = "psutil"
+version = "5.9.8"
+description = "Cross-platform lib for process and system monitoring in Python."
+optional = true
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
+files = [
+ {file = "psutil-5.9.8-cp27-cp27m-macosx_10_9_x86_64.whl", hash =
"sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8"},
+ {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_i686.whl", hash =
"sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73"},
+ {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_x86_64.whl", hash =
"sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7"},
+ {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_i686.whl", hash =
"sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36"},
+ {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_x86_64.whl", hash =
"sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d"},
+ {file = "psutil-5.9.8-cp27-none-win32.whl", hash =
"sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e"},
+ {file = "psutil-5.9.8-cp27-none-win_amd64.whl", hash =
"sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631"},
+ {file = "psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash =
"sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81"},
+ {file =
"psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl",
hash =
"sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421"},
+ {file =
"psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
hash =
"sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4"},
+ {file = "psutil-5.9.8-cp36-cp36m-win32.whl", hash =
"sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee"},
+ {file = "psutil-5.9.8-cp36-cp36m-win_amd64.whl", hash =
"sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2"},
+ {file = "psutil-5.9.8-cp37-abi3-win32.whl", hash =
"sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0"},
+ {file = "psutil-5.9.8-cp37-abi3-win_amd64.whl", hash =
"sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf"},
+ {file = "psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash =
"sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8"},
+ {file = "psutil-5.9.8.tar.gz", hash =
"sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c"},
+]
+
+[package.extras]
+test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+
[[package]]
name = "psycopg2-binary"
version = "2.9.9"
@@ -3908,6 +3972,26 @@ files = [
{file = "tomli-2.0.1.tar.gz", hash =
"sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
+[[package]]
+name = "tqdm"
+version = "4.66.1"
+description = "Fast, Extensible Progress Meter"
+optional = true
+python-versions = ">=3.7"
+files = [
+ {file = "tqdm-4.66.1-py3-none-any.whl", hash =
"sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"},
+ {file = "tqdm-4.66.1.tar.gz", hash =
"sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"},
+]
+
+[package.dependencies]
+colorama = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"]
+notebook = ["ipywidgets (>=6)"]
+slack = ["slack-sdk"]
+telegram = ["requests"]
+
[[package]]
name = "typing-extensions"
version = "4.9.0"
@@ -4271,6 +4355,7 @@ cffi = ["cffi (>=1.11)"]
[extras]
adlfs = ["adlfs"]
+daft = ["getdaft"]
duckdb = ["duckdb", "pyarrow"]
dynamodb = ["boto3"]
gcsfs = ["gcsfs"]
@@ -4288,4 +4373,3 @@ zstandard = ["zstandard"]
[metadata]
lock-version = "2.0"
python-versions = "^3.8"
-content-hash =
"2d2b08bb4e99f940d5a428593438ea5fe28eaec7962b3171ff340195463379d8"
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
index 7692bb0..b9d44b7 100644
--- a/pyiceberg/table/__init__.py
+++ b/pyiceberg/table/__init__.py
@@ -120,6 +120,7 @@ from pyiceberg.utils.concurrent import ExecutorFactory
from pyiceberg.utils.datetime import datetime_to_millis
if TYPE_CHECKING:
+ import daft
import pandas as pd
import pyarrow as pa
import ray
@@ -1381,6 +1382,16 @@ class DataScan(TableScan):
return ray.data.from_arrow(self.to_arrow())
+ def to_daft(self) -> daft.DataFrame:
+ """Read a Daft DataFrame lazily from this Iceberg table.
+
+ Returns:
+ daft.DataFrame: Unmaterialized Daft Dataframe created from the
Iceberg table
+ """
+ import daft
+
+ return daft.read_iceberg(self)
+
class MoveOperation(Enum):
First = 1
diff --git a/pyproject.toml b/pyproject.toml
index dcc91fb..a741ee4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,7 @@ adlfs = { version = ">=2023.1.0,<2024.3.0", optional = true }
gcsfs = { version = ">=2023.1.0,<2024.1.0", optional = true }
psycopg2-binary = { version = ">=2.9.6", optional = true }
sqlalchemy = { version = "^2.0.18", optional = true }
+getdaft = { version = ">=0.2.12", optional = true }
[tool.poetry.dev-dependencies]
pytest = "7.4.4"
@@ -105,6 +106,7 @@ pyarrow = ["pyarrow"]
pandas = ["pandas", "pyarrow"]
duckdb = ["duckdb", "pyarrow"]
ray = ["ray", "pyarrow", "pandas"]
+daft = ["getdaft"]
snappy = ["python-snappy"]
hive = ["thrift"]
s3fs = ["s3fs"]
@@ -263,6 +265,10 @@ ignore_missing_imports = true
module = "ray.*"
ignore_missing_imports = true
+[[tool.mypy.overrides]]
+module = "daft.*"
+ignore_missing_imports = true
+
[[tool.mypy.overrides]]
module = "pyparsing.*"
ignore_missing_imports = true