This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 8770503fd4 Python: Implement `to_pandas` (#6254)
8770503fd4 is described below
commit 8770503fd4051c44b90b7ad7f57cc81efb2cb6b2
Author: Đặng Minh Dũng <[email protected]>
AuthorDate: Mon Dec 12 02:30:20 2022 +0700
Python: Implement `to_pandas` (#6254)
---
python/poetry.lock | 54 ++++++++++++++++++++++++++++++++++++--
python/pyiceberg/table/__init__.py | 12 +++++++--
python/pyproject.toml | 7 +++++
3 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/python/poetry.lock b/python/poetry.lock
index 0a54622ac8..7c249e2858 100644
--- a/python/poetry.lock
+++ b/python/poetry.lock
@@ -500,6 +500,26 @@ python-versions = ">=3.6"
[package.dependencies]
pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
+[[package]]
+name = "pandas"
+version = "1.5.2"
+description = "Powerful data structures for data analysis, time series, and
statistics"
+category = "main"
+optional = true
+python-versions = ">=3.8"
+
+[package.dependencies]
+numpy = [
+ {version = ">=1.20.3", markers = "python_version < \"3.10\""},
+ {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
+ {version = ">=1.23.2", markers = "python_version >= \"3.11\""},
+]
+python-dateutil = ">=2.8.1"
+pytz = ">=2020.1"
+
+[package.extras]
+test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
+
[[package]]
name = "pep517"
version = "0.13.0"
@@ -667,7 +687,7 @@ python-versions = "*"
name = "pytz"
version = "2022.6"
description = "World timezone definitions, modern and historical"
-category = "dev"
+category = "main"
optional = false
python-versions = "*"
@@ -948,6 +968,7 @@ cffi = ["cffi (>=1.11)"]
duckdb = ["duckdb", "pyarrow"]
glue = ["boto3"]
hive = ["thrift"]
+pandas = ["pandas", "pyarrow"]
pyarrow = ["pyarrow"]
s3fs = ["s3fs"]
snappy = ["python-snappy"]
@@ -955,7 +976,7 @@ snappy = ["python-snappy"]
[metadata]
lock-version = "1.1"
python-versions = "^3.8"
-content-hash =
"8fa466512e3f74dd8f985ed4c85d2698407bffda8f23858700f5089e904f3982"
+content-hash =
"0a9c1eb50886e25d628f1c78c4dc236a2cb28225340aa853eb91aab7bdc96a9b"
[metadata.files]
aiobotocore = [
@@ -1622,6 +1643,35 @@ packaging = [
{file = "packaging-21.3-py3-none-any.whl", hash =
"sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
{file = "packaging-21.3.tar.gz", hash =
"sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
]
+pandas = [
+ {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_universal2.whl", hash =
"sha256:e9dbacd22555c2d47f262ef96bb4e30880e5956169741400af8b306bbb24a273"},
+ {file = "pandas-1.5.2-cp310-cp310-macosx_10_9_x86_64.whl", hash =
"sha256:e2b83abd292194f350bb04e188f9379d36b8dfac24dd445d5c87575f3beaf789"},
+ {file = "pandas-1.5.2-cp310-cp310-macosx_11_0_arm64.whl", hash =
"sha256:2552bffc808641c6eb471e55aa6899fa002ac94e4eebfa9ec058649122db5824"},
+ {file =
"pandas-1.5.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
hash =
"sha256:1fc87eac0541a7d24648a001d553406f4256e744d92df1df8ebe41829a915028"},
+ {file =
"pandas-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash
= "sha256:d0d8fd58df5d17ddb8c72a5075d87cd80d71b542571b5f78178fb067fa4e9c72"},
+ {file = "pandas-1.5.2-cp310-cp310-win_amd64.whl", hash =
"sha256:4aed257c7484d01c9a194d9a94758b37d3d751849c05a0050c087a358c41ad1f"},
+ {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_universal2.whl", hash =
"sha256:375262829c8c700c3e7cbb336810b94367b9c4889818bbd910d0ecb4e45dc261"},
+ {file = "pandas-1.5.2-cp311-cp311-macosx_10_9_x86_64.whl", hash =
"sha256:cc3cd122bea268998b79adebbb8343b735a5511ec14efb70a39e7acbc11ccbdc"},
+ {file = "pandas-1.5.2-cp311-cp311-macosx_11_0_arm64.whl", hash =
"sha256:b4f5a82afa4f1ff482ab8ded2ae8a453a2cdfde2001567b3ca24a4c5c5ca0db3"},
+ {file =
"pandas-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
hash =
"sha256:8092a368d3eb7116e270525329a3e5c15ae796ccdf7ccb17839a73b4f5084a39"},
+ {file =
"pandas-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash
= "sha256:f6257b314fc14958f8122779e5a1557517b0f8e500cfb2bd53fa1f75a8ad0af2"},
+ {file = "pandas-1.5.2-cp311-cp311-win_amd64.whl", hash =
"sha256:82ae615826da838a8e5d4d630eb70c993ab8636f0eff13cb28aafc4291b632b5"},
+ {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_universal2.whl", hash =
"sha256:457d8c3d42314ff47cc2d6c54f8fc0d23954b47977b2caed09cd9635cb75388b"},
+ {file = "pandas-1.5.2-cp38-cp38-macosx_10_9_x86_64.whl", hash =
"sha256:c009a92e81ce836212ce7aa98b219db7961a8b95999b97af566b8dc8c33e9519"},
+ {file = "pandas-1.5.2-cp38-cp38-macosx_11_0_arm64.whl", hash =
"sha256:71f510b0efe1629bf2f7c0eadb1ff0b9cf611e87b73cd017e6b7d6adb40e2b3a"},
+ {file =
"pandas-1.5.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash
= "sha256:a40dd1e9f22e01e66ed534d6a965eb99546b41d4d52dbdb66565608fde48203f"},
+ {file =
"pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash =
"sha256:5ae7e989f12628f41e804847a8cc2943d362440132919a69429d4dea1f164da0"},
+ {file = "pandas-1.5.2-cp38-cp38-win32.whl", hash =
"sha256:530948945e7b6c95e6fa7aa4be2be25764af53fba93fe76d912e35d1c9ee46f5"},
+ {file = "pandas-1.5.2-cp38-cp38-win_amd64.whl", hash =
"sha256:73f219fdc1777cf3c45fde7f0708732ec6950dfc598afc50588d0d285fddaefc"},
+ {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_universal2.whl", hash =
"sha256:9608000a5a45f663be6af5c70c3cbe634fa19243e720eb380c0d378666bc7702"},
+ {file = "pandas-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash =
"sha256:315e19a3e5c2ab47a67467fc0362cb36c7c60a93b6457f675d7d9615edad2ebe"},
+ {file = "pandas-1.5.2-cp39-cp39-macosx_11_0_arm64.whl", hash =
"sha256:e18bc3764cbb5e118be139b3b611bc3fbc5d3be42a7e827d1096f46087b395eb"},
+ {file =
"pandas-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash
= "sha256:0183cb04a057cc38fde5244909fca9826d5d57c4a5b7390c0cc3fa7acd9fa883"},
+ {file =
"pandas-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash =
"sha256:344021ed3e639e017b452aa8f5f6bf38a8806f5852e217a7594417fb9bbfa00e"},
+ {file = "pandas-1.5.2-cp39-cp39-win32.whl", hash =
"sha256:e7469271497960b6a781eaa930cba8af400dd59b62ec9ca2f4d31a19f2f91090"},
+ {file = "pandas-1.5.2-cp39-cp39-win_amd64.whl", hash =
"sha256:c218796d59d5abd8780170c937b812c9637e84c32f8271bbf9845970f8c1351f"},
+ {file = "pandas-1.5.2.tar.gz", hash =
"sha256:220b98d15cee0b2cd839a6358bd1f273d0356bf964c1a1aeb32d47db0215488b"},
+]
pep517 = [
{file = "pep517-0.13.0-py3-none-any.whl", hash =
"sha256:4ba4446d80aed5b5eac6509ade100bff3e7943a8489de249654a5ae9b33ee35b"},
{file = "pep517-0.13.0.tar.gz", hash =
"sha256:ae69927c5c172be1add9203726d4b84cf3ebad1edcd5f71fcdc746e66e829f59"},
diff --git a/python/pyiceberg/table/__init__.py
b/python/pyiceberg/table/__init__.py
index 904b234f3a..5dbc2f22ae 100644
--- a/python/pyiceberg/table/__init__.py
+++ b/python/pyiceberg/table/__init__.py
@@ -59,6 +59,7 @@ from pyiceberg.typedef import (
from pyiceberg.types import StructType
if TYPE_CHECKING:
+ import pandas as pd
import pyarrow as pa
from duckdb import DuckDBPyConnection
@@ -211,7 +212,11 @@ class TableScan(Generic[S], ABC):
...
@abstractmethod
- def to_arrow(self) -> pa.table:
+ def to_arrow(self) -> pa.Table:
+ ...
+
+ @abstractmethod
+ def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
...
def update(self: S, **overrides: Any) -> S:
@@ -338,7 +343,7 @@ class DataScan(TableScan["DataScan"]):
yield from (FileScanTask(file) for file in
matching_partition_files)
- def to_arrow(self) -> pa.table:
+ def to_arrow(self) -> pa.Table:
from pyiceberg.io.pyarrow import PyArrowFileIO, expression_to_pyarrow,
schema_to_pyarrow
warnings.warn(
@@ -380,6 +385,9 @@ class DataScan(TableScan["DataScan"]):
return ds.to_table(filter=pyarrow_filter, columns=columns)
+ def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
+ return self.to_arrow().to_pandas(**kwargs)
+
def to_duckdb(self, table_name: str, connection:
Optional[DuckDBPyConnection] = None) -> DuckDBPyConnection:
import duckdb
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 5ebe8e196e..5f46a26859 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -60,6 +60,8 @@ zstandard = "0.19.0"
pyarrow = { version = "10.0.1", optional = true }
+pandas = { version = "1.5.2", optional = true }
+
duckdb = { version = "0.6.0", optional = true }
python-snappy = { version = "0.6.1", optional = true }
@@ -89,6 +91,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry.extras]
pyarrow = ["pyarrow"]
+pandas = ["pandas", "pyarrow"]
duckdb = ["duckdb", "pyarrow"]
snappy = ["python-snappy"]
hive = ["thrift"]
@@ -127,6 +130,10 @@ disallow_untyped_defs = true
module = "pyarrow.*"
ignore_missing_imports = true
+[[tool.mypy.overrides]]
+module = "pandas.*"
+ignore_missing_imports = true
+
[[tool.mypy.overrides]]
module = "snappy.*"
ignore_missing_imports = true