This is an automated email from the ASF dual-hosted git repository.
timsaucer pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new f59dd08 feat: add `head`, `tail` methods (#915)
f59dd08 is described below
commit f59dd08bfbc0f01cc16b858465d03c3a01ba647c
Author: Ion Koutsouris <[email protected]>
AuthorDate: Mon Oct 21 23:02:49 2024 +0200
feat: add `head`, `tail` methods (#915)
* feat: add head, tail methods
* chore: add default head/tail
---
python/datafusion/dataframe.py | 25 +++++++++++++++++++++++++
python/tests/test_dataframe.py | 22 ++++++++++++++++++++++
2 files changed, 47 insertions(+)
diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py
index 3ed6d40..e4f8073 100644
--- a/python/datafusion/dataframe.py
+++ b/python/datafusion/dataframe.py
@@ -292,6 +292,31 @@ class DataFrame:
"""
return DataFrame(self.df.limit(count, offset))
+ def head(self, n: int = 5) -> DataFrame:
+ """Return a new :py:class:`DataFrame` with a limited number of rows.
+
+ Args:
+ n: Number of rows to take from the head of the DataFrame.
+
+ Returns:
+ DataFrame after limiting.
+ """
+ return DataFrame(self.df.limit(n, 0))
+
+ def tail(self, n: int = 5) -> DataFrame:
+ """Return a new :py:class:`DataFrame` with a limited number of rows.
+
+ Be aware this could be potentially expensive since the row size needs
to be
+ determined of the dataframe. This is done by collecting it.
+
+ Args:
+ n: Number of rows to take from the tail of the DataFrame.
+
+ Returns:
+ DataFrame after limiting.
+ """
+ return DataFrame(self.df.limit(n, max(0, self.count() - n)))
+
def collect(self) -> list[pa.RecordBatch]:
"""Execute this :py:class:`DataFrame` and collect results into memory.
diff --git a/python/tests/test_dataframe.py b/python/tests/test_dataframe.py
index bb408c9..d73f5eb 100644
--- a/python/tests/test_dataframe.py
+++ b/python/tests/test_dataframe.py
@@ -201,6 +201,28 @@ def test_limit_with_offset(df):
assert len(result.column(1)) == 1
+def test_head(df):
+ df = df.head(1)
+
+ # execute and collect the first (and only) batch
+ result = df.collect()[0]
+
+ assert result.column(0) == pa.array([1])
+ assert result.column(1) == pa.array([4])
+ assert result.column(2) == pa.array([8])
+
+
+def test_tail(df):
+ df = df.tail(1)
+
+ # execute and collect the first (and only) batch
+ result = df.collect()[0]
+
+ assert result.column(0) == pa.array([3])
+ assert result.column(1) == pa.array([6])
+ assert result.column(2) == pa.array([8])
+
+
def test_with_column(df):
df = df.with_column("c", column("a") + column("b"))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]