This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new d51c54d test: Window functions (#182)
d51c54d is described below
commit d51c54d7b0d299ad5e1a14d4801624d4345fba26
Author: Dejan Simic <[email protected]>
AuthorDate: Wed Feb 15 23:51:34 2023 +0100
test: Window functions (#182)
* Write unit tests for window functions
* Add test for date_bin function
---
datafusion/tests/test_dataframe.py | 86 +++++++++++++++++++++++++++++++++-----
datafusion/tests/test_functions.py | 21 ++++++++--
2 files changed, 93 insertions(+), 14 deletions(-)
diff --git a/datafusion/tests/test_dataframe.py
b/datafusion/tests/test_dataframe.py
index 30327ee..dcab86a 100644
--- a/datafusion/tests/test_dataframe.py
+++ b/datafusion/tests/test_dataframe.py
@@ -33,8 +33,8 @@ def df():
# create a RecordBatch and a new DataFrame from it
batch = pa.RecordBatch.from_arrays(
- [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
- names=["a", "b"],
+ [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])],
+ names=["a", "b", "c"],
)
return ctx.create_dataframe([[batch]])
@@ -100,7 +100,7 @@ def test_sort(df):
df = df.sort(column("b").sort(ascending=False))
table = pa.Table.from_batches(df.collect())
- expected = {"a": [3, 2, 1], "b": [6, 5, 4]}
+ expected = {"a": [3, 2, 1], "b": [6, 5, 4], "c": [8, 5, 8]}
assert table.to_pydict() == expected
@@ -204,24 +204,90 @@ def test_distinct():
assert df_a.collect() == df_b.collect()
[email protected](
- reason="https://github.com/apache/arrow-datafusion-python/issues/135"
-)
-def test_window_lead(df):
+def test_window_functions(df):
df = df.select(
column("a"),
+ column("b"),
+ column("c"),
+ f.alias(
+ f.window("row_number", [], order_by=[f.order_by(column("c"))]),
+ "row",
+ ),
+ f.alias(
+ f.window("rank", [], order_by=[f.order_by(column("c"))]),
+ "rank",
+ ),
+ f.alias(
+ f.window("dense_rank", [], order_by=[f.order_by(column("c"))]),
+ "dense_rank",
+ ),
+ f.alias(
+ f.window("percent_rank", [], order_by=[f.order_by(column("c"))]),
+ "percent_rank",
+ ),
+ f.alias(
+ f.window("cume_dist", [], order_by=[f.order_by(column("b"))]),
+ "cume_dist",
+ ),
+ f.alias(
+ f.window(
+ "ntile", [literal(2)], order_by=[f.order_by(column("c"))]
+ ),
+ "ntile",
+ ),
+ f.alias(
+ f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]),
+ "previous",
+ ),
f.alias(
f.window(
"lead", [column("b")], order_by=[f.order_by(column("b"))]
),
- "a_next",
+ "next",
+ ),
+ f.alias(
+ f.window(
+ "first_value",
+ [column("a")],
+ order_by=[f.order_by(column("b"))],
+ ),
+ "first_value",
+ ),
+ f.alias(
+ f.window(
+ "last_value", [column("b")], order_by=[f.order_by(column("b"))]
+ ),
+ "last_value",
+ ),
+ f.alias(
+ f.window(
+ "nth_value",
+ [column("b"), literal(2)],
+ order_by=[f.order_by(column("b"))],
+ ),
+ "2nd_value",
),
)
table = pa.Table.from_batches(df.collect())
- expected = {"a": [1, 2, 3], "a_next": [5, 6, None]}
- assert table.to_pydict() == expected
+ expected = {
+ "a": [1, 2, 3],
+ "b": [4, 5, 6],
+ "c": [8, 5, 8],
+ "row": [2, 1, 3],
+ "rank": [2, 1, 2],
+ "dense_rank": [2, 1, 2],
+ "percent_rank": [0.5, 0, 0.5],
+ "cume_dist": [0.3333333333333333, 0.6666666666666666, 1.0],
+ "ntile": [1, 1, 2],
+ "next": [5, 6, None],
+ "previous": [None, 4, 5],
+ "first_value": [1, 1, 1],
+ "last_value": [4, 5, 6],
+ "2nd_value": [None, 5, 5],
+ }
+ assert table.sort_by("a").to_pydict() == expected
def test_get_dataframe(tmp_path):
diff --git a/datafusion/tests/test_functions.py
b/datafusion/tests/test_functions.py
index 76edfa2..bea5808 100644
--- a/datafusion/tests/test_functions.py
+++ b/datafusion/tests/test_functions.py
@@ -364,6 +364,11 @@ def test_temporal_functions(df):
f.datepart(literal("year"), column("d")),
f.date_trunc(literal("month"), column("d")),
f.datetrunc(literal("day"), column("d")),
+ f.date_bin(
+ literal("15 minutes"),
+ column("d"),
+ literal("2001-01-01 00:02:30"),
+ ),
f.from_unixtime(literal(1673383974)),
f.to_timestamp(literal("2023-09-07 05:06:14.523952")),
f.to_timestamp_seconds(literal("2023-09-07 05:06:14.523952")),
@@ -384,17 +389,25 @@ def test_temporal_functions(df):
type=pa.timestamp("ns"),
)
assert result.column(4) == pa.array(
- [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s")
+ [
+ datetime(2022, 12, 30, 23, 47, 30),
+ datetime(2027, 6, 25, 23, 47, 30),
+ datetime(2020, 7, 1, 23, 47, 30),
+ ],
+ type=pa.timestamp("ns"),
)
assert result.column(5) == pa.array(
- [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns")
+ [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s")
)
assert result.column(6) == pa.array(
- [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s")
+ [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns")
)
assert result.column(7) == pa.array(
- [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms")
+ [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s")
)
assert result.column(8) == pa.array(
+ [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms")
+ )
+ assert result.column(9) == pa.array(
[datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us")
)