[arrow-datafusion-python] branch main updated: test: Window functions (#182)

agrove Wed, 15 Feb 2023 14:51:52 -0800

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git



The following commit(s) were added to refs/heads/main by this push:
     new d51c54d  test: Window functions (#182)
d51c54d is described below

commit d51c54d7b0d299ad5e1a14d4801624d4345fba26
Author: Dejan Simic <[email protected]>
AuthorDate: Wed Feb 15 23:51:34 2023 +0100

    test: Window functions (#182)
    
    * Write unit tests for window functions
    
    * Add test for date_bin function
---
 datafusion/tests/test_dataframe.py | 86 +++++++++++++++++++++++++++++++++-----
 datafusion/tests/test_functions.py | 21 ++++++++--
 2 files changed, 93 insertions(+), 14 deletions(-)

diff --git a/datafusion/tests/test_dataframe.py 
b/datafusion/tests/test_dataframe.py
index 30327ee..dcab86a 100644
--- a/datafusion/tests/test_dataframe.py
+++ b/datafusion/tests/test_dataframe.py
@@ -33,8 +33,8 @@ def df():
 
     # create a RecordBatch and a new DataFrame from it
     batch = pa.RecordBatch.from_arrays(
-        [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
-        names=["a", "b"],
+        [pa.array([1, 2, 3]), pa.array([4, 5, 6]), pa.array([8, 5, 8])],
+        names=["a", "b", "c"],
     )
 
     return ctx.create_dataframe([[batch]])
@@ -100,7 +100,7 @@ def test_sort(df):
     df = df.sort(column("b").sort(ascending=False))
 
     table = pa.Table.from_batches(df.collect())
-    expected = {"a": [3, 2, 1], "b": [6, 5, 4]}
+    expected = {"a": [3, 2, 1], "b": [6, 5, 4], "c": [8, 5, 8]}
 
     assert table.to_pydict() == expected
 
@@ -204,24 +204,90 @@ def test_distinct():
     assert df_a.collect() == df_b.collect()
 
 
[email protected](
-    reason="https://github.com/apache/arrow-datafusion-python/issues/135";
-)
-def test_window_lead(df):
+def test_window_functions(df):
     df = df.select(
         column("a"),
+        column("b"),
+        column("c"),
+        f.alias(
+            f.window("row_number", [], order_by=[f.order_by(column("c"))]),
+            "row",
+        ),
+        f.alias(
+            f.window("rank", [], order_by=[f.order_by(column("c"))]),
+            "rank",
+        ),
+        f.alias(
+            f.window("dense_rank", [], order_by=[f.order_by(column("c"))]),
+            "dense_rank",
+        ),
+        f.alias(
+            f.window("percent_rank", [], order_by=[f.order_by(column("c"))]),
+            "percent_rank",
+        ),
+        f.alias(
+            f.window("cume_dist", [], order_by=[f.order_by(column("b"))]),
+            "cume_dist",
+        ),
+        f.alias(
+            f.window(
+                "ntile", [literal(2)], order_by=[f.order_by(column("c"))]
+            ),
+            "ntile",
+        ),
+        f.alias(
+            f.window("lag", [column("b")], order_by=[f.order_by(column("b"))]),
+            "previous",
+        ),
         f.alias(
             f.window(
                 "lead", [column("b")], order_by=[f.order_by(column("b"))]
             ),
-            "a_next",
+            "next",
+        ),
+        f.alias(
+            f.window(
+                "first_value",
+                [column("a")],
+                order_by=[f.order_by(column("b"))],
+            ),
+            "first_value",
+        ),
+        f.alias(
+            f.window(
+                "last_value", [column("b")], order_by=[f.order_by(column("b"))]
+            ),
+            "last_value",
+        ),
+        f.alias(
+            f.window(
+                "nth_value",
+                [column("b"), literal(2)],
+                order_by=[f.order_by(column("b"))],
+            ),
+            "2nd_value",
         ),
     )
 
     table = pa.Table.from_batches(df.collect())
 
-    expected = {"a": [1, 2, 3], "a_next": [5, 6, None]}
-    assert table.to_pydict() == expected
+    expected = {
+        "a": [1, 2, 3],
+        "b": [4, 5, 6],
+        "c": [8, 5, 8],
+        "row": [2, 1, 3],
+        "rank": [2, 1, 2],
+        "dense_rank": [2, 1, 2],
+        "percent_rank": [0.5, 0, 0.5],
+        "cume_dist": [0.3333333333333333, 0.6666666666666666, 1.0],
+        "ntile": [1, 1, 2],
+        "next": [5, 6, None],
+        "previous": [None, 4, 5],
+        "first_value": [1, 1, 1],
+        "last_value": [4, 5, 6],
+        "2nd_value": [None, 5, 5],
+    }
+    assert table.sort_by("a").to_pydict() == expected
 
 
 def test_get_dataframe(tmp_path):
diff --git a/datafusion/tests/test_functions.py 
b/datafusion/tests/test_functions.py
index 76edfa2..bea5808 100644
--- a/datafusion/tests/test_functions.py
+++ b/datafusion/tests/test_functions.py
@@ -364,6 +364,11 @@ def test_temporal_functions(df):
         f.datepart(literal("year"), column("d")),
         f.date_trunc(literal("month"), column("d")),
         f.datetrunc(literal("day"), column("d")),
+        f.date_bin(
+            literal("15 minutes"),
+            column("d"),
+            literal("2001-01-01 00:02:30"),
+        ),
         f.from_unixtime(literal(1673383974)),
         f.to_timestamp(literal("2023-09-07 05:06:14.523952")),
         f.to_timestamp_seconds(literal("2023-09-07 05:06:14.523952")),
@@ -384,17 +389,25 @@ def test_temporal_functions(df):
         type=pa.timestamp("ns"),
     )
     assert result.column(4) == pa.array(
-        [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s")
+        [
+            datetime(2022, 12, 30, 23, 47, 30),
+            datetime(2027, 6, 25, 23, 47, 30),
+            datetime(2020, 7, 1, 23, 47, 30),
+        ],
+        type=pa.timestamp("ns"),
     )
     assert result.column(5) == pa.array(
-        [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns")
+        [datetime(2023, 1, 10, 20, 52, 54)] * 3, type=pa.timestamp("s")
     )
     assert result.column(6) == pa.array(
-        [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s")
+        [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("ns")
     )
     assert result.column(7) == pa.array(
-        [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms")
+        [datetime(2023, 9, 7, 5, 6, 14)] * 3, type=pa.timestamp("s")
     )
     assert result.column(8) == pa.array(
+        [datetime(2023, 9, 7, 5, 6, 14, 523000)] * 3, type=pa.timestamp("ms")
+    )
+    assert result.column(9) == pa.array(
         [datetime(2023, 9, 7, 5, 6, 14, 523952)] * 3, type=pa.timestamp("us")
     )

[arrow-datafusion-python] branch main updated: test: Window functions (#182)

Reply via email to