This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git
The following commit(s) were added to refs/heads/master by this push:
new d083d97 [DataFrame] - Add DataFrame::distinct binding (#34)
d083d97 is described below
commit d083d97daf8597e4b42052fd8d366c419e3e2c43
Author: Francis Du <[email protected]>
AuthorDate: Thu Sep 8 11:07:09 2022 +0800
[DataFrame] - Add DataFrame::distinct binding (#34)
* feat: add DataFrame::distinct binding
* fix: fmt
* fix: python linter
---
.gitignore | 1 +
datafusion/tests/test_dataframe.py | 24 ++++++++++++++++++++++++
src/dataframe.rs | 8 +++++++-
3 files changed, 32 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 5e6b18b..2e03daf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
target
Cargo.lock
+/venv
.idea
# Byte-compiled / optimized / DLL files
diff --git a/datafusion/tests/test_dataframe.py
b/datafusion/tests/test_dataframe.py
index bed0a91..30506a7 100644
--- a/datafusion/tests/test_dataframe.py
+++ b/datafusion/tests/test_dataframe.py
@@ -156,6 +156,30 @@ def test_join():
assert table.to_pydict() == expected
+def test_distinct():
+ ctx = SessionContext()
+
+ batch = pa.RecordBatch.from_arrays(
+ [pa.array([1, 2, 3, 1, 2, 3]), pa.array([4, 5, 6, 4, 5, 6])],
+ names=["a", "b"],
+ )
+ df_a = (
+ ctx.create_dataframe([[batch]])
+ .distinct()
+ .sort(column("a").sort(ascending=True))
+ )
+
+ batch = pa.RecordBatch.from_arrays(
+ [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+ names=["a", "b"],
+ )
+ df_b = ctx.create_dataframe([[batch]]).sort(
+ column("a").sort(ascending=True)
+ )
+
+ assert df_a.collect() == df_b.collect()
+
+
def test_window_lead(df):
df = df.select(
column("a"),
diff --git a/src/dataframe.rs b/src/dataframe.rs
index 80963f7..87f38c1 100644
--- a/src/dataframe.rs
+++ b/src/dataframe.rs
@@ -129,6 +129,12 @@ impl PyDataFrame {
Ok(pretty::print_batches(&batches)?)
}
+ /// Filter out duplicate rows
+ fn distinct(&self) -> PyResult<Self> {
+ let df = self.df.distinct()?;
+ Ok(Self::new(df))
+ }
+
fn join(
&self,
right: PyDataFrame,
@@ -147,7 +153,7 @@ impl PyDataFrame {
"The join type {} does not exist or is not implemented",
how
))
- .into())
+ .into());
}
};