This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 3d244415da [SYSTEMDS-3741] Python API Builtin countDistinctApprox
3d244415da is described below
commit 3d244415dae9b615a8ada6c16b7deabeabd9bef8
Author: e-strauss <[email protected]>
AuthorDate: Tue Sep 3 18:45:04 2024 +0200
[SYSTEMDS-3741] Python API Builtin countDistinctApprox
Closes #2088
---
src/main/python/systemds/operator/nodes/matrix.py | 14 ++++++++++++
src/main/python/tests/matrix/test_aggregations.py | 26 +++++++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/src/main/python/systemds/operator/nodes/matrix.py
b/src/main/python/systemds/operator/nodes/matrix.py
index a7c4202e88..07566ebfd0 100644
--- a/src/main/python/systemds/operator/nodes/matrix.py
+++ b/src/main/python/systemds/operator/nodes/matrix.py
@@ -260,6 +260,20 @@ class Matrix(OperationNode):
f"Axis has to be either 0, 1 or None, for column, row or complete
{self.operation}")
+ def countDistinctApprox(self, axis: int = None) -> 'OperationNode':
+ """Calculate the approximate number of distinct values of matrix.
+ :param axis: can be 0 or 1 to do either row or column aggregation
+ :return: `Matrix` representing operation
+ """
+ if axis == 0:
+ return Matrix(self.sds_context, 'colCountDistinctApprox', [self])
+ elif axis == 1:
+ return Matrix(self.sds_context, 'rowCountDistinctApprox', [self])
+ elif axis is None:
+ return Scalar(self.sds_context, 'countDistinctApprox', [self])
+ raise ValueError(
+ f"Axis has to be either 0, 1 or None, for column, row or complete
{self.operation}")
+
def var(self, axis: int = None) -> 'OperationNode':
"""Calculate variance of matrix.
diff --git a/src/main/python/tests/matrix/test_aggregations.py
b/src/main/python/tests/matrix/test_aggregations.py
index 6313615122..8627d2547c 100644
--- a/src/main/python/tests/matrix/test_aggregations.py
+++ b/src/main/python/tests/matrix/test_aggregations.py
@@ -120,6 +120,32 @@ class TestMatrixAggFn(unittest.TestCase):
self.assertTrue(np.allclose(
self.sds.from_numpy(m2).trace().compute(), m2.trace()))
+ def test_countDistinctApprox1(self):
+ distinct = 100
+ m = np.round(np.random.random((1000, 1000))*(distinct - 1))
+ # allow and error of 1%
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m).countDistinctApprox().compute(),
len(np.unique(m)), 1))
+
+ def test_countDistinctApprox2(self):
+ distinct = 1000
+ m = np.round(np.random.random((10000, 100))*(distinct - 1))
+ # allow and error of 1%
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m).countDistinctApprox(0).compute(),
[len(np.unique(col))*100 for col in m.T], 10))
+
+ def test_countDistinctApprox3(self):
+ distinct = 1000
+ m = np.round(np.random.random((100, 10000))*(distinct - 1))
+ # allow and error of 1%
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m).countDistinctApprox(1).compute(),
np.array([[len(np.unique(col))] for col in m]), 10))
+
+ def test_countDistinctApprox4(self):
+ m = np.round(np.random.random((2, 2)))
+ with self.assertRaises(ValueError):
+ self.sds.from_numpy(m).countDistinctApprox(2)
+
def test_countDistinct1(self):
self.assertTrue(np.allclose(