This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new d78165b3ae [SYSTEMDS-3863] New robust scaling built-in function
d78165b3ae is described below
commit d78165b3ae89091ba421d375b5eec8bc916a36ce
Author: KilianBat <[email protected]>
AuthorDate: Sat Dec 27 14:43:26 2025 +0100
[SYSTEMDS-3863] New robust scaling built-in function
Closes #2278.
---
scripts/builtin/scaleRobust.dml | 60 ++++++++++++++++
scripts/builtin/scaleRobustApply.dml | 48 +++++++++++++
src/main/java/org/apache/sysds/api/DMLScript.java | 2 +-
.../java/org/apache/sysds/common/Builtins.java | 2 +
.../builtin/part2/BuiltinScaleRobustTest.java | 83 ++++++++++++++++++++++
src/test/scripts/functions/builtin/scaleRobust.R | 42 +++++++++++
src/test/scripts/functions/builtin/scaleRobust.dml | 24 +++++++
src/test/scripts/functions/builtin/scaleRobust.py | 38 ++++++++++
8 files changed, 298 insertions(+), 1 deletion(-)
diff --git a/scripts/builtin/scaleRobust.dml b/scripts/builtin/scaleRobust.dml
new file mode 100644
index 0000000000..ce309fabe5
--- /dev/null
+++ b/scripts/builtin/scaleRobust.dml
@@ -0,0 +1,60 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Robust scaling using median and IQR (Interquartile Range)
+# Resistant to outliers by centering with the median and scaling with IQR.
+#
+# INPUT:
+#
-------------------------------------------------------------------------------------
+# X Input feature matrix of shape n-by-m
+#
-------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
-------------------------------------------------------------------------------------
+# Y Scaled output matrix of shape n-by-m
+# med Column medians (Q2) of shape 1-by-m
+# q1 Column first quantiles (Q1) of shape 1-by-m
+# q3 Column first quantiles (Q3) of shape 1-by-m
+#
-------------------------------------------------------------------------------------
+
+m_scaleRobust = function(Matrix[Double] X)
+ return (Matrix[Double] Y, Matrix[Double] med, Matrix[Double] q1,
Matrix[Double] q3)
+{
+ n = nrow(X)
+ m = ncol(X)
+
+ med = matrix(0.0, rows=1, cols=m)
+ q1 = matrix(0.0, rows=1, cols=m)
+ q3 = matrix(0.0, rows=1, cols=m)
+
+ # Define quantile probabilities once, outside the loop
+ q_probs = as.matrix(list(0.25, 0.5, 0.75));
+
+ # Loop over columns to compute quantiles
+ parfor (j in 1:m) {
+ q = quantile(X[,j], q_probs)
+ med[1,j] = q[2,1]
+ q1[1,j] = q[1,1]
+ q3[1,j] = q[3,1]
+ }
+
+ Y = scaleRobustApply(X, med, q1, q3);
+}
diff --git a/scripts/builtin/scaleRobustApply.dml
b/scripts/builtin/scaleRobustApply.dml
new file mode 100644
index 0000000000..11461731b3
--- /dev/null
+++ b/scripts/builtin/scaleRobustApply.dml
@@ -0,0 +1,48 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Apply robust scaling using precomputed medians and IQRs
+#
+# INPUT:
+#
-------------------------------------------------------------------------------------
+# X Input feature matrix of shape n-by-m
+# med Column medians (Q2) of shape 1-by-m
+# q1 Column first quantiles (Q1) of shape 1-by-m
+# q3 Column first quantiles (Q3) of shape 1-by-m
+#
-------------------------------------------------------------------------------------
+#
+# OUTPUT:
+#
-------------------------------------------------------------------------------------
+# Y Scaled output matrix of shape n-by-m
+#
-------------------------------------------------------------------------------------
+
+m_scaleRobustApply = function(Matrix[Double] X, Matrix[Double] med,
Matrix[Double] q1, Matrix[Double] q3)
+ return (Matrix[Double] Y)
+{
+ iqr = q3 - q1
+
+ # Ensure robust scaling is safe by replacing invalid IQRs
+ iqr = replace(target=iqr, pattern=0, replacement=1)
+ iqr = replace(target=iqr, pattern=NaN, replacement=1)
+
+ # Apply robust transformation
+ Y = (X - med) / iqr
+}
diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java
b/src/main/java/org/apache/sysds/api/DMLScript.java
index 65805b5c2e..e6becd83d1 100644
--- a/src/main/java/org/apache/sysds/api/DMLScript.java
+++ b/src/main/java/org/apache/sysds/api/DMLScript.java
@@ -327,7 +327,7 @@ public class DMLScript
Map<String, String> argVals = dmlOptions.argVals;
DML_FILE_PATH_ANTLR_PARSER = dmlOptions.filePath;
-
+
//Step 3: invoke dml script
printInvocationInfo(fileOrScript, fnameOptConfig,
argVals);
execute(dmlScriptStr, fnameOptConfig, argVals, args);
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 4feab311c7..dc1f23b83f 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -392,6 +392,8 @@ public enum Builtins {
RMEMPTY("removeEmpty", false, true),
SCALE("scale", true, false),
SCALEAPPLY("scaleApply", true, false),
+ SCALEROBUST("scaleRobust", true, false),
+ SCALEROBUSTAPPLY("scaleRobustApply", true, false),
SCALE_MINMAX("scaleMinMax", true, false),
TIME("time", false),
TOKENIZE("tokenize", false, true),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java
new file mode 100644
index 0000000000..630b149aae
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinScaleRobustTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import java.util.HashMap;
+
+import org.junit.Test;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.common.Types.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+public class BuiltinScaleRobustTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "scaleRobust";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinScaleRobustTest.class.getSimpleName() + "/";
+ private final static double eps = 1e-10;
+ private final static int rows = 70;
+ private final static int cols = 50;
+
+
+ @Override
+ public void setUp() {
+ addTestConfiguration(TEST_NAME, new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B"}));
+ }
+
+ @Test
+ public void testScaleRobustDenseCP() {
+ runTest(false, ExecType.CP);
+ }
+
+ private void runTest(boolean sparse, ExecType et) {
+ ExecMode old = setExecMode(et);
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ double sparsity = sparse ? 0.1 : 0.9;
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ programArgs = new String[]{"-args", input("A"),
output("B")};
+ programArgs = new String[]{"-exec", "singlenode",
"-args", input("A"), output("B")};
+ rCmd = "Rscript " + fullRScriptName + " " + inputDir()
+ " " + expectedDir();
+
+ double[][] A = getRandomMatrix(rows, cols, -10, 10,
sparsity, 7);
+ writeInputMatrixWithMTD("A", A, true);
+
+ // Run DML
+ runTest(true, false, null, -1);
+
+ // Run R
+ runRScript(true);
+
+ // Read matrices and compare
+ HashMap<CellIndex, Double> dmlfile =
readDMLMatrixFromOutputDir("B");
+ HashMap<CellIndex, Double> rfile =
readRMatrixFromExpectedDir("B");
+ TestUtils.compareMatrices(dmlfile, rfile, eps, "DML",
"R");
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ } finally {
+ resetExecMode(old);
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/scaleRobust.R
b/src/test/scripts/functions/builtin/scaleRobust.R
new file mode 100644
index 0000000000..553555cb39
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.R
@@ -0,0 +1,42 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+library("Matrix")
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+
+X = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+colnames(X) = colnames(X, do.NULL=FALSE, prefix="C")
+Y = X
+
+for (j in 1:ncol(X)) {
+ col = X[, j]
+ med = quantile(col, probs=0.5, type=1, names=FALSE, na.rm=FALSE)
+ q1 = quantile(col, probs=0.25, type=1, names=FALSE, na.rm=FALSE)
+ q3 = quantile(col, probs=0.75, type=1, names=FALSE, na.rm=FALSE)
+ iqr = q3 - q1
+ if (iqr == 0 || is.nan(iqr)) iqr = 1
+ Y[, j] = (col - med) / iqr
+}
+
+writeMM(as(Y, "CsparseMatrix"), paste(args[2], "B", sep=""))
diff --git a/src/test/scripts/functions/builtin/scaleRobust.dml
b/src/test/scripts/functions/builtin/scaleRobust.dml
new file mode 100644
index 0000000000..23dcd5f97a
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.dml
@@ -0,0 +1,24 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+[Y, med, iqr] = scaleRobust(X);
+write(Y, $2);
diff --git a/src/test/scripts/functions/builtin/scaleRobust.py
b/src/test/scripts/functions/builtin/scaleRobust.py
new file mode 100644
index 0000000000..37d13f41e6
--- /dev/null
+++ b/src/test/scripts/functions/builtin/scaleRobust.py
@@ -0,0 +1,38 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import numpy as np
+from scipy.io import mmread, mmwrite
+from scipy.sparse import csc_matrix
+from sklearn.preprocessing import RobustScaler
+
+if __name__ == "__main__":
+ input_path = sys.argv[1] + "A.mtx"
+ output_path = sys.argv[2] + "B"
+
+ X = mmread(input_path).toarray()
+
+ # Apply RobustScaler
+ scaler = RobustScaler()
+ Y = scaler.fit_transform(X)
+
+ mmwrite(output_path, csc_matrix(Y))