This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new ffe8c2f767 [SYSTEMDS-3151] New detectMissingType builtin function
ffe8c2f767 is described below

commit ffe8c2f767c353c54c7613166592476e1dfbda61
Author: saimonx2 <[email protected]>
AuthorDate: Sun Mar 29 12:14:00 2026 +0200

    [SYSTEMDS-3151] New detectMissingType builtin function
    
    Closes #2410.
---
 .../staging/detectMissingType/create_dataset.dml   | 114 +++++++++++++
 .../detectMissingType/detectMissingType.dml        | 181 +++++++++++++++++++++
 .../detectMissingType/testDetectMissingType.dml    |  68 ++++++++
 3 files changed, 363 insertions(+)

diff --git a/scripts/staging/detectMissingType/create_dataset.dml 
b/scripts/staging/detectMissingType/create_dataset.dml
new file mode 100644
index 0000000000..2852b48cef
--- /dev/null
+++ b/scripts/staging/detectMissingType/create_dataset.dml
@@ -0,0 +1,114 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+apply_mcar = function(Matrix[Double] col_in, Double p_missing, Integer seed) 
return (Matrix[Double] col_out) {
+    n = nrow(col_in);
+    is_missing = (rand(rows=n, cols=1, min=0, max=1, seed=seed) < p_missing);
+
+    # create mask for columns
+    nan_mask = replace(target=is_missing * 1.0, pattern=1, replacement=NaN);
+    nan_mask = replace(target=nan_mask, pattern=0, replacement=1);
+
+    # apply mask
+    col_out = col_in * nan_mask;
+}
+
+apply_mar = function(Matrix[Double] col_in, Matrix[Double] ref1, 
Matrix[Double] ref2, Integer seed, Double base_prob, Double strength) return 
(Matrix[Double] col_out) {
+    
+    ref1_std = scale(ref1, TRUE, TRUE);
+    ref2_std = scale(ref2, TRUE, TRUE);
+    
+    # gen prob
+    logit_base = log(base_prob / (1.0 - base_prob));
+    logit = logit_base + strength * (1.2 * ref1_std - 0.8 * ref2_std);
+    p_missing = 1 / (1 + exp(-logit));
+    p_missing = max(min(p_missing, 0.85), 0.10); 
+    
+    # gen mask
+    is_missing = (rand(rows=nrow(col_in), cols=1, seed=seed) < p_missing);
+    nan_mask = replace(target=is_missing * 1.0, pattern=1, replacement=NaN);
+    nan_mask = replace(target=nan_mask, pattern=0, replacement=1);
+    
+    # apply mask
+    col_out = col_in * nan_mask;
+}
+
+apply_mnar = function(Matrix[Double] col_in, Double p_missing, Integer seed) 
return (Matrix[Double] col_out) {
+
+    y_norm = (col_in - min(col_in)) / (max(col_in) - min(col_in));
+    
+    # gen missing
+    is_missing = (rand(rows=nrow(col_in), cols=1, seed=seed) < (y_norm * 
p_missing));
+    
+    # apply Nan
+    col_out = col_in + (ifelse(is_missing, NaN, 0));
+}
+
+get_dataset = function(Integer n, Matrix[Double] GroundTruth = matrix(0,0,0)) 
return (Matrix[Double] X, Matrix[Double] GT_out) {
+    
+    # create groundTruth
+    if (nrow(GroundTruth) == 0) {
+        # default groundTruth
+        GT_out = matrix("0 0 0 1 1 2 2 3 3", rows=9, cols=1);
+    } else {
+        GT_out = GroundTruth;
+    }
+
+    num_cols = nrow(GT_out);
+    X = matrix(0, rows=n, cols=0);
+    
+    # gen anchor columns
+    anchor1 = rand(rows=n, cols=1, pdf="normal", seed=42);
+    anchor2 = rand(rows=n, cols=1, pdf="normal", seed=43);
+
+    # gen columns
+    for (i in 1:num_cols) {
+        code = as.scalar(GT_out[i, 1]);
+        col_base = matrix(0, rows=n, cols=1);
+
+        if (i <= 2) {
+            if (i == 1) {
+                col_base = anchor1;
+            } else {
+                col_base = anchor2;
+            }
+        } else {
+            # gen noise & correlation
+            noise = rand(rows=n, cols=1, seed=i*123);
+            col_base = 0.5 * anchor1 + 0.3 * anchor2 + noise * 0.2;
+        }
+
+        # apply missing type
+        if (code == 0) { 
+            X = cbind(X, col_base); 
+        }
+        else if (code == 1) { 
+            X = cbind(X, apply_mcar(col_base, 0.20, i)); 
+        }
+        else if (code == 2) { 
+            X = cbind(X, apply_mar(col_base, anchor1, anchor2, i, 0.15, 3.0)); 
+        }
+        else if (code == 3) { 
+            X = cbind(X, apply_mnar(col_base, 0.25, i)); 
+        }
+    }
+    print("Generation dataset completed");
+}
\ No newline at end of file
diff --git a/scripts/staging/detectMissingType/detectMissingType.dml 
b/scripts/staging/detectMissingType/detectMissingType.dml
new file mode 100644
index 0000000000..1f522d5fc8
--- /dev/null
+++ b/scripts/staging/detectMissingType/detectMissingType.dml
@@ -0,0 +1,181 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+# INPUT:
+# 
------------------------------------------------------------------------------------
+# X            Matrix[Double] (n x p)
+#              Data matrix with possible missing values (NA)
+# alpha        Double
+#              Significance level for MCAR chi-square tests (default = 0.05)
+# auc_thresh   Double
+#              Threshold on AUC to distinguish MAR vs MNAR (default = 0.70)
+# 
------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# 
------------------------------------------------------------------------------------
+# DetectedCodes   Matrix[Double] (p x 1)
+#                 Missingness type detected for each column:
+#                   0 = no missing values
+#                   1 = MCAR
+#                   2 = MAR
+#                   3 = MNAR
+# 
------------------------------------------------------------------------------------
+#
+# The following function analyzes every column in the dataset to identify the 
missingness' mechanism
+# For each column having missing values, at first it verifies if the missing 
value depends on other variables (MCAR test via chi-squared)
+# If MCAR gets excluded, the function considers if the missing can be explain 
by the other observed columns using a logistic regression 
+# and AUc instead. In this way we differentiate between MAR and MNAR
+# In the end, it returns a code per column indicating the type of missingness
+
+m_detectMissingType = function(Matrix[Double] X, Double alpha = 0.05, Double 
auc_thresh = 0.70) 
+return (Matrix[Double] DetectedCodes) {
+    
+    p = ncol(X);
+    DetectedCodes = matrix(0, rows=p, cols=1);
+    min_rows = 20;
+
+    print("Col\tMissing\tMin_PVal\tAUC\tPredizione");
+    print("---------------------------------------------------------");
+
+    for (i in 1:p) {
+        target_col = X[, i];
+        missing_count = sum(is.na(target_col));
+
+        detected_code = 0;
+        label = "NONE";
+        best_pval = 1.0;
+        auc_val = 0.5;
+        
+        if (missing_count > 0) {
+            miss_flag = is.na(target_col) + 1;
+            
+            # MCAR check
+            for (j in 1:p) {
+                if (i != j) {
+                    col_j = X[, j];
+                    observed = (is.na(col_j) == 0);
+
+                    if (sum(observed) > min_rows) {
+                        R_v = removeEmpty(target=miss_flag, margin="rows", 
select=observed);
+                        P_v = removeEmpty(target=col_j,     margin="rows", 
select=observed);
+
+                        min_v = min(P_v); 
+                        max_v = max(P_v); 
+                        range_v = max_v - min_v;
+
+                        if (range_v > 0) {
+                            P_bin = bin_into_5(P_v);
+                            F = table(R_v, P_bin);
+                            
+                            if (nrow(F) > 1 & ncol(F) > 1) {
+                                pval = chisq_pval(F);
+                                if (pval < best_pval) { best_pval = pval; }
+                            }
+                        }
+                    }
+                }
+            }
+            
+            if (best_pval > alpha) {
+                label = "MCAR"; 
+                detected_code = 1;
+            } else {
+                # MAR vs MNAR check
+                features = drop_column(X, i);
+                auc_val = mar_auc_score(miss_flag, features);
+                
+                if (auc_val >= auc_thresh) {
+                    label = "MAR "; 
+                    detected_code = 2;
+                } else {
+                    label = "MNAR"; 
+                    detected_code = 3;
+                }
+            }
+        }
+
+        DetectedCodes[i, 1] = detected_code;
+        print(i + "\t" + missing_count + "\t" + toString(best_pval) + "\t" + 
auc_val + "\t" + label);
+    }
+}
+
+
+
+# Evaluate AUC of a logistic regression that tries to explain the missingness 
using other columns
+# Higher values indicate the dependence of the observed variables (probably 
MAR)
+
+mar_auc_score = function(Matrix[Double] miss_flag, Matrix[Double] features) 
return (Double auc_val) {
+    # need to analyze only full rows
+    complete_rows = (rowSums(is.na(features)) == 0);
+    
+    X_clean = removeEmpty(target=features, margin="rows", 
select=complete_rows);
+    R_clean = removeEmpty(target=miss_flag, margin="rows", 
select=complete_rows);
+    
+    # need to check if we have more than 2 rows to compare
+    auc_val = 0.5;
+    if (nrow(R_clean) >= 2 & min(R_clean) != max(R_clean)) {
+        B = multiLogReg(X=X_clean, Y=R_clean, icpt=2, verbose=FALSE);
+        [probs, yhat, acc] = multiLogRegPredict(X=X_clean, B=B, Y=R_clean);
+        auc_val = auc(Y=(R_clean - 1), P=probs[, 2]);
+    }
+}
+
+# Bin a numeric vector into 5 equal-width bins (1..5)
+bin_into_5 = function(Matrix[Double] v)
+return (Matrix[Double] b)
+{
+    vmin = min(v);
+    vmax = max(v);
+    b = floor(((v - vmin) / (vmax - vmin)) * 4.99) + 1;
+}
+
+# Chi-square independence test p-value for a contingency table F
+chisq_pval = function(Matrix[Double] F)
+return (Double p)
+{
+    eps = 1e-10;
+
+    row_s = rowSums(F);
+    col_s = colSums(F);
+    expected = (row_s %*% col_s) / sum(F);
+
+    chi2 = sum((F - expected)^2 / (expected + eps));
+    dfv = (nrow(F) - 1) * (ncol(F) - 1);
+
+    p = 1.0 - cdf(target=chi2, dist="chisq", df=dfv);
+}
+
+# Remove column idx from matrix X (keeps column order)
+drop_column = function(Matrix[Double] X, Integer idx)
+return (Matrix[Double] X_out)
+{
+    p = ncol(X);
+
+    if (idx == 1) {
+        X_out = X[, 2:p];
+    } else if (idx == p) {
+        X_out = X[, 1:(p-1)];
+    } else {
+        X_out = cbind(X[, 1:(idx-1)], X[, (idx+1):p]);
+    }
+}
+
diff --git a/scripts/staging/detectMissingType/testDetectMissingType.dml 
b/scripts/staging/detectMissingType/testDetectMissingType.dml
new file mode 100644
index 0000000000..2d3f7d897a
--- /dev/null
+++ b/scripts/staging/detectMissingType/testDetectMissingType.dml
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+source("src/test/scripts/functions/builtin/create_dataset.dml") as gen;
+
+# four different tests
+for (t in 1:4) {
+    print("=========================================================");
+    
+    n_rows = 5000;
+    test_name = "";
+    
+    if (t == 1) { 
+        test_name = "9 column MIX (Default)";
+        gt_test = matrix("0 0 0 1 1 2 2 3 3", rows=9, cols=1); 
+    } 
+    else if (t == 2) { 
+        test_name = "4 Easy column";
+        gt_test = matrix("0 1 2 3", rows=4, cols=1); 
+    }
+    else if (t == 3) { 
+        test_name = "10 unbalanced columns (MAR)";
+        gt_test = matrix("0 0 2 2 2 2 2 2 2 2", rows=10, cols=1); 
+    }
+    else { 
+        test_name = "small number of rows (N=150)";
+        gt_test = matrix("0 1 2 3", rows=4, cols=1);
+        n_rows = 150; 
+    }
+
+    print("Execute TEST " + t + ": " + test_name);
+
+    # gen dataset
+    [data, truth] = gen::get_dataset(n=n_rows, GroundTruth=gt_test);
+
+    # analyze dataset
+    results = detectMissingType(data, 0.05, 0.60);
+
+    # check the results
+    match_matrix = (results == truth);
+    correct_count = sum(match_matrix);
+    accuracy = (correct_count / nrow(truth)) * 100;
+    
+    print("---------------------------------------------------------");
+    print("Accuracy: " + toString(accuracy) + "%");
+}
+
+print("=========================================================");
+print("Test suite completed");
\ No newline at end of file

Reply via email to