[systemds] branch master updated: [MINOR] Added support for categorical features in SMOTE

ssiddiqi Wed, 17 Mar 2021 15:34:44 -0700

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 797ab88  [MINOR] Added support for categorical features in SMOTE
797ab88 is described below

commit 797ab881507ad2389aa947430411e04256fc1801
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Mar 17 22:48:11 2021 +0100

    [MINOR] Added support for categorical features in SMOTE
---
 scripts/builtin/smote.dml                          | 56 ++++++++++++++++++----
 .../builtin/BuiltinGaussianClassifierTest.java     |  3 +-
 .../test/functions/builtin/BuiltinSmoteTest.java   | 41 +++++++++++-----
 src/test/scripts/functions/builtin/smote.dml       |  3 +-
 4 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/scripts/builtin/smote.dml b/scripts/builtin/smote.dml
index 1a416ea..c6fc751 100644
--- a/scripts/builtin/smote.dml
+++ b/scripts/builtin/smote.dml
@@ -21,12 +21,14 @@
 
 
 # Builtin function for handing class imbalance using Synthetic Minority 
Over-sampling Technique (SMOTE)
+# by Nitesh V. Chawla et. al. In Journal of Artificial Intelligence Research 
16 (2002). 321–357
 #
 # INPUT PARAMETERS:
 # 
---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # 
---------------------------------------------------------------------------------------------
 # X               Double   ---       Matrix of minority class samples 
+# mask             Double   ---       0/1 mask vector where 0 represent 
numeric value and 1 represent categorical value
 # s               Integer   25       Amount of SMOTE (percentage of 
oversampling), integral multiple of 100
 # k               Integer   1        Number of nearest neighbour
 # 
---------------------------------------------------------------------------------------------
@@ -38,7 +40,7 @@
 # 
---------------------------------------------------------------------------------------------
 # Y               Double   ---       Matrix of (N/100)-1 * nrow(X) synthetic 
minority class samples 
 
-m_smote = function(Matrix[Double] X, Integer s = 200, Integer k = 1, Boolean 
verbose = FALSE) 
+m_smote = function(Matrix[Double] X, Matrix[Double] mask, Integer s = 200, 
Integer k = 1, Boolean verbose = FALSE) 
 return (Matrix[Double] Y) {
 
   if(s < 100 | (s%%100) != 0)
@@ -46,18 +48,19 @@ return (Matrix[Double] Y) {
     print("the number of samples should be an integral multiple of 100. 
Setting s = 100")
     s = 100
   }
-  
   if(k < 1) {
     print("k should not be less than 1. Setting k value to default k = 1.")
     k = 1
   }
+  if(ncol(mask) != ncol(X))
+    stop("column mismatch: no. of columns in mask vector should be equal to 
no. of columns in data matrix")
   
   # matrix to keep the index of KNN for each minority sample
   knn_index = matrix(0,k,nrow(X))
   # find nearest neighbour
   for(i in 1:nrow(X))
   {
-    knn = nn(X, X[i, ], k)
+    knn = nn(X, X[i, ], mask, k)
     knn_index[, i] = knn
   }
   
@@ -79,13 +82,28 @@ return (Matrix[Double] Y) {
     # pick the random NN
     knn_sample = knn_index[as.scalar(rand_index[iter+1]),] 
     # generate sample    
-    for(i in 1:ncol(knn_index))
-    {
+    for(i in 1:ncol(knn_index)) {
       index = as.scalar(knn_sample[1,i])
+
       X_diff = X[index,] - X[i, ]
       gap = as.scalar(Rand(rows=1, cols=1, min=0, max=1, seed = 42))
+      # generate synthetic sample
       X_sys = X[i, ] + (gap*X_diff)
+      # for nominal features replace their value with majority voting
+      if(sum(mask) > 0) {
+        categorical = X_sys * mask
+        # get all nn values
+        computation_matrix = table(knn_index[,i], knn_index[, i], nrow(X), 
nrow(X))
+        nn_X = computation_matrix %*% X
+        nn_X = removeEmpty(target=nn_X, margin = "rows")
+        nn_X = nn_X * mask
+        freq = getFrequentValue(nn_X)
+        categorical = (categorical > 0) * freq
+        X_sys = X_sys * (mask == 0)
+        X_sys = X_sys + categorical
+      }
       synthetic_samples[iter*ncol(knn_index)+i,] = X_sys;
+
     }
     iter = iter + 1
   }
@@ -97,19 +115,41 @@ return (Matrix[Double] Y) {
 
 }
   
+# as described in the paper, fr categorical columns compute the difference by 
replacing the 
+# categorical values with the median of standard deviation of numerical values
 
-
-nn = function(Matrix[Double] X, Matrix[Double] instance, Integer k )
+nn = function(Matrix[Double] X, Matrix[Double] instance, Matrix[Double] mask, 
Integer k )
 return (Matrix[Double] knn_)
 {
   if(nrow(X) < k)
     stop("can not pick "+k+" nearest neighbours from "+nrow(X)+" total 
instances")
 
-  # compute the euclidean distance
   diff = X - instance
+  diff_nominal  = diff * mask
+  if(sum(diff_nominal) != 0) {
+    only_number = removeEmpty(target=X, margin="cols", select=(mask==0))
+    num_std = colSds(only_number)
+    num_std_median = median(t(num_std))
+    diff_nominal = (diff_nominal != 0)
+    diff_nominal = diff_nominal * num_std_median 
+    diff = diff_nominal + (diff * (mask==0))  
+  }
   square_diff = diff^2
   distance = sqrt(rowSums(square_diff))
   sort_dist = order(target = distance, by = 1, decreasing= FALSE, index.return 
=  TRUE)
   knn_ = sort_dist[2:k+1,]
 }
 
+getFrequentValue = function(Matrix[Double] X)
+return (Matrix[Double] freq)
+{
+  freq = matrix(0, rows=1, cols=ncol(X))
+  for(i in 1:ncol(X))
+  {
+    if(sum(X[, i]) != 0) {
+      cat_counts = table(X[, i], 1, nrow(X), 1);  # counts for each category
+      freq[1,i] = as.scalar(rowIndexMax(t(cat_counts))) # mode
+    }
+  }
+}
+
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
index 38ac980..c1a277b 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinGaussianClassifierTest.java
@@ -85,6 +85,7 @@ public class BuiltinGaussianClassifierTest extends 
AutomatedTestBase
 
        public void testGaussianClassifier(int rows, int cols, double sparsity, 
int classes)
        {
+               setOutputBuffering(true);
                loadTestConfiguration(getTestConfiguration(TEST_NAME));
                String HOME = SCRIPT_DIR + TEST_DIR;
                fullDMLScriptName = HOME + TEST_NAME + ".dml";
@@ -136,7 +137,7 @@ public class BuiltinGaussianClassifierTest extends 
AutomatedTestBase
                double[][] invcovsSYSTEMDS = 
TestUtils.convertHashMapToDoubleArray(invcovsSYSTEMDStemp);
 
                TestUtils.compareMatrices(priorR, priorSYSTEMDS, Math.pow(10, 
-5.0), "priorR", "priorSYSTEMDS");
-               TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 
5L,5L, this.toString());
+               TestUtils.compareMatricesBitAvgDistance(meansR, meansSYSTEMDS, 
10L,10L, this.toString());
                TestUtils.compareMatricesBitAvgDistance(determinantsR, 
determinantsSYSTEMDS, (long)2E+12,(long)2E+12, this.toString());
                TestUtils.compareMatricesBitAvgDistance(invcovsR, 
invcovsSYSTEMDS, (long)2E+20,(long)2E+20, this.toString());
        }
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
index 0c1fd77..c750719 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinSmoteTest.java
@@ -49,29 +49,45 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
 
        @Test
        public void testSmote0CP() {
-               runSmoteTest(100, 1, LopProperties.ExecType.CP);
+               double[][] mask =  new 
double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(100, 3, mask, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testSmote1CP() {
-               runSmoteTest(300, 10, LopProperties.ExecType.CP);
+               double[][] mask =  new 
double[][]{{1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1}};
+               runSmoteTest(300, 10, mask, LopProperties.ExecType.CP);
        }
 
        @Test
        public void testSmote2CP() {
-               runSmoteTest(400, 5, LopProperties.ExecType.CP);
+               double[][] mask =  new 
double[][]{{1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(400, 5, mask, LopProperties.ExecType.CP);
        }
 
        @Test
-       public void testSmote1Spark() {
-               runSmoteTest(300, 3, LopProperties.ExecType.SPARK);
+       public void testSmote3CP() {
+               double[][] mask =  new 
double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(300, 3, mask, LopProperties.ExecType.CP);
        }
 
        @Test
-       public void testSmote2Spark() { runSmoteTest(400, 5, 
LopProperties.ExecType.SPARK);     }
+       public void testSmote4CP() {
+               double[][] mask =  new 
double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(400, 5, mask, LopProperties.ExecType.CP);  }
 
+       public void testSmote3Spark() {
+               double[][] mask =  new 
double[][]{{1,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(300, 3, mask, LopProperties.ExecType.SPARK);
+       }
+
+       @Test
+       public void testSmote4Spark() {
+               double[][] mask =  new 
double[][]{{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
+               runSmoteTest(400, 5, mask, LopProperties.ExecType.SPARK);       
}
+               
 
-       private void runSmoteTest(int sample, int nn, LopProperties.ExecType 
instType) {
+       private void runSmoteTest(int sample, int nn, double[][] mask, 
LopProperties.ExecType instType) {
                Types.ExecMode platformOld = setExecMode(instType);
 
                boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
@@ -81,13 +97,16 @@ public class BuiltinSmoteTest extends AutomatedTestBase {
                        loadTestConfiguration(getTestConfiguration(TEST_NAME));
                        String HOME = SCRIPT_DIR + TEST_DIR;
                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
-                       programArgs = new String[] {"-nvargs", "X=" + 
input("X"), "S=" + sample, "K=" + nn , "Z="+output("Sum"), "T="+input("T")};
-
-                       double[][] X = getRandomMatrix(rows, colsX, 0, 1, 0.3, 
1);
+                       programArgs = new String[] {"-nvargs", "X=" + 
input("X"), "S=" + sample, "M="+input("M"),
+                               "K=" + nn , "Z="+output("Sum"), 
"T="+input("T")};
 
+                       double[][] X = getRandomMatrix(rows, colsX, 1, 10, 1, 
1);
+                       X = TestUtils.round(X);
                        writeInputMatrixWithMTD("X", X, true);
+                       writeInputMatrixWithMTD("M", mask, true);
 
-                       double[][] T = getRandomMatrix(rows, colsX, 2, 3.0, 
0.3, 3);
+                       double[][] T = getRandomMatrix(rows, colsX, 20, 30, 1, 
3);
+                       T = TestUtils.round(T);
 
                        writeInputMatrixWithMTD("T", T, true);
 
diff --git a/src/test/scripts/functions/builtin/smote.dml 
b/src/test/scripts/functions/builtin/smote.dml
index 5a8d5d6..8385f1b 100644
--- a/src/test/scripts/functions/builtin/smote.dml
+++ b/src/test/scripts/functions/builtin/smote.dml
@@ -21,7 +21,8 @@
 
 
 A = read($X);
-B = smote(X = A, s=$S, k=$K, verbose=TRUE);
+M = read($M)
+B = smote(X = A, mask=M, s=$S, k=$K, verbose=TRUE);
 
 # test if all point fall in same cluster (closed to each other)
 # read some new data T != A

[systemds] branch master updated: [MINOR] Added support for categorical features in SMOTE

Reply via email to