[systemds] branch main updated: [SYSTEMDS-3153] Improved MV imputation by KNN (different methods)

mboehm7 Wed, 30 Aug 2023 12:05:59 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 73555e932a [SYSTEMDS-3153] Improved MV imputation by KNN (different 
methods)
73555e932a is described below

commit 73555e932a516063c860f5d05c84e6523cc7619b
Author: regaleo605 <[email protected]>
AuthorDate: Wed Aug 30 20:25:59 2023 +0200

    [SYSTEMDS-3153] Improved MV imputation by KNN (different methods)
    
    LDE project SoSe'23, part 2
    Closes #1888.
---
 scripts/builtin/imputeByKNN.dml                    | 132 +++++++++------------
 src/test/scripts/functions/builtin/imputeByKNN.dml |  10 +-
 2 files changed, 61 insertions(+), 81 deletions(-)

diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
index b560c94799..240631be47 100644
--- a/scripts/builtin/imputeByKNN.dml
+++ b/scripts/builtin/imputeByKNN.dml
@@ -29,19 +29,20 @@
 # 
------------------------------------------------------------------------------
 # INPUT:
 # 
------------------------------------------------------------------------------
-# X          Matrix with missing values, which are represented as NaNs
-# method     Method used for imputing missing values with different performance
-#            and accuracy tradeoffs:
-#            'dist' (default): Compute all-pairs distances and impute the
-#                              missing values by closest. O(N^2 * #features)
-#            'dist_missing':   Compute distances between data and records with
-#                              missing values. O(N*M * #features), assuming
-#                              that the number of records with MV is M<<N.
-#            'dist_sample':    Compute distances between sample of data and
-#                              records with missing values. O(S*M * #features)
-#                              with M<<N and S<<N, but suboptimal imputation.
-# seed       Root seed value for random/sample calls for deterministic behavior
-#            -1 for true randomization
+# X           Matrix with missing values, which are represented as NaNs
+# method      Method used for imputing missing values with different 
performance
+#             and accuracy tradeoffs:
+#             'dist' (default): Compute all-pairs distances and impute the
+#                               missing values by closest. O(N^2 * #features)
+#             'dist_missing':   Compute distances between data and records with
+#                               missing values. O(N*M * #features), assuming
+#                               that the number of records with MV is M<<N.
+#             'dist_sample':    Compute distances between sample of data and
+#                               records with missing values. O(S*M * #features)
+#                               with M<<N and S<<N, but suboptimal imputation.
+# seed        Root seed value for random/sample calls for deterministic 
behavior
+#             -1 for true randomization
+# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
 # 
------------------------------------------------------------------------------
 #
 # OUTPUT:
@@ -49,20 +50,14 @@
 # result     Imputed dataset
 # 
------------------------------------------------------------------------------
 
-m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
+m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1, 
Double sample_frac = 0.1)
   return(Matrix[Double] result)
 {
-  #TODO fix seed handling (only root seed)
-  #TODO fix imputation for all columns with missing values
-
   #KNN-Imputation Script
 
   #Create a mask for placeholder and to check for missing values
   masked = is.nan(X)
 
-  #Find the column containing multiple missing values
-  missing_col = rowIndexMax(colSums(is.nan(X)))
-
   #Impute NaN value with temporary mean value of the column
   filled_matrix = imputeByMean(X, matrix(0, cols = ncol(X), rows = 1))
 
@@ -76,30 +71,14 @@ m_imputeByKNN = function(Matrix[Double] X, String 
method="dist", Int seed=-1)
     #Get the minimum distance row-wise computation
     minimum_index = rowIndexMin(distance_matrix)
 
-    #Position of missing values in per row in which column
-    position = rowSums(is.nan(X))
-    position = position * minimum_index
-
-    #Filter the 0 out
-    I = (rowSums(is.nan(X))!=0)
-    missing = removeEmpty(target=position, margin="rows", select=I)
-
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(missing, 
seq(1,nrow(filled_matrix)),odim1=nrow(filled_matrix),odim2=nrow(missing))
-
-    #Replace the index with value
-    imputedValue = t(indices) %*% filled_matrix[,as.scalar(missing_col)]
-
-    #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
-    missing_indices = seq(1, nrow(pos)) * pos
+    #Create aligned matrix from minimum index
+    aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = 
nrow(X))
 
-    #Put the replacement value in the missing indices
-    I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+    #Get the X records that need to be imputed
+    imputedValue = t(filled_matrix) %*% aligned
 
-    #Replace the masked column with to be imputed Value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else if(method == "dist_missing") {
     #assuming small missing values
@@ -116,41 +95,38 @@ m_imputeByKNN = function(Matrix[Double] X, String 
method="dist", Int seed=-1)
     D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
     minD = rowIndexMin(t(D))
 
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(minD, seq(1,nrow(M2)),odim1=nrow(M2),odim2=nrow(minD))
-
-    #Replace the value
-    imputedValue = t(indices) %*% M2[,as.scalar(missing_col)]
-
     #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
+    pos = rowMaxs(is.nan(X))
     missing_indices = seq(1, nrow(pos)) * pos
 
     #Put the replacement value in the missing indices
     I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
+
+    #Replace the 0 to avoid error in table()
+    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
 
-    #Update the masked value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Create aligned matrix from minimum index
+    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
+
+    #Reshape the subset
+    reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
+
+    #Get the M2 records that need to be imputed
+    imputedValue = t(reshaped) %*% aligned
+
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else if(method == "dist_sample"){
     #assuming large missing values
     #Split the matrix into containing NaN values (missing records) and not 
containing NaN values (M2 records)
-    I = (rowSums(is.nan(X))!=0)
+    I = rowSums(is.nan(X)) != 0
     missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
 
-    Y = (rowSums(is.nan(X))==0)
-    M3 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
-    #Create a random subset
-    random_matrix = ceiling(rand(rows = nrow(M3), cols = 1, min = 0, max = 1, 
sparsity = 0.5, seed = seed))
-
-    #ensure that random_matrix has at least 1 value
-    if(as.scalar(colSums(random_matrix)) < 1)
-      random_matrix = matrix(1, rows = nrow(M3), cols = 1)
-
-    subset = M3 * random_matrix
-    subset = removeEmpty(target=subset, margin = "rows", select = 
random_matrix)
+    #Create permutation matrix for sampling sample_frac*nrow(X) rows
+    I = rand(rows=nrow(X), cols=1, seed=seed) <= sample_frac;
+    subset = removeEmpty(target=filled_matrix, margin="rows", select=I);
 
     #Calculate the euclidean distance between fully records and missing 
records, and then find the min value row wise
     dotSubset = rowSums(subset * subset) %*% matrix(1, rows = 1, cols = 
nrow(missing))
@@ -158,22 +134,28 @@ m_imputeByKNN = function(Matrix[Double] X, String 
method="dist", Int seed=-1)
     D = sqrt(dotSubset + dotMissing - 2 * (subset %*% t(missing)))
     minD = rowIndexMin(t(D))
 
-    #Convert the value indices into 0/1 matrix to find location
-    indices = table(minD, 
seq(1,nrow(subset)),odim1=nrow(subset),odim2=nrow(minD))
-
-    #Replace the value
-    imputedValue = t(indices) %*% subset[,as.scalar(missing_col)]
-
     #Get the index location of the missing value
-    pos = rowSums(is.nan(X))
+    pos = rowMaxs(is.nan(X))
     missing_indices = seq(1, nrow(pos)) * pos
 
     #Put the replacement value in the missing indices
     I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
+
+    #Replace the 0 to avoid error in table()
+    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
+
+    #Create aligned matrix from minimum index
+    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
+
+    #Reshape the subset
+    reshaped = rbind(subset, matrix(0, rows = nrow(X) - nrow(subset), cols = 
ncol(X)))
+
+    #Get the subset records that need to be imputed
+    imputedValue = t(reshaped) %*% aligned
 
-    #Update the masked value
-    masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+    #Update the mask value
+    masked = t(imputedValue) * masked
   }
   else {
     print("Method is unknown or not yet implemented")
diff --git a/src/test/scripts/functions/builtin/imputeByKNN.dml 
b/src/test/scripts/functions/builtin/imputeByKNN.dml
index f02ac90eac..299c1dda30 100644
--- a/src/test/scripts/functions/builtin/imputeByKNN.dml
+++ b/src/test/scripts/functions/builtin/imputeByKNN.dml
@@ -22,14 +22,12 @@
 # Prepare the data
 X = read($1, data_type="frame", format="csv", header=TRUE, naStrings= ["20"]);
 X = cbind(as.matrix(X[,4:5]), as.matrix(X[,7]))
-remove_col = is.nan(X)
 
-data = removeEmpty(target = X, margin = "rows", select = (remove_col[,1] != 1))
-mask = is.nan(data)
+mask = is.nan(X)
 
 #Perform the KNN imputation
-result = imputeByKNN(X = data, method = $2)
-result2 = imputeByKNN(X = data, method = $3)
+result = imputeByKNN(X = X, method = $2)
+result2 = imputeByKNN(X = X, method = $3)
 
 #Get the imputed value
 I = (mask[,2] == 1);
@@ -41,4 +39,4 @@ value = colSums(value[,2])
 value2 = colSums(value2[,2])
 
 write(value, $4)
-write(value2, $5)
+write(value2, $5)
\ No newline at end of file

[systemds] branch main updated: [SYSTEMDS-3153] Improved MV imputation by KNN (different methods)

Reply via email to