Re: [PR] [SYSTEMDS-3153] Missing value imputation using KNN [systemds]

via GitHub Wed, 18 Oct 2023 05:15:24 -0700


Baunsgaard commented on code in PR #1925:
URL: https://github.com/apache/systemds/pull/1925#discussion_r1363757838



##########
scripts/builtin/imputeByKNN.dml:
##########
@@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String 
method="dist", Int seed=-1, Do
     distance_matrix = dist(filled_matrix)
 
     #Change 0 value so rowIndexMin will ignore that diagonal value
-    distance_matrix = replace(target = distance_matrix, pattern = 0, 
replacement = 999)
+    distance_matrix = replace(target=distance_matrix, pattern=0, 
replacement=999)
 
     #Get the minimum distance row-wise computation
     minimum_index = rowIndexMin(distance_matrix)
 
     #Create aligned matrix from minimum index
-    aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = 
nrow(X))
+    aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X), 
odim2=nrow(X))
 
     #Get the X records that need to be imputed
     imputedValue = t(filled_matrix) %*% aligned
-
-    #Update the mask value
-    masked = t(imputedValue) * masked
+    imputedValue = t(imputedValue)
   }
   else if(method == "dist_missing") {
     #assuming small missing values
-    #Split the matrix into containing NaN values (missing records) and not 
containing NaN values (M2 records)
-    I = (rowSums(is.nan(X))!=0)
-    missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
-
-    Y = (rowSums(is.nan(X))==0)
-    M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
-    #Calculate the euclidean distance between fully records and missing 
records, and then find the min value row wise
-    dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing))
-    dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = 
nrow(M2)))
-    D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
-    minD = rowIndexMin(t(D))
-
-    #Get the index location of the missing value
-    pos = rowMaxs(is.nan(X))
-    missing_indices = seq(1, nrow(pos)) * pos
-
-    #Put the replacement value in the missing indices
-    I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
-
-    #Replace the 0 to avoid error in table()
-    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
-
-    #Create aligned matrix from minimum index
-    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
-
-    #Reshape the subset
-    reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
-
-    #Get the M2 records that need to be imputed
-    imputedValue = t(reshaped) %*% aligned
-
-    #Update the mask value
-    masked = t(imputedValue) * masked
+    imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0)
   }
   else if(method == "dist_sample"){
     #assuming large missing values
+    imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac)
+  }
+  else {
+    print("Method is unknown or not yet implemented")

Review Comment:
   change this to stop instead of print



##########
scripts/builtin/imputeByKNN.dml:
##########
@@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String 
method="dist", Int seed=-1, Do
     distance_matrix = dist(filled_matrix)
 
     #Change 0 value so rowIndexMin will ignore that diagonal value
-    distance_matrix = replace(target = distance_matrix, pattern = 0, 
replacement = 999)
+    distance_matrix = replace(target=distance_matrix, pattern=0, 
replacement=999)
 
     #Get the minimum distance row-wise computation
     minimum_index = rowIndexMin(distance_matrix)
 
     #Create aligned matrix from minimum index
-    aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = 
nrow(X))
+    aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X), 
odim2=nrow(X))
 
     #Get the X records that need to be imputed
     imputedValue = t(filled_matrix) %*% aligned
-
-    #Update the mask value
-    masked = t(imputedValue) * masked
+    imputedValue = t(imputedValue)
   }
   else if(method == "dist_missing") {
     #assuming small missing values
-    #Split the matrix into containing NaN values (missing records) and not 
containing NaN values (M2 records)
-    I = (rowSums(is.nan(X))!=0)
-    missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
-
-    Y = (rowSums(is.nan(X))==0)
-    M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
-    #Calculate the euclidean distance between fully records and missing 
records, and then find the min value row wise
-    dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing))
-    dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = 
nrow(M2)))
-    D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
-    minD = rowIndexMin(t(D))
-
-    #Get the index location of the missing value
-    pos = rowMaxs(is.nan(X))
-    missing_indices = seq(1, nrow(pos)) * pos
-
-    #Put the replacement value in the missing indices
-    I2 = removeEmpty(target=missing_indices, margin="rows")
-    R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
-
-    #Replace the 0 to avoid error in table()
-    R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
-
-    #Create aligned matrix from minimum index
-    aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
-
-    #Reshape the subset
-    reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
-
-    #Get the M2 records that need to be imputed
-    imputedValue = t(reshaped) %*% aligned
-
-    #Update the mask value
-    masked = t(imputedValue) * masked
+    imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0)
   }
   else if(method == "dist_sample"){
     #assuming large missing values
+    imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac)
+  }
+  else {
+    print("Method is unknown or not yet implemented")
+  }
+
+  #Impute the value
+  result = replace(target=X, pattern=NaN, replacement=0)
+  result = result + (imputedValue * is.nan(X))
+}
+
+compute_missing_values = function (Matrix[Double] X, Matrix[Double] 
filled_matrix, Int seed, Double sample_frac)
+return (Matrix[Double] imputedValue)

Review Comment:
   indent with 4 spaces on the return or break lines in the function 
definitions.
   (just to keep it consistent with other builtins.)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@systemds.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] [SYSTEMDS-3153] Missing value imputation using KNN [systemds]

Reply via email to