Baunsgaard commented on code in PR #1925:
URL: https://github.com/apache/systemds/pull/1925#discussion_r1363757838
##########
scripts/builtin/imputeByKNN.dml:
##########
@@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String
method="dist", Int seed=-1, Do
distance_matrix = dist(filled_matrix)
#Change 0 value so rowIndexMin will ignore that diagonal value
- distance_matrix = replace(target = distance_matrix, pattern = 0,
replacement = 999)
+ distance_matrix = replace(target=distance_matrix, pattern=0,
replacement=999)
#Get the minimum distance row-wise computation
minimum_index = rowIndexMin(distance_matrix)
#Create aligned matrix from minimum index
- aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 =
nrow(X))
+ aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X),
odim2=nrow(X))
#Get the X records that need to be imputed
imputedValue = t(filled_matrix) %*% aligned
-
- #Update the mask value
- masked = t(imputedValue) * masked
+ imputedValue = t(imputedValue)
}
else if(method == "dist_missing") {
#assuming small missing values
- #Split the matrix into containing NaN values (missing records) and not
containing NaN values (M2 records)
- I = (rowSums(is.nan(X))!=0)
- missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
-
- Y = (rowSums(is.nan(X))==0)
- M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
- #Calculate the euclidean distance between fully records and missing
records, and then find the min value row wise
- dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing))
- dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols =
nrow(M2)))
- D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
- minD = rowIndexMin(t(D))
-
- #Get the index location of the missing value
- pos = rowMaxs(is.nan(X))
- missing_indices = seq(1, nrow(pos)) * pos
-
- #Put the replacement value in the missing indices
- I2 = removeEmpty(target=missing_indices, margin="rows")
- R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
-
- #Replace the 0 to avoid error in table()
- R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
-
- #Create aligned matrix from minimum index
- aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
-
- #Reshape the subset
- reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
-
- #Get the M2 records that need to be imputed
- imputedValue = t(reshaped) %*% aligned
-
- #Update the mask value
- masked = t(imputedValue) * masked
+ imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0)
}
else if(method == "dist_sample"){
#assuming large missing values
+ imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac)
+ }
+ else {
+ print("Method is unknown or not yet implemented")
Review Comment:
change this to stop instead of print
##########
scripts/builtin/imputeByKNN.dml:
##########
@@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String
method="dist", Int seed=-1, Do
distance_matrix = dist(filled_matrix)
#Change 0 value so rowIndexMin will ignore that diagonal value
- distance_matrix = replace(target = distance_matrix, pattern = 0,
replacement = 999)
+ distance_matrix = replace(target=distance_matrix, pattern=0,
replacement=999)
#Get the minimum distance row-wise computation
minimum_index = rowIndexMin(distance_matrix)
#Create aligned matrix from minimum index
- aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 =
nrow(X))
+ aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X),
odim2=nrow(X))
#Get the X records that need to be imputed
imputedValue = t(filled_matrix) %*% aligned
-
- #Update the mask value
- masked = t(imputedValue) * masked
+ imputedValue = t(imputedValue)
}
else if(method == "dist_missing") {
#assuming small missing values
- #Split the matrix into containing NaN values (missing records) and not
containing NaN values (M2 records)
- I = (rowSums(is.nan(X))!=0)
- missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
-
- Y = (rowSums(is.nan(X))==0)
- M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
- #Calculate the euclidean distance between fully records and missing
records, and then find the min value row wise
- dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing))
- dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols =
nrow(M2)))
- D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
- minD = rowIndexMin(t(D))
-
- #Get the index location of the missing value
- pos = rowMaxs(is.nan(X))
- missing_indices = seq(1, nrow(pos)) * pos
-
- #Put the replacement value in the missing indices
- I2 = removeEmpty(target=missing_indices, margin="rows")
- R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
-
- #Replace the 0 to avoid error in table()
- R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
-
- #Create aligned matrix from minimum index
- aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
-
- #Reshape the subset
- reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
-
- #Get the M2 records that need to be imputed
- imputedValue = t(reshaped) %*% aligned
-
- #Update the mask value
- masked = t(imputedValue) * masked
+ imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0)
}
else if(method == "dist_sample"){
#assuming large missing values
+ imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac)
+ }
+ else {
+ print("Method is unknown or not yet implemented")
+ }
+
+ #Impute the value
+ result = replace(target=X, pattern=NaN, replacement=0)
+ result = result + (imputedValue * is.nan(X))
+}
+
+compute_missing_values = function (Matrix[Double] X, Matrix[Double]
filled_matrix, Int seed, Double sample_frac)
+return (Matrix[Double] imputedValue)
Review Comment:
indent with 4 spaces on the return or break lines in the function
definitions.
(just to keep it consistent with other builtins.)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]