Baunsgaard commented on code in PR #1925: URL: https://github.com/apache/systemds/pull/1925#discussion_r1363757838
########## scripts/builtin/imputeByKNN.dml: ########## @@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1, Do distance_matrix = dist(filled_matrix) #Change 0 value so rowIndexMin will ignore that diagonal value - distance_matrix = replace(target = distance_matrix, pattern = 0, replacement = 999) + distance_matrix = replace(target=distance_matrix, pattern=0, replacement=999) #Get the minimum distance row-wise computation minimum_index = rowIndexMin(distance_matrix) #Create aligned matrix from minimum index - aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X)) + aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X), odim2=nrow(X)) #Get the X records that need to be imputed imputedValue = t(filled_matrix) %*% aligned - - #Update the mask value - masked = t(imputedValue) * masked + imputedValue = t(imputedValue) } else if(method == "dist_missing") { #assuming small missing values - #Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records) - I = (rowSums(is.nan(X))!=0) - missing = removeEmpty(target=filled_matrix, margin="rows", select=I) - - Y = (rowSums(is.nan(X))==0) - M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y) - - #Calculate the euclidean distance between fully records and missing records, and then find the min value row wise - dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing)) - dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(M2))) - D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing))) - minD = rowIndexMin(t(D)) - - #Get the index location of the missing value - pos = rowMaxs(is.nan(X)) - missing_indices = seq(1, nrow(pos)) * pos - - #Put the replacement value in the missing indices - I2 = removeEmpty(target=missing_indices, margin="rows") - R = table(I2,1,minD,odim1 = nrow(X), odim2=1) - - #Replace the 0 to avoid error in table() - R = replace(target = R, pattern = 0, replacement = nrow(X)+1) - - #Create aligned matrix from minimum index - aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X)) - - #Reshape the subset - reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X))) - - #Get the M2 records that need to be imputed - imputedValue = t(reshaped) %*% aligned - - #Update the mask value - masked = t(imputedValue) * masked + imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0) } else if(method == "dist_sample"){ #assuming large missing values + imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac) + } + else { + print("Method is unknown or not yet implemented") Review Comment: change this to stop instead of print ########## scripts/builtin/imputeByKNN.dml: ########## @@ -66,103 +64,76 @@ m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1, Do distance_matrix = dist(filled_matrix) #Change 0 value so rowIndexMin will ignore that diagonal value - distance_matrix = replace(target = distance_matrix, pattern = 0, replacement = 999) + distance_matrix = replace(target=distance_matrix, pattern=0, replacement=999) #Get the minimum distance row-wise computation minimum_index = rowIndexMin(distance_matrix) #Create aligned matrix from minimum index - aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X)) + aligned = table(minimum_index, seq(1, nrow(X)), odim1=nrow(X), odim2=nrow(X)) #Get the X records that need to be imputed imputedValue = t(filled_matrix) %*% aligned - - #Update the mask value - masked = t(imputedValue) * masked + imputedValue = t(imputedValue) } else if(method == "dist_missing") { #assuming small missing values - #Split the matrix into containing NaN values (missing records) and not containing NaN values (M2 records) - I = (rowSums(is.nan(X))!=0) - missing = removeEmpty(target=filled_matrix, margin="rows", select=I) - - Y = (rowSums(is.nan(X))==0) - M2 = removeEmpty(target=filled_matrix, margin = "rows", select = Y) - - #Calculate the euclidean distance between fully records and missing records, and then find the min value row wise - dotM2 = rowSums(M2 * M2) %*% matrix(1, rows = 1, cols = nrow(missing)) - dotMissing = t(rowSums(missing * missing) %*% matrix(1, rows = 1, cols = nrow(M2))) - D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing))) - minD = rowIndexMin(t(D)) - - #Get the index location of the missing value - pos = rowMaxs(is.nan(X)) - missing_indices = seq(1, nrow(pos)) * pos - - #Put the replacement value in the missing indices - I2 = removeEmpty(target=missing_indices, margin="rows") - R = table(I2,1,minD,odim1 = nrow(X), odim2=1) - - #Replace the 0 to avoid error in table() - R = replace(target = R, pattern = 0, replacement = nrow(X)+1) - - #Create aligned matrix from minimum index - aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X)) - - #Reshape the subset - reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X))) - - #Get the M2 records that need to be imputed - imputedValue = t(reshaped) %*% aligned - - #Update the mask value - masked = t(imputedValue) * masked + imputedValue = compute_missing_values(X, filled_matrix, seed, 1.0) } else if(method == "dist_sample"){ #assuming large missing values + imputedValue = compute_missing_values(X, filled_matrix, seed, sample_frac) + } + else { + print("Method is unknown or not yet implemented") + } + + #Impute the value + result = replace(target=X, pattern=NaN, replacement=0) + result = result + (imputedValue * is.nan(X)) +} + +compute_missing_values = function (Matrix[Double] X, Matrix[Double] filled_matrix, Int seed, Double sample_frac) +return (Matrix[Double] imputedValue) Review Comment: indent with 4 spaces on the return or break lines in the function definitions. (just to keep it consistent with other builtins.) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@systemds.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org