This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 73555e932a [SYSTEMDS-3153] Improved MV imputation by KNN (different
methods)
73555e932a is described below
commit 73555e932a516063c860f5d05c84e6523cc7619b
Author: regaleo605 <[email protected]>
AuthorDate: Wed Aug 30 20:25:59 2023 +0200
[SYSTEMDS-3153] Improved MV imputation by KNN (different methods)
LDE project SoSe'23, part 2
Closes #1888.
---
scripts/builtin/imputeByKNN.dml | 132 +++++++++------------
src/test/scripts/functions/builtin/imputeByKNN.dml | 10 +-
2 files changed, 61 insertions(+), 81 deletions(-)
diff --git a/scripts/builtin/imputeByKNN.dml b/scripts/builtin/imputeByKNN.dml
index b560c94799..240631be47 100644
--- a/scripts/builtin/imputeByKNN.dml
+++ b/scripts/builtin/imputeByKNN.dml
@@ -29,19 +29,20 @@
#
------------------------------------------------------------------------------
# INPUT:
#
------------------------------------------------------------------------------
-# X Matrix with missing values, which are represented as NaNs
-# method Method used for imputing missing values with different performance
-# and accuracy tradeoffs:
-# 'dist' (default): Compute all-pairs distances and impute the
-# missing values by closest. O(N^2 * #features)
-# 'dist_missing': Compute distances between data and records with
-# missing values. O(N*M * #features), assuming
-# that the number of records with MV is M<<N.
-# 'dist_sample': Compute distances between sample of data and
-# records with missing values. O(S*M * #features)
-# with M<<N and S<<N, but suboptimal imputation.
-# seed Root seed value for random/sample calls for deterministic behavior
-# -1 for true randomization
+# X Matrix with missing values, which are represented as NaNs
+# method Method used for imputing missing values with different
performance
+# and accuracy tradeoffs:
+# 'dist' (default): Compute all-pairs distances and impute the
+# missing values by closest. O(N^2 * #features)
+# 'dist_missing': Compute distances between data and records with
+# missing values. O(N*M * #features), assuming
+# that the number of records with MV is M<<N.
+# 'dist_sample': Compute distances between sample of data and
+# records with missing values. O(S*M * #features)
+# with M<<N and S<<N, but suboptimal imputation.
+# seed Root seed value for random/sample calls for deterministic
behavior
+# -1 for true randomization
+# sample_frac Sample fraction for 'dist_sample' (value between 0 and 1)
#
------------------------------------------------------------------------------
#
# OUTPUT:
@@ -49,20 +50,14 @@
# result Imputed dataset
#
------------------------------------------------------------------------------
-m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1)
+m_imputeByKNN = function(Matrix[Double] X, String method="dist", Int seed=-1,
Double sample_frac = 0.1)
return(Matrix[Double] result)
{
- #TODO fix seed handling (only root seed)
- #TODO fix imputation for all columns with missing values
-
#KNN-Imputation Script
#Create a mask for placeholder and to check for missing values
masked = is.nan(X)
- #Find the column containing multiple missing values
- missing_col = rowIndexMax(colSums(is.nan(X)))
-
#Impute NaN value with temporary mean value of the column
filled_matrix = imputeByMean(X, matrix(0, cols = ncol(X), rows = 1))
@@ -76,30 +71,14 @@ m_imputeByKNN = function(Matrix[Double] X, String
method="dist", Int seed=-1)
#Get the minimum distance row-wise computation
minimum_index = rowIndexMin(distance_matrix)
- #Position of missing values in per row in which column
- position = rowSums(is.nan(X))
- position = position * minimum_index
-
- #Filter the 0 out
- I = (rowSums(is.nan(X))!=0)
- missing = removeEmpty(target=position, margin="rows", select=I)
-
- #Convert the value indices into 0/1 matrix to find location
- indices = table(missing,
seq(1,nrow(filled_matrix)),odim1=nrow(filled_matrix),odim2=nrow(missing))
-
- #Replace the index with value
- imputedValue = t(indices) %*% filled_matrix[,as.scalar(missing_col)]
-
- #Get the index location of the missing value
- pos = rowSums(is.nan(X))
- missing_indices = seq(1, nrow(pos)) * pos
+ #Create aligned matrix from minimum index
+ aligned = table(minimum_index, seq(1, nrow(X)), odim1 = nrow(X), odim2 =
nrow(X))
- #Put the replacement value in the missing indices
- I2 = removeEmpty(target=missing_indices, margin="rows")
- R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+ #Get the X records that need to be imputed
+ imputedValue = t(filled_matrix) %*% aligned
- #Replace the masked column with to be imputed Value
- masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+ #Update the mask value
+ masked = t(imputedValue) * masked
}
else if(method == "dist_missing") {
#assuming small missing values
@@ -116,41 +95,38 @@ m_imputeByKNN = function(Matrix[Double] X, String
method="dist", Int seed=-1)
D = sqrt(dotM2 + dotMissing - 2 * (M2 %*% t(missing)))
minD = rowIndexMin(t(D))
- #Convert the value indices into 0/1 matrix to find location
- indices = table(minD, seq(1,nrow(M2)),odim1=nrow(M2),odim2=nrow(minD))
-
- #Replace the value
- imputedValue = t(indices) %*% M2[,as.scalar(missing_col)]
-
#Get the index location of the missing value
- pos = rowSums(is.nan(X))
+ pos = rowMaxs(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos
#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
- R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+ R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
+
+ #Replace the 0 to avoid error in table()
+ R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
- #Update the masked value
- masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+ #Create aligned matrix from minimum index
+ aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
+
+ #Reshape the subset
+ reshaped = rbind(M2, matrix(0, rows = nrow(X) - nrow(M2), cols = ncol(X)))
+
+ #Get the M2 records that need to be imputed
+ imputedValue = t(reshaped) %*% aligned
+
+ #Update the mask value
+ masked = t(imputedValue) * masked
}
else if(method == "dist_sample"){
#assuming large missing values
#Split the matrix into containing NaN values (missing records) and not
containing NaN values (M2 records)
- I = (rowSums(is.nan(X))!=0)
+ I = rowSums(is.nan(X)) != 0
missing = removeEmpty(target=filled_matrix, margin="rows", select=I)
- Y = (rowSums(is.nan(X))==0)
- M3 = removeEmpty(target=filled_matrix, margin = "rows", select = Y)
-
- #Create a random subset
- random_matrix = ceiling(rand(rows = nrow(M3), cols = 1, min = 0, max = 1,
sparsity = 0.5, seed = seed))
-
- #ensure that random_matrix has at least 1 value
- if(as.scalar(colSums(random_matrix)) < 1)
- random_matrix = matrix(1, rows = nrow(M3), cols = 1)
-
- subset = M3 * random_matrix
- subset = removeEmpty(target=subset, margin = "rows", select =
random_matrix)
+ #Create permutation matrix for sampling sample_frac*nrow(X) rows
+ I = rand(rows=nrow(X), cols=1, seed=seed) <= sample_frac;
+ subset = removeEmpty(target=filled_matrix, margin="rows", select=I);
#Calculate the euclidean distance between fully records and missing
records, and then find the min value row wise
dotSubset = rowSums(subset * subset) %*% matrix(1, rows = 1, cols =
nrow(missing))
@@ -158,22 +134,28 @@ m_imputeByKNN = function(Matrix[Double] X, String
method="dist", Int seed=-1)
D = sqrt(dotSubset + dotMissing - 2 * (subset %*% t(missing)))
minD = rowIndexMin(t(D))
- #Convert the value indices into 0/1 matrix to find location
- indices = table(minD,
seq(1,nrow(subset)),odim1=nrow(subset),odim2=nrow(minD))
-
- #Replace the value
- imputedValue = t(indices) %*% subset[,as.scalar(missing_col)]
-
#Get the index location of the missing value
- pos = rowSums(is.nan(X))
+ pos = rowMaxs(is.nan(X))
missing_indices = seq(1, nrow(pos)) * pos
#Put the replacement value in the missing indices
I2 = removeEmpty(target=missing_indices, margin="rows")
- R = table(I2,1,imputedValue,odim1 = nrow(X), odim2=1)
+ R = table(I2,1,minD,odim1 = nrow(X), odim2=1)
+
+ #Replace the 0 to avoid error in table()
+ R = replace(target = R, pattern = 0, replacement = nrow(X)+1)
+
+ #Create aligned matrix from minimum index
+ aligned = table(R, seq(1, nrow(X)), odim1 = nrow(X), odim2 = nrow(X))
+
+ #Reshape the subset
+ reshaped = rbind(subset, matrix(0, rows = nrow(X) - nrow(subset), cols =
ncol(X)))
+
+ #Get the subset records that need to be imputed
+ imputedValue = t(reshaped) %*% aligned
- #Update the masked value
- masked[,as.scalar(missing_col)] = masked[,as.scalar(missing_col)] * R
+ #Update the mask value
+ masked = t(imputedValue) * masked
}
else {
print("Method is unknown or not yet implemented")
diff --git a/src/test/scripts/functions/builtin/imputeByKNN.dml
b/src/test/scripts/functions/builtin/imputeByKNN.dml
index f02ac90eac..299c1dda30 100644
--- a/src/test/scripts/functions/builtin/imputeByKNN.dml
+++ b/src/test/scripts/functions/builtin/imputeByKNN.dml
@@ -22,14 +22,12 @@
# Prepare the data
X = read($1, data_type="frame", format="csv", header=TRUE, naStrings= ["20"]);
X = cbind(as.matrix(X[,4:5]), as.matrix(X[,7]))
-remove_col = is.nan(X)
-data = removeEmpty(target = X, margin = "rows", select = (remove_col[,1] != 1))
-mask = is.nan(data)
+mask = is.nan(X)
#Perform the KNN imputation
-result = imputeByKNN(X = data, method = $2)
-result2 = imputeByKNN(X = data, method = $3)
+result = imputeByKNN(X = X, method = $2)
+result2 = imputeByKNN(X = X, method = $3)
#Get the imputed value
I = (mask[,2] == 1);
@@ -41,4 +39,4 @@ value = colSums(value[,2])
value2 = colSums(value2[,2])
write(value, $4)
-write(value2, $5)
+write(value2, $5)
\ No newline at end of file