This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 36eaaeb [SYSTEMDS-2659] imputeByFD now accepts the matrix input
36eaaeb is described below
commit 36eaaeb961130471c7d8f19456a7848312ff25b5
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Tue Sep 1 22:47:36 2020 +0200
[SYSTEMDS-2659] imputeByFD now accepts the matrix input
The initial version of imputeByFD accepts the frame input then internally
recodes the
frame and performs imputations. Now, the method accepts the matrix input
(recoded matrix for non-numeric data) and directly perform imputations on
matrix values.
---
scripts/builtin/imputeByFD.dml | 30 +++++--------------------
src/test/scripts/functions/builtin/imputeFD.dml | 22 ++++++++++++++++--
2 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 8ad523a..01281d2 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml
@@ -25,7 +25,7 @@
#
---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
#
---------------------------------------------------------------------------------------------
-# F String -- Data frame
+# X Double -- Matrix X
# source Integer -- source attribute to use for imputation
and error correction
# target Integer -- attribute to be fixed
# threshold Double -- threshold value in interval [0, 1] for
robust FDs
@@ -36,39 +36,21 @@
#
---------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
#
---------------------------------------------------------------------------------------------
-# imputed_F String --- Frame with possible imputations
+# X Double --- Matrix with possible imputations
-s_imputeByFD = function(Frame[String] F, Integer sourceAttribute, Integer
targetAttribute, Double threshold)
- return(Frame[String] imputed_F)
+m_imputeByFD = function(Matrix[Double] X, Integer sourceAttribute, Integer
targetAttribute, Double threshold)
+ return(Matrix[Double] X)
{
-
# sanity checks
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1]
found "+threshold)
- if(sourceAttribute < 0 | sourceAttribute > ncol(F) | targetAttribute < 0 |
targetAttribute > ncol(F))
+ if(sourceAttribute < 0 | sourceAttribute > ncol(X) | targetAttribute < 0 |
targetAttribute > ncol(X))
stop("Stopping due to invalid source and target")
-
-
- # detect schema for transformation
- schema = detectSchema(F)
- s=""
- for(i in 1: ncol(F)) {
- if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) ==
"BOOLEAN" )
- s = s+as.integer(i)+",";
- }
-
- # recode data frame
- jspecR = "{ids:true, recode:["+s+"]}";
- [X, M] = transformencode(target=F, spec=jspecR);
-
+
# impute missing values and fix errors
X[,targetAttribute] = imputeAndCorrect(X[,sourceAttribute],
X[,targetAttribute], threshold)
-
- # getting the actual data back
- dF = transformdecode(target=X, spec=jspecR, meta=M);
- imputed_F = dF;
}
imputeAndCorrect = function(Matrix[Double] X, Matrix[Double] Y, Double
threshold)
diff --git a/src/test/scripts/functions/builtin/imputeFD.dml
b/src/test/scripts/functions/builtin/imputeFD.dml
index 9325562..4782921 100644
--- a/src/test/scripts/functions/builtin/imputeFD.dml
+++ b/src/test/scripts/functions/builtin/imputeFD.dml
@@ -16,6 +16,24 @@
#
#-------------------------------------------------------------
-X = read($1, data_type="frame", format="csv", header=FALSE);
+F = read($1, data_type="frame", format="csv", header=FALSE);
+# as the method accepts the matrix so convert the non-numeric data into matrix
+
+# detect schema for transformation
+schema = detectSchema(F)
+s=""
+for(i in 1: ncol(F)) {
+ if(as.scalar(schema[1,i]) == "STRING" | as.scalar(schema[1,i]) == "BOOLEAN" )
+ s = s+as.integer(i)+",";
+}
+
+# recode data frame
+jspecR = "{ids:true, recode:["+s+"]}";
+[X, M] = transformencode(target=F, spec=jspecR);
+# call the method
Y = imputeByFD(X, $2, $3, $4);
-write(Y, $5, format="binary")
+
+# getting the actual data back
+dF = transformdecode(target=Y, spec=jspecR, meta=M);
+
+write(dF, $5, format="binary")