http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/m-svm/m-svm.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/m-svm/m-svm.dml b/src/test/scripts/applications/m-svm/m-svm.dml index 1307707..bbf5acc 100644 --- a/src/test/scripts/applications/m-svm/m-svm.dml +++ b/src/test/scripts/applications/m-svm/m-svm.dml @@ -1,145 +1,145 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multiclass SVM with squared slack variables, -# learns one-against-the-rest binary-class classifiers -# -# Example Usage: -# Assume SVM_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100 -# -# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" -# - -cmdLine_fmt=ifdef($fmt, "text") -cmdLine_icpt = ifdef($icpt, 0) -cmdLine_tol=ifdef($tol, 0.001) -cmdLine_reg=ifdef($reg, 1.0) -cmdLine_maxiter=ifdef($maxiter, 100) - -print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) - -X = read($X) - -check_X = sum(X) -if(check_X == 0){ - print("X has no non-zeros") -}else{ - Y = read($Y) - intercept = cmdLine_icpt - num_classes = $classes - epsilon = cmdLine_tol - lambda = cmdLine_reg - max_iterations = cmdLine_maxiter - - num_samples = nrow(X) - num_features = ncol(X) - - if (intercept == 1) { - ones = matrix(1, rows=num_samples, cols=1); - X = append(X, ones); - } - - num_rows_in_w = num_features - if(intercept == 1){ - num_rows_in_w = num_rows_in_w + 1 - } - w = matrix(0, rows=num_rows_in_w, cols=num_classes) - - debug_mat = matrix(-1, rows=max_iterations, cols=num_classes) - parfor(iter_class in 1:num_classes){ - Y_local = 2 * ppred(Y, iter_class, "==") - 1 - w_class = matrix(0, rows=num_features, cols=1) - if (intercept == 1) { - zero_matrix = matrix(0, rows=1, cols=1); - w_class = t(append(t(w_class), zero_matrix)); - } - - g_old = t(X) %*% Y_local - s = g_old - - Xw = matrix(0, rows=nrow(X), cols=1) - iter = 0 - continue = 1 - while(continue == 1) { - # minimizing primal obj along direction s - step_sz = 0 - Xd = X %*% s - wd = lambda * sum(w_class * s) - dd = lambda * sum(s * s) - continue1 = 1 - while(continue1 == 1){ - tmp_Xw = Xw + step_sz*Xd - out = 1 - Y_local * (tmp_Xw) - sv = ppred(out, 0, ">") - out = out * sv - g = wd + step_sz*dd - sum(out * Y_local * Xd) - h = dd + sum(Xd * sv * Xd) - step_sz = step_sz - g/h - if (g*g/h < 0.0000000001){ - continue1 = 0 - } - } - - #update weights - w_class = w_class + step_sz*s - Xw = Xw + step_sz*Xd - - out = 1 - Y_local * Xw - sv = ppred(out, 0, ">") - out = sv * out - obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) - g_new = t(X) %*% (out * Y_local) - lambda * w_class - - tmp = sum(s * g_old) - - train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100 - print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) - debug_mat[iter+1,iter_class] = obj - - if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){ - continue = 0 - } - - #non-linear CG step - be = sum(g_new * g_new)/sum(g_old * g_old) - s = be * s + g_new - g_old = g_new - - iter = iter + 1 - } - - w[,iter_class] = w_class - } - - write(w, $model, format=cmdLine_fmt) - - debug_str = "# Class, Iter, Obj" - for(iter_class in 1:ncol(debug_mat)){ - for(iter in 1:nrow(debug_mat)){ - obj = castAsScalar(debug_mat[iter, iter_class]) - if(obj != -1) - debug_str = append(debug_str, iter_class + "," + iter + "," + obj) - } - } - write(debug_str, $Log) -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multiclass SVM with squared slack variables, +# learns one-against-the-rest binary-class classifiers +# +# Example Usage: +# Assume SVM_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100 +# +# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" +# + +cmdLine_fmt=ifdef($fmt, "text") +cmdLine_icpt = ifdef($icpt, 0) +cmdLine_tol=ifdef($tol, 0.001) +cmdLine_reg=ifdef($reg, 1.0) +cmdLine_maxiter=ifdef($maxiter, 100) + +print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) + +X = read($X) + +check_X = sum(X) +if(check_X == 0){ + print("X has no non-zeros") +}else{ + Y = read($Y) + intercept = cmdLine_icpt + num_classes = $classes + epsilon = cmdLine_tol + lambda = cmdLine_reg + max_iterations = cmdLine_maxiter + + num_samples = nrow(X) + num_features = ncol(X) + + if (intercept == 1) { + ones = matrix(1, rows=num_samples, cols=1); + X = append(X, ones); + } + + num_rows_in_w = num_features + if(intercept == 1){ + num_rows_in_w = num_rows_in_w + 1 + } + w = matrix(0, rows=num_rows_in_w, cols=num_classes) + + debug_mat = matrix(-1, rows=max_iterations, cols=num_classes) + parfor(iter_class in 1:num_classes){ + Y_local = 2 * ppred(Y, iter_class, "==") - 1 + w_class = matrix(0, rows=num_features, cols=1) + if (intercept == 1) { + zero_matrix = matrix(0, rows=1, cols=1); + w_class = t(append(t(w_class), zero_matrix)); + } + + g_old = t(X) %*% Y_local + s = g_old + + Xw = matrix(0, rows=nrow(X), cols=1) + iter = 0 + continue = 1 + while(continue == 1) { + # minimizing primal obj along direction s + step_sz = 0 + Xd = X %*% s + wd = lambda * sum(w_class * s) + dd = lambda * sum(s * s) + continue1 = 1 + while(continue1 == 1){ + tmp_Xw = Xw + step_sz*Xd + out = 1 - Y_local * (tmp_Xw) + sv = ppred(out, 0, ">") + out = out * sv + g = wd + step_sz*dd - sum(out * Y_local * Xd) + h = dd + sum(Xd * sv * Xd) + step_sz = step_sz - g/h + if (g*g/h < 0.0000000001){ + continue1 = 0 + } + } + + #update weights + w_class = w_class + step_sz*s + Xw = Xw + step_sz*Xd + + out = 1 - Y_local * Xw + sv = ppred(out, 0, ">") + out = sv * out + obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) + g_new = t(X) %*% (out * Y_local) - lambda * w_class + + tmp = sum(s * g_old) + + train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100 + print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) + debug_mat[iter+1,iter_class] = obj + + if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){ + continue = 0 + } + + #non-linear CG step + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 + } + + w[,iter_class] = w_class + } + + write(w, $model, format=cmdLine_fmt) + + debug_str = "# Class, Iter, Obj" + for(iter_class in 1:ncol(debug_mat)){ + for(iter in 1:nrow(debug_mat)){ + obj = castAsScalar(debug_mat[iter, iter_class]) + if(obj != -1) + debug_str = append(debug_str, iter_class + "," + iter + "," + obj) + } + } + write(debug_str, $Log) +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/m-svm/m-svm.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/m-svm/m-svm.pydml b/src/test/scripts/applications/m-svm/m-svm.pydml index 83f17cf..348f599 100644 --- a/src/test/scripts/applications/m-svm/m-svm.pydml +++ b/src/test/scripts/applications/m-svm/m-svm.pydml @@ -1,136 +1,136 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multiclass SVM with squared slack variables, -# learns one-against-the-rest binary-class classifiers -# -# Example Usage: -# Assume SVM_HOME is set to the home of the pydml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100 -# -# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.pydml -python -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" -# - -cmdLine_fmt=ifdef($fmt, "text") -cmdLine_icpt = ifdef($icpt, 0) -cmdLine_tol=ifdef($tol, 0.001) -cmdLine_reg=ifdef($reg, 1.0) -cmdLine_maxiter=ifdef($maxiter, 100) - -print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) - -X = load($X) - -check_X = sum(X) -if(check_X == 0): - print("X has no non-zeros") -else: - Y = load($Y) - intercept = cmdLine_icpt - num_classes = $classes - epsilon = cmdLine_tol - lambda = cmdLine_reg - max_iterations = cmdLine_maxiter - - num_samples = nrow(X) - num_features = ncol(X) - - if (intercept == 1): - ones = full(1, rows=num_samples, cols=1) - X = append(X, ones) - - num_rows_in_w = num_features - if(intercept == 1): - num_rows_in_w = num_rows_in_w + 1 - w = full(0, rows=num_rows_in_w, cols=num_classes) - - debug_mat = full(-1, rows=max_iterations, cols=num_classes) - parfor(iter_class in 1:num_classes): - Y_local = 2 * ppred(Y, iter_class, "==") - 1 - w_class = full(0, rows=num_features, cols=1) - if (intercept == 1): - zero_matrix = full(0, rows=1, cols=1) - w_class = transpose(append(transpose(w_class), zero_matrix)) - g_old = dot(transpose(X), Y_local) - s = g_old - - Xw = full(0, rows=nrow(X), cols=1) - iter = 0 - continue = 1 - while(continue == 1): - # minimizing primal obj along direction s - step_sz = 0 - Xd = dot(X, s) - wd = lambda * sum(w_class * s) - dd = lambda * sum(s * s) - continue1 = 1 - while(continue1 == 1): - tmp_Xw = Xw + step_sz*Xd - out = 1 - Y_local * (tmp_Xw) - sv = ppred(out, 0, ">") - out = out * sv - g = wd + step_sz*dd - sum(out * Y_local * Xd) - h = dd + sum(Xd * sv * Xd) - step_sz = step_sz - g/h - if (g*g/h < 0.0000000001): - continue1 = 0 - - #update weights - w_class = w_class + step_sz*s - Xw = Xw + step_sz*Xd - - out = 1 - Y_local * Xw - sv = ppred(out, 0, ">") - out = sv * out - obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) - g_new = dot(transpose(X), (out * Y_local)) - lambda * w_class - - tmp = sum(s * g_old) - - train_acc = sum(ppred(Y_local*(dot(X, w_class)), 0, ">="))/num_samples*100 - print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) - debug_mat[iter+1,iter_class] = obj - - if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)): - continue = 0 - - #non-linear CG step - be = sum(g_new * g_new)/sum(g_old * g_old) - s = be * s + g_new - g_old = g_new - - iter = iter + 1 - # end while(continue == 1) - - w[,iter_class] = w_class - # end parfor(iter_class in 1:num_classes) - - save(w, $model, format=cmdLine_fmt) - - debug_str = "# Class, Iter, Obj" - for(iter_class in 1:ncol(debug_mat)): - for(iter in 1:nrow(debug_mat)): - obj = castAsScalar(debug_mat[iter, iter_class]) - if(obj != -1): - debug_str = append(debug_str, iter_class + "," + iter + "," + obj) - save(debug_str, $Log) - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multiclass SVM with squared slack variables, +# learns one-against-the-rest binary-class classifiers +# +# Example Usage: +# Assume SVM_HOME is set to the home of the pydml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume number of classes is 10, epsilon = 0.001, lambda=1.0, max_iterations = 100 +# +# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.pydml -python -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept classes=10 tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" +# + +cmdLine_fmt=ifdef($fmt, "text") +cmdLine_icpt = ifdef($icpt, 0) +cmdLine_tol=ifdef($tol, 0.001) +cmdLine_reg=ifdef($reg, 1.0) +cmdLine_maxiter=ifdef($maxiter, 100) + +print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) + +X = load($X) + +check_X = sum(X) +if(check_X == 0): + print("X has no non-zeros") +else: + Y = load($Y) + intercept = cmdLine_icpt + num_classes = $classes + epsilon = cmdLine_tol + lambda = cmdLine_reg + max_iterations = cmdLine_maxiter + + num_samples = nrow(X) + num_features = ncol(X) + + if (intercept == 1): + ones = full(1, rows=num_samples, cols=1) + X = append(X, ones) + + num_rows_in_w = num_features + if(intercept == 1): + num_rows_in_w = num_rows_in_w + 1 + w = full(0, rows=num_rows_in_w, cols=num_classes) + + debug_mat = full(-1, rows=max_iterations, cols=num_classes) + parfor(iter_class in 1:num_classes): + Y_local = 2 * ppred(Y, iter_class, "==") - 1 + w_class = full(0, rows=num_features, cols=1) + if (intercept == 1): + zero_matrix = full(0, rows=1, cols=1) + w_class = transpose(append(transpose(w_class), zero_matrix)) + g_old = dot(transpose(X), Y_local) + s = g_old + + Xw = full(0, rows=nrow(X), cols=1) + iter = 0 + continue = 1 + while(continue == 1): + # minimizing primal obj along direction s + step_sz = 0 + Xd = dot(X, s) + wd = lambda * sum(w_class * s) + dd = lambda * sum(s * s) + continue1 = 1 + while(continue1 == 1): + tmp_Xw = Xw + step_sz*Xd + out = 1 - Y_local * (tmp_Xw) + sv = ppred(out, 0, ">") + out = out * sv + g = wd + step_sz*dd - sum(out * Y_local * Xd) + h = dd + sum(Xd * sv * Xd) + step_sz = step_sz - g/h + if (g*g/h < 0.0000000001): + continue1 = 0 + + #update weights + w_class = w_class + step_sz*s + Xw = Xw + step_sz*Xd + + out = 1 - Y_local * Xw + sv = ppred(out, 0, ">") + out = sv * out + obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) + g_new = dot(transpose(X), (out * Y_local)) - lambda * w_class + + tmp = sum(s * g_old) + + train_acc = sum(ppred(Y_local*(dot(X, w_class)), 0, ">="))/num_samples*100 + print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) + debug_mat[iter+1,iter_class] = obj + + if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)): + continue = 0 + + #non-linear CG step + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 + # end while(continue == 1) + + w[,iter_class] = w_class + # end parfor(iter_class in 1:num_classes) + + save(w, $model, format=cmdLine_fmt) + + debug_str = "# Class, Iter, Obj" + for(iter_class in 1:ncol(debug_mat)): + for(iter in 1:nrow(debug_mat)): + obj = castAsScalar(debug_mat[iter, iter_class]) + if(obj != -1): + debug_str = append(debug_str, iter_class + "," + iter + "," + obj) + save(debug_str, $Log) + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.R b/src/test/scripts/applications/mdabivar/MDABivariateStats.R index 7715c5e..1844bbb 100644 --- a/src/test/scripts/applications/mdabivar/MDABivariateStats.R +++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.R @@ -1,294 +1,294 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -bivar_ss = function(X, Y) { - - # Unweighted co-variance - covXY = cov(X,Y) - - # compute standard deviations for both X and Y by computing 2^nd central moment - m2X = var(X) - m2Y = var(Y) - sigmaX = sqrt(m2X) - sigmaY = sqrt(m2Y) - - # Pearson's R - R = covXY / (sigmaX*sigmaY) - - return(list("R" = R, "covXY" = covXY, "sigmaX" = sigmaX, "sigmaY" = sigmaY)) -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_cc = function(A, B) { - - # Contingency Table - F = table(A,B) - - # Chi-Squared - cst = chisq.test(F) - - r = rowSums(F) - c = colSums(F) - - chi_squared = as.numeric(cst[1]) - - # compute p-value - pValue = as.numeric(cst[3]) - - # Assign return values - pval = pValue - contingencyTable = F - rowMarginals = r - colMarginals = c - - return(list("pval" = pval, "contingencyTable" = contingencyTable, "rowMarginals" = rowMarginals, "colMarginals" = colMarginals)) -} - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -bivar_sc = function(Y, A) { - # mean and variance in target variable - W = length(A) - my = mean(Y) - varY = var(Y) - - # category-wise (frequencies, means, variances) - CFreqs = as.matrix(table(A)) - - CMeans = as.matrix(aggregate(Y, by=list(A), "mean")$x) - - CVars = as.matrix(aggregate(Y, by=list(A), "var")$x) - CVars[is.na(CVars)] <- 0 - - # number of categories - R = nrow(CFreqs) - df1 = R-1 - df2 = W-R - - anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1) - anova_den = sum( (CFreqs-1)*CVars )/(W-R) - AnovaF = anova_num/anova_den - pVal = 1-pf(AnovaF, df1, df2) - - return(list("pVal" = pVal, "CFreqs" = CFreqs, "CMeans" = CMeans, "CVars" = CVars)) -} - -# Main starts here ----------------------------------------------------------------------------------------------------------- - -args <- commandArgs(TRUE) - -library(Matrix) - -# input data set -D = readMM(paste(args[1], "X.mtx", sep="")); - -# label attr id (must be a valid index > 0) -label_index = as.integer(args[2]) - -# feature attributes, column vector of indices -feature_indices = readMM(paste(args[1], "feature_indices.mtx", sep="")) - -# can be either 1 (scale) or 0 (categorical) -label_measurement_level = as.integer(args[3]) - -# measurement levels for features, 0/1 column vector -feature_measurement_levels = readMM(paste(args[1], "feature_measurement_levels.mtx", sep="")) - -sz = ncol(D) - -# store for pvalues and pearson's r -stats = matrix(0, sz, 1) -# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's -tests = matrix(0, sz, 1) -# store for covariances used to compute pearson's r -covariances = matrix(0, sz, 1) -# store for standard deviations used to compute pearson's r -standard_deviations = matrix(0, sz, 1) - -labels = D[,label_index] - -labelCorrection = 0 -if(label_measurement_level == 1){ - numLabels = length(labels) - cmLabels = var(labels) - stdLabels = sqrt(cmLabels) - standard_deviations[label_index,1] = stdLabels -}else{ - labelCorrection = 1 - min(labels) - labels = labels + labelCorrection -} - -mx = apply(D, 2, max) -mn = apply(D, 2, min) -num_distinct_values = mx-mn+1 -max_num_distinct_values = 0 -for(i1 in 1:nrow(feature_indices)){ - feature_index1 = feature_indices[i1,1] - num = num_distinct_values[feature_index1] - if(feature_measurement_levels[i1,1] == 0 & num >= max_num_distinct_values){ - max_num_distinct_values = num - } -} -distinct_label_values = matrix(0, 1, 1) -contingencyTableSz = 1 -maxNumberOfGroups = 1 -if(max_num_distinct_values != 0){ - maxNumberOfGroups = max_num_distinct_values -} -if(label_measurement_level==0){ - distinct_label_values = as.data.frame(table(labels))$Freq - if(max_num_distinct_values != 0){ - contingencyTableSz = max_num_distinct_values*length(distinct_label_values) - } - maxNumberOfGroups = max(maxNumberOfGroups, length(distinct_label_values)) -} -# store for contingency table cell values -contingencyTablesCounts = matrix(0, sz, contingencyTableSz) -# store for contingency table label(row) assignments -contingencyTablesLabelValues = matrix(0, sz, contingencyTableSz) -# store for contingency table feature(col) assignments -contingencyTablesFeatureValues = matrix(0, sz, contingencyTableSz) -# store for distinct values -featureValues = matrix(0, sz, maxNumberOfGroups) -# store for counts of distinct values -featureCounts = matrix(0, sz, maxNumberOfGroups) -# store for group means -featureMeans = matrix(0, sz, maxNumberOfGroups) -# store for group standard deviations -featureSTDs = matrix(0, sz, maxNumberOfGroups) - -if(label_measurement_level == 0){ - featureCounts[label_index,1:length(distinct_label_values)] = distinct_label_values - for(i2 in 1:length(distinct_label_values)){ - featureValues[label_index,i2] = i2-labelCorrection - } -} - -for(i3 in 1:nrow(feature_indices)){ - feature_index2 = feature_indices[i3,1] - feature_measurement_level = feature_measurement_levels[i3,1] - - feature = D[,feature_index2] - - if(feature_measurement_level == 0){ - featureCorrection = 1 - min(feature) - feature = feature + featureCorrection - - if(label_measurement_level == feature_measurement_level){ - # categorical-categorical - tests[feature_index2,1] = 1 - - ret = bivar_cc(labels, feature) - pVal = ret$pval - contingencyTable = ret$contingencyTable - rowMarginals = ret$rowMarginals - colMarginals = ret$colMarginals - - stats[feature_index2,1] = pVal - - sz3 = nrow(contingencyTable)*ncol(contingencyTable) - - contingencyTableCounts = matrix(0, 1, sz3) - contingencyTableLabelValues = matrix(0, 1, sz3) - contingencyTableFeatureValues = matrix(0, 1, sz3) - - for(i4 in 1:nrow(contingencyTable)){ - for(j in 1:ncol(contingencyTable)){ - #get rid of this, see *1 below - contingencyTableCounts[1, ncol(contingencyTable)*(i4-1)+j] = contingencyTable[i4,j] - - contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection - contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection - } - } - contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts - - contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues - contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues - - featureCounts[feature_index2,1:length(colMarginals)] = colMarginals - for(i5 in 1:length(colMarginals)){ - featureValues[feature_index2,i5] = i5-featureCorrection - } - }else{ - # label is scale, feature is categorical - tests[feature_index2,1] = 2 - - ret = bivar_sc(labels, feature) - pVal = ret$pVal - frequencies = ret$CFreqs - means = ret$CMeans - variances = ret$CVars - - stats[feature_index2,1] = pVal - featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies) - for(i6 in 1:nrow(frequencies)){ - featureValues[feature_index2,i6] = i6 - featureCorrection - } - featureMeans[feature_index2,1:nrow(means)] = t(means) - featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) - } - }else{ - if(label_measurement_level == feature_measurement_level){ - # scale-scale - tests[feature_index2,1] = 3 - - ret = bivar_ss(labels, feature) - r = ret$R - covariance = ret$covXY - stdX = ret$sigmaX - stdY = ret$sigmaY - - stats[feature_index2,1] = r - covariances[feature_index2,1] = covariance - standard_deviations[feature_index2,1] = stdY - }else{ - # label is categorical, feature is scale - tests[feature_index2,1] = 2 - - ret = bivar_sc(feature, labels) - pVal = ret$pVal - frequencies = ret$CFreqs - means = ret$CMeans - variances = ret$CVars - - stats[feature_index2,1] = pVal - featureMeans[feature_index2,1:nrow(means)] = t(means) - featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) - } - } -} - -writeMM(as(stats, "CsparseMatrix"), paste(args[4], "stats", sep="")) -writeMM(as(tests, "CsparseMatrix"), paste(args[4], "tests", sep="")) -writeMM(as(covariances, "CsparseMatrix"), paste(args[4], "covariances", sep="")) -writeMM(as(standard_deviations, "CsparseMatrix"), paste(args[4], "standard_deviations", sep="")) -writeMM(as(contingencyTablesCounts, "CsparseMatrix"), paste(args[4], "contingencyTablesCounts", sep="")) -writeMM(as(contingencyTablesLabelValues, "CsparseMatrix"), paste(args[4], "contingencyTablesLabelValues", sep="")) -writeMM(as(contingencyTablesFeatureValues, "CsparseMatrix"), paste(args[4], "contingencyTablesFeatureValues", sep="")) -writeMM(as(featureValues, "CsparseMatrix"), paste(args[4], "featureValues", sep="")) -writeMM(as(featureCounts, "CsparseMatrix"), paste(args[4], "featureCounts", sep="")) -writeMM(as(featureMeans, "CsparseMatrix"), paste(args[4], "featureMeans", sep="")) -writeMM(as(featureSTDs, "CsparseMatrix"), paste(args[4], "featureSTDs", sep="")) - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +bivar_ss = function(X, Y) { + + # Unweighted co-variance + covXY = cov(X,Y) + + # compute standard deviations for both X and Y by computing 2^nd central moment + m2X = var(X) + m2Y = var(Y) + sigmaX = sqrt(m2X) + sigmaY = sqrt(m2Y) + + # Pearson's R + R = covXY / (sigmaX*sigmaY) + + return(list("R" = R, "covXY" = covXY, "sigmaX" = sigmaX, "sigmaY" = sigmaY)) +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_cc = function(A, B) { + + # Contingency Table + F = table(A,B) + + # Chi-Squared + cst = chisq.test(F) + + r = rowSums(F) + c = colSums(F) + + chi_squared = as.numeric(cst[1]) + + # compute p-value + pValue = as.numeric(cst[3]) + + # Assign return values + pval = pValue + contingencyTable = F + rowMarginals = r + colMarginals = c + + return(list("pval" = pval, "contingencyTable" = contingencyTable, "rowMarginals" = rowMarginals, "colMarginals" = colMarginals)) +} + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +bivar_sc = function(Y, A) { + # mean and variance in target variable + W = length(A) + my = mean(Y) + varY = var(Y) + + # category-wise (frequencies, means, variances) + CFreqs = as.matrix(table(A)) + + CMeans = as.matrix(aggregate(Y, by=list(A), "mean")$x) + + CVars = as.matrix(aggregate(Y, by=list(A), "var")$x) + CVars[is.na(CVars)] <- 0 + + # number of categories + R = nrow(CFreqs) + df1 = R-1 + df2 = W-R + + anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1) + anova_den = sum( (CFreqs-1)*CVars )/(W-R) + AnovaF = anova_num/anova_den + pVal = 1-pf(AnovaF, df1, df2) + + return(list("pVal" = pVal, "CFreqs" = CFreqs, "CMeans" = CMeans, "CVars" = CVars)) +} + +# Main starts here ----------------------------------------------------------------------------------------------------------- + +args <- commandArgs(TRUE) + +library(Matrix) + +# input data set +D = readMM(paste(args[1], "X.mtx", sep="")); + +# label attr id (must be a valid index > 0) +label_index = as.integer(args[2]) + +# feature attributes, column vector of indices +feature_indices = readMM(paste(args[1], "feature_indices.mtx", sep="")) + +# can be either 1 (scale) or 0 (categorical) +label_measurement_level = as.integer(args[3]) + +# measurement levels for features, 0/1 column vector +feature_measurement_levels = readMM(paste(args[1], "feature_measurement_levels.mtx", sep="")) + +sz = ncol(D) + +# store for pvalues and pearson's r +stats = matrix(0, sz, 1) +# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's +tests = matrix(0, sz, 1) +# store for covariances used to compute pearson's r +covariances = matrix(0, sz, 1) +# store for standard deviations used to compute pearson's r +standard_deviations = matrix(0, sz, 1) + +labels = D[,label_index] + +labelCorrection = 0 +if(label_measurement_level == 1){ + numLabels = length(labels) + cmLabels = var(labels) + stdLabels = sqrt(cmLabels) + standard_deviations[label_index,1] = stdLabels +}else{ + labelCorrection = 1 - min(labels) + labels = labels + labelCorrection +} + +mx = apply(D, 2, max) +mn = apply(D, 2, min) +num_distinct_values = mx-mn+1 +max_num_distinct_values = 0 +for(i1 in 1:nrow(feature_indices)){ + feature_index1 = feature_indices[i1,1] + num = num_distinct_values[feature_index1] + if(feature_measurement_levels[i1,1] == 0 & num >= max_num_distinct_values){ + max_num_distinct_values = num + } +} +distinct_label_values = matrix(0, 1, 1) +contingencyTableSz = 1 +maxNumberOfGroups = 1 +if(max_num_distinct_values != 0){ + maxNumberOfGroups = max_num_distinct_values +} +if(label_measurement_level==0){ + distinct_label_values = as.data.frame(table(labels))$Freq + if(max_num_distinct_values != 0){ + contingencyTableSz = max_num_distinct_values*length(distinct_label_values) + } + maxNumberOfGroups = max(maxNumberOfGroups, length(distinct_label_values)) +} +# store for contingency table cell values +contingencyTablesCounts = matrix(0, sz, contingencyTableSz) +# store for contingency table label(row) assignments +contingencyTablesLabelValues = matrix(0, sz, contingencyTableSz) +# store for contingency table feature(col) assignments +contingencyTablesFeatureValues = matrix(0, sz, contingencyTableSz) +# store for distinct values +featureValues = matrix(0, sz, maxNumberOfGroups) +# store for counts of distinct values +featureCounts = matrix(0, sz, maxNumberOfGroups) +# store for group means +featureMeans = matrix(0, sz, maxNumberOfGroups) +# store for group standard deviations +featureSTDs = matrix(0, sz, maxNumberOfGroups) + +if(label_measurement_level == 0){ + featureCounts[label_index,1:length(distinct_label_values)] = distinct_label_values + for(i2 in 1:length(distinct_label_values)){ + featureValues[label_index,i2] = i2-labelCorrection + } +} + +for(i3 in 1:nrow(feature_indices)){ + feature_index2 = feature_indices[i3,1] + feature_measurement_level = feature_measurement_levels[i3,1] + + feature = D[,feature_index2] + + if(feature_measurement_level == 0){ + featureCorrection = 1 - min(feature) + feature = feature + featureCorrection + + if(label_measurement_level == feature_measurement_level){ + # categorical-categorical + tests[feature_index2,1] = 1 + + ret = bivar_cc(labels, feature) + pVal = ret$pval + contingencyTable = ret$contingencyTable + rowMarginals = ret$rowMarginals + colMarginals = ret$colMarginals + + stats[feature_index2,1] = pVal + + sz3 = nrow(contingencyTable)*ncol(contingencyTable) + + contingencyTableCounts = matrix(0, 1, sz3) + contingencyTableLabelValues = matrix(0, 1, sz3) + contingencyTableFeatureValues = matrix(0, 1, sz3) + + for(i4 in 1:nrow(contingencyTable)){ + for(j in 1:ncol(contingencyTable)){ + #get rid of this, see *1 below + contingencyTableCounts[1, ncol(contingencyTable)*(i4-1)+j] = contingencyTable[i4,j] + + contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection + contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection + } + } + contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts + + contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues + contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues + + featureCounts[feature_index2,1:length(colMarginals)] = colMarginals + for(i5 in 1:length(colMarginals)){ + featureValues[feature_index2,i5] = i5-featureCorrection + } + }else{ + # label is scale, feature is categorical + tests[feature_index2,1] = 2 + + ret = bivar_sc(labels, feature) + pVal = ret$pVal + frequencies = ret$CFreqs + means = ret$CMeans + variances = ret$CVars + + stats[feature_index2,1] = pVal + featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies) + for(i6 in 1:nrow(frequencies)){ + featureValues[feature_index2,i6] = i6 - featureCorrection + } + featureMeans[feature_index2,1:nrow(means)] = t(means) + featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) + } + }else{ + if(label_measurement_level == feature_measurement_level){ + # scale-scale + tests[feature_index2,1] = 3 + + ret = bivar_ss(labels, feature) + r = ret$R + covariance = ret$covXY + stdX = ret$sigmaX + stdY = ret$sigmaY + + stats[feature_index2,1] = r + covariances[feature_index2,1] = covariance + standard_deviations[feature_index2,1] = stdY + }else{ + # label is categorical, feature is scale + tests[feature_index2,1] = 2 + + ret = bivar_sc(feature, labels) + pVal = ret$pVal + frequencies = ret$CFreqs + means = ret$CMeans + variances = ret$CVars + + stats[feature_index2,1] = pVal + featureMeans[feature_index2,1:nrow(means)] = t(means) + featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) + } + } +} + +writeMM(as(stats, "CsparseMatrix"), paste(args[4], "stats", sep="")) +writeMM(as(tests, "CsparseMatrix"), paste(args[4], "tests", sep="")) +writeMM(as(covariances, "CsparseMatrix"), paste(args[4], "covariances", sep="")) +writeMM(as(standard_deviations, "CsparseMatrix"), paste(args[4], "standard_deviations", sep="")) +writeMM(as(contingencyTablesCounts, "CsparseMatrix"), paste(args[4], "contingencyTablesCounts", sep="")) +writeMM(as(contingencyTablesLabelValues, "CsparseMatrix"), paste(args[4], "contingencyTablesLabelValues", sep="")) +writeMM(as(contingencyTablesFeatureValues, "CsparseMatrix"), paste(args[4], "contingencyTablesFeatureValues", sep="")) +writeMM(as(featureValues, "CsparseMatrix"), paste(args[4], "featureValues", sep="")) +writeMM(as(featureCounts, "CsparseMatrix"), paste(args[4], "featureCounts", sep="")) +writeMM(as(featureMeans, "CsparseMatrix"), paste(args[4], "featureMeans", sep="")) +writeMM(as(featureSTDs, "CsparseMatrix"), paste(args[4], "featureSTDs", sep="")) + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.dml b/src/test/scripts/applications/mdabivar/MDABivariateStats.dml index 1e04154..56163ad 100644 --- a/src/test/scripts/applications/mdabivar/MDABivariateStats.dml +++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.dml @@ -1,268 +1,268 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Main starts here ----------------------------------------------------------------------------------------------------------- - -# input data set -D = read($1) - -# label attr id (must be a valid index > 0) -label_index = $2 - -# feature attributes, column vector of indices -feature_indices = read($3) - -# can be either 1 (scale) or 0 (categorical) -label_measurement_level = $4 - -# measurement levels for features, 0/1 column vector -feature_measurement_levels = read($5) - -sz = ncol(D) - -# store for pvalues and pearson's r -stats = matrix(0, rows=sz, cols=1) -# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's -tests = matrix(0, rows=sz, cols=1) -# store for covariances used to compute pearson's r -covariances = matrix(0, rows=sz, cols=1) -# store for standard deviations used to compute pearson's r -standard_deviations = matrix(0, rows=sz, cols=1) - -labels = D[,label_index] - -labelCorrection = 0 -if(label_measurement_level == 1){ - numLabels = nrow(labels) - cmLabels = moment(labels,2) - stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) ) - standard_deviations[label_index,1] = stdLabels -}else{ - labelCorrection = 1 - min(labels) - labels = labels + labelCorrection -} - -mx = colMaxs(D) -mn = colMins(D) -num_distinct_values = mx-mn+1 -max_num_distinct_values = 0 -for(i1 in 1:nrow(feature_indices)){ - feature_index1 = castAsScalar(feature_indices[i1,1]) - num = castAsScalar(num_distinct_values[1,feature_index1]) - if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values){ - max_num_distinct_values = num - } -} -distinct_label_values = matrix(0, rows=1, cols=1) -contingencyTableSz = 1 -maxNumberOfGroups = 1 -if(max_num_distinct_values != 0){ - maxNumberOfGroups = max_num_distinct_values -} -if(label_measurement_level==0){ - distinct_label_values = aggregate(target=labels, groups=labels, fn="count") - if(max_num_distinct_values != 0){ - contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values) - } - maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values)) -} -# store for contingency table cell values -contingencyTablesCounts = matrix(0, rows=sz, cols=contingencyTableSz) -# store for contingency table label(row) assignments -contingencyTablesLabelValues = matrix(0, rows=sz, cols=contingencyTableSz) -# store for contingency table feature(col) assignments -contingencyTablesFeatureValues = matrix(0, rows=sz, cols=contingencyTableSz) -# store for distinct values -featureValues = matrix(0, rows=sz, cols=maxNumberOfGroups) -# store for counts of distinct values -featureCounts = matrix(0, rows=sz, cols=maxNumberOfGroups) -# store for group means -featureMeans = matrix(0, rows=sz, cols=maxNumberOfGroups) -# store for group standard deviations -featureSTDs = matrix(0, rows=sz, cols=maxNumberOfGroups) - -if(label_measurement_level == 0){ - featureCounts[label_index,1:nrow(distinct_label_values)] = t(distinct_label_values) - parfor(i2 in 1:nrow(distinct_label_values)){ - featureValues[label_index,i2] = i2-labelCorrection - } -} - -parfor(i3 in 1:nrow(feature_indices), check=0){ - feature_index2 = castAsScalar(feature_indices[i3,1]) - feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1]) - - feature = D[,feature_index2] - - if(feature_measurement_level == 0){ - featureCorrection = 1 - min(feature) - feature = feature + featureCorrection - - if(label_measurement_level == feature_measurement_level){ - # categorical-categorical - tests[feature_index2,1] = 1 - [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature) - stats[feature_index2,1] = pVal - - sz3=1 - if(1==1){ - sz3 = nrow(contingencyTable)*ncol(contingencyTable) - } - contingencyTableLabelValues = matrix(0, rows=1, cols=sz3) - contingencyTableFeatureValues = matrix(0, rows=1, cols=sz3) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Main starts here ----------------------------------------------------------------------------------------------------------- + +# input data set +D = read($1) + +# label attr id (must be a valid index > 0) +label_index = $2 + +# feature attributes, column vector of indices +feature_indices = read($3) + +# can be either 1 (scale) or 0 (categorical) +label_measurement_level = $4 + +# measurement levels for features, 0/1 column vector +feature_measurement_levels = read($5) + +sz = ncol(D) + +# store for pvalues and pearson's r +stats = matrix(0, rows=sz, cols=1) +# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's +tests = matrix(0, rows=sz, cols=1) +# store for covariances used to compute pearson's r +covariances = matrix(0, rows=sz, cols=1) +# store for standard deviations used to compute pearson's r +standard_deviations = matrix(0, rows=sz, cols=1) + +labels = D[,label_index] + +labelCorrection = 0 +if(label_measurement_level == 1){ + numLabels = nrow(labels) + cmLabels = moment(labels,2) + stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) ) + standard_deviations[label_index,1] = stdLabels +}else{ + labelCorrection = 1 - min(labels) + labels = labels + labelCorrection +} + +mx = colMaxs(D) +mn = colMins(D) +num_distinct_values = mx-mn+1 +max_num_distinct_values = 0 +for(i1 in 1:nrow(feature_indices)){ + feature_index1 = castAsScalar(feature_indices[i1,1]) + num = castAsScalar(num_distinct_values[1,feature_index1]) + if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values){ + max_num_distinct_values = num + } +} +distinct_label_values = matrix(0, rows=1, cols=1) +contingencyTableSz = 1 +maxNumberOfGroups = 1 +if(max_num_distinct_values != 0){ + maxNumberOfGroups = max_num_distinct_values +} +if(label_measurement_level==0){ + distinct_label_values = aggregate(target=labels, groups=labels, fn="count") + if(max_num_distinct_values != 0){ + contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values) + } + maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values)) +} +# store for contingency table cell values +contingencyTablesCounts = matrix(0, rows=sz, cols=contingencyTableSz) +# store for contingency table label(row) assignments +contingencyTablesLabelValues = matrix(0, rows=sz, cols=contingencyTableSz) +# store for contingency table feature(col) assignments +contingencyTablesFeatureValues = matrix(0, rows=sz, cols=contingencyTableSz) +# store for distinct values +featureValues = matrix(0, rows=sz, cols=maxNumberOfGroups) +# store for counts of distinct values +featureCounts = matrix(0, rows=sz, cols=maxNumberOfGroups) +# store for group means +featureMeans = matrix(0, rows=sz, cols=maxNumberOfGroups) +# store for group standard deviations +featureSTDs = matrix(0, rows=sz, cols=maxNumberOfGroups) + +if(label_measurement_level == 0){ + featureCounts[label_index,1:nrow(distinct_label_values)] = t(distinct_label_values) + parfor(i2 in 1:nrow(distinct_label_values)){ + featureValues[label_index,i2] = i2-labelCorrection + } +} + +parfor(i3 in 1:nrow(feature_indices), check=0){ + feature_index2 = castAsScalar(feature_indices[i3,1]) + feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1]) + + feature = D[,feature_index2] + + if(feature_measurement_level == 0){ + featureCorrection = 1 - min(feature) + feature = feature + featureCorrection - parfor(i4 in 1:nrow(contingencyTable), check=0){ - parfor(j in 1:ncol(contingencyTable), check=0){ - contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection - contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection - } - } + if(label_measurement_level == feature_measurement_level){ + # categorical-categorical + tests[feature_index2,1] = 1 + [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature) + stats[feature_index2,1] = pVal + + sz3=1 + if(1==1){ + sz3 = nrow(contingencyTable)*ncol(contingencyTable) + } + contingencyTableLabelValues = matrix(0, rows=1, cols=sz3) + contingencyTableFeatureValues = matrix(0, rows=1, cols=sz3) + + parfor(i4 in 1:nrow(contingencyTable), check=0){ + parfor(j in 1:ncol(contingencyTable), check=0){ + contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection + contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection + } + } contingencyTableCounts = matrix(contingencyTable, rows=1, cols=sz3, byrow=TRUE) - contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts - - contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues - contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues - - featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals - parfor(i5 in 1:ncol(colMarginals), check=0){ - featureValues[feature_index2,i5] = i5-featureCorrection - } - }else{ - # label is scale, feature is categorical - tests[feature_index2,1] = 2 - [pVal, frequencies, means, variances] = bivar_sc(labels, feature) - stats[feature_index2,1] = pVal - featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies) - parfor(i6 in 1:nrow(frequencies), check=0){ - featureValues[feature_index2,i6] = i6 - featureCorrection - } - featureMeans[feature_index2,1:nrow(means)] = t(means) - featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) - } - }else{ - if(label_measurement_level == feature_measurement_level){ - # scale-scale - tests[feature_index2,1] = 3 - [r, covariance, stdX, stdY] = bivar_ss(labels, feature) - stats[feature_index2,1] = r - covariances[feature_index2,1] = covariance - standard_deviations[feature_index2,1] = stdY - }else{ - # label is categorical, feature is scale - tests[feature_index2,1] = 2 - [pVal, frequencies, means, variances] = bivar_sc(feature, labels) - stats[feature_index2,1] = pVal - featureMeans[feature_index2,1:nrow(means)] = t(means) - featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) - } - } -} - -write(stats, $6, format="text") -write(tests, $7, format="text") -write(covariances, $8, format="text") -write(standard_deviations, $9, format="text") -write(contingencyTablesCounts, $10, format="text") -write(contingencyTablesLabelValues, $11, format="text") -write(contingencyTablesFeatureValues, $12, format="text") -write(featureValues, $13, format="text") -write(featureCounts, $14, format="text") -write(featureMeans, $15, format="text") -write(featureSTDs, $16, format="text") - -# ----------------------------------------------------------------------------------------------------------- - -bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) { - - # Unweighted co-variance - covXY = cov(X,Y) - - # compute standard deviations for both X and Y by computing 2^nd central moment - W = nrow(X) - m2X = moment(X,2) - m2Y = moment(Y,2) - sigmaX = sqrt(m2X * (W/(W-1.0)) ) - sigmaY = sqrt(m2Y * (W/(W-1.0)) ) - - - # Pearson's R - R = covXY / (sigmaX*sigmaY) -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double pval, Matrix[Double] contingencyTable, Matrix[Double] rowMarginals, Matrix[Double] colMarginals) { - - # Contingency Table - FF = table(A,B) - + contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts + + contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues + contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues + + featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals + parfor(i5 in 1:ncol(colMarginals), check=0){ + featureValues[feature_index2,i5] = i5-featureCorrection + } + }else{ + # label is scale, feature is categorical + tests[feature_index2,1] = 2 + [pVal, frequencies, means, variances] = bivar_sc(labels, feature) + stats[feature_index2,1] = pVal + featureCounts[feature_index2,1:nrow(frequencies)] = t(frequencies) + parfor(i6 in 1:nrow(frequencies), check=0){ + featureValues[feature_index2,i6] = i6 - featureCorrection + } + featureMeans[feature_index2,1:nrow(means)] = t(means) + featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) + } + }else{ + if(label_measurement_level == feature_measurement_level){ + # scale-scale + tests[feature_index2,1] = 3 + [r, covariance, stdX, stdY] = bivar_ss(labels, feature) + stats[feature_index2,1] = r + covariances[feature_index2,1] = covariance + standard_deviations[feature_index2,1] = stdY + }else{ + # label is categorical, feature is scale + tests[feature_index2,1] = 2 + [pVal, frequencies, means, variances] = bivar_sc(feature, labels) + stats[feature_index2,1] = pVal + featureMeans[feature_index2,1:nrow(means)] = t(means) + featureSTDs[feature_index2,1:nrow(variances)] = t(sqrt(variances)) + } + } +} + +write(stats, $6, format="text") +write(tests, $7, format="text") +write(covariances, $8, format="text") +write(standard_deviations, $9, format="text") +write(contingencyTablesCounts, $10, format="text") +write(contingencyTablesLabelValues, $11, format="text") +write(contingencyTablesFeatureValues, $12, format="text") +write(featureValues, $13, format="text") +write(featureCounts, $14, format="text") +write(featureMeans, $15, format="text") +write(featureSTDs, $16, format="text") + +# ----------------------------------------------------------------------------------------------------------- + +bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) { + + # Unweighted co-variance + covXY = cov(X,Y) + + # compute standard deviations for both X and Y by computing 2^nd central moment + W = nrow(X) + m2X = moment(X,2) + m2Y = moment(Y,2) + sigmaX = sqrt(m2X * (W/(W-1.0)) ) + sigmaY = sqrt(m2Y * (W/(W-1.0)) ) + + + # Pearson's R + R = covXY / (sigmaX*sigmaY) +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double pval, Matrix[Double] contingencyTable, Matrix[Double] rowMarginals, Matrix[Double] colMarginals) { + + # Contingency Table + FF = table(A,B) + tmp = removeEmpty(target=FF, margin="rows"); F = removeEmpty(target=tmp, margin="cols"); - # Chi-Squared - W = sum(F) - r = rowSums(F) - c = colSums(F) - E = (r %*% c)/W - E = ppred(E, 0, "==")*0.0001 + E - T = (F-E)^2/E - chi_squared = sum(T) - - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1) - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE) - - - # Assign return values - pval = pValue - contingencyTable = F - rowMarginals = r - colMarginals = c -} - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double pVal, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { - # mean and variance in target variable - W = nrow(A) - my = mean(Y) - varY = moment(Y,2) * W/(W-1.0) - - # category-wise (frequencies, means, variances) - CFreqs1 = aggregate(target=Y, groups=A, fn="count") + # Chi-Squared + W = sum(F) + r = rowSums(F) + c = colSums(F) + E = (r %*% c)/W + E = ppred(E, 0, "==")*0.0001 + E + T = (F-E)^2/E + chi_squared = sum(T) + + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1) + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE) + + + # Assign return values + pval = pValue + contingencyTable = F + rowMarginals = r + colMarginals = c +} + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double pVal, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { + # mean and variance in target variable + W = nrow(A) + my = mean(Y) + varY = moment(Y,2) * W/(W-1.0) + + # category-wise (frequencies, means, variances) + CFreqs1 = aggregate(target=Y, groups=A, fn="count") present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), margin="rows") CFreqs = present_domain_vals_mat %*% CFreqs1 - CMeans = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="mean") - CVars = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="variance") - - # number of categories - R = nrow(CFreqs) - df1 = R-1 - df2 = W-R - - anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1) - anova_den = sum( (CFreqs-1)*CVars )/(W-R) - AnovaF = anova_num/anova_den - pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=FALSE) -} + CMeans = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="mean") + CVars = present_domain_vals_mat %*% aggregate(target=Y, groups=A, fn="variance") + + # number of categories + R = nrow(CFreqs) + df1 = R-1 + df2 = W-R + + anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1) + anova_den = sum( (CFreqs-1)*CVars )/(W-R) + AnovaF = anova_num/anova_den + pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=FALSE) +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml b/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml index c899065..7fbc101 100644 --- a/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml +++ b/src/test/scripts/applications/mdabivar/MDABivariateStats.pydml @@ -1,246 +1,246 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Main starts here ----------------------------------------------------------------------------------------------------------- - -# input data set -D = load($1) - -# label attr id (must be a valid index > 0) -label_index = $2 - -# feature attributes, column vector of indices -feature_indices = load($3) - -# can be either 1 (scale) or 0 (categorical) -label_measurement_level = $4 - -# measurement levels for features, 0/1 column vector -feature_measurement_levels = read($5) - -sz = ncol(D) - -# store for pvalues and pearson's r -stats = full(0, rows=sz, cols=1) -# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's -tests = full(0, rows=sz, cols=1) -# store for covariances used to compute pearson's r -covariances = full(0, rows=sz, cols=1) -# store for standard deviations used to compute pearson's r -standard_deviations = full(0, rows=sz, cols=1) - -labels = D[,label_index] - -labelCorrection = 0 -if(label_measurement_level == 1): - numLabels = nrow(labels) - cmLabels = moment(labels,2) - stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) ) - standard_deviations[label_index,1] = stdLabels -else: - labelCorrection = 1 - min(labels) - labels = labels + labelCorrection - -mx = colMaxs(D) -mn = colMins(D) -num_distinct_values = mx-mn+1 -max_num_distinct_values = 0 -for(i1 in 1:nrow(feature_indices)): - feature_index1 = castAsScalar(feature_indices[i1,1]) - num = castAsScalar(num_distinct_values[1,feature_index1]) - if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values): - max_num_distinct_values = num -distinct_label_values = full(0, rows=1, cols=1) -contingencyTableSz = 1 -maxNumberOfGroups = 1 -if(max_num_distinct_values != 0): - maxNumberOfGroups = max_num_distinct_values -if(label_measurement_level==0): - distinct_label_values = aggregate(target=labels, groups=labels, fn="count") - if(max_num_distinct_values != 0): - contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values) - maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values)) -# store for contingency table cell values -contingencyTablesCounts = full(0, rows=sz, cols=contingencyTableSz) -# store for contingency table label(row) assignments -contingencyTablesLabelValues = full(0, rows=sz, cols=contingencyTableSz) -# store for contingency table feature(col) assignments -contingencyTablesFeatureValues = full(0, rows=sz, cols=contingencyTableSz) -# store for distinct values -featureValues = full(0, rows=sz, cols=maxNumberOfGroups) -# store for counts of distinct values -featureCounts = full(0, rows=sz, cols=maxNumberOfGroups) -# store for group means -featureMeans = full(0, rows=sz, cols=maxNumberOfGroups) -# store for group standard deviations -featureSTDs = full(0, rows=sz, cols=maxNumberOfGroups) - -if(label_measurement_level == 0): - featureCounts[label_index,1:nrow(distinct_label_values)] = transpose(distinct_label_values) - parfor(i2 in 1:nrow(distinct_label_values)): - featureValues[label_index,i2] = i2-labelCorrection - -parfor(i3 in 1:nrow(feature_indices), check=0): - feature_index2 = castAsScalar(feature_indices[i3,1]) - feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1]) - - feature = D[,feature_index2] - - if(feature_measurement_level == 0): - featureCorrection = 1 - min(feature) - feature = feature + featureCorrection - - if(label_measurement_level == feature_measurement_level): - # categorical-categorical - tests[feature_index2,1] = 1 - [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature) - stats[feature_index2,1] = pVal - - sz3=1 - if(1==1): - sz3 = nrow(contingencyTable)*ncol(contingencyTable) - contingencyTableLabelValues = full(0, rows=1, cols=sz3) - contingencyTableFeatureValues = full(0, rows=1, cols=sz3) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Main starts here ----------------------------------------------------------------------------------------------------------- + +# input data set +D = load($1) + +# label attr id (must be a valid index > 0) +label_index = $2 + +# feature attributes, column vector of indices +feature_indices = load($3) + +# can be either 1 (scale) or 0 (categorical) +label_measurement_level = $4 + +# measurement levels for features, 0/1 column vector +feature_measurement_levels = read($5) + +sz = ncol(D) + +# store for pvalues and pearson's r +stats = full(0, rows=sz, cols=1) +# store for type of test performed: 1 is chi-sq, 2 is ftest, 3 is pearson's +tests = full(0, rows=sz, cols=1) +# store for covariances used to compute pearson's r +covariances = full(0, rows=sz, cols=1) +# store for standard deviations used to compute pearson's r +standard_deviations = full(0, rows=sz, cols=1) + +labels = D[,label_index] + +labelCorrection = 0 +if(label_measurement_level == 1): + numLabels = nrow(labels) + cmLabels = moment(labels,2) + stdLabels = sqrt(cmLabels * (numLabels/(numLabels-1.0)) ) + standard_deviations[label_index,1] = stdLabels +else: + labelCorrection = 1 - min(labels) + labels = labels + labelCorrection + +mx = colMaxs(D) +mn = colMins(D) +num_distinct_values = mx-mn+1 +max_num_distinct_values = 0 +for(i1 in 1:nrow(feature_indices)): + feature_index1 = castAsScalar(feature_indices[i1,1]) + num = castAsScalar(num_distinct_values[1,feature_index1]) + if(castAsScalar(feature_measurement_levels[i1,1]) == 0 & num >= max_num_distinct_values): + max_num_distinct_values = num +distinct_label_values = full(0, rows=1, cols=1) +contingencyTableSz = 1 +maxNumberOfGroups = 1 +if(max_num_distinct_values != 0): + maxNumberOfGroups = max_num_distinct_values +if(label_measurement_level==0): + distinct_label_values = aggregate(target=labels, groups=labels, fn="count") + if(max_num_distinct_values != 0): + contingencyTableSz = max_num_distinct_values*nrow(distinct_label_values) + maxNumberOfGroups = max(maxNumberOfGroups, nrow(distinct_label_values)) +# store for contingency table cell values +contingencyTablesCounts = full(0, rows=sz, cols=contingencyTableSz) +# store for contingency table label(row) assignments +contingencyTablesLabelValues = full(0, rows=sz, cols=contingencyTableSz) +# store for contingency table feature(col) assignments +contingencyTablesFeatureValues = full(0, rows=sz, cols=contingencyTableSz) +# store for distinct values +featureValues = full(0, rows=sz, cols=maxNumberOfGroups) +# store for counts of distinct values +featureCounts = full(0, rows=sz, cols=maxNumberOfGroups) +# store for group means +featureMeans = full(0, rows=sz, cols=maxNumberOfGroups) +# store for group standard deviations +featureSTDs = full(0, rows=sz, cols=maxNumberOfGroups) + +if(label_measurement_level == 0): + featureCounts[label_index,1:nrow(distinct_label_values)] = transpose(distinct_label_values) + parfor(i2 in 1:nrow(distinct_label_values)): + featureValues[label_index,i2] = i2-labelCorrection + +parfor(i3 in 1:nrow(feature_indices), check=0): + feature_index2 = castAsScalar(feature_indices[i3,1]) + feature_measurement_level = castAsScalar(feature_measurement_levels[i3,1]) + + feature = D[,feature_index2] + + if(feature_measurement_level == 0): + featureCorrection = 1 - min(feature) + feature = feature + featureCorrection + + if(label_measurement_level == feature_measurement_level): + # categorical-categorical + tests[feature_index2,1] = 1 + [pVal, contingencyTable, rowMarginals, colMarginals] = bivar_cc(labels, feature) + stats[feature_index2,1] = pVal + + sz3=1 + if(1==1): + sz3 = nrow(contingencyTable)*ncol(contingencyTable) + contingencyTableLabelValues = full(0, rows=1, cols=sz3) + contingencyTableFeatureValues = full(0, rows=1, cols=sz3) - parfor(i4 in 1:nrow(contingencyTable), check=0): - parfor(j in 1:ncol(contingencyTable), check=0): - contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection - contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection + parfor(i4 in 1:nrow(contingencyTable), check=0): + parfor(j in 1:ncol(contingencyTable), check=0): + contingencyTableLabelValues[1, ncol(contingencyTable)*(i4-1)+j] = i4-labelCorrection + contingencyTableFeatureValues[1, ncol(contingencyTable)*(i4-1)+j] = j-featureCorrection contingencyTableCounts = contingencyTable.reshape(rows=1, cols=sz3) - contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts - - contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues - contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues - - featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals - parfor(i5 in 1:ncol(colMarginals), check=0): - featureValues[feature_index2,i5] = i5-featureCorrection - else: - # label is scale, feature is categorical - tests[feature_index2,1] = 2 - [pVal, frequencies, means, variances] = bivar_sc(labels, feature) - stats[feature_index2,1] = pVal - featureCounts[feature_index2,1:nrow(frequencies)] = transpose(frequencies) - parfor(i6 in 1:nrow(frequencies), check=0): - featureValues[feature_index2,i6] = i6 - featureCorrection - featureMeans[feature_index2,1:nrow(means)] = transpose(means) - featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances)) - else: - if(label_measurement_level == feature_measurement_level): - # scale-scale - tests[feature_index2,1] = 3 - [r, covariance, stdX, stdY] = bivar_ss(labels, feature) - stats[feature_index2,1] = r - covariances[feature_index2,1] = covariance - standard_deviations[feature_index2,1] = stdY - else: - # label is categorical, feature is scale - tests[feature_index2,1] = 2 - [pVal, frequencies, means, variances] = bivar_sc(feature, labels) - stats[feature_index2,1] = pVal - featureMeans[feature_index2,1:nrow(means)] = transpose(means) - featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances)) - # end if(feature_measurement_level == 0) -# end parfor(i3 in 1:nrow(feature_indices), check=0) - -save(stats, $6, format="text") -save(tests, $7, format="text") -save(covariances, $8, format="text") -save(standard_deviations, $9, format="text") -save(contingencyTablesCounts, $10, format="text") -save(contingencyTablesLabelValues, $11, format="text") -save(contingencyTablesFeatureValues, $12, format="text") -save(featureValues, $13, format="text") -save(featureCounts, $14, format="text") -save(featureMeans, $15, format="text") -save(featureSTDs, $16, format="text") - -# ----------------------------------------------------------------------------------------------------------- - -def bivar_ss(X:matrix[float], Y:matrix[float]) -> (R:float, covXY:float, sigmaX:float, sigmaY:float): - # Unweighted co-variance - covXY = cov(X,Y) - - # compute standard deviations for both X and Y by computing 2^nd central moment - W = nrow(X) - m2X = moment(X,2) - m2Y = moment(Y,2) - sigmaX = sqrt(m2X * (W/(W-1.0)) ) - sigmaY = sqrt(m2Y * (W/(W-1.0)) ) - - # Pearson's R - R = covXY / (sigmaX*sigmaY) - -# ----------------------------------------------------------------------------------------------------------- - -def bivar_cc(A:matrix[float], B:matrix[float]) -> (pval:float, contingencyTable:matrix[float], rowMarginals:matrix[float], colMarginals:matrix[float]): - # Contingency Table - FF = table(A,B) - - tmp = removeEmpty(target=FF, axis=0) - F = removeEmpty(target=tmp, axis=1) - - # Chi-Squared - W = sum(F) - r = rowSums(F) - c = colSums(F) - E = (dot(r, c))/W - E = ppred(E, 0, "==")*0.0001 + E - T = (F-E)**2/E - chi_squared = sum(T) - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1) - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=False) - - # Assign return values - pval = pValue - contingencyTable = F - rowMarginals = r - colMarginals = c - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -def bivar_sc(Y:matrix[float], A:matrix[float]) -> (pVal:float, CFreqs:matrix[float], CMeans:matrix[float], CVars:matrix[float]): - # mean and variance in target variable - W = nrow(A) - my = mean(Y) - varY = moment(Y,2) * W/(W-1.0) - - # category-wise (frequencies, means, variances) - CFreqs1 = aggregate(target=Y, groups=A, fn="count") - present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), axis=0) - CFreqs = dot(present_domain_vals_mat, CFreqs1) - - CMeans = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="mean")) - CVars = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="variance")) - - # number of categories - R = nrow(CFreqs) - df1 = R-1 - df2 = W-R - - anova_num = sum( (CFreqs*(CMeans-my)**2) )/(R-1) - anova_den = sum( (CFreqs-1)*CVars )/(W-R) - AnovaF = anova_num/anova_den - pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=False) - + contingencyTablesCounts[feature_index2,1:sz3] = contingencyTableCounts + + contingencyTablesLabelValues[feature_index2,1:sz3] = contingencyTableLabelValues + contingencyTablesFeatureValues[feature_index2,1:sz3] = contingencyTableFeatureValues + + featureCounts[feature_index2,1:ncol(colMarginals)] = colMarginals + parfor(i5 in 1:ncol(colMarginals), check=0): + featureValues[feature_index2,i5] = i5-featureCorrection + else: + # label is scale, feature is categorical + tests[feature_index2,1] = 2 + [pVal, frequencies, means, variances] = bivar_sc(labels, feature) + stats[feature_index2,1] = pVal + featureCounts[feature_index2,1:nrow(frequencies)] = transpose(frequencies) + parfor(i6 in 1:nrow(frequencies), check=0): + featureValues[feature_index2,i6] = i6 - featureCorrection + featureMeans[feature_index2,1:nrow(means)] = transpose(means) + featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances)) + else: + if(label_measurement_level == feature_measurement_level): + # scale-scale + tests[feature_index2,1] = 3 + [r, covariance, stdX, stdY] = bivar_ss(labels, feature) + stats[feature_index2,1] = r + covariances[feature_index2,1] = covariance + standard_deviations[feature_index2,1] = stdY + else: + # label is categorical, feature is scale + tests[feature_index2,1] = 2 + [pVal, frequencies, means, variances] = bivar_sc(feature, labels) + stats[feature_index2,1] = pVal + featureMeans[feature_index2,1:nrow(means)] = transpose(means) + featureSTDs[feature_index2,1:nrow(variances)] = transpose(sqrt(variances)) + # end if(feature_measurement_level == 0) +# end parfor(i3 in 1:nrow(feature_indices), check=0) + +save(stats, $6, format="text") +save(tests, $7, format="text") +save(covariances, $8, format="text") +save(standard_deviations, $9, format="text") +save(contingencyTablesCounts, $10, format="text") +save(contingencyTablesLabelValues, $11, format="text") +save(contingencyTablesFeatureValues, $12, format="text") +save(featureValues, $13, format="text") +save(featureCounts, $14, format="text") +save(featureMeans, $15, format="text") +save(featureSTDs, $16, format="text") + +# ----------------------------------------------------------------------------------------------------------- + +def bivar_ss(X:matrix[float], Y:matrix[float]) -> (R:float, covXY:float, sigmaX:float, sigmaY:float): + # Unweighted co-variance + covXY = cov(X,Y) + + # compute standard deviations for both X and Y by computing 2^nd central moment + W = nrow(X) + m2X = moment(X,2) + m2Y = moment(Y,2) + sigmaX = sqrt(m2X * (W/(W-1.0)) ) + sigmaY = sqrt(m2Y * (W/(W-1.0)) ) + + # Pearson's R + R = covXY / (sigmaX*sigmaY) + +# ----------------------------------------------------------------------------------------------------------- + +def bivar_cc(A:matrix[float], B:matrix[float]) -> (pval:float, contingencyTable:matrix[float], rowMarginals:matrix[float], colMarginals:matrix[float]): + # Contingency Table + FF = table(A,B) + + tmp = removeEmpty(target=FF, axis=0) + F = removeEmpty(target=tmp, axis=1) + + # Chi-Squared + W = sum(F) + r = rowSums(F) + c = colSums(F) + E = (dot(r, c))/W + E = ppred(E, 0, "==")*0.0001 + E + T = (F-E)**2/E + chi_squared = sum(T) + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1) + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=False) + + # Assign return values + pval = pValue + contingencyTable = F + rowMarginals = r + colMarginals = c + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +def bivar_sc(Y:matrix[float], A:matrix[float]) -> (pVal:float, CFreqs:matrix[float], CMeans:matrix[float], CVars:matrix[float]): + # mean and variance in target variable + W = nrow(A) + my = mean(Y) + varY = moment(Y,2) * W/(W-1.0) + + # category-wise (frequencies, means, variances) + CFreqs1 = aggregate(target=Y, groups=A, fn="count") + present_domain_vals_mat = removeEmpty(target=diag(1-ppred(CFreqs1, 0, "==")), axis=0) + CFreqs = dot(present_domain_vals_mat, CFreqs1) + + CMeans = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="mean")) + CVars = dot(present_domain_vals_mat, aggregate(target=Y, groups=A, fn="variance")) + + # number of categories + R = nrow(CFreqs) + df1 = R-1 + df2 = W-R + + anova_num = sum( (CFreqs*(CMeans-my)**2) )/(R-1) + anova_den = sum( (CFreqs-1)*CVars )/(W-R) + AnovaF = anova_num/anova_den + pVal = pf(target=AnovaF, df1=df1, df2=df2, lower.tail=False) + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R index dc65b8a..a3ca47a 100644 --- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R +++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.R @@ -1,71 +1,71 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) - -library("Matrix") - -D = as.matrix(readMM(paste(args[1], "X.mtx", sep=""))) -C = as.matrix(readMM(paste(args[1], "Y.mtx", sep=""))) - -# reading input args -numClasses = as.integer(args[2]); -laplace_correction = as.double(args[3]); - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals - -# Compute the feature counts for each class -classFeatureCounts = matrix(0, numClasses, numFeatures) -for (i in 1:numFeatures) { - Col = D[,i] - classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; -} - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction - -# Compute class conditional probabilities -ones = matrix(1, 1, numFeatures) -repClassSums = classSums %*% ones; -class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; - -# Compute class priors -class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] -class_prior = class_counts / numRows; - -# Compute accuracy on training set -ones = matrix(1, numRows, 1) -D_w_ones = cbind(D, ones) -model = cbind(class_conditionals, class_prior) -log_probs = D_w_ones %*% t(log(model)) -pred = max.col(log_probs,ties.method="last"); -acc = sum(pred == C) / numRows * 100 - -print(paste("Training Accuracy (%): ", acc, sep="")) - -# write out the model -writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep="")); -writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep="")); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) + +library("Matrix") + +D = as.matrix(readMM(paste(args[1], "X.mtx", sep=""))) +C = as.matrix(readMM(paste(args[1], "Y.mtx", sep=""))) + +# reading input args +numClasses = as.integer(args[2]); +laplace_correction = as.double(args[3]); + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals + +# Compute the feature counts for each class +classFeatureCounts = matrix(0, numClasses, numFeatures) +for (i in 1:numFeatures) { + Col = D[,i] + classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; +} + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction + +# Compute class conditional probabilities +ones = matrix(1, 1, numFeatures) +repClassSums = classSums %*% ones; +class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; + +# Compute class priors +class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] +class_prior = class_counts / numRows; + +# Compute accuracy on training set +ones = matrix(1, numRows, 1) +D_w_ones = cbind(D, ones) +model = cbind(class_conditionals, class_prior) +log_probs = D_w_ones %*% t(log(model)) +pred = max.col(log_probs,ties.method="last"); +acc = sum(pred == C) / numRows * 100 + +print(paste("Training Accuracy (%): ", acc, sep="")) + +# write out the model +writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep="")); +writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep=""));
