http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_naive-bayes.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_naive-bayes.R b/src/test/scripts/applications/parfor/parfor_naive-bayes.R index f455c2c..cb0d00f 100644 --- a/src/test/scripts/applications/parfor/parfor_naive-bayes.R +++ b/src/test/scripts/applications/parfor/parfor_naive-bayes.R @@ -1,61 +1,61 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -D = as.matrix(readMM(paste(args[1], "D.mtx", sep=""))) -C = as.matrix(readMM(paste(args[1], "C.mtx", sep=""))) - -# reading input args -numClasses = as.integer(args[2]); -laplace_correction = 1 - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals - -# Compute the feature counts for each class -classFeatureCounts = matrix(0, numClasses, numFeatures) -for (i in 1:numFeatures) { - Col = D[,i] - classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; -} - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction - -# Compute class conditional probabilities -repClassSums = classSums %*% matrix(1,1,numFeatures); -class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; - -# Compute class priors -class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] -class_prior = class_counts / numRows; - -# write out the model -writeMM(as(class_prior, "CsparseMatrix"), paste(args[3], "class_prior", sep="")); -writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[3], "class_conditionals", sep="")); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +D = as.matrix(readMM(paste(args[1], "D.mtx", sep=""))) +C = as.matrix(readMM(paste(args[1], "C.mtx", sep=""))) + +# reading input args +numClasses = as.integer(args[2]); +laplace_correction = 1 + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals + +# Compute the feature counts for each class +classFeatureCounts = matrix(0, numClasses, numFeatures) +for (i in 1:numFeatures) { + Col = D[,i] + classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; +} + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction + +# Compute class conditional probabilities +repClassSums = classSums %*% matrix(1,1,numFeatures); +class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; + +# Compute class priors +class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] +class_prior = class_counts / numRows; + +# write out the model +writeMM(as(class_prior, "CsparseMatrix"), paste(args[3], "class_prior", sep="")); +writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[3], "class_conditionals", sep=""));
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_univariate.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_univariate.R b/src/test/scripts/applications/parfor/parfor_univariate.R index cb5dfb1..14f9f95 100644 --- a/src/test/scripts/applications/parfor/parfor_univariate.R +++ b/src/test/scripts/applications/parfor/parfor_univariate.R @@ -1,155 +1,155 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") -library("moments") - -A1 <- readMM(paste(args[1], "D.mtx", sep="")) -K1 <- readMM(paste(args[1], "K.mtx", sep="")) -A <- as.matrix(A1); -K <- as.matrix(K1); -maxC = args[2]; - - -# number of features/attributes -n = ncol(A); - -# number of data records -m = nrow(A); - -# number of statistics -numBaseStats = 17; # (14 scale stats, 3 categorical stats) - -max_kind = max(K); - -# matrices to store computed statistics -baseStats = array(0,dim=c(numBaseStats,n)); - -if (maxC > 0) { - countsArray = array(0,dim=c(maxC,n)); -} - -for(i in 1:n) { - - # project out the i^th column - F = as.matrix(A[,i]); - - kind = K[1,i]; - - if ( kind == 1 ) { - print("scale"); - # compute SCALE statistics on the projected column - minimum = min(F); - maximum = max(F); - rng = maximum - minimum; - - mu = mean(F); - m2 = moment(F, order=2, central=TRUE); - m3 = moment(F, order=3, central=TRUE); - m4 = moment(F, order=4, central=TRUE); - - var = m/(m-1.0)*m2; - - std_dev = sqrt(var); - se = std_dev/sqrt(m); - cv = std_dev/mu; - - g1 = m3/(std_dev^3); - g2 = m4/(std_dev^4) - 3; - #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); - se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); - - #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); - se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); - - md = median(F); #quantile(F, 0.5, type = 1); - - S = sort(F) - q25d=m*0.25 - q75d=m*0.75 - q25i=ceiling(q25d) - q75i=ceiling(q75d) - - iqm = sum(S[(q25i+1):q75i]) - iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] - iqm = iqm/(m*0.5) - - #iqm = mean( subset(F, F>quantile(F,1/4,type = 1) & F<=quantile(F,3/4,type = 1) ) ) - - # place the computed statistics in output matrices - baseStats[1,i] = minimum; - baseStats[2,i] = maximum; - baseStats[3,i] = rng; - - baseStats[4,i] = mu; - baseStats[5,i] = var; - baseStats[6,i] = std_dev; - baseStats[7,i] = se; - baseStats[8,i] = cv; - - baseStats[9,i] = g1; - baseStats[10,i] = g2; - baseStats[11,i] = se_g1; - baseStats[12,i] = se_g2; - - baseStats[13,i] = md; - baseStats[14,i] = iqm; - } - else { - if (kind == 2 | kind == 3) { - print("categorical"); - - # check if the categorical column has valid values - minF = min(F); - if (minF <=0) { - print("ERROR: Categorical attributes can only take values starting from 1."); - } - else { - # compute CATEGORICAL statistics on the projected column - cat_counts = table(F); # counts for each category - num_cat = nrow(cat_counts); # number of categories - - mx = max(t(as.vector(cat_counts))) - mode = which(cat_counts == mx) - - numModes = length(cat_counts[ cat_counts==mx ]); - - # place the computed statistics in output matrices - baseStats[15,i] = num_cat; - baseStats[16,i] = mode; - baseStats[17,i] = numModes; - - if (max_kind > 1) { - countsArray[1:length(cat_counts),i] = cat_counts; - } - } - } - } -} - -writeMM(as(baseStats, "CsparseMatrix"), paste(args[3], "base.stats", sep="")); -if (max_kind > 1) { - writeMM(as(countsArray, "CsparseMatrix"), paste(args[3], "categorical.counts", sep="")); -} - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") +library("moments") + +A1 <- readMM(paste(args[1], "D.mtx", sep="")) +K1 <- readMM(paste(args[1], "K.mtx", sep="")) +A <- as.matrix(A1); +K <- as.matrix(K1); +maxC = args[2]; + + +# number of features/attributes +n = ncol(A); + +# number of data records +m = nrow(A); + +# number of statistics +numBaseStats = 17; # (14 scale stats, 3 categorical stats) + +max_kind = max(K); + +# matrices to store computed statistics +baseStats = array(0,dim=c(numBaseStats,n)); + +if (maxC > 0) { + countsArray = array(0,dim=c(maxC,n)); +} + +for(i in 1:n) { + + # project out the i^th column + F = as.matrix(A[,i]); + + kind = K[1,i]; + + if ( kind == 1 ) { + print("scale"); + # compute SCALE statistics on the projected column + minimum = min(F); + maximum = max(F); + rng = maximum - minimum; + + mu = mean(F); + m2 = moment(F, order=2, central=TRUE); + m3 = moment(F, order=3, central=TRUE); + m4 = moment(F, order=4, central=TRUE); + + var = m/(m-1.0)*m2; + + std_dev = sqrt(var); + se = std_dev/sqrt(m); + cv = std_dev/mu; + + g1 = m3/(std_dev^3); + g2 = m4/(std_dev^4) - 3; + #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); + se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); + + #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); + se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); + + md = median(F); #quantile(F, 0.5, type = 1); + + S = sort(F) + q25d=m*0.25 + q75d=m*0.75 + q25i=ceiling(q25d) + q75i=ceiling(q75d) + + iqm = sum(S[(q25i+1):q75i]) + iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] + iqm = iqm/(m*0.5) + + #iqm = mean( subset(F, F>quantile(F,1/4,type = 1) & F<=quantile(F,3/4,type = 1) ) ) + + # place the computed statistics in output matrices + baseStats[1,i] = minimum; + baseStats[2,i] = maximum; + baseStats[3,i] = rng; + + baseStats[4,i] = mu; + baseStats[5,i] = var; + baseStats[6,i] = std_dev; + baseStats[7,i] = se; + baseStats[8,i] = cv; + + baseStats[9,i] = g1; + baseStats[10,i] = g2; + baseStats[11,i] = se_g1; + baseStats[12,i] = se_g2; + + baseStats[13,i] = md; + baseStats[14,i] = iqm; + } + else { + if (kind == 2 | kind == 3) { + print("categorical"); + + # check if the categorical column has valid values + minF = min(F); + if (minF <=0) { + print("ERROR: Categorical attributes can only take values starting from 1."); + } + else { + # compute CATEGORICAL statistics on the projected column + cat_counts = table(F); # counts for each category + num_cat = nrow(cat_counts); # number of categories + + mx = max(t(as.vector(cat_counts))) + mode = which(cat_counts == mx) + + numModes = length(cat_counts[ cat_counts==mx ]); + + # place the computed statistics in output matrices + baseStats[15,i] = num_cat; + baseStats[16,i] = mode; + baseStats[17,i] = numModes; + + if (max_kind > 1) { + countsArray[1:length(cat_counts),i] = cat_counts; + } + } + } + } +} + +writeMM(as(baseStats, "CsparseMatrix"), paste(args[3], "base.stats", sep="")); +if (max_kind > 1) { + writeMM(as(countsArray, "CsparseMatrix"), paste(args[3], "categorical.counts", sep="")); +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_univariate0.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_univariate0.dml b/src/test/scripts/applications/parfor/parfor_univariate0.dml index 061d4a0..2a6a9c5 100644 --- a/src/test/scripts/applications/parfor/parfor_univariate0.dml +++ b/src/test/scripts/applications/parfor/parfor_univariate0.dml @@ -1,166 +1,166 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# DML Script to compute univariate statistics for all attributes -# in a given data set -# -# Three inputs: -# $1) A - input data -# $2) K - row matrix that denotes the "kind" for each -# attribute -# kind=1 for scale, kind=2 for nominal, -# kind=3 for ordinal -# $3) maxC - maximum number of categories in any categorical -# attribute -# -# One output: -# $4) output directory in which following three statistics -# files are created -# + base.stats - matrix with all 17 statistics (14 scale, -# 3 categorical) computed for all attributes -# + categorical.counts - matrix in which each column -# gives the category-wise counts for all categories in -# that attribute -# -# - -A = read($1); # data file -K = read($2); # attribute kind file -maxC = $3; # max number of categories in any categorical attribute - - -if (maxC < 0) { - print("ERROR: maximum number maxC of categories must be a positve value."); -} -else { - - - # number of features/attributes - n = ncol(A); - - # number of data records - m = nrow(A); - - # number of statistics - numBaseStats = 17; # (14 scale stats, 3 categorical stats) - - max_kind = max(K); - - # matrices to store computed statistics - baseStats = matrix(0, rows=numBaseStats, cols=n); - - if (maxC > 0) { - countsArray = matrix(0, rows=maxC, cols=n); - } - - for(i in 1:n) { - - # project out the i^th column - F = A[,i]; - - kind = castAsScalar(K[1,i]); - - if ( kind == 1 ) { - print("[" + i + "] Scale"); - # compute SCALE statistics on the projected column - minimum = min(F); - maximum = max(F); - rng = maximum - minimum; - - mu = mean(F); - m2 = moment(F, 2); - m3 = moment(F, 3); - m4 = moment(F, 4); - - var = m/(m-1.0)*m2; - std_dev = sqrt(var); - se = std_dev/sqrt(m); - cv = std_dev/mu; - - g1 = m3/(std_dev^3); - g2 = m4/(std_dev^4) - 3; - #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); - se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); - - #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); - se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); - - md = median(F); #quantile(F, 0.5); - iqm = interQuartileMean(F); - - # place the computed statistics in output matrices - baseStats[1,i] = minimum; - baseStats[2,i] = maximum; - baseStats[3,i] = rng; - - baseStats[4,i] = mu; - baseStats[5,i] = var; - baseStats[6,i] = std_dev; - baseStats[7,i] = se; - baseStats[8,i] = cv; - - baseStats[9,i] = g1; - baseStats[10,i] = g2; - baseStats[11,i] = se_g1; - baseStats[12,i] = se_g2; - - baseStats[13,i] = md; - baseStats[14,i] = iqm; - } - else { - if (kind == 2 | kind == 3) { - print("[" + i + "] Categorical"); - - # check if the categorical column has valid values - minF = min(F); - if (minF <=0) { - print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); - } - else { - # compute CATEGORICAL statistics on the projected column - cat_counts = table(F,1); # counts for each category - num_cat = nrow(cat_counts); # number of categories - - mode = rowIndexMax(t(cat_counts)); - mx = max(cat_counts) - modeArr = ppred(cat_counts, mx, "==") - numModes = sum(modeArr); - - # place the computed statistics in output matrices - baseStats[15,i] = num_cat; - baseStats[16,i] = mode; - baseStats[17,i] = numModes; - - if (max_kind > 1) { - countsArray[,i] = cat_counts; - } - } - } - } - } - - write(baseStats, $4+"/base.stats"); - if (max_kind > 1) { - write(countsArray, $4+"/categorical.counts"); - } - -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# DML Script to compute univariate statistics for all attributes +# in a given data set +# +# Three inputs: +# $1) A - input data +# $2) K - row matrix that denotes the "kind" for each +# attribute +# kind=1 for scale, kind=2 for nominal, +# kind=3 for ordinal +# $3) maxC - maximum number of categories in any categorical +# attribute +# +# One output: +# $4) output directory in which following three statistics +# files are created +# + base.stats - matrix with all 17 statistics (14 scale, +# 3 categorical) computed for all attributes +# + categorical.counts - matrix in which each column +# gives the category-wise counts for all categories in +# that attribute +# +# + +A = read($1); # data file +K = read($2); # attribute kind file +maxC = $3; # max number of categories in any categorical attribute + + +if (maxC < 0) { + print("ERROR: maximum number maxC of categories must be a positve value."); +} +else { + + + # number of features/attributes + n = ncol(A); + + # number of data records + m = nrow(A); + + # number of statistics + numBaseStats = 17; # (14 scale stats, 3 categorical stats) + + max_kind = max(K); + + # matrices to store computed statistics + baseStats = matrix(0, rows=numBaseStats, cols=n); + + if (maxC > 0) { + countsArray = matrix(0, rows=maxC, cols=n); + } + + for(i in 1:n) { + + # project out the i^th column + F = A[,i]; + + kind = castAsScalar(K[1,i]); + + if ( kind == 1 ) { + print("[" + i + "] Scale"); + # compute SCALE statistics on the projected column + minimum = min(F); + maximum = max(F); + rng = maximum - minimum; + + mu = mean(F); + m2 = moment(F, 2); + m3 = moment(F, 3); + m4 = moment(F, 4); + + var = m/(m-1.0)*m2; + std_dev = sqrt(var); + se = std_dev/sqrt(m); + cv = std_dev/mu; + + g1 = m3/(std_dev^3); + g2 = m4/(std_dev^4) - 3; + #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); + se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); + + #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); + se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); + + md = median(F); #quantile(F, 0.5); + iqm = interQuartileMean(F); + + # place the computed statistics in output matrices + baseStats[1,i] = minimum; + baseStats[2,i] = maximum; + baseStats[3,i] = rng; + + baseStats[4,i] = mu; + baseStats[5,i] = var; + baseStats[6,i] = std_dev; + baseStats[7,i] = se; + baseStats[8,i] = cv; + + baseStats[9,i] = g1; + baseStats[10,i] = g2; + baseStats[11,i] = se_g1; + baseStats[12,i] = se_g2; + + baseStats[13,i] = md; + baseStats[14,i] = iqm; + } + else { + if (kind == 2 | kind == 3) { + print("[" + i + "] Categorical"); + + # check if the categorical column has valid values + minF = min(F); + if (minF <=0) { + print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); + } + else { + # compute CATEGORICAL statistics on the projected column + cat_counts = table(F,1); # counts for each category + num_cat = nrow(cat_counts); # number of categories + + mode = rowIndexMax(t(cat_counts)); + mx = max(cat_counts) + modeArr = ppred(cat_counts, mx, "==") + numModes = sum(modeArr); + + # place the computed statistics in output matrices + baseStats[15,i] = num_cat; + baseStats[16,i] = mode; + baseStats[17,i] = numModes; + + if (max_kind > 1) { + countsArray[,i] = cat_counts; + } + } + } + } + } + + write(baseStats, $4+"/base.stats"); + if (max_kind > 1) { + write(countsArray, $4+"/categorical.counts"); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_univariate1.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_univariate1.dml b/src/test/scripts/applications/parfor/parfor_univariate1.dml index e22fd86..1f120ef 100644 --- a/src/test/scripts/applications/parfor/parfor_univariate1.dml +++ b/src/test/scripts/applications/parfor/parfor_univariate1.dml @@ -1,166 +1,166 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# DML Script to compute univariate statistics for all attributes -# in a given data set -# -# Three inputs: -# $1) A - input data -# $2) K - row matrix that denotes the "kind" for each -# attribute -# kind=1 for scale, kind=2 for nominal, -# kind=3 for ordinal -# $3) maxC - maximum number of categories in any categorical -# attribute -# -# One output: -# $4) output directory in which following three statistics -# files are created -# + base.stats - matrix with all 17 statistics (14 scale, -# 3 categorical) computed for all attributes -# + categorical.counts - matrix in which each column -# gives the category-wise counts for all categories in -# that attribute -# -# - -A = read($1); # data file -K = read($2); # attribute kind file -maxC = $3; # max number of categories in any categorical attribute - - -if (maxC < 0) { - print("ERROR: maximum number maxC of categories must be a positve value."); -} -else { - - - # number of features/attributes - n = ncol(A); - - # number of data records - m = nrow(A); - - # number of statistics - numBaseStats = 17; # (14 scale stats, 3 categorical stats) - - max_kind = max(K); - - # matrices to store computed statistics - baseStats = matrix(0, rows=numBaseStats, cols=n); - - if (maxC > 0) { - countsArray = matrix(0, rows=maxC, cols=n); - } - - parfor(i in 1:n, par=4, mode=LOCAL, check=0, opt=NONE) { - - # project out the i^th column - F = A[,i]; - - kind = castAsScalar(K[1,i]); - - if ( kind == 1 ) { - print("[" + i + "] Scale"); - # compute SCALE statistics on the projected column - minimum = min(F); - maximum = max(F); - rng = maximum - minimum; - - mu = mean(F); - m2 = moment(F, 2); - m3 = moment(F, 3); - m4 = moment(F, 4); - - var = m/(m-1.0)*m2; - std_dev = sqrt(var); - se = std_dev/sqrt(m); - cv = std_dev/mu; - - g1 = m3/(std_dev^3); - g2 = m4/(std_dev^4) - 3; - #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); - se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); - - #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); - se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); - - md = median(F); #quantile(F, 0.5); - iqm = interQuartileMean(F); - - # place the computed statistics in output matrices - baseStats[1,i] = minimum; - baseStats[2,i] = maximum; - baseStats[3,i] = rng; - - baseStats[4,i] = mu; - baseStats[5,i] = var; - baseStats[6,i] = std_dev; - baseStats[7,i] = se; - baseStats[8,i] = cv; - - baseStats[9,i] = g1; - baseStats[10,i] = g2; - baseStats[11,i] = se_g1; - baseStats[12,i] = se_g2; - - baseStats[13,i] = md; - baseStats[14,i] = iqm; - } - else { - if (kind == 2 | kind == 3) { - print("[" + i + "] Categorical"); - - # check if the categorical column has valid values - minF = min(F); - if (minF <=0) { - print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); - } - else { - # compute CATEGORICAL statistics on the projected column - cat_counts = table(F,1); # counts for each category - num_cat = nrow(cat_counts); # number of categories - - mode = rowIndexMax(t(cat_counts)); - mx = max(cat_counts) - modeArr = ppred(cat_counts, mx, "==") - numModes = sum(modeArr); - - # place the computed statistics in output matrices - baseStats[15,i] = num_cat; - baseStats[16,i] = mode; - baseStats[17,i] = numModes; - - if (max_kind > 1) { - countsArray[,i] = cat_counts; - } - } - } - } - } - - write(baseStats, $4+"/base.stats"); - if (max_kind > 1) { - write(countsArray, $4+"/categorical.counts"); - } - -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# DML Script to compute univariate statistics for all attributes +# in a given data set +# +# Three inputs: +# $1) A - input data +# $2) K - row matrix that denotes the "kind" for each +# attribute +# kind=1 for scale, kind=2 for nominal, +# kind=3 for ordinal +# $3) maxC - maximum number of categories in any categorical +# attribute +# +# One output: +# $4) output directory in which following three statistics +# files are created +# + base.stats - matrix with all 17 statistics (14 scale, +# 3 categorical) computed for all attributes +# + categorical.counts - matrix in which each column +# gives the category-wise counts for all categories in +# that attribute +# +# + +A = read($1); # data file +K = read($2); # attribute kind file +maxC = $3; # max number of categories in any categorical attribute + + +if (maxC < 0) { + print("ERROR: maximum number maxC of categories must be a positve value."); +} +else { + + + # number of features/attributes + n = ncol(A); + + # number of data records + m = nrow(A); + + # number of statistics + numBaseStats = 17; # (14 scale stats, 3 categorical stats) + + max_kind = max(K); + + # matrices to store computed statistics + baseStats = matrix(0, rows=numBaseStats, cols=n); + + if (maxC > 0) { + countsArray = matrix(0, rows=maxC, cols=n); + } + + parfor(i in 1:n, par=4, mode=LOCAL, check=0, opt=NONE) { + + # project out the i^th column + F = A[,i]; + + kind = castAsScalar(K[1,i]); + + if ( kind == 1 ) { + print("[" + i + "] Scale"); + # compute SCALE statistics on the projected column + minimum = min(F); + maximum = max(F); + rng = maximum - minimum; + + mu = mean(F); + m2 = moment(F, 2); + m3 = moment(F, 3); + m4 = moment(F, 4); + + var = m/(m-1.0)*m2; + std_dev = sqrt(var); + se = std_dev/sqrt(m); + cv = std_dev/mu; + + g1 = m3/(std_dev^3); + g2 = m4/(std_dev^4) - 3; + #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); + se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); + + #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); + se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); + + md = median(F); #quantile(F, 0.5); + iqm = interQuartileMean(F); + + # place the computed statistics in output matrices + baseStats[1,i] = minimum; + baseStats[2,i] = maximum; + baseStats[3,i] = rng; + + baseStats[4,i] = mu; + baseStats[5,i] = var; + baseStats[6,i] = std_dev; + baseStats[7,i] = se; + baseStats[8,i] = cv; + + baseStats[9,i] = g1; + baseStats[10,i] = g2; + baseStats[11,i] = se_g1; + baseStats[12,i] = se_g2; + + baseStats[13,i] = md; + baseStats[14,i] = iqm; + } + else { + if (kind == 2 | kind == 3) { + print("[" + i + "] Categorical"); + + # check if the categorical column has valid values + minF = min(F); + if (minF <=0) { + print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); + } + else { + # compute CATEGORICAL statistics on the projected column + cat_counts = table(F,1); # counts for each category + num_cat = nrow(cat_counts); # number of categories + + mode = rowIndexMax(t(cat_counts)); + mx = max(cat_counts) + modeArr = ppred(cat_counts, mx, "==") + numModes = sum(modeArr); + + # place the computed statistics in output matrices + baseStats[15,i] = num_cat; + baseStats[16,i] = mode; + baseStats[17,i] = numModes; + + if (max_kind > 1) { + countsArray[,i] = cat_counts; + } + } + } + } + } + + write(baseStats, $4+"/base.stats"); + if (max_kind > 1) { + write(countsArray, $4+"/categorical.counts"); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_univariate4.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_univariate4.dml b/src/test/scripts/applications/parfor/parfor_univariate4.dml index 1ebfcbd..8953c64 100644 --- a/src/test/scripts/applications/parfor/parfor_univariate4.dml +++ b/src/test/scripts/applications/parfor/parfor_univariate4.dml @@ -1,166 +1,166 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# DML Script to compute univariate statistics for all attributes -# in a given data set -# -# Three inputs: -# $1) A - input data -# $2) K - row matrix that denotes the "kind" for each -# attribute -# kind=1 for scale, kind=2 for nominal, -# kind=3 for ordinal -# $3) maxC - maximum number of categories in any categorical -# attribute -# -# One output: -# $4) output directory in which following three statistics -# files are created -# + base.stats - matrix with all 17 statistics (14 scale, -# 3 categorical) computed for all attributes -# + categorical.counts - matrix in which each column -# gives the category-wise counts for all categories in -# that attribute -# -# - -A = read($1); # data file -K = read($2); # attribute kind file -maxC = $3; # max number of categories in any categorical attribute - - -if (maxC < 0) { - print("ERROR: maximum number maxC of categories must be a positve value."); -} -else { - - - # number of features/attributes - n = ncol(A); - - # number of data records - m = nrow(A); - - # number of statistics - numBaseStats = 17; # (14 scale stats, 3 categorical stats) - - max_kind = max(K); - - # matrices to store computed statistics - baseStats = matrix(0, rows=numBaseStats, cols=n); - - if (maxC > 0) { - countsArray = matrix(0, rows=maxC, cols=n); - } - - parfor(i in 1:n, check=0) { - - # project out the i^th column - F = A[,i]; - - kind = castAsScalar(K[1,i]); - - if ( kind == 1 ) { - print("[" + i + "] Scale"); - # compute SCALE statistics on the projected column - minimum = min(F); - maximum = max(F); - rng = maximum - minimum; - - mu = mean(F); - m2 = moment(F, 2); - m3 = moment(F, 3); - m4 = moment(F, 4); - - var = m/(m-1.0)*m2; - std_dev = sqrt(var); - se = std_dev/sqrt(m); - cv = std_dev/mu; - - g1 = m3/(std_dev^3); - g2 = m4/(std_dev^4) - 3; - #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); - se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); - - #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); - se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); - - md = median(F) #quantile(F, 0.5); - iqm = interQuartileMean(F); - - # place the computed statistics in output matrices - baseStats[1,i] = minimum; - baseStats[2,i] = maximum; - baseStats[3,i] = rng; - - baseStats[4,i] = mu; - baseStats[5,i] = var; - baseStats[6,i] = std_dev; - baseStats[7,i] = se; - baseStats[8,i] = cv; - - baseStats[9,i] = g1; - baseStats[10,i] = g2; - baseStats[11,i] = se_g1; - baseStats[12,i] = se_g2; - - baseStats[13,i] = md; - baseStats[14,i] = iqm; - } - else { - if (kind == 2 | kind == 3) { - print("[" + i + "] Categorical"); - - # check if the categorical column has valid values - minF = min(F); - if (minF <=0) { - print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); - } - else { - # compute CATEGORICAL statistics on the projected column - cat_counts = table(F,1); # counts for each category - num_cat = nrow(cat_counts); # number of categories - - mode = rowIndexMax(t(cat_counts)); - mx = max(cat_counts) - modeArr = ppred(cat_counts, mx, "==") - numModes = sum(modeArr); - - # place the computed statistics in output matrices - baseStats[15,i] = num_cat; - baseStats[16,i] = mode; - baseStats[17,i] = numModes; - - if (max_kind > 1) { - countsArray[,i] = cat_counts; - } - } - } - } - } - - write(baseStats, $4+"/base.stats"); - if (max_kind > 1) { - write(countsArray, $4+"/categorical.counts"); - } - -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# DML Script to compute univariate statistics for all attributes +# in a given data set +# +# Three inputs: +# $1) A - input data +# $2) K - row matrix that denotes the "kind" for each +# attribute +# kind=1 for scale, kind=2 for nominal, +# kind=3 for ordinal +# $3) maxC - maximum number of categories in any categorical +# attribute +# +# One output: +# $4) output directory in which following three statistics +# files are created +# + base.stats - matrix with all 17 statistics (14 scale, +# 3 categorical) computed for all attributes +# + categorical.counts - matrix in which each column +# gives the category-wise counts for all categories in +# that attribute +# +# + +A = read($1); # data file +K = read($2); # attribute kind file +maxC = $3; # max number of categories in any categorical attribute + + +if (maxC < 0) { + print("ERROR: maximum number maxC of categories must be a positve value."); +} +else { + + + # number of features/attributes + n = ncol(A); + + # number of data records + m = nrow(A); + + # number of statistics + numBaseStats = 17; # (14 scale stats, 3 categorical stats) + + max_kind = max(K); + + # matrices to store computed statistics + baseStats = matrix(0, rows=numBaseStats, cols=n); + + if (maxC > 0) { + countsArray = matrix(0, rows=maxC, cols=n); + } + + parfor(i in 1:n, check=0) { + + # project out the i^th column + F = A[,i]; + + kind = castAsScalar(K[1,i]); + + if ( kind == 1 ) { + print("[" + i + "] Scale"); + # compute SCALE statistics on the projected column + minimum = min(F); + maximum = max(F); + rng = maximum - minimum; + + mu = mean(F); + m2 = moment(F, 2); + m3 = moment(F, 3); + m4 = moment(F, 4); + + var = m/(m-1.0)*m2; + std_dev = sqrt(var); + se = std_dev/sqrt(m); + cv = std_dev/mu; + + g1 = m3/(std_dev^3); + g2 = m4/(std_dev^4) - 3; + #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); + se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); + + #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); + se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); + + md = median(F) #quantile(F, 0.5); + iqm = interQuartileMean(F); + + # place the computed statistics in output matrices + baseStats[1,i] = minimum; + baseStats[2,i] = maximum; + baseStats[3,i] = rng; + + baseStats[4,i] = mu; + baseStats[5,i] = var; + baseStats[6,i] = std_dev; + baseStats[7,i] = se; + baseStats[8,i] = cv; + + baseStats[9,i] = g1; + baseStats[10,i] = g2; + baseStats[11,i] = se_g1; + baseStats[12,i] = se_g2; + + baseStats[13,i] = md; + baseStats[14,i] = iqm; + } + else { + if (kind == 2 | kind == 3) { + print("[" + i + "] Categorical"); + + # check if the categorical column has valid values + minF = min(F); + if (minF <=0) { + print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); + } + else { + # compute CATEGORICAL statistics on the projected column + cat_counts = table(F,1); # counts for each category + num_cat = nrow(cat_counts); # number of categories + + mode = rowIndexMax(t(cat_counts)); + mx = max(cat_counts) + modeArr = ppred(cat_counts, mx, "==") + numModes = sum(modeArr); + + # place the computed statistics in output matrices + baseStats[15,i] = num_cat; + baseStats[16,i] = mode; + baseStats[17,i] = numModes; + + if (max_kind > 1) { + countsArray[,i] = cat_counts; + } + } + } + } + } + + write(baseStats, $4+"/base.stats"); + if (max_kind > 1) { + write(countsArray, $4+"/categorical.counts"); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/validation/LinearLogisticRegression.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/validation/LinearLogisticRegression.dml b/src/test/scripts/applications/validation/LinearLogisticRegression.dml index b5d3955..473b2dc 100644 --- a/src/test/scripts/applications/validation/LinearLogisticRegression.dml +++ b/src/test/scripts/applications/validation/LinearLogisticRegression.dml @@ -1,246 +1,246 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Solves Linear Logistic Regression using Trust Region methods. -# Can be adapted for L2-SVMs and more general unconstrained optimization problems also -# setup optimization parameters (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650) -# The parameter C is the weight that the algorithm puts on the loss function, instead of the regularizer. -# if intercept = 1, then w has one extra value than the dimensions of X. Predictions are computed as X*w[1:n-1,1] + w[n,1] -# Arguments: 1.X 2.y 3.intercept 4.max_iteration 5.C 6.w - -# 100K dataset -# hadoop jar SystemML.jar -f LinearLogisticRegression.dml -args itau/logreg/X_100k_500 itau/logreg/y_100k 0 50 0.001 itau/logreg/w_100k_1 - -# 1M dataset -# hadoop jar SystemML.jar -f LinearLogisticRegression.dml -args itau/logreg/X_100m_5k itau/logreg/y_100m_1 0 50 0.001 itau/demo/logreg/w_100m_1 - - -# internal parameters -tol = 0.001 -eta0 = 0.0001 -eta1 = 0.25 -eta2 = 0.75 -sigma1 = 0.25 -sigma2 = 0.5 -sigma3 = 4.0 -psi = 0.1 - -# read training data files -X = read($1) -intercept = $3 - -D = ncol(X) -#initialize w -w = Rand(rows=D, cols=1, min=0.0, max=0.0); -zeros_D = Rand(rows = D, cols = 1, min = 0.0, max = 0.0); - -if (intercept == 1) { - num_samples = nrow(X); - ones = Rand(rows=num_samples, cols=1, min=1, max=1, pdf="uniform"); - X = append(X, ones); - zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0); - w = t(append(t(w), zero_matrix)); - zeros_D = t(append(t(zeros_D), zero_matrix)); -} - -N = nrow(X) - -# read (training and test) labels -y = read($2) - -maxiter = $4 -maxinneriter = 1000 - -C = $5 - -e = Rand(rows=1, cols=1, min=1.0, max=1.0); -o = X %*% w -logistic = 1.0/(1.0 + exp( -y * o)) - -obj = 0.5 * t(w) %*% w + C*sum(-log(logistic)) -grad = w + C*t(X) %*% ((logistic - 1)*y) -logisticD = logistic*(1-logistic) -delta = sqrt(sum(grad*grad)) - -# number of iterations -iter = 0 - -# starting point for CG - -# VS: change -zeros_N = Rand(rows = N, cols = 1, min = 0.0, max = 0.0); - -# boolean for convergence check - -converge = (delta < tol) | (iter > maxiter) -norm_r2 = sum(grad*grad) - -# VS: change -norm_grad = sqrt(norm_r2) -norm_grad_initial = norm_grad - -alpha = t(w) %*% w -alpha2 = alpha - -while(!converge) { - - norm_grad = sqrt(sum(grad*grad)) - - print("-- Outer Iteration = " + iter) - objScalar = castAsScalar(obj) - print(" Iterations = " + iter + ", Objective = " + objScalar + ", Gradient Norm = " + norm_grad) - - # SOLVE TRUST REGION SUB-PROBLEM - s = zeros_D - os = zeros_N - r = -grad - d = r - inneriter = 0 - innerconverge = ( sqrt(sum(r*r)) <= psi * norm_grad) - while (!innerconverge) { - inneriter = inneriter + 1 - norm_r2 = sum(r*r) - od = X %*% d - Hd = d + C*(t(X) %*% (logisticD*od)) - alpha_deno = t(d) %*% Hd - alpha = norm_r2 / alpha_deno - - s = s + castAsScalar(alpha) * d - os = os + castAsScalar(alpha) * od - - sts = t(s) %*% s - delta2 = delta*delta - stsScalar = castAsScalar(sts) - - shouldBreak = FALSE; # to mimic "break" in the following 'if' condition - if (stsScalar > delta2) { - print(" --- cg reaches trust region boundary") - s = s - castAsScalar(alpha) * d - os = os - castAsScalar(alpha) * od - std = t(s) %*% d - dtd = t(d) %*% d - sts = t(s) %*% s - rad = sqrt(std*std + dtd*(delta2 - sts)) - stdScalar = castAsScalar(std) - if(stdScalar >= 0) { - tau = (delta2 - sts)/(std + rad) - } - else { - tau = (rad - std)/dtd - } - - s = s + castAsScalar(tau) * d - os = os + castAsScalar(tau) * od - r = r - castAsScalar(tau) * Hd - - #break - shouldBreak = TRUE; - innerconverge = TRUE; - - } - - if (!shouldBreak) { - r = r - castAsScalar(alpha) * Hd - old_norm_r2 = norm_r2 - norm_r2 = sum(r*r) - beta = norm_r2/old_norm_r2 - d = r + beta*d - innerconverge = (sqrt(norm_r2) <= psi * norm_grad) | (inneriter > maxinneriter) - } - } - - print(" --- Inner CG Iteration = " + inneriter) - # END TRUST REGION SUB-PROBLEM - # compute rho, update w, obtain delta - gs = t(s) %*% grad - qk = -0.5*(gs - (t(s) %*% r)) - - wnew = w + s - onew = o + os - logisticnew = 1.0/(1.0 + exp(-y * onew )) - objnew = 0.5 * t(wnew) %*% wnew + C * sum(-log(logisticnew)) - - actred = (obj - objnew) - actredScalar = castAsScalar(actred) - rho = actred / qk - qkScalar = castAsScalar(qk) - rhoScalar = castAsScalar(rho); - snorm = sqrt(sum( s * s )) - - print(" Actual = " + actredScalar) - print(" Predicted = " + qkScalar) - - if (iter==0) { - delta = min(delta, snorm) - } - alpha2 = objnew - obj - gs - alpha2Scalar = castAsScalar(alpha2) - if (alpha2Scalar <= 0) { - alpha = sigma3*e - } - else { - ascalar = max(sigma1, -0.5*castAsScalar(gs)/alpha2Scalar) - alpha = ascalar*e - } - - if (rhoScalar > eta0) { - - w = wnew - o = onew - grad = w + C*t(X) %*% ((logisticnew - 1) * y ) - norm_grad = sqrt(sum(grad*grad)) - logisticD = logisticnew * (1 - logisticnew) - obj = objnew - } - - alphaScalar = castAsScalar(alpha) - if (rhoScalar < eta0){ - delta = min(max( alphaScalar , sigma1) * snorm, sigma2 * delta ) - } - else { - if (rhoScalar < eta1){ - delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma2 * delta)) - } - else { - if (rhoScalar < eta2) { - delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma3 * delta)) - } - else { - delta = max(delta, min( alphaScalar * snorm, sigma3 * delta)) - } - } - } - - o2 = y * o - correct = sum(ppred(o2, 0, ">")) - accuracy = correct*100.0/N - iter = iter + 1 - #converge = (norm_grad < (tol * norm_grad_initial)) | (iter > maxiter) - converge = (norm_grad < tol) | (iter > maxiter) - - print(" Delta = " + delta) - print(" Training Accuracy = " + accuracy) - print(" Correct = " + correct) - print(" OuterIter = " + iter) - print(" Converge = " + converge) -} - -write(w, $6, format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Solves Linear Logistic Regression using Trust Region methods. +# Can be adapted for L2-SVMs and more general unconstrained optimization problems also +# setup optimization parameters (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650) +# The parameter C is the weight that the algorithm puts on the loss function, instead of the regularizer. +# if intercept = 1, then w has one extra value than the dimensions of X. Predictions are computed as X*w[1:n-1,1] + w[n,1] +# Arguments: 1.X 2.y 3.intercept 4.max_iteration 5.C 6.w + +# 100K dataset +# hadoop jar SystemML.jar -f LinearLogisticRegression.dml -args itau/logreg/X_100k_500 itau/logreg/y_100k 0 50 0.001 itau/logreg/w_100k_1 + +# 1M dataset +# hadoop jar SystemML.jar -f LinearLogisticRegression.dml -args itau/logreg/X_100m_5k itau/logreg/y_100m_1 0 50 0.001 itau/demo/logreg/w_100m_1 + + +# internal parameters +tol = 0.001 +eta0 = 0.0001 +eta1 = 0.25 +eta2 = 0.75 +sigma1 = 0.25 +sigma2 = 0.5 +sigma3 = 4.0 +psi = 0.1 + +# read training data files +X = read($1) +intercept = $3 + +D = ncol(X) +#initialize w +w = Rand(rows=D, cols=1, min=0.0, max=0.0); +zeros_D = Rand(rows = D, cols = 1, min = 0.0, max = 0.0); + +if (intercept == 1) { + num_samples = nrow(X); + ones = Rand(rows=num_samples, cols=1, min=1, max=1, pdf="uniform"); + X = append(X, ones); + zero_matrix = Rand(rows=1, cols=1, min=0.0, max=0.0); + w = t(append(t(w), zero_matrix)); + zeros_D = t(append(t(zeros_D), zero_matrix)); +} + +N = nrow(X) + +# read (training and test) labels +y = read($2) + +maxiter = $4 +maxinneriter = 1000 + +C = $5 + +e = Rand(rows=1, cols=1, min=1.0, max=1.0); +o = X %*% w +logistic = 1.0/(1.0 + exp( -y * o)) + +obj = 0.5 * t(w) %*% w + C*sum(-log(logistic)) +grad = w + C*t(X) %*% ((logistic - 1)*y) +logisticD = logistic*(1-logistic) +delta = sqrt(sum(grad*grad)) + +# number of iterations +iter = 0 + +# starting point for CG + +# VS: change +zeros_N = Rand(rows = N, cols = 1, min = 0.0, max = 0.0); + +# boolean for convergence check + +converge = (delta < tol) | (iter > maxiter) +norm_r2 = sum(grad*grad) + +# VS: change +norm_grad = sqrt(norm_r2) +norm_grad_initial = norm_grad + +alpha = t(w) %*% w +alpha2 = alpha + +while(!converge) { + + norm_grad = sqrt(sum(grad*grad)) + + print("-- Outer Iteration = " + iter) + objScalar = castAsScalar(obj) + print(" Iterations = " + iter + ", Objective = " + objScalar + ", Gradient Norm = " + norm_grad) + + # SOLVE TRUST REGION SUB-PROBLEM + s = zeros_D + os = zeros_N + r = -grad + d = r + inneriter = 0 + innerconverge = ( sqrt(sum(r*r)) <= psi * norm_grad) + while (!innerconverge) { + inneriter = inneriter + 1 + norm_r2 = sum(r*r) + od = X %*% d + Hd = d + C*(t(X) %*% (logisticD*od)) + alpha_deno = t(d) %*% Hd + alpha = norm_r2 / alpha_deno + + s = s + castAsScalar(alpha) * d + os = os + castAsScalar(alpha) * od + + sts = t(s) %*% s + delta2 = delta*delta + stsScalar = castAsScalar(sts) + + shouldBreak = FALSE; # to mimic "break" in the following 'if' condition + if (stsScalar > delta2) { + print(" --- cg reaches trust region boundary") + s = s - castAsScalar(alpha) * d + os = os - castAsScalar(alpha) * od + std = t(s) %*% d + dtd = t(d) %*% d + sts = t(s) %*% s + rad = sqrt(std*std + dtd*(delta2 - sts)) + stdScalar = castAsScalar(std) + if(stdScalar >= 0) { + tau = (delta2 - sts)/(std + rad) + } + else { + tau = (rad - std)/dtd + } + + s = s + castAsScalar(tau) * d + os = os + castAsScalar(tau) * od + r = r - castAsScalar(tau) * Hd + + #break + shouldBreak = TRUE; + innerconverge = TRUE; + + } + + if (!shouldBreak) { + r = r - castAsScalar(alpha) * Hd + old_norm_r2 = norm_r2 + norm_r2 = sum(r*r) + beta = norm_r2/old_norm_r2 + d = r + beta*d + innerconverge = (sqrt(norm_r2) <= psi * norm_grad) | (inneriter > maxinneriter) + } + } + + print(" --- Inner CG Iteration = " + inneriter) + # END TRUST REGION SUB-PROBLEM + # compute rho, update w, obtain delta + gs = t(s) %*% grad + qk = -0.5*(gs - (t(s) %*% r)) + + wnew = w + s + onew = o + os + logisticnew = 1.0/(1.0 + exp(-y * onew )) + objnew = 0.5 * t(wnew) %*% wnew + C * sum(-log(logisticnew)) + + actred = (obj - objnew) + actredScalar = castAsScalar(actred) + rho = actred / qk + qkScalar = castAsScalar(qk) + rhoScalar = castAsScalar(rho); + snorm = sqrt(sum( s * s )) + + print(" Actual = " + actredScalar) + print(" Predicted = " + qkScalar) + + if (iter==0) { + delta = min(delta, snorm) + } + alpha2 = objnew - obj - gs + alpha2Scalar = castAsScalar(alpha2) + if (alpha2Scalar <= 0) { + alpha = sigma3*e + } + else { + ascalar = max(sigma1, -0.5*castAsScalar(gs)/alpha2Scalar) + alpha = ascalar*e + } + + if (rhoScalar > eta0) { + + w = wnew + o = onew + grad = w + C*t(X) %*% ((logisticnew - 1) * y ) + norm_grad = sqrt(sum(grad*grad)) + logisticD = logisticnew * (1 - logisticnew) + obj = objnew + } + + alphaScalar = castAsScalar(alpha) + if (rhoScalar < eta0){ + delta = min(max( alphaScalar , sigma1) * snorm, sigma2 * delta ) + } + else { + if (rhoScalar < eta1){ + delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma2 * delta)) + } + else { + if (rhoScalar < eta2) { + delta = max(sigma1 * delta, min( alphaScalar * snorm, sigma3 * delta)) + } + else { + delta = max(delta, min( alphaScalar * snorm, sigma3 * delta)) + } + } + } + + o2 = y * o + correct = sum(ppred(o2, 0, ">")) + accuracy = correct*100.0/N + iter = iter + 1 + #converge = (norm_grad < (tol * norm_grad_initial)) | (iter > maxiter) + converge = (norm_grad < tol) | (iter > maxiter) + + print(" Delta = " + delta) + print(" Training Accuracy = " + accuracy) + print(" Correct = " + correct) + print(" OuterIter = " + iter) + print(" Converge = " + converge) +} + +write(w, $6, format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/validation/genRandData4LogisticRegression.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/validation/genRandData4LogisticRegression.dml b/src/test/scripts/applications/validation/genRandData4LogisticRegression.dml index d06de67..b42a315 100644 --- a/src/test/scripts/applications/validation/genRandData4LogisticRegression.dml +++ b/src/test/scripts/applications/validation/genRandData4LogisticRegression.dml @@ -1,122 +1,122 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random data to test linear logistic regression - -# 100K dataset -# hadoop jar SystemML.jar -f genRandData4LogisticRegression.dml -args 100000 500 0.0 5.0 itau/logreg/w_100k itau/logreg/X_100k_500 itau/logreg/y_100k 0 0 0.01 - -# 1M dataset -# hadoop jar SystemML.jar -f genRandData4LogisticRegression.dml -args 1000000 1000 0.0 5.0 itau/logreg/w_1m itau/logreg/X_1m_1k /logreg/y_1m 0 0 0.0001 - -# $1 is number of samples -# $2 is number of features (independent variables) -# $3 is the mean of the linear form (w^T X) -# $4 is the st.dev. of the linear form (w^T X) -# $5 is location to store generated weights -# $6 is location to store generated data -# $7 is location to store generated labels -# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 -# $9 is 0 if no intercept and 1 if there is intercept -# $10 controls sparsity in the generated data - -numSamples = $1 -numFeatures = $2 -meanLF = $3 -sigmaLF = $4 -addNoise = $8 -b = $9 - -X = Rand (rows=numSamples, cols=numFeatures, min=-1, max=2, pdf="uniform", seed=0, sparsity=$10); -w = Rand (rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) - -if (b != 0) { - b_mat = Rand (rows=numSamples, cols=1, min=1, max=1); - X = append (X, b_mat); - numFeatures_plus_one = numFeatures + 1; - w = Rand (rows=numFeatures_plus_one, cols=1, min=-1, max=1, pdf="uniform", seed=0); -} - -[w, new_sigmaLF] = scaleWeights (X, w, meanLF, sigmaLF); -if (sigmaLF != new_sigmaLF) { - print ("The standard deviation requirement on the linear form is TOO TIGHT!"); - print ("We relaxed sigmaLF from " + sigmaLF + " to " + new_sigmaLF + "."); -} -ot = X %*% w; - -if (b != 0) { - X = X [, 1:numFeatures]; -} - -emp_meanLF = sum (ot) / numSamples; -emp_sigmaLF = sqrt (sum (ot * ot) / numSamples - emp_meanLF * emp_meanLF); -print ("Empirical meanLF = " + emp_meanLF + "; Empirical sigmaLF = " + emp_sigmaLF); - -prob = 1 / (1 + exp (- ot)); - -if(addNoise == 1){ - r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) -}else{ - print("this data generator generates the same dataset for both noise=0 and noise=1") - r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) - #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") -} - -print ("nrow(prob) = " + nrow(prob) + ", ncol(prob) = " + ncol(prob) + "; nrow(r) = " + nrow(r) + ", ncol(r) = " + ncol(r)); - -Y = 1 - 2*ppred(prob, r, "<") - -write (w, $5, format="text"); -write (X, $6, format="binary"); -write (Y, $7, format="binary"); - - -# Shifts and scales the weights to ensure the desired statistics for Linear Form = w^T X -# Used in data and/or weight generation in the testing of GLM, Logistic Regression etc. -# new_sigmaLF == sigmaLF if successful, new_sigmaLF > sigmaLF if had to relax this constraint -scaleWeights = - function (Matrix[double] X_data, Matrix[double] w_unscaled, double meanLF, double sigmaLF) - return (Matrix[double] w_scaled, double new_sigmaLF) -{ - numFeatures = nrow (w_unscaled); - W_ext = Rand (rows = numFeatures, cols = 2, min = 1, max = 1); - W_ext [, 1] = w_unscaled; - S1 = colSums (X_data %*% W_ext); - TF = Rand (rows = 2, cols = 2, min = 1, max = 1); - TF [1, 1] = S1 [1, 1] * meanLF * nrow (X_data) / castAsScalar (S1 %*% t(S1)); - TF [1, 2] = S1 [1, 2]; - TF [2, 1] = S1 [1, 2] * meanLF * nrow (X_data) / castAsScalar (S1 %*% t(S1)); - TF [2, 2] = - S1 [1, 1]; - TF = W_ext %*% TF; - Q = t(TF) %*% t(X_data) %*% X_data %*% TF; - Q [1, 1] = Q [1, 1] - nrow (X_data) * meanLF * meanLF; - new_sigmaLF = sigmaLF; - discr = castAsScalar (Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1] - nrow (X_data) * Q [2, 2] * sigmaLF * sigmaLF); - if (discr > 0.0) { - new_sigmaLF = sqrt (castAsScalar ((Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1]) / (nrow (X_data) * Q [2, 2]))); - discr = -0.0; - } - t = Rand (rows = 2, cols = 1, min = 1, max = 1); - t [2, 1] = (- Q [1, 2] + sqrt (- discr)) / Q [2, 2]; - w_scaled = TF %*% t; -} - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data to test linear logistic regression + +# 100K dataset +# hadoop jar SystemML.jar -f genRandData4LogisticRegression.dml -args 100000 500 0.0 5.0 itau/logreg/w_100k itau/logreg/X_100k_500 itau/logreg/y_100k 0 0 0.01 + +# 1M dataset +# hadoop jar SystemML.jar -f genRandData4LogisticRegression.dml -args 1000000 1000 0.0 5.0 itau/logreg/w_1m itau/logreg/X_1m_1k /logreg/y_1m 0 0 0.0001 + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is the mean of the linear form (w^T X) +# $4 is the st.dev. of the linear form (w^T X) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 +# $9 is 0 if no intercept and 1 if there is intercept +# $10 controls sparsity in the generated data + +numSamples = $1 +numFeatures = $2 +meanLF = $3 +sigmaLF = $4 +addNoise = $8 +b = $9 + +X = Rand (rows=numSamples, cols=numFeatures, min=-1, max=2, pdf="uniform", seed=0, sparsity=$10); +w = Rand (rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) + +if (b != 0) { + b_mat = Rand (rows=numSamples, cols=1, min=1, max=1); + X = append (X, b_mat); + numFeatures_plus_one = numFeatures + 1; + w = Rand (rows=numFeatures_plus_one, cols=1, min=-1, max=1, pdf="uniform", seed=0); +} + +[w, new_sigmaLF] = scaleWeights (X, w, meanLF, sigmaLF); +if (sigmaLF != new_sigmaLF) { + print ("The standard deviation requirement on the linear form is TOO TIGHT!"); + print ("We relaxed sigmaLF from " + sigmaLF + " to " + new_sigmaLF + "."); +} +ot = X %*% w; + +if (b != 0) { + X = X [, 1:numFeatures]; +} + +emp_meanLF = sum (ot) / numSamples; +emp_sigmaLF = sqrt (sum (ot * ot) / numSamples - emp_meanLF * emp_meanLF); +print ("Empirical meanLF = " + emp_meanLF + "; Empirical sigmaLF = " + emp_sigmaLF); + +prob = 1 / (1 + exp (- ot)); + +if(addNoise == 1){ + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) +}else{ + print("this data generator generates the same dataset for both noise=0 and noise=1") + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) + #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") +} + +print ("nrow(prob) = " + nrow(prob) + ", ncol(prob) = " + ncol(prob) + "; nrow(r) = " + nrow(r) + ", ncol(r) = " + ncol(r)); + +Y = 1 - 2*ppred(prob, r, "<") + +write (w, $5, format="text"); +write (X, $6, format="binary"); +write (Y, $7, format="binary"); + + +# Shifts and scales the weights to ensure the desired statistics for Linear Form = w^T X +# Used in data and/or weight generation in the testing of GLM, Logistic Regression etc. +# new_sigmaLF == sigmaLF if successful, new_sigmaLF > sigmaLF if had to relax this constraint +scaleWeights = + function (Matrix[double] X_data, Matrix[double] w_unscaled, double meanLF, double sigmaLF) + return (Matrix[double] w_scaled, double new_sigmaLF) +{ + numFeatures = nrow (w_unscaled); + W_ext = Rand (rows = numFeatures, cols = 2, min = 1, max = 1); + W_ext [, 1] = w_unscaled; + S1 = colSums (X_data %*% W_ext); + TF = Rand (rows = 2, cols = 2, min = 1, max = 1); + TF [1, 1] = S1 [1, 1] * meanLF * nrow (X_data) / castAsScalar (S1 %*% t(S1)); + TF [1, 2] = S1 [1, 2]; + TF [2, 1] = S1 [1, 2] * meanLF * nrow (X_data) / castAsScalar (S1 %*% t(S1)); + TF [2, 2] = - S1 [1, 1]; + TF = W_ext %*% TF; + Q = t(TF) %*% t(X_data) %*% X_data %*% TF; + Q [1, 1] = Q [1, 1] - nrow (X_data) * meanLF * meanLF; + new_sigmaLF = sigmaLF; + discr = castAsScalar (Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1] - nrow (X_data) * Q [2, 2] * sigmaLF * sigmaLF); + if (discr > 0.0) { + new_sigmaLF = sqrt (castAsScalar ((Q [1, 1] * Q [2, 2] - Q [1, 2] * Q [2, 1]) / (nrow (X_data) * Q [2, 2]))); + discr = -0.0; + } + t = Rand (rows = 2, cols = 1, min = 1, max = 1); + t [2, 1] = (- Q [1, 2] + sqrt (- discr)) / Q [2, 2]; + w_scaled = TF %*% t; +} + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/welchTTest/welchTTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/welchTTest/welchTTest.R b/src/test/scripts/applications/welchTTest/welchTTest.R index 66da912..ff74d71 100644 --- a/src/test/scripts/applications/welchTTest/welchTTest.R +++ b/src/test/scripts/applications/welchTTest/welchTTest.R @@ -1,49 +1,49 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) -library(Matrix) - -posSamples = readMM(paste(args[1], "posSamples.mtx", sep="")) -negSamples = readMM(paste(args[1], "negSamples.mtx", sep="")) - -#computing sample sizes -posSampleSize = nrow(posSamples) -negSampleSize = nrow(negSamples) - -#computing means -posSampleMeans = colMeans(posSamples) -negSampleMeans = colMeans(negSamples) - -#computing (unbiased) variances -posSampleVariances = (colSums(posSamples^2) - posSampleSize * posSampleMeans^2) / (posSampleSize-1) -negSampleVariances = (colSums(negSamples^2) - negSampleSize * negSampleMeans^2) / (negSampleSize-1) - -#computing t-statistics and degrees of freedom -t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) -degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ^ 2) / (posSampleVariances^2/(posSampleSize^2 * (posSampleSize-1)) + negSampleVariances^2/(negSampleSize^2 * (negSampleSize-1)))) - -#R will write a vector as a 1-column matrix, forcing it to write a 1-row matrix -t_statistics_mat = matrix(t_statistics, 1, length(t_statistics)) -degrees_of_freedom_mat = matrix(degrees_of_freedom, 1, length(degrees_of_freedom)) - -writeMM(as(t_statistics_mat, "CsparseMatrix"), paste(args[2], "t_statistics", sep="")) -writeMM(as(degrees_of_freedom_mat, "CsparseMatrix"), paste(args[2], "degrees_of_freedom", sep="")) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +library(Matrix) + +posSamples = readMM(paste(args[1], "posSamples.mtx", sep="")) +negSamples = readMM(paste(args[1], "negSamples.mtx", sep="")) + +#computing sample sizes +posSampleSize = nrow(posSamples) +negSampleSize = nrow(negSamples) + +#computing means +posSampleMeans = colMeans(posSamples) +negSampleMeans = colMeans(negSamples) + +#computing (unbiased) variances +posSampleVariances = (colSums(posSamples^2) - posSampleSize * posSampleMeans^2) / (posSampleSize-1) +negSampleVariances = (colSums(negSamples^2) - negSampleSize * negSampleMeans^2) / (negSampleSize-1) + +#computing t-statistics and degrees of freedom +t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) +degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ^ 2) / (posSampleVariances^2/(posSampleSize^2 * (posSampleSize-1)) + negSampleVariances^2/(negSampleSize^2 * (negSampleSize-1)))) + +#R will write a vector as a 1-column matrix, forcing it to write a 1-row matrix +t_statistics_mat = matrix(t_statistics, 1, length(t_statistics)) +degrees_of_freedom_mat = matrix(degrees_of_freedom, 1, length(degrees_of_freedom)) + +writeMM(as(t_statistics_mat, "CsparseMatrix"), paste(args[2], "t_statistics", sep="")) +writeMM(as(degrees_of_freedom_mat, "CsparseMatrix"), paste(args[2], "degrees_of_freedom", sep="")) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/welchTTest/welchTTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/welchTTest/welchTTest.dml b/src/test/scripts/applications/welchTTest/welchTTest.dml index 4e42f03..7bc0144 100644 --- a/src/test/scripts/applications/welchTTest/welchTTest.dml +++ b/src/test/scripts/applications/welchTTest/welchTTest.dml @@ -1,43 +1,43 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -posSamples = read($1, format="text") -negSamples = read($2, format="text") - -#computing sample sizes -posSampleSize = nrow(posSamples) -negSampleSize = nrow(negSamples) - -#computing means -posSampleMeans = colMeans(posSamples) -negSampleMeans = colMeans(negSamples) - -#computing (unbiased) variances -posSampleVariances = (colSums(posSamples^2) - posSampleSize * posSampleMeans^2) / (posSampleSize-1) -negSampleVariances = (colSums(negSamples^2) - negSampleSize * negSampleMeans^2) / (negSampleSize-1) - -#computing t-statistics and degrees of freedom -t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) -degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ^ 2) / (posSampleVariances^2/(posSampleSize^2 * (posSampleSize-1)) + negSampleVariances^2/(negSampleSize^2 * (negSampleSize-1)))) - -write(t_statistics, $3, format="text") -write(degrees_of_freedom, $4, format="text") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +posSamples = read($1, format="text") +negSamples = read($2, format="text") + +#computing sample sizes +posSampleSize = nrow(posSamples) +negSampleSize = nrow(negSamples) + +#computing means +posSampleMeans = colMeans(posSamples) +negSampleMeans = colMeans(negSamples) + +#computing (unbiased) variances +posSampleVariances = (colSums(posSamples^2) - posSampleSize * posSampleMeans^2) / (posSampleSize-1) +negSampleVariances = (colSums(negSamples^2) - negSampleSize * negSampleMeans^2) / (negSampleSize-1) + +#computing t-statistics and degrees of freedom +t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) +degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ^ 2) / (posSampleVariances^2/(posSampleSize^2 * (posSampleSize-1)) + negSampleVariances^2/(negSampleSize^2 * (negSampleSize-1)))) + +write(t_statistics, $3, format="text") +write(degrees_of_freedom, $4, format="text") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/welchTTest/welchTTest.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/welchTTest/welchTTest.pydml b/src/test/scripts/applications/welchTTest/welchTTest.pydml index abfff3a..5dbd049 100644 --- a/src/test/scripts/applications/welchTTest/welchTTest.pydml +++ b/src/test/scripts/applications/welchTTest/welchTTest.pydml @@ -1,43 +1,43 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -posSamples = load($1, format="text") -negSamples = load($2, format="text") - -#computing sample sizes -posSampleSize = nrow(posSamples) -negSampleSize = nrow(negSamples) - -#computing means -posSampleMeans = colMeans(posSamples) -negSampleMeans = colMeans(negSamples) - -#computing (unbiased) variances -posSampleVariances = (colSums(posSamples ** 2) - posSampleSize * posSampleMeans ** 2) / (posSampleSize-1) -negSampleVariances = (colSums(negSamples ** 2) - negSampleSize * negSampleMeans ** 2) / (negSampleSize-1) - -#computing t-statistics and degrees of freedom -t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) -degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ** 2) / (posSampleVariances ** 2/((posSampleSize ** 2) * (posSampleSize-1)) + (negSampleVariances ** 2)/((negSampleSize ** 2) * (negSampleSize-1)))) - -save(t_statistics, $3, format="text") -save(degrees_of_freedom, $4, format="text") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +posSamples = load($1, format="text") +negSamples = load($2, format="text") + +#computing sample sizes +posSampleSize = nrow(posSamples) +negSampleSize = nrow(negSamples) + +#computing means +posSampleMeans = colMeans(posSamples) +negSampleMeans = colMeans(negSamples) + +#computing (unbiased) variances +posSampleVariances = (colSums(posSamples ** 2) - posSampleSize * posSampleMeans ** 2) / (posSampleSize-1) +negSampleVariances = (colSums(negSamples ** 2) - negSampleSize * negSampleMeans ** 2) / (negSampleSize-1) + +#computing t-statistics and degrees of freedom +t_statistics = (posSampleMeans - negSampleMeans) / sqrt(posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) +degrees_of_freedom = round(((posSampleVariances/posSampleSize + negSampleVariances/negSampleSize) ** 2) / (posSampleVariances ** 2/((posSampleSize ** 2) * (posSampleSize-1)) + (negSampleVariances ** 2)/((negSampleSize ** 2) * (negSampleSize-1)))) + +save(t_statistics, $3, format="text") +save(degrees_of_freedom, $4, format="text") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/functions/aggregate/AllMax.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/aggregate/AllMax.R b/src/test/scripts/functions/aggregate/AllMax.R index 3ed23d6..73cde8d 100644 --- a/src/test/scripts/functions/aggregate/AllMax.R +++ b/src/test/scripts/functions/aggregate/AllMax.R @@ -1,29 +1,29 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) - -library("Matrix") - -A <- as.matrix(readMM(paste(args[1], "A.mtx", sep=""))) -B <- as.matrix(max(A)); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) + +library("Matrix") + +A <- as.matrix(readMM(paste(args[1], "A.mtx", sep=""))) +B <- as.matrix(max(A)); + writeMM(as(B, "CsparseMatrix"), paste(args[2], "B", sep="")); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/functions/aggregate/AllMean.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/aggregate/AllMean.R b/src/test/scripts/functions/aggregate/AllMean.R index 4e315a4..07ee1a9 100644 --- a/src/test/scripts/functions/aggregate/AllMean.R +++ b/src/test/scripts/functions/aggregate/AllMean.R @@ -1,29 +1,29 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) - -library("Matrix") - -A <- as.matrix(readMM(paste(args[1], "A.mtx", sep=""))) -B <- as.matrix(mean(A)); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) + +library("Matrix") + +A <- as.matrix(readMM(paste(args[1], "A.mtx", sep=""))) +B <- as.matrix(mean(A)); + writeMM(as(B, "CsparseMatrix"), paste(args[2], "B", sep="")); \ No newline at end of file
