http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml index d0ce8d1..77df03b 100644 --- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml +++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.dml @@ -1,78 +1,78 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multinomial naive Bayes classifier with Laplace correction -# -# Example Usage: -# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" -# - -# defaults -# $laplace = 1 -fmt = ifdef($fmt, "text") - -# reading input args -numClasses = $classes -D = read($X) -C = read($Y) -laplace_correction = ifdef($laplace, 1) - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals - -# Compute the feature counts for each class -classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures) -parfor (i in 1:numFeatures) { - Col = D[,i] - classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses)) -} - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction - -# Compute class conditional probabilities -ones = matrix(1, rows=1, cols=numFeatures) -repClassSums = classSums %*% ones -class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums - -# Compute class priors -class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses)) -class_prior = class_counts / numRows; - -# Compute accuracy on training set -ones = matrix(1, rows=numRows, cols=1) -D_w_ones = append(D, ones) -model = append(class_conditionals, class_prior) -log_probs = D_w_ones %*% t(log(model)) -pred = rowIndexMax(log_probs) -acc = sum(ppred(pred, C, "==")) / numRows * 100 - -acc_str = "Training Accuracy (%): " + acc -print(acc_str) -write(acc_str, $accuracy) - -# write out the model -write(class_prior, $prior, format=fmt); -write(class_conditionals, $conditionals, format=fmt); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multinomial naive Bayes classifier with Laplace correction +# +# Example Usage: +# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" +# + +# defaults +# $laplace = 1 +fmt = ifdef($fmt, "text") + +# reading input args +numClasses = $classes +D = read($X) +C = read($Y) +laplace_correction = ifdef($laplace, 1) + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals + +# Compute the feature counts for each class +classFeatureCounts = matrix(0, rows=numClasses, cols=numFeatures) +parfor (i in 1:numFeatures) { + Col = D[,i] + classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=as.integer(numClasses)) +} + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction + +# Compute class conditional probabilities +ones = matrix(1, rows=1, cols=numFeatures) +repClassSums = classSums %*% ones +class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums + +# Compute class priors +class_counts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses)) +class_prior = class_counts / numRows; + +# Compute accuracy on training set +ones = matrix(1, rows=numRows, cols=1) +D_w_ones = append(D, ones) +model = append(class_conditionals, class_prior) +log_probs = D_w_ones %*% t(log(model)) +pred = rowIndexMax(log_probs) +acc = sum(ppred(pred, C, "==")) / numRows * 100 + +acc_str = "Training Accuracy (%): " + acc +print(acc_str) +write(acc_str, $accuracy) + +# write out the model +write(class_prior, $prior, format=fmt); +write(class_conditionals, $conditionals, format=fmt);
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml index 5d84951..462b330 100644 --- a/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml +++ b/src/test/scripts/applications/naive-bayes-parfor/naive-bayes.pydml @@ -1,79 +1,79 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multinomial naive Bayes classifier with Laplace correction -# -# Example Usage: -# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" -# - -# defaults -# $laplace = 1 -fmt = ifdef($fmt, "text") - -# reading input args -numClasses = $classes -D = load($X) -C = load($Y) -laplace_correction = ifdef($laplace, 1) - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals - -# Compute the feature counts for each class -classFeatureCounts = full(0, rows=numClasses, cols=numFeatures) -parfor (i in 1:numFeatures): - Col = D[,i] - classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses) - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction - -# Compute class conditional probabilities -ones = full(1, rows=1, cols=numFeatures) -repClassSums = dot(classSums, ones) -class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums - -# Compute class priors -class_counts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses) -class_prior = class_counts / numRows - -# Compute accuracy on training set -ones = full(1, rows=numRows, cols=1) -D_w_ones = append(D, ones) -model = append(class_conditionals, class_prior) -log_model = log(model) -transpose_log_model = log_model.transpose() -log_probs = dot(D_w_ones, transpose_log_model) -pred = rowIndexMax(log_probs) -acc = sum(ppred(pred, C, "==")) / numRows * 100 - -acc_str = "Training Accuracy (%): " + acc -print(acc_str) -save(acc_str, $accuracy) - -# write out the model -save(class_prior, $prior, format=fmt) -save(class_conditionals, $conditionals, format=fmt) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multinomial naive Bayes classifier with Laplace correction +# +# Example Usage: +# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" +# + +# defaults +# $laplace = 1 +fmt = ifdef($fmt, "text") + +# reading input args +numClasses = $classes +D = load($X) +C = load($Y) +laplace_correction = ifdef($laplace, 1) + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals + +# Compute the feature counts for each class +classFeatureCounts = full(0, rows=numClasses, cols=numFeatures) +parfor (i in 1:numFeatures): + Col = D[,i] + classFeatureCounts[,i] = aggregate(target=Col, groups=C, fn="sum", ngroups=numClasses) + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction + +# Compute class conditional probabilities +ones = full(1, rows=1, cols=numFeatures) +repClassSums = dot(classSums, ones) +class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums + +# Compute class priors +class_counts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses) +class_prior = class_counts / numRows + +# Compute accuracy on training set +ones = full(1, rows=numRows, cols=1) +D_w_ones = append(D, ones) +model = append(class_conditionals, class_prior) +log_model = log(model) +transpose_log_model = log_model.transpose() +log_probs = dot(D_w_ones, transpose_log_model) +pred = rowIndexMax(log_probs) +acc = sum(ppred(pred, C, "==")) / numRows * 100 + +acc_str = "Training Accuracy (%): " + acc +print(acc_str) +save(acc_str, $accuracy) + +# write out the model +save(class_prior, $prior, format=fmt) +save(class_conditionals, $conditionals, format=fmt) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes/naive-bayes.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.R b/src/test/scripts/applications/naive-bayes/naive-bayes.R index dc65b8a..a3ca47a 100644 --- a/src/test/scripts/applications/naive-bayes/naive-bayes.R +++ b/src/test/scripts/applications/naive-bayes/naive-bayes.R @@ -1,71 +1,71 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) - -library("Matrix") - -D = as.matrix(readMM(paste(args[1], "X.mtx", sep=""))) -C = as.matrix(readMM(paste(args[1], "Y.mtx", sep=""))) - -# reading input args -numClasses = as.integer(args[2]); -laplace_correction = as.double(args[3]); - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals - -# Compute the feature counts for each class -classFeatureCounts = matrix(0, numClasses, numFeatures) -for (i in 1:numFeatures) { - Col = D[,i] - classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; -} - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction - -# Compute class conditional probabilities -ones = matrix(1, 1, numFeatures) -repClassSums = classSums %*% ones; -class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; - -# Compute class priors -class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] -class_prior = class_counts / numRows; - -# Compute accuracy on training set -ones = matrix(1, numRows, 1) -D_w_ones = cbind(D, ones) -model = cbind(class_conditionals, class_prior) -log_probs = D_w_ones %*% t(log(model)) -pred = max.col(log_probs,ties.method="last"); -acc = sum(pred == C) / numRows * 100 - -print(paste("Training Accuracy (%): ", acc, sep="")) - -# write out the model -writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep="")); -writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep="")); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) + +library("Matrix") + +D = as.matrix(readMM(paste(args[1], "X.mtx", sep=""))) +C = as.matrix(readMM(paste(args[1], "Y.mtx", sep=""))) + +# reading input args +numClasses = as.integer(args[2]); +laplace_correction = as.double(args[3]); + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals + +# Compute the feature counts for each class +classFeatureCounts = matrix(0, numClasses, numFeatures) +for (i in 1:numFeatures) { + Col = D[,i] + classFeatureCounts[,i] = aggregate(as.vector(Col), by=list(as.vector(C)), FUN=sum)[,2]; +} + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplace_correction + +# Compute class conditional probabilities +ones = matrix(1, 1, numFeatures) +repClassSums = classSums %*% ones; +class_conditionals = (classFeatureCounts + laplace_correction) / repClassSums; + +# Compute class priors +class_counts = aggregate(as.vector(C), by=list(as.vector(C)), FUN=length)[,2] +class_prior = class_counts / numRows; + +# Compute accuracy on training set +ones = matrix(1, numRows, 1) +D_w_ones = cbind(D, ones) +model = cbind(class_conditionals, class_prior) +log_probs = D_w_ones %*% t(log(model)) +pred = max.col(log_probs,ties.method="last"); +acc = sum(pred == C) / numRows * 100 + +print(paste("Training Accuracy (%): ", acc, sep="")) + +# write out the model +writeMM(as(class_prior, "CsparseMatrix"), paste(args[4], "prior", sep="")); +writeMM(as(class_conditionals, "CsparseMatrix"), paste(args[4], "conditionals", sep="")); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes/naive-bayes.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.dml b/src/test/scripts/applications/naive-bayes/naive-bayes.dml index 1fe1bf4..a81edea 100644 --- a/src/test/scripts/applications/naive-bayes/naive-bayes.dml +++ b/src/test/scripts/applications/naive-bayes/naive-bayes.dml @@ -1,67 +1,67 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multinomial naive Bayes classifier with Laplace correction -# -# Example Usage: -# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" -# - -# defaults -# $laplace = 1 -fmt = ifdef($fmt, "text") - -# reading input args -numClasses = $classes -D = read($X) -C = read($Y) -laplaceCorrection = ifdef($laplace, 1) - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals -# Compute the feature counts for each class -classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses)); - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection - -# Compute class conditional probabilities -classConditionals = (classFeatureCounts + laplaceCorrection) / classSums - -# Compute class priors -classCounts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses)) -classPrior = classCounts / numRows; - -# Compute accuracy on training set -logProbs = D %*% t(log(classConditionals)) + t(log(classPrior)); -acc = sum(rowIndexMax(logProbs) == C) / numRows * 100 - -acc_str = "Training Accuracy (%): " + acc -print(acc_str) -write(acc_str, $accuracy) - -# write out the model -write(classPrior, $prior, format=fmt); -write(classConditionals, $conditionals, format=fmt); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multinomial naive Bayes classifier with Laplace correction +# +# Example Usage: +# hadoop jar SystemML.jar -f naive-bayes.dml -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" +# + +# defaults +# $laplace = 1 +fmt = ifdef($fmt, "text") + +# reading input args +numClasses = $classes +D = read($X) +C = read($Y) +laplaceCorrection = ifdef($laplace, 1) + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals +# Compute the feature counts for each class +classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=as.integer(numClasses)); + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection + +# Compute class conditional probabilities +classConditionals = (classFeatureCounts + laplaceCorrection) / classSums + +# Compute class priors +classCounts = aggregate(target=C, groups=C, fn="count", ngroups=as.integer(numClasses)) +classPrior = classCounts / numRows; + +# Compute accuracy on training set +logProbs = D %*% t(log(classConditionals)) + t(log(classPrior)); +acc = sum(rowIndexMax(logProbs) == C) / numRows * 100 + +acc_str = "Training Accuracy (%): " + acc +print(acc_str) +write(acc_str, $accuracy) + +# write out the model +write(classPrior, $prior, format=fmt); +write(classConditionals, $conditionals, format=fmt); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/naive-bayes/naive-bayes.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml index 25fbf84..480265d 100644 --- a/src/test/scripts/applications/naive-bayes/naive-bayes.pydml +++ b/src/test/scripts/applications/naive-bayes/naive-bayes.pydml @@ -1,71 +1,71 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multinomial naive Bayes classifier with Laplace correction -# -# Example Usage: -# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" -# - -# defaults -# $laplace = 1 -fmt = ifdef($fmt, "text") - -# reading input args -numClasses = $classes -D = load($X) -C = load($Y) -laplaceCorrection = ifdef($laplace, 1) - -numRows = nrow(D) -numFeatures = ncol(D) - -# Compute conditionals -# Compute the feature counts for each class -classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=numClasses); - -# Compute the total feature count for each class -# and add the number of features to this sum -# for subsequent regularization (Laplace's rule) -classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection - -# Compute class conditional probabilities -classConditionals = (classFeatureCounts + laplaceCorrection) / classSums - -# Compute class priors -classCounts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses) -classPrior = classCounts / numRows - -# Compute accuracy on training set -lmodel1 = log(classConditionals) -lmodel2 = log(classPrior) -tlmodel1 = lmodel1.transpose() -tlmodel2 = lmodel2.transpose() -logProbs = dot(D, tlmodel1) + tlmodel2 -acc = sum(rowIndexMax(logProbs) == C) / numRows * 100 - -acc_str = "Training Accuracy (%): " + acc -print(acc_str) -save(acc_str, $accuracy) - -# write out the model -save(classPrior, $prior, format=fmt) -save(classConditionals, $conditionals, format=fmt) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multinomial naive Bayes classifier with Laplace correction +# +# Example Usage: +# hadoop jar SystemML.jar -f naive-bayes.pydml -python -nvargs X=<Data> Y=<labels> classes=<Num Classes> laplace=<Laplace Correction> prior=<Model file1> conditionals=<Model file2> accuracy=<accuracy file> fmt="text" +# + +# defaults +# $laplace = 1 +fmt = ifdef($fmt, "text") + +# reading input args +numClasses = $classes +D = load($X) +C = load($Y) +laplaceCorrection = ifdef($laplace, 1) + +numRows = nrow(D) +numFeatures = ncol(D) + +# Compute conditionals +# Compute the feature counts for each class +classFeatureCounts = aggregate(target=D, groups=C, fn="sum", ngroups=numClasses); + +# Compute the total feature count for each class +# and add the number of features to this sum +# for subsequent regularization (Laplace's rule) +classSums = rowSums(classFeatureCounts) + numFeatures*laplaceCorrection + +# Compute class conditional probabilities +classConditionals = (classFeatureCounts + laplaceCorrection) / classSums + +# Compute class priors +classCounts = aggregate(target=C, groups=C, fn="count", ngroups=numClasses) +classPrior = classCounts / numRows + +# Compute accuracy on training set +lmodel1 = log(classConditionals) +lmodel2 = log(classPrior) +tlmodel1 = lmodel1.transpose() +tlmodel2 = lmodel2.transpose() +logProbs = dot(D, tlmodel1) + tlmodel2 +acc = sum(rowIndexMax(logProbs) == C) / numRows * 100 + +acc_str = "Training Accuracy (%): " + acc +print(acc_str) +save(acc_str, $accuracy) + +# write out the model +save(classPrior, $prior, format=fmt) +save(classConditionals, $conditionals, format=fmt) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_bivariate.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_bivariate.R b/src/test/scripts/applications/parfor/parfor_bivariate.R index 889b22b..82a5be5 100644 --- a/src/test/scripts/applications/parfor/parfor_bivariate.R +++ b/src/test/scripts/applications/parfor/parfor_bivariate.R @@ -1,155 +1,155 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - - -D1 <- readMM(paste(args[1], "D.mtx", sep="")) -S11 <- readMM(paste(args[1], "S1.mtx", sep="")) -S21 <- readMM(paste(args[1], "S2.mtx", sep="")) -K11 <- readMM(paste(args[1], "K1.mtx", sep="")) -K21 <- readMM(paste(args[1], "K2.mtx", sep="")) -D <- as.matrix(D1); -S1 <- as.matrix(S11); -S2 <- as.matrix(S21); -K1 <- as.matrix(K11); -K2 <- as.matrix(K21); - -numPairs <- ncol(S1) * ncol(S2); # number of attribute pairs (|S1|*|S2|) -maxC <- args[2]; # max number of categories in any categorical attribute - -s1size <- ncol(S1); -s2size <- ncol(S2); - -# R, chisq, cramers, spearman, eta, anovaf -numstats <- 8; -basestats <- array(0,dim=c(numstats,numPairs)); -cat_counts <- array(0,dim=c(maxC,numPairs)); -cat_means <- array(0,dim=c(maxC,numPairs)); -cat_vars <- array(0,dim=c(maxC,numPairs)); - - -for( i in 1:s1size ) { - a1 <- S1[,i]; - k1 <- K1[1,i]; - A1 <- as.matrix(D[,a1]); - - for( j in 1:s2size ) { - pairID <-(i-1)*s2size+j; - a2 <- S2[,j]; - k2 <- K2[1,j]; - A2 <- as.matrix(D[,a2]); - - if (k1 == k2) { - if (k1 == 1) { - # scale-scale - print("scale-scale"); - basestats[1,pairID] <- cor(D[,a1], D[,a2]); - #basestats[1,pairID] <- cor(A1, A2); - - print(basestats[1,pairID]); - } else { - # nominal-nominal or ordinal-ordinal - print("categorical-categorical"); - F <- table(A1,A2); - cst <- chisq.test(F); - chi_squared <- as.numeric(cst[1]); - degFreedom <- (nrow(F)-1)*(ncol(F)-1); - pValue <- as.numeric(cst[3]); - q <- min(dim(F)); - W <- sum(F); - cramers_v <- sqrt(chi_squared/(W*(q-1))); - - basestats[2,pairID] <- chi_squared; - basestats[3,pairID] <- degFreedom; - basestats[4,pairID] <- pValue; - basestats[5,pairID] <- cramers_v; - - if ( k1 == 3 ) { - # ordinal-ordinal - print("ordinal-ordinal"); - basestats[6,pairID] <- cor(A1,A2, method="spearman"); - } - } - } - else { - if (k1 == 1 || k2 == 1) { - # Scale-nominal/ordinal - print("scale-categorical"); - if ( k1 == 1 ) { - Av <- as.matrix(A2); - Yv <- as.matrix(A1); - } - else { - Av <- as.matrix(A1); - Yv <- as.matrix(A2); - } - - W <- nrow(Av); - my <- mean(Yv); - varY <- var(Yv); - - CFreqs <- as.matrix(table(Av)); - CMeans <- as.matrix(aggregate(Yv, by=list(Av), "mean")$V1); - CVars <- as.matrix(aggregate(Yv, by=list(Av), "var")$V1); - R <- nrow(CFreqs); - - Eta <- sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - anova_num <- sum( (CFreqs*(CMeans-my)^2) )/(R-1); - anova_den <- sum( (CFreqs-1)*CVars )/(W-R); - ANOVAF <- anova_num/anova_den; - - basestats[7,pairID] <- Eta; - basestats[8,pairID] <- ANOVAF; - - cat_counts[ 1:length(CFreqs),pairID] <- CFreqs; - cat_means[ 1:length(CMeans),pairID] <- CMeans; - cat_vars[ 1:length(CVars),pairID] <- CVars; - } - else { - # nominal-ordinal or ordinal-nominal - print("nomial-ordinal"); #TODO should not be same code - F <- table(A1,A2); - cst <- chisq.test(F); - chi_squared <- as.numeric(cst[1]); - degFreedom <- (nrow(F)-1)*(ncol(F)-1); - pValue <- as.numeric(cst[3]); - q <- min(dim(F)); - W <- sum(F); - cramers_v <- sqrt(chi_squared/(W*(q-1))); - - basestats[2,pairID] <- chi_squared; - basestats[3,pairID] <- degFreedom; - basestats[4,pairID] <- pValue; - basestats[5,pairID] <- cramers_v; - } - } - } -} - -writeMM(as(basestats, "CsparseMatrix"), paste(args[3], "bivar.stats", sep="")); -writeMM(as(cat_counts, "CsparseMatrix"), paste(args[3], "category.counts", sep="")); -writeMM(as(cat_means, "CsparseMatrix"), paste(args[3], "category.means", sep="")); -writeMM(as(cat_vars, "CsparseMatrix"), paste(args[3], "category.variances", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + + +D1 <- readMM(paste(args[1], "D.mtx", sep="")) +S11 <- readMM(paste(args[1], "S1.mtx", sep="")) +S21 <- readMM(paste(args[1], "S2.mtx", sep="")) +K11 <- readMM(paste(args[1], "K1.mtx", sep="")) +K21 <- readMM(paste(args[1], "K2.mtx", sep="")) +D <- as.matrix(D1); +S1 <- as.matrix(S11); +S2 <- as.matrix(S21); +K1 <- as.matrix(K11); +K2 <- as.matrix(K21); + +numPairs <- ncol(S1) * ncol(S2); # number of attribute pairs (|S1|*|S2|) +maxC <- args[2]; # max number of categories in any categorical attribute + +s1size <- ncol(S1); +s2size <- ncol(S2); + +# R, chisq, cramers, spearman, eta, anovaf +numstats <- 8; +basestats <- array(0,dim=c(numstats,numPairs)); +cat_counts <- array(0,dim=c(maxC,numPairs)); +cat_means <- array(0,dim=c(maxC,numPairs)); +cat_vars <- array(0,dim=c(maxC,numPairs)); + + +for( i in 1:s1size ) { + a1 <- S1[,i]; + k1 <- K1[1,i]; + A1 <- as.matrix(D[,a1]); + + for( j in 1:s2size ) { + pairID <-(i-1)*s2size+j; + a2 <- S2[,j]; + k2 <- K2[1,j]; + A2 <- as.matrix(D[,a2]); + + if (k1 == k2) { + if (k1 == 1) { + # scale-scale + print("scale-scale"); + basestats[1,pairID] <- cor(D[,a1], D[,a2]); + #basestats[1,pairID] <- cor(A1, A2); + + print(basestats[1,pairID]); + } else { + # nominal-nominal or ordinal-ordinal + print("categorical-categorical"); + F <- table(A1,A2); + cst <- chisq.test(F); + chi_squared <- as.numeric(cst[1]); + degFreedom <- (nrow(F)-1)*(ncol(F)-1); + pValue <- as.numeric(cst[3]); + q <- min(dim(F)); + W <- sum(F); + cramers_v <- sqrt(chi_squared/(W*(q-1))); + + basestats[2,pairID] <- chi_squared; + basestats[3,pairID] <- degFreedom; + basestats[4,pairID] <- pValue; + basestats[5,pairID] <- cramers_v; + + if ( k1 == 3 ) { + # ordinal-ordinal + print("ordinal-ordinal"); + basestats[6,pairID] <- cor(A1,A2, method="spearman"); + } + } + } + else { + if (k1 == 1 || k2 == 1) { + # Scale-nominal/ordinal + print("scale-categorical"); + if ( k1 == 1 ) { + Av <- as.matrix(A2); + Yv <- as.matrix(A1); + } + else { + Av <- as.matrix(A1); + Yv <- as.matrix(A2); + } + + W <- nrow(Av); + my <- mean(Yv); + varY <- var(Yv); + + CFreqs <- as.matrix(table(Av)); + CMeans <- as.matrix(aggregate(Yv, by=list(Av), "mean")$V1); + CVars <- as.matrix(aggregate(Yv, by=list(Av), "var")$V1); + R <- nrow(CFreqs); + + Eta <- sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + anova_num <- sum( (CFreqs*(CMeans-my)^2) )/(R-1); + anova_den <- sum( (CFreqs-1)*CVars )/(W-R); + ANOVAF <- anova_num/anova_den; + + basestats[7,pairID] <- Eta; + basestats[8,pairID] <- ANOVAF; + + cat_counts[ 1:length(CFreqs),pairID] <- CFreqs; + cat_means[ 1:length(CMeans),pairID] <- CMeans; + cat_vars[ 1:length(CVars),pairID] <- CVars; + } + else { + # nominal-ordinal or ordinal-nominal + print("nomial-ordinal"); #TODO should not be same code + F <- table(A1,A2); + cst <- chisq.test(F); + chi_squared <- as.numeric(cst[1]); + degFreedom <- (nrow(F)-1)*(ncol(F)-1); + pValue <- as.numeric(cst[3]); + q <- min(dim(F)); + W <- sum(F); + cramers_v <- sqrt(chi_squared/(W*(q-1))); + + basestats[2,pairID] <- chi_squared; + basestats[3,pairID] <- degFreedom; + basestats[4,pairID] <- pValue; + basestats[5,pairID] <- cramers_v; + } + } + } +} + +writeMM(as(basestats, "CsparseMatrix"), paste(args[3], "bivar.stats", sep="")); +writeMM(as(cat_counts, "CsparseMatrix"), paste(args[3], "category.counts", sep="")); +writeMM(as(cat_means, "CsparseMatrix"), paste(args[3], "category.means", sep="")); +writeMM(as(cat_vars, "CsparseMatrix"), paste(args[3], "category.variances", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_bivariate0.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_bivariate0.dml b/src/test/scripts/applications/parfor/parfor_bivariate0.dml index ac79abd..341a0b1 100644 --- a/src/test/scripts/applications/parfor/parfor_bivariate0.dml +++ b/src/test/scripts/applications/parfor/parfor_bivariate0.dml @@ -1,265 +1,265 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -/* - * - * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs - * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n} - * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) - * - * Seven inputs: - * $1) D - input data - * $2) S1 - First attribute set {A_11, A_12, ... A_1m} - * $3) S2 - Second attribute set {A_21, A_22, ... A_2n} - * $4) K1 - kind for attributes in S1 - * $5) K2 - kind for attributes in S2 - * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal - * $6) numPairs - total number of pairs (m*n) - * $7) maxC - maximum number of categories in any categorical attribute - * - * One output: - * $6) output directory in which following four statistics files are created - * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs - * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F) - * + categorical.counts - - * + categorical.means - - * + categorical.variances - - * -> Values in these three matrices are applicable only for scale-categorical attribute pairs. - * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k. - */ - -D = read($1, rows=$7, cols=$8); # input data set -S1 = read($2, rows=1, cols=$9); # attribute set 1 -S2 = read($3, rows=1, cols=$9); # attribute set 2 -K1 = read($4, rows=1, cols=$9); # kind for attributes in S1 -K2 = read($5, rows=1, cols=$9); # kind for attributes in S2 -numPairs = $10; # number of attribute pairs (|S1|*|S2|) -maxC = $11; # max number of categories in any categorical attribute - -s1size = ncol(S1); -s2size = ncol(S2); - -# R, chisq, cramers, spearman, eta, anovaf -numstats = 8; -basestats = matrix(0, rows=numstats, cols=numPairs); -cat_counts = matrix(0, rows=maxC, cols=numPairs); -cat_means = matrix(0, rows=maxC, cols=numPairs); -cat_vars = matrix(0, rows=maxC, cols=numPairs); - - -for( i in 1:s1size ) { - a1 = castAsScalar(S1[,i]); - k1 = castAsScalar(K1[1,i]); - A1 = D[,a1]; - #print("a1="+a1); - - for( j in 1:s2size ) { - pairID = (i-1)*s2size+j; - #print("ID="+pairID+"(i="+i+",j="+j+")"); - a2 = castAsScalar(S2[,j]); - k2 = castAsScalar(K2[1,j]); - A2 = D[,a2]; - #print("a2="+a2); - - if (k1 == k2) { - if (k1 == 1) { - # scale-scale - print("[" + i + "," + j + "] scale-scale"); - r = bivar_ss(A1,A2); - basestats[1,pairID] = r; - #print("scale:"+r); - } else { - # nominal-nominal or ordinal-ordinal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1,A2); - basestats[2,pairID] = chisq; - basestats[3,pairID] = df; - basestats[4,pairID] = pval; - basestats[5,pairID] = cramersv; - - if ( k1 == 3 ) { - # ordinal-ordinal - print("[" + i + "," + j + "] ordinal-ordinal"); - sp = bivar_oo(A1, A2); - basestats[6,pairID] = sp; - } - } - } - else { - if (k1 == 1 | k2 == 1) { - # Scale-nominal/ordinal TODO MB correctness errors - print("[" + i + "," + j + "] scale-categorical"); - - if ( k1 == 1 ) { - [eta,f, counts, means, vars] = bivar_sc(A1,A2); - } - else { - [eta,f, counts, means, vars] = bivar_sc(A2,A1); - } - basestats[7,pairID] = eta; - basestats[8,pairID] = f; - cat_counts[,pairID] = counts; - cat_means[,pairID] = means; - cat_vars[,pairID] = vars; - } - else { - # nominal-ordinal or ordinal-nominal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1,A2); - basestats[2,pairID] = chisq; - basestats[3,pairID] = df; - basestats[4,pairID] = pval; - basestats[5,pairID] = cramersv; - } - } - } -} - -write(basestats, $6 + "/bivar.stats"); -write(cat_counts, $6 + "/category.counts"); -write(cat_means, $6 + "/category.means"); -write(cat_vars, $6 + "/category.variances"); - - -# ----------------------------------------------------------------------------------------------------------- - -bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) { - - # Contingency Table - F = table(A,B); - - # Chi-Squared - W = sum(F); - r = rowSums(F); - c = colSums(F); - E = (r %*% c)/W; - T = (F-E)^2/E; - chi_squared = sum(T); - - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1); - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - - # Cramer's V - R = nrow(F); - C = ncol(F); - q = min(R,C); - cramers_v = sqrt(chi_squared/(W*(q-1))); - - # Assign return values - chisq = chi_squared; - df = as.double(degFreedom); - pval = pValue; - cramersv = cramers_v; -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) { - - # Unweighted co-variance - covXY = cov(X,Y); - - # compute standard deviations for both X and Y by computing 2^nd central moment - W = nrow(X); - m2X = moment(X,2); - m2Y = moment(Y,2); - sigmaX = sqrt(m2X * (W/(W-1.0)) ); - sigmaY = sqrt(m2Y * (W/(W-1.0)) ); - - # Pearson's R - R = covXY / (sigmaX*sigmaY); - #print("Rmx="+m2X); - #print("Rmy="+m2Y); - #print("Rcov="+covXY); - #print("Rsx="+sigmaX); - #print("Rsy="+sigmaY); - #print("R="+R); -} - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { - - # mean and variance in target variable - W = nrow(A); - my = mean(Y); - varY = moment(Y,2) * W/(W-1.0) - - # category-wise (frequencies, means, variances) - CFreqs = aggregate(target=Y, groups=A, fn="count"); - CMeans = aggregate(target=Y, groups=A, fn="mean"); - CVars = aggregate(target=Y, groups=A, fn="variance"); - - # number of categories - R = nrow(CFreqs); - - Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - - anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); - anova_den = sum( (CFreqs-1)*CVars )/(W-R); - AnovaF = anova_num/anova_den; -} - -# ----------------------------------------------------------------------------------------------------------- -# Function to compute ranks -# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category -computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { - Ranks = cumsum(X) - X/2 + 1/2; -} - -#------------------------------------------------------------------------- - -bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) { - - # compute contingency table - F = table(A,B); - - catA = nrow(F); # number of categories in A - catB = ncol(F); # number of categories in B - - # compute category-wise counts for both the attributes - R = rowSums(F); - S = colSums(F); - - # compute scores, both are column vectors - [C] = computeRanks(R); - meanX = mean(C,R); - - columnS = t(S); - [D] = computeRanks(columnS); - - # scores (C,D) are individual values, and counts (R,S) act as weights - meanY = mean(D,columnS); - - W = sum(F); # total weight, or total #cases - varX = moment(C,R,2)*(W/(W-1.0)); - varY = moment(D,columnS,2)*(W/(W-1.0)); - covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); - - sp = covXY/(sqrt(varX)*sqrt(varY)); -} - -# ----------------------------------------------------------------------------------------------------------- +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +/* + * + * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs + * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n} + * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) + * + * Seven inputs: + * $1) D - input data + * $2) S1 - First attribute set {A_11, A_12, ... A_1m} + * $3) S2 - Second attribute set {A_21, A_22, ... A_2n} + * $4) K1 - kind for attributes in S1 + * $5) K2 - kind for attributes in S2 + * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal + * $6) numPairs - total number of pairs (m*n) + * $7) maxC - maximum number of categories in any categorical attribute + * + * One output: + * $6) output directory in which following four statistics files are created + * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs + * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F) + * + categorical.counts - + * + categorical.means - + * + categorical.variances - + * -> Values in these three matrices are applicable only for scale-categorical attribute pairs. + * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k. + */ + +D = read($1, rows=$7, cols=$8); # input data set +S1 = read($2, rows=1, cols=$9); # attribute set 1 +S2 = read($3, rows=1, cols=$9); # attribute set 2 +K1 = read($4, rows=1, cols=$9); # kind for attributes in S1 +K2 = read($5, rows=1, cols=$9); # kind for attributes in S2 +numPairs = $10; # number of attribute pairs (|S1|*|S2|) +maxC = $11; # max number of categories in any categorical attribute + +s1size = ncol(S1); +s2size = ncol(S2); + +# R, chisq, cramers, spearman, eta, anovaf +numstats = 8; +basestats = matrix(0, rows=numstats, cols=numPairs); +cat_counts = matrix(0, rows=maxC, cols=numPairs); +cat_means = matrix(0, rows=maxC, cols=numPairs); +cat_vars = matrix(0, rows=maxC, cols=numPairs); + + +for( i in 1:s1size ) { + a1 = castAsScalar(S1[,i]); + k1 = castAsScalar(K1[1,i]); + A1 = D[,a1]; + #print("a1="+a1); + + for( j in 1:s2size ) { + pairID = (i-1)*s2size+j; + #print("ID="+pairID+"(i="+i+",j="+j+")"); + a2 = castAsScalar(S2[,j]); + k2 = castAsScalar(K2[1,j]); + A2 = D[,a2]; + #print("a2="+a2); + + if (k1 == k2) { + if (k1 == 1) { + # scale-scale + print("[" + i + "," + j + "] scale-scale"); + r = bivar_ss(A1,A2); + basestats[1,pairID] = r; + #print("scale:"+r); + } else { + # nominal-nominal or ordinal-ordinal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1,A2); + basestats[2,pairID] = chisq; + basestats[3,pairID] = df; + basestats[4,pairID] = pval; + basestats[5,pairID] = cramersv; + + if ( k1 == 3 ) { + # ordinal-ordinal + print("[" + i + "," + j + "] ordinal-ordinal"); + sp = bivar_oo(A1, A2); + basestats[6,pairID] = sp; + } + } + } + else { + if (k1 == 1 | k2 == 1) { + # Scale-nominal/ordinal TODO MB correctness errors + print("[" + i + "," + j + "] scale-categorical"); + + if ( k1 == 1 ) { + [eta,f, counts, means, vars] = bivar_sc(A1,A2); + } + else { + [eta,f, counts, means, vars] = bivar_sc(A2,A1); + } + basestats[7,pairID] = eta; + basestats[8,pairID] = f; + cat_counts[,pairID] = counts; + cat_means[,pairID] = means; + cat_vars[,pairID] = vars; + } + else { + # nominal-ordinal or ordinal-nominal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1,A2); + basestats[2,pairID] = chisq; + basestats[3,pairID] = df; + basestats[4,pairID] = pval; + basestats[5,pairID] = cramersv; + } + } + } +} + +write(basestats, $6 + "/bivar.stats"); +write(cat_counts, $6 + "/category.counts"); +write(cat_means, $6 + "/category.means"); +write(cat_vars, $6 + "/category.variances"); + + +# ----------------------------------------------------------------------------------------------------------- + +bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) { + + # Contingency Table + F = table(A,B); + + # Chi-Squared + W = sum(F); + r = rowSums(F); + c = colSums(F); + E = (r %*% c)/W; + T = (F-E)^2/E; + chi_squared = sum(T); + + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1); + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + + # Cramer's V + R = nrow(F); + C = ncol(F); + q = min(R,C); + cramers_v = sqrt(chi_squared/(W*(q-1))); + + # Assign return values + chisq = chi_squared; + df = as.double(degFreedom); + pval = pValue; + cramersv = cramers_v; +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) { + + # Unweighted co-variance + covXY = cov(X,Y); + + # compute standard deviations for both X and Y by computing 2^nd central moment + W = nrow(X); + m2X = moment(X,2); + m2Y = moment(Y,2); + sigmaX = sqrt(m2X * (W/(W-1.0)) ); + sigmaY = sqrt(m2Y * (W/(W-1.0)) ); + + # Pearson's R + R = covXY / (sigmaX*sigmaY); + #print("Rmx="+m2X); + #print("Rmy="+m2Y); + #print("Rcov="+covXY); + #print("Rsx="+sigmaX); + #print("Rsy="+sigmaY); + #print("R="+R); +} + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { + + # mean and variance in target variable + W = nrow(A); + my = mean(Y); + varY = moment(Y,2) * W/(W-1.0) + + # category-wise (frequencies, means, variances) + CFreqs = aggregate(target=Y, groups=A, fn="count"); + CMeans = aggregate(target=Y, groups=A, fn="mean"); + CVars = aggregate(target=Y, groups=A, fn="variance"); + + # number of categories + R = nrow(CFreqs); + + Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + + anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); + anova_den = sum( (CFreqs-1)*CVars )/(W-R); + AnovaF = anova_num/anova_den; +} + +# ----------------------------------------------------------------------------------------------------------- +# Function to compute ranks +# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category +computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { + Ranks = cumsum(X) - X/2 + 1/2; +} + +#------------------------------------------------------------------------- + +bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) { + + # compute contingency table + F = table(A,B); + + catA = nrow(F); # number of categories in A + catB = ncol(F); # number of categories in B + + # compute category-wise counts for both the attributes + R = rowSums(F); + S = colSums(F); + + # compute scores, both are column vectors + [C] = computeRanks(R); + meanX = mean(C,R); + + columnS = t(S); + [D] = computeRanks(columnS); + + # scores (C,D) are individual values, and counts (R,S) act as weights + meanY = mean(D,columnS); + + W = sum(F); # total weight, or total #cases + varX = moment(C,R,2)*(W/(W-1.0)); + varY = moment(D,columnS,2)*(W/(W-1.0)); + covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); + + sp = covXY/(sqrt(varX)*sqrt(varY)); +} + +# ----------------------------------------------------------------------------------------------------------- \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/parfor/parfor_bivariate1.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/parfor/parfor_bivariate1.dml b/src/test/scripts/applications/parfor/parfor_bivariate1.dml index 69a7ad6..aa40d7c 100644 --- a/src/test/scripts/applications/parfor/parfor_bivariate1.dml +++ b/src/test/scripts/applications/parfor/parfor_bivariate1.dml @@ -1,256 +1,256 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -/* - * - * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs - * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n} - * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) - * - * Seven inputs: - * $1) D - input data - * $2) S1 - First attribute set {A_11, A_12, ... A_1m} - * $3) S2 - Second attribute set {A_21, A_22, ... A_2n} - * $4) K1 - kind for attributes in S1 - * $5) K2 - kind for attributes in S2 - * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal - * $6) numPairs - total number of pairs (m*n) - * $7) maxC - maximum number of categories in any categorical attribute - * - * One output: - * $6) output directory in which following four statistics files are created - * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs - * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F) - * + categorical.counts - - * + categorical.means - - * + categorical.variances - - * -> Values in these three matrices are applicable only for scale-categorical attribute pairs. - * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k. - */ - -D = read($1, rows=$7, cols=$8); # input data set -S1 = read($2, rows=1, cols=$9); # attribute set 1 -S2 = read($3, rows=1, cols=$9); # attribute set 2 -K1 = read($4, rows=1, cols=$9); # kind for attributes in S1 -K2 = read($5, rows=1, cols=$9); # kind for attributes in S2 -numPairs = $10; # number of attribute pairs (|S1|*|S2|) -maxC = $11; # max number of categories in any categorical attribute - -s1size = ncol(S1); -s2size = ncol(S2); - -# R, chisq, cramers, spearman, eta, anovaf -numstats = 8; -basestats = matrix(0, rows=numstats, cols=numPairs); -cat_counts = matrix(0, rows=maxC, cols=numPairs); -cat_means = matrix(0, rows=maxC, cols=numPairs); -cat_vars = matrix(0, rows=maxC, cols=numPairs); - - -parfor( i in 1:s1size, par=4, mode=LOCAL, check=0, opt=NONE) { - a1 = castAsScalar(S1[,i]); - k1 = castAsScalar(K1[1,i]); - A1 = D[,a1]; - - parfor( j in 1:s2size, par=4, mode=LOCAL, check=0, opt=NONE) { - pairID = (i-1)*s2size+j; - a2 = castAsScalar(S2[,j]); - k2 = castAsScalar(K2[1,j]); - A2 = D[,a2]; - - if (k1 == k2) { - if (k1 == 1) { - # scale-scale - print("[" + i + "," + j + "] scale-scale"); - r = bivar_ss(A1,A2); - basestats[1,pairID] = r; - } else { - # nominal-nominal or ordinal-ordinal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1,A2); - basestats[2,pairID] = chisq; - basestats[3,pairID] = df; - basestats[4,pairID] = pval; - basestats[5,pairID] = cramersv; - - if ( k1 == 3 ) { - # ordinal-ordinal - print("[" + i + "," + j + "] ordinal-ordinal"); - sp = bivar_oo(A1, A2); - basestats[6,pairID] = sp; - } - } - } - else { - if (k1 == 1 | k2 == 1) { - # Scale-nominal/ordinal TODO MB correctness errors - print("[" + i + "," + j + "] scale-categorical"); - - if ( k1 == 1 ) { - [eta,f, counts, means, vars] = bivar_sc(A1,A2); - } - else { - [eta,f, counts, means, vars] = bivar_sc(A2,A1); - } - basestats[7,pairID] = eta; - basestats[8,pairID] = f; - cat_counts[,pairID] = counts; - cat_means[,pairID] = means; - cat_vars[,pairID] = vars; - } - else { - # nominal-ordinal or ordinal-nominal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1,A2); - basestats[2,pairID] = chisq; - basestats[3,pairID] = df; - basestats[4,pairID] = pval; - basestats[5,pairID] = cramersv; - } - } - } -} - -write(basestats, $6 + "/bivar.stats"); -write(cat_counts, $6 + "/category.counts"); -write(cat_means, $6 + "/category.means"); -write(cat_vars, $6 + "/category.variances"); - - -# ----------------------------------------------------------------------------------------------------------- - -bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) { - - # Contingency Table - F = table(A,B); - - # Chi-Squared - W = sum(F); - r = rowSums(F); - c = colSums(F); - E = (r %*% c)/W; - T = (F-E)^2/E; - chi_squared = sum(T); - - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1); - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - - # Cramer's V - R = nrow(F); - C = ncol(F); - q = min(R,C); - cramers_v = sqrt(chi_squared/(W*(q-1))); - - # Assign return values - chisq = chi_squared; - df = as.double(degFreedom); - pval = pValue; - cramersv = cramers_v; -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) { - - # Unweighted co-variance - covXY = cov(X,Y); - - # compute standard deviations for both X and Y by computing 2^nd central moment - W = nrow(X); - m2X = moment(X,2); - m2Y = moment(Y,2); - sigmaX = sqrt(m2X * (W/(W-1.0)) ); - sigmaY = sqrt(m2Y * (W/(W-1.0)) ); - - # Pearson's R - R = covXY / (sigmaX*sigmaY); -} - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { - - # mean and variance in target variable - W = nrow(A); - my = mean(Y); - varY = moment(Y,2) * W/(W-1.0) - - # category-wise (frequencies, means, variances) - CFreqs = aggregate(target=Y, groups=A, fn="count"); - CMeans = aggregate(target=Y, groups=A, fn="mean"); - CVars = aggregate(target=Y, groups=A, fn="variance"); - - # number of categories - R = nrow(CFreqs); - - Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - - anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); - anova_den = sum( (CFreqs-1)*CVars )/(W-R); - AnovaF = anova_num/anova_den; -} - -# ----------------------------------------------------------------------------------------------------------- -# Function to compute ranks -# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category -computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { - Ranks = cumsum(X) - X/2 + 1/2; -} - -#------------------------------------------------------------------------- - -bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) { - - # compute contingency table - F = table(A,B); - - catA = nrow(F); # number of categories in A - catB = ncol(F); # number of categories in B - - # compute category-wise counts for both the attributes - R = rowSums(F); - S = colSums(F); - - # compute scores, both are column vectors - [C] = computeRanks(R); - meanX = mean(C,R); - - columnS = t(S); - [D] = computeRanks(columnS); - - # scores (C,D) are individual values, and counts (R,S) act as weights - meanY = mean(D,columnS); - - W = sum(F); # total weight, or total #cases - varX = moment(C,R,2)*(W/(W-1.0)); - varY = moment(D,columnS,2)*(W/(W-1.0)); - covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); - - sp = covXY/(sqrt(varX)*sqrt(varY)); -} - -# ----------------------------------------------------------------------------------------------------------- - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +/* + * + * For a given pair of attribute sets, compute bivariate statistics between all attribute pairs + * Given, S_1 = {A_11, A_12, ... A_1m} and S_2 = {A_21, A_22, ... A_2n} + * compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) + * + * Seven inputs: + * $1) D - input data + * $2) S1 - First attribute set {A_11, A_12, ... A_1m} + * $3) S2 - Second attribute set {A_21, A_22, ... A_2n} + * $4) K1 - kind for attributes in S1 + * $5) K2 - kind for attributes in S2 + * kind=1 for scale, kind=2 for nominal, kind=3 for ordinal + * $6) numPairs - total number of pairs (m*n) + * $7) maxC - maximum number of categories in any categorical attribute + * + * One output: + * $6) output directory in which following four statistics files are created + * + bivar.stats - matrix with all 8 bivariate statistics computed for different attribute pairs + * (R, (chi-sq, df, pval, cramersv), spearman, Eta, F) + * + categorical.counts - + * + categorical.means - + * + categorical.variances - + * -> Values in these three matrices are applicable only for scale-categorical attribute pairs. + * k^th column in these matrices denote the attribute pair (A_1i,A_2j) where i*j = k. + */ + +D = read($1, rows=$7, cols=$8); # input data set +S1 = read($2, rows=1, cols=$9); # attribute set 1 +S2 = read($3, rows=1, cols=$9); # attribute set 2 +K1 = read($4, rows=1, cols=$9); # kind for attributes in S1 +K2 = read($5, rows=1, cols=$9); # kind for attributes in S2 +numPairs = $10; # number of attribute pairs (|S1|*|S2|) +maxC = $11; # max number of categories in any categorical attribute + +s1size = ncol(S1); +s2size = ncol(S2); + +# R, chisq, cramers, spearman, eta, anovaf +numstats = 8; +basestats = matrix(0, rows=numstats, cols=numPairs); +cat_counts = matrix(0, rows=maxC, cols=numPairs); +cat_means = matrix(0, rows=maxC, cols=numPairs); +cat_vars = matrix(0, rows=maxC, cols=numPairs); + + +parfor( i in 1:s1size, par=4, mode=LOCAL, check=0, opt=NONE) { + a1 = castAsScalar(S1[,i]); + k1 = castAsScalar(K1[1,i]); + A1 = D[,a1]; + + parfor( j in 1:s2size, par=4, mode=LOCAL, check=0, opt=NONE) { + pairID = (i-1)*s2size+j; + a2 = castAsScalar(S2[,j]); + k2 = castAsScalar(K2[1,j]); + A2 = D[,a2]; + + if (k1 == k2) { + if (k1 == 1) { + # scale-scale + print("[" + i + "," + j + "] scale-scale"); + r = bivar_ss(A1,A2); + basestats[1,pairID] = r; + } else { + # nominal-nominal or ordinal-ordinal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1,A2); + basestats[2,pairID] = chisq; + basestats[3,pairID] = df; + basestats[4,pairID] = pval; + basestats[5,pairID] = cramersv; + + if ( k1 == 3 ) { + # ordinal-ordinal + print("[" + i + "," + j + "] ordinal-ordinal"); + sp = bivar_oo(A1, A2); + basestats[6,pairID] = sp; + } + } + } + else { + if (k1 == 1 | k2 == 1) { + # Scale-nominal/ordinal TODO MB correctness errors + print("[" + i + "," + j + "] scale-categorical"); + + if ( k1 == 1 ) { + [eta,f, counts, means, vars] = bivar_sc(A1,A2); + } + else { + [eta,f, counts, means, vars] = bivar_sc(A2,A1); + } + basestats[7,pairID] = eta; + basestats[8,pairID] = f; + cat_counts[,pairID] = counts; + cat_means[,pairID] = means; + cat_vars[,pairID] = vars; + } + else { + # nominal-ordinal or ordinal-nominal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1,A2); + basestats[2,pairID] = chisq; + basestats[3,pairID] = df; + basestats[4,pairID] = pval; + basestats[5,pairID] = cramersv; + } + } + } +} + +write(basestats, $6 + "/bivar.stats"); +write(cat_counts, $6 + "/category.counts"); +write(cat_means, $6 + "/category.means"); +write(cat_vars, $6 + "/category.variances"); + + +# ----------------------------------------------------------------------------------------------------------- + +bivar_cc = function(Matrix[Double] A, Matrix[Double] B) return (Double chisq, Double df, Double pval, Double cramersv) { + + # Contingency Table + F = table(A,B); + + # Chi-Squared + W = sum(F); + r = rowSums(F); + c = colSums(F); + E = (r %*% c)/W; + T = (F-E)^2/E; + chi_squared = sum(T); + + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1); + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + + # Cramer's V + R = nrow(F); + C = ncol(F); + q = min(R,C); + cramers_v = sqrt(chi_squared/(W*(q-1))); + + # Assign return values + chisq = chi_squared; + df = as.double(degFreedom); + pval = pValue; + cramersv = cramers_v; +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R) { + + # Unweighted co-variance + covXY = cov(X,Y); + + # compute standard deviations for both X and Y by computing 2^nd central moment + W = nrow(X); + m2X = moment(X,2); + m2Y = moment(Y,2); + sigmaX = sqrt(m2X * (W/(W-1.0)) ); + sigmaY = sqrt(m2Y * (W/(W-1.0)) ); + + # Pearson's R + R = covXY / (sigmaX*sigmaY); +} + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +bivar_sc = function(Matrix[Double] Y, Matrix[Double] A) return (Double Eta, Double AnovaF, Matrix[Double] CFreqs, Matrix[Double] CMeans, Matrix[Double] CVars ) { + + # mean and variance in target variable + W = nrow(A); + my = mean(Y); + varY = moment(Y,2) * W/(W-1.0) + + # category-wise (frequencies, means, variances) + CFreqs = aggregate(target=Y, groups=A, fn="count"); + CMeans = aggregate(target=Y, groups=A, fn="mean"); + CVars = aggregate(target=Y, groups=A, fn="variance"); + + # number of categories + R = nrow(CFreqs); + + Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + + anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); + anova_den = sum( (CFreqs-1)*CVars )/(W-R); + AnovaF = anova_num/anova_den; +} + +# ----------------------------------------------------------------------------------------------------------- +# Function to compute ranks +# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category +computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { + Ranks = cumsum(X) - X/2 + 1/2; +} + +#------------------------------------------------------------------------- + +bivar_oo = function(Matrix[Double] A, Matrix[Double] B) return (Double sp) { + + # compute contingency table + F = table(A,B); + + catA = nrow(F); # number of categories in A + catB = ncol(F); # number of categories in B + + # compute category-wise counts for both the attributes + R = rowSums(F); + S = colSums(F); + + # compute scores, both are column vectors + [C] = computeRanks(R); + meanX = mean(C,R); + + columnS = t(S); + [D] = computeRanks(columnS); + + # scores (C,D) are individual values, and counts (R,S) act as weights + meanY = mean(D,columnS); + + W = sum(F); # total weight, or total #cases + varX = moment(C,R,2)*(W/(W-1.0)); + varY = moment(D,columnS,2)*(W/(W-1.0)); + covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); + + sp = covXY/(sqrt(varX)*sqrt(varY)); +} + +# ----------------------------------------------------------------------------------------------------------- + +
