http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/IQMTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/IQMTest.dml b/src/test/scripts/applications/descriptivestats/IQMTest.dml index f0b9477..d6234ab 100644 --- a/src/test/scripts/applications/descriptivestats/IQMTest.dml +++ b/src/test/scripts/applications/descriptivestats/IQMTest.dml @@ -1,35 +1,35 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -$$readhelper$$ - -V = read("$$indir$$vector", rows=$$rows$$, cols=1, format="text") -W = read("$$indir$$weight", rows=$$rows$$, cols=1, format="text") - -# inter quartile mean -iqm = interQuartileMean(V) -iqmHelper1 = iqm * Helper; -write(iqmHelper1, "$$outdir$$iqm", format="text"); - -# weighted inter quartile mean -wiqm = interQuartileMean(V, W) -iqmHelper2 = wiqm * Helper; +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +$$readhelper$$ + +V = read("$$indir$$vector", rows=$$rows$$, cols=1, format="text") +W = read("$$indir$$weight", rows=$$rows$$, cols=1, format="text") + +# inter quartile mean +iqm = interQuartileMean(V) +iqmHelper1 = iqm * Helper; +write(iqmHelper1, "$$outdir$$iqm", format="text"); + +# weighted inter quartile mean +wiqm = interQuartileMean(V, W) +iqmHelper2 = wiqm * Helper; write(iqmHelper2, "$$outdir$$weighted_iqm", format="text"); \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OddsRatio.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OddsRatio.R b/src/test/scripts/applications/descriptivestats/OddsRatio.R index 6649e94..4994935 100644 --- a/src/test/scripts/applications/descriptivestats/OddsRatio.R +++ b/src/test/scripts/applications/descriptivestats/OddsRatio.R @@ -1,75 +1,75 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java -# command line invocation assuming $CC_HOME is set to the home of the R script -# Rscript $CC_HOME/OddsRato.R $CC_HOME/in/ $CC_HOME/expected/ - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -A = readMM(paste(args[1], "A.mtx", sep="")); -B = readMM(paste(args[1], "B.mtx", sep="")); - -F = table(A[,1],B[,1]); - -a11 = F[1,1]; -a12 = F[1,2]; -a21 = F[2,1]; -a22 = F[2,2]; - -#print(paste(a11, " ", a12, " ", a21, " ", a22)); - -oddsRatio = (a11*a22)/(a12*a21); -sigma = sqrt(1/a11 + 1/a12 + 1/a21 + 1/a22); -left_conf = exp( log(oddsRatio) - 2*sigma ) -right_conf = exp( log(oddsRatio) + 2*sigma ) -sigma_away = abs( log(oddsRatio)/sigma ) - -# chisq.test returns a list containing statistic, p-value, etc. -cst = chisq.test(F); - -# get the chi-squared coefficient from the list -chi_squared = as.numeric(cst[1]); -degFreedom = as.numeric(cst[2]); -pValue = as.numeric(cst[3]); - -q = min(dim(F)); -W = sum(F); -cramers_v = sqrt(chi_squared/(W*(q-1))); - - -#print(paste(oddsRatio, " ", sigma, " [", left_conf, ",", right_conf, "] ", sigma_away, " ")); -#print(paste(chi_squared, " ", degFreedom, " [", pValue, ",", cramers_v, "] ")); - -write(oddsRatio, paste(args[2], "oddsRatio", sep="")); -write(sigma, paste(args[2], "sigma", sep="")); -write(left_conf, paste(args[2], "leftConf", sep="")); -write(right_conf, paste(args[2], "rightConf", sep="")); -write(sigma_away, paste(args[2], "sigmasAway", sep="")); - -#write(chi_squared, paste(args[2], "chiSquared", sep="")); -#write(degFreedom, paste(args[2], "degFreedom", sep="")); -#write(pValue, paste(args[2], "pValue", sep="")); -#write(cramers_v, paste(args[2], "cramersV", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java +# command line invocation assuming $CC_HOME is set to the home of the R script +# Rscript $CC_HOME/OddsRato.R $CC_HOME/in/ $CC_HOME/expected/ + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +A = readMM(paste(args[1], "A.mtx", sep="")); +B = readMM(paste(args[1], "B.mtx", sep="")); + +F = table(A[,1],B[,1]); + +a11 = F[1,1]; +a12 = F[1,2]; +a21 = F[2,1]; +a22 = F[2,2]; + +#print(paste(a11, " ", a12, " ", a21, " ", a22)); + +oddsRatio = (a11*a22)/(a12*a21); +sigma = sqrt(1/a11 + 1/a12 + 1/a21 + 1/a22); +left_conf = exp( log(oddsRatio) - 2*sigma ) +right_conf = exp( log(oddsRatio) + 2*sigma ) +sigma_away = abs( log(oddsRatio)/sigma ) + +# chisq.test returns a list containing statistic, p-value, etc. +cst = chisq.test(F); + +# get the chi-squared coefficient from the list +chi_squared = as.numeric(cst[1]); +degFreedom = as.numeric(cst[2]); +pValue = as.numeric(cst[3]); + +q = min(dim(F)); +W = sum(F); +cramers_v = sqrt(chi_squared/(W*(q-1))); + + +#print(paste(oddsRatio, " ", sigma, " [", left_conf, ",", right_conf, "] ", sigma_away, " ")); +#print(paste(chi_squared, " ", degFreedom, " [", pValue, ",", cramers_v, "] ")); + +write(oddsRatio, paste(args[2], "oddsRatio", sep="")); +write(sigma, paste(args[2], "sigma", sep="")); +write(left_conf, paste(args[2], "leftConf", sep="")); +write(right_conf, paste(args[2], "rightConf", sep="")); +write(sigma_away, paste(args[2], "sigmasAway", sep="")); + +#write(chi_squared, paste(args[2], "chiSquared", sep="")); +#write(degFreedom, paste(args[2], "degFreedom", sep="")); +#write(pValue, paste(args[2], "pValue", sep="")); +#write(cramers_v, paste(args[2], "cramersV", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OddsRatio.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OddsRatio.dml b/src/test/scripts/applications/descriptivestats/OddsRatio.dml index 8a62389..ae52e03 100644 --- a/src/test/scripts/applications/descriptivestats/OddsRatio.dml +++ b/src/test/scripts/applications/descriptivestats/OddsRatio.dml @@ -1,114 +1,114 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -# DML script to compute oddsRatio and related confidence intervals -# Input: two column vectors with categorical values w/ number of categories = 2 - -#A1 = Rand(rows=1400, cols=1, min=1, max=2); -#A2 = Rand(rows=1400, cols=1, min=1, max=2); -#A1 = round(A1); -#A2 = round(A2); - -A1 = read($1, rows=$2, cols=1); -A2 = read($3, rows=$2, cols=1); - -F = table(A1, A2); - -# Perform computations only on 2x2 contingency tables -if (nrow(F) != 2 | ncol(F) != 2) { - print(max(A1) + ", " + max(A2)); - print("Only 2x2 tables are supported. Contingency table constructed from given data is [" + nrow(F) + ", " + ncol(F) + "]"); -} -else { - [oddsRatio, left_conf, right_conf, sd, chisquared, pvalue, crv, sigma_away, degf] = pair_corr(F); - #print("Odds Ratio " + oddsRatio); - #print("Standard Devication " + sd); - #print("Confidence Interval [" + left_conf + "," + right_conf + "]"); - #print("Howmany sigma's away [" + sigma_away); - #print("Chi-squared Test: statistic = " + chisquared + ", pValue = " + pvalue + ", Cramer's V = " + crv + ", Degrees of Freedom = " + degf); - - write(oddsRatio, $4); - write(sd, $5); - write(left_conf, $6); - write(right_conf, $7); - write(sigma_away, $8); - #write(chisquared, $9); - #write(degf, $10); - #write(pvalue, $11); - #write(crv, $12); -} - -# ----------------------------------------------------------------------------------------------- - -# Given a 2x2 contingency table, it computes oddsRatio and the corresponding confidence interval -pair_corr = function(Matrix[Double] A) return (Double oddsRatio, Double left_conf, Double right_conf, Double sd, Double chisquared, Double pvalue, Double crv, Double sigma_away, Double df) { - a11 = castAsScalar(A[1,1]); - a12 = castAsScalar(A[1,2]); - a21 = castAsScalar(A[2,1]); - a22 = castAsScalar(A[2,2]); - - sd = sqrt(1/a11 + 1/a12 + 1/a21 + 1/a22); - oddsRatio = (a11*a22)/(a12*a21); - - [chisquared, df, pvalue, crv] = bivar_cc(A); - - left_conf = exp( log(oddsRatio) - 2*sd ) - right_conf = exp( log(oddsRatio) + 2*sd ) - sigma_away = abs( log(oddsRatio)/sd ) -} - -# ----------------------------------------------------------------------------------------------- - -# Given a contingency table, perform the chi-squared test. -bivar_cc = function(Matrix[Double] F) return (Double chisq, Double df, Double pval, Double cramersv) { - - # Contingency Table - # F = ctable(A,B); - - # Chi-Squared - W = sum(F); - r = rowSums(F); - c = colSums(F); - E = (r %*% c)/W; - T = (F-E)^2/E; - chi_squared = sum(T); - - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1); - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - - # Cramer's V - R = nrow(F); - C = ncol(F); - q = min(R,C); - cramers_v = sqrt(chi_squared/(W*(q-1))); - - # Assign return values - chisq = chi_squared; - df = degFreedom; - pval = pValue; - cramersv = cramers_v; -} - -# ----------------------------------------------------------------------------------------------- - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +# DML script to compute oddsRatio and related confidence intervals +# Input: two column vectors with categorical values w/ number of categories = 2 + +#A1 = Rand(rows=1400, cols=1, min=1, max=2); +#A2 = Rand(rows=1400, cols=1, min=1, max=2); +#A1 = round(A1); +#A2 = round(A2); + +A1 = read($1, rows=$2, cols=1); +A2 = read($3, rows=$2, cols=1); + +F = table(A1, A2); + +# Perform computations only on 2x2 contingency tables +if (nrow(F) != 2 | ncol(F) != 2) { + print(max(A1) + ", " + max(A2)); + print("Only 2x2 tables are supported. Contingency table constructed from given data is [" + nrow(F) + ", " + ncol(F) + "]"); +} +else { + [oddsRatio, left_conf, right_conf, sd, chisquared, pvalue, crv, sigma_away, degf] = pair_corr(F); + #print("Odds Ratio " + oddsRatio); + #print("Standard Devication " + sd); + #print("Confidence Interval [" + left_conf + "," + right_conf + "]"); + #print("Howmany sigma's away [" + sigma_away); + #print("Chi-squared Test: statistic = " + chisquared + ", pValue = " + pvalue + ", Cramer's V = " + crv + ", Degrees of Freedom = " + degf); + + write(oddsRatio, $4); + write(sd, $5); + write(left_conf, $6); + write(right_conf, $7); + write(sigma_away, $8); + #write(chisquared, $9); + #write(degf, $10); + #write(pvalue, $11); + #write(crv, $12); +} + +# ----------------------------------------------------------------------------------------------- + +# Given a 2x2 contingency table, it computes oddsRatio and the corresponding confidence interval +pair_corr = function(Matrix[Double] A) return (Double oddsRatio, Double left_conf, Double right_conf, Double sd, Double chisquared, Double pvalue, Double crv, Double sigma_away, Double df) { + a11 = castAsScalar(A[1,1]); + a12 = castAsScalar(A[1,2]); + a21 = castAsScalar(A[2,1]); + a22 = castAsScalar(A[2,2]); + + sd = sqrt(1/a11 + 1/a12 + 1/a21 + 1/a22); + oddsRatio = (a11*a22)/(a12*a21); + + [chisquared, df, pvalue, crv] = bivar_cc(A); + + left_conf = exp( log(oddsRatio) - 2*sd ) + right_conf = exp( log(oddsRatio) + 2*sd ) + sigma_away = abs( log(oddsRatio)/sd ) +} + +# ----------------------------------------------------------------------------------------------- + +# Given a contingency table, perform the chi-squared test. +bivar_cc = function(Matrix[Double] F) return (Double chisq, Double df, Double pval, Double cramersv) { + + # Contingency Table + # F = ctable(A,B); + + # Chi-Squared + W = sum(F); + r = rowSums(F); + c = colSums(F); + E = (r %*% c)/W; + T = (F-E)^2/E; + chi_squared = sum(T); + + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1); + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + + # Cramer's V + R = nrow(F); + C = ncol(F); + q = min(R,C); + cramers_v = sqrt(chi_squared/(W*(q-1))); + + # Assign return values + chisq = chi_squared; + df = degFreedom; + pval = pValue; + cramersv = cramers_v; +} + +# ----------------------------------------------------------------------------------------------- + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.R b/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.R index f3fe12d..f7d6de1 100644 --- a/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.R +++ b/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.R @@ -1,38 +1,38 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.BivariateOrdinalOrdinalTest.java -# command line invocation assuming $OO_HOME is set to the home of the R script -# Rscript $OO_HOME/OrdinalOrdinal.R $OO_HOME/in/ $OO_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -A = readMM(paste(args[1], "A.mtx", sep="")) -B = readMM(paste(args[1], "B.mtx", sep="")) - -spearman = cor(A[,1],B[,1], method="spearman"); - -#paste("R value", spearman); - -write(spearman, paste(args[2], "Spearman", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.BivariateOrdinalOrdinalTest.java +# command line invocation assuming $OO_HOME is set to the home of the R script +# Rscript $OO_HOME/OrdinalOrdinal.R $OO_HOME/in/ $OO_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +A = readMM(paste(args[1], "A.mtx", sep="")) +B = readMM(paste(args[1], "B.mtx", sep="")) + +spearman = cor(A[,1],B[,1], method="spearman"); + +#paste("R value", spearman); + +write(spearman, paste(args[2], "Spearman", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.dml b/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.dml index a30818a..cb153d0 100644 --- a/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.dml +++ b/src/test/scripts/applications/descriptivestats/OrdinalOrdinal.dml @@ -1,74 +1,74 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script OrdinalOrdinal.dml? -# Assume OO_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for A and B -# hadoop jar SystemML.jar -f $OO_HOME/OrdinalOrdinal.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/Spearman" - -#------------------------------------------------------------------------- -# Function to compute ranks -# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category -computeRanks = function(matrix[double] X) return (matrix[double] Ranks) { - Ranks = cumsum(X) - X/2 + 1/2; -} -#------------------------------------------------------------------------- - -A = read($1, rows=$2, cols=1, format="text"); -B = read($3, rows=$2, cols=1, format="text"); - -# compute contingency table -F = table(A,B); - -catA = nrow(F); # number of categories in A -catB = ncol(F); # number of categories in B - -# compute category-wise counts for both the attributes -R = rowSums(F); -S = colSums(F); - -# compute scores, both are column vectors -[C] = computeRanks(R); -meanX = mean(C,R); - -columnS = t(S); -[D] = computeRanks(columnS); - -# scores (C,D) are individual values, and counts (R,S) act as weights -meanY = mean(D,columnS); - -W = sum(F); # total weight, or total #cases -varX = moment(C,R,2)*(W/(W-1.0)); -varY = moment(D,columnS,2)*(W/(W-1.0)); -covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); - -sp = covXY/(sqrt(varX)*sqrt(varY)); - -#print("X: mean " + meanX + ", var " + varX); -#print("Y: mean " + meanY + ", var " + varY); -#print("covXY: " + sp); - -#sp = spearman(A,B); - -write(sp, $4); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script OrdinalOrdinal.dml? +# Assume OO_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for A and B +# hadoop jar SystemML.jar -f $OO_HOME/OrdinalOrdinal.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/Spearman" + +#------------------------------------------------------------------------- +# Function to compute ranks +# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category +computeRanks = function(matrix[double] X) return (matrix[double] Ranks) { + Ranks = cumsum(X) - X/2 + 1/2; +} +#------------------------------------------------------------------------- + +A = read($1, rows=$2, cols=1, format="text"); +B = read($3, rows=$2, cols=1, format="text"); + +# compute contingency table +F = table(A,B); + +catA = nrow(F); # number of categories in A +catB = ncol(F); # number of categories in B + +# compute category-wise counts for both the attributes +R = rowSums(F); +S = colSums(F); + +# compute scores, both are column vectors +[C] = computeRanks(R); +meanX = mean(C,R); + +columnS = t(S); +[D] = computeRanks(columnS); + +# scores (C,D) are individual values, and counts (R,S) act as weights +meanY = mean(D,columnS); + +W = sum(F); # total weight, or total #cases +varX = moment(C,R,2)*(W/(W-1.0)); +varY = moment(D,columnS,2)*(W/(W-1.0)); +covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); + +sp = covXY/(sqrt(varX)*sqrt(varY)); + +#print("X: mean " + meanX + ", var " + varX); +#print("Y: mean " + meanY + ", var " + varY); +#print("covXY: " + sp); + +#sp = spearman(A,B); + +write(sp, $4); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.R index addc779..0c244da 100644 --- a/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.R +++ b/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.R @@ -1,46 +1,46 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.BivariateOrdinalOrdinalWithWeightsTest.java -# command line invocation assuming $OO_HOME is set to the home of the R script -# Rscript $OO_HOME/OrdinalOrdinal.R $OO_HOME/in/ $OO_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -Atemp = readMM(paste(args[1], "A.mtx", sep="")) -Btemp = readMM(paste(args[1], "B.mtx", sep="")) -WMtemp = readMM(paste(args[1], "WM.mtx", sep="")) - -#Atemp = readMM(file="$$indir$$A.mtx"); #readMM(paste(args[1], "A.mtx", sep="")) -#Btemp = readMM(file="$$indir$$B.mtx"); #readMM(paste(args[1], "B.mtx", sep="")) -#WMtemp = readMM(file="$$indir$$WM.mtx"); #readMM(paste(args[1], "WM.mtx", sep="")) - -A = rep(Atemp[,1],WMtemp[,1]) -B = rep(Btemp[,1],WMtemp[,1]) - -spearman = cor(A, B, method="spearman"); - -#paste("Weighted R value", spearman); - -write(spearman, paste(args[2], "Spearman", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.BivariateOrdinalOrdinalWithWeightsTest.java +# command line invocation assuming $OO_HOME is set to the home of the R script +# Rscript $OO_HOME/OrdinalOrdinal.R $OO_HOME/in/ $OO_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +Atemp = readMM(paste(args[1], "A.mtx", sep="")) +Btemp = readMM(paste(args[1], "B.mtx", sep="")) +WMtemp = readMM(paste(args[1], "WM.mtx", sep="")) + +#Atemp = readMM(file="$$indir$$A.mtx"); #readMM(paste(args[1], "A.mtx", sep="")) +#Btemp = readMM(file="$$indir$$B.mtx"); #readMM(paste(args[1], "B.mtx", sep="")) +#WMtemp = readMM(file="$$indir$$WM.mtx"); #readMM(paste(args[1], "WM.mtx", sep="")) + +A = rep(Atemp[,1],WMtemp[,1]) +B = rep(Btemp[,1],WMtemp[,1]) + +spearman = cor(A, B, method="spearman"); + +#paste("Weighted R value", spearman); + +write(spearman, paste(args[2], "Spearman", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.dml index 9af57fa..e6b5143 100644 --- a/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.dml +++ b/src/test/scripts/applications/descriptivestats/OrdinalOrdinalWithWeightsTest.dml @@ -1,78 +1,78 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -#A <- Ordinal -#B <- Ordinal -#WM <- Weights - -#A = read("$$indir$$A", rows=$$rows$$, cols=1, format="text"); -#B = read("$$indir$$B", rows=$$rows$$, cols=1, format="text"); -#WM = read("$$indir$$WM", rows=$$rows$$, cols=1, format="text"); - -A = read($1, rows=$2, cols=1, format="text"); -B = read($3, rows=$2, cols=1, format="text"); -WM = read($4, rows=$2, cols=1, format="text"); - -# compute contingency table -F = table(A,B,WM); - -catA = nrow(F); # number of categories in A -catB = ncol(F); # number of categories in B - -# compute category-wise counts for both the attributes -R = rowSums(F); -S = colSums(F); - -# compute scores, both are column vectors -[C] = computeRanks(R); -meanX = mean(C,R); - -columnS = t(S); -[D] = computeRanks(columnS); - -# scores (C,D) are individual values, and counts (R,S) act as weights -meanY = mean(D,columnS); - -W = sum(F); # total weight, or total #cases -varX = moment(C,R,2)*(W/(W-1.0)); -varY = moment(D,columnS,2)*(W/(W-1.0)); -covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); - -sp = covXY/(sqrt(varX)*sqrt(varY)); - -#print("X: mean " + meanX + ", var " + varX); -#print("Y: mean " + meanY + ", var " + varY); -#print("covXY: " + sp); - -#sp = spearman(A,B,WM); - -write(sp, $5); - - -#------------------------------------------------------------------------- -# Function to compute ranks -# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category -computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { - Ranks = cumsum(X) - X/2 + 1/2; -} -#------------------------------------------------------------------------- - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +#A <- Ordinal +#B <- Ordinal +#WM <- Weights + +#A = read("$$indir$$A", rows=$$rows$$, cols=1, format="text"); +#B = read("$$indir$$B", rows=$$rows$$, cols=1, format="text"); +#WM = read("$$indir$$WM", rows=$$rows$$, cols=1, format="text"); + +A = read($1, rows=$2, cols=1, format="text"); +B = read($3, rows=$2, cols=1, format="text"); +WM = read($4, rows=$2, cols=1, format="text"); + +# compute contingency table +F = table(A,B,WM); + +catA = nrow(F); # number of categories in A +catB = ncol(F); # number of categories in B + +# compute category-wise counts for both the attributes +R = rowSums(F); +S = colSums(F); + +# compute scores, both are column vectors +[C] = computeRanks(R); +meanX = mean(C,R); + +columnS = t(S); +[D] = computeRanks(columnS); + +# scores (C,D) are individual values, and counts (R,S) act as weights +meanY = mean(D,columnS); + +W = sum(F); # total weight, or total #cases +varX = moment(C,R,2)*(W/(W-1.0)); +varY = moment(D,columnS,2)*(W/(W-1.0)); +covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); + +sp = covXY/(sqrt(varX)*sqrt(varY)); + +#print("X: mean " + meanX + ", var " + varX); +#print("Y: mean " + meanY + ", var " + varY); +#print("covXY: " + sp); + +#sp = spearman(A,B,WM); + +write(sp, $5); + + +#------------------------------------------------------------------------- +# Function to compute ranks +# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category +computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { + Ranks = cumsum(X) - X/2 + 1/2; +} +#------------------------------------------------------------------------- + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/QuantileTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/QuantileTest.dml b/src/test/scripts/applications/descriptivestats/QuantileTest.dml index a8f2f6d..233fcdf 100644 --- a/src/test/scripts/applications/descriptivestats/QuantileTest.dml +++ b/src/test/scripts/applications/descriptivestats/QuantileTest.dml @@ -1,34 +1,34 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -$$readhelper$$ - -V = read("$$indir$$vector", rows=$$rows1$$, cols=1, format="text") -W = read("$$indir$$weight", rows=$$rows1$$, cols=1, format="text") -P = read("$$indir$$prob", rows=$$rows2$$, cols=1, format="text") - -# quantile -Q = quantile(V, P) -write(Q, "$$outdir$$quantile", format="text"); - -# weighted quantile -WQ = quantile(V, W, P) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +$$readhelper$$ + +V = read("$$indir$$vector", rows=$$rows1$$, cols=1, format="text") +W = read("$$indir$$weight", rows=$$rows1$$, cols=1, format="text") +P = read("$$indir$$prob", rows=$$rows2$$, cols=1, format="text") + +# quantile +Q = quantile(V, P) +write(Q, "$$outdir$$quantile", format="text"); + +# weighted quantile +WQ = quantile(V, W, P) write(WQ, "$$outdir$$weighted_quantile", format="text"); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Scale.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/Scale.R b/src/test/scripts/applications/descriptivestats/Scale.R index 0027c87..45c9efb 100644 --- a/src/test/scripts/applications/descriptivestats/Scale.R +++ b/src/test/scripts/applications/descriptivestats/Scale.R @@ -1,141 +1,141 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java -# command line invocation assuming $S_HOME is set to the home of the R script -# Rscript $S_HOME/Scale.R $S_HOME/in/ $S_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -options(repos="http://cran.stat.ucla.edu/") -is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) - -is_plotrix = is.installed("plotrix"); -if ( !is_plotrix ) { -install.packages("plotrix"); -} -library("plotrix"); - -is_psych = is.installed("psych"); -if ( !is_psych ) { -install.packages("psych"); -} -library("psych") - -is_moments = is.installed("moments"); -if( !is_moments){ -install.packages("moments"); -} -library("moments") - -#library("batch") -library("Matrix") - -V = readMM(paste(args[1], "vector.mtx", sep="")) -P = readMM(paste(args[1], "prob.mtx", sep="")) - -n = nrow(V) - -# mean -mu = mean(V) - -# variances -var = var(V[,1]) - -# standard deviations -std_dev = sd(V[,1], na.rm = FALSE) - -# standard errors of mean -SE = std.error(V[,1], na.rm) - -# coefficients of variation -cv = std_dev/mu - -# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) -# har_mu = harmonic.mean(V[,1]) -- DML does not support this yet - -# geometric means is not currently supported. -# geom_mu = geometric.mean(V[,1]) -- DML does not support this yet - -# min and max -mn=min(V) -mx=max(V) - -# range -rng = mx - mn - -# Skewness -g1 = moment(V[,1], order=3, central=TRUE)/(std_dev^3) - -# standard error of skewness (not sure how it is defined without the weight) -se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) - -# Kurtosis (using binomial formula) -g2 = moment(V[,1], order=4, central=TRUE)/(var^2)-3 - -# Standard error of Kurtosis (not sure how it is defined without the weight) -se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5)*(n-3)) ) - -# median -md = median(V[,1]) #quantile(V[,1], 0.5, type = 1) - -# quantile -Q = t(quantile(V[,1], P[,1], type = 1)) - -# inter-quartile mean -S=c(sort(V[,1])) - -q25d=n*0.25 -q75d=n*0.75 -q25i=ceiling(q25d) -q75i=ceiling(q75d) - -iqm = sum(S[(q25i+1):q75i]) -iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] -iqm = iqm/(n*0.5) - -#print(paste("IQM ", iqm)); - -# outliers use ppred to describe it -out_minus = t(as.numeric(V< mu-5*std_dev)*V) -out_plus = t(as.numeric(V> mu+5*std_dev)*V) - -write(mu, paste(args[2], "mean", sep="")); -write(std_dev, paste(args[2], "std", sep="")); -write(SE, paste(args[2], "se", sep="")); -write(var, paste(args[2], "var", sep="")); -write(cv, paste(args[2], "cv", sep="")); -# write(har_mu),paste(args[2], "har", sep="")); -# write(geom_mu, paste(args[2], "geom", sep="")); -write(mn, paste(args[2], "min", sep="")); -write(mx, paste(args[2], "max", sep="")); -write(rng, paste(args[2], "rng", sep="")); -write(g1, paste(args[2], "g1", sep="")); -write(se_g1, paste(args[2], "se_g1", sep="")); -write(g2, paste(args[2], "g2", sep="")); -write(se_g2, paste(args[2], "se_g2", sep="")); -write(md, paste(args[2], "median", sep="")); -write(iqm, paste(args[2], "iqm", sep="")); -writeMM(as(t(out_minus),"CsparseMatrix"), paste(args[2], "out_minus", sep=""), format="text"); -writeMM(as(t(out_plus),"CsparseMatrix"), paste(args[2], "out_plus", sep=""), format="text"); -writeMM(as(t(Q),"CsparseMatrix"), paste(args[2], "quantile", sep=""), format="text"); - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java +# command line invocation assuming $S_HOME is set to the home of the R script +# Rscript $S_HOME/Scale.R $S_HOME/in/ $S_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +options(repos="http://cran.stat.ucla.edu/") +is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) + +is_plotrix = is.installed("plotrix"); +if ( !is_plotrix ) { +install.packages("plotrix"); +} +library("plotrix"); + +is_psych = is.installed("psych"); +if ( !is_psych ) { +install.packages("psych"); +} +library("psych") + +is_moments = is.installed("moments"); +if( !is_moments){ +install.packages("moments"); +} +library("moments") + +#library("batch") +library("Matrix") + +V = readMM(paste(args[1], "vector.mtx", sep="")) +P = readMM(paste(args[1], "prob.mtx", sep="")) + +n = nrow(V) + +# mean +mu = mean(V) + +# variances +var = var(V[,1]) + +# standard deviations +std_dev = sd(V[,1], na.rm = FALSE) + +# standard errors of mean +SE = std.error(V[,1], na.rm) + +# coefficients of variation +cv = std_dev/mu + +# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) +# har_mu = harmonic.mean(V[,1]) -- DML does not support this yet + +# geometric means is not currently supported. +# geom_mu = geometric.mean(V[,1]) -- DML does not support this yet + +# min and max +mn=min(V) +mx=max(V) + +# range +rng = mx - mn + +# Skewness +g1 = moment(V[,1], order=3, central=TRUE)/(std_dev^3) + +# standard error of skewness (not sure how it is defined without the weight) +se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) + +# Kurtosis (using binomial formula) +g2 = moment(V[,1], order=4, central=TRUE)/(var^2)-3 + +# Standard error of Kurtosis (not sure how it is defined without the weight) +se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5)*(n-3)) ) + +# median +md = median(V[,1]) #quantile(V[,1], 0.5, type = 1) + +# quantile +Q = t(quantile(V[,1], P[,1], type = 1)) + +# inter-quartile mean +S=c(sort(V[,1])) + +q25d=n*0.25 +q75d=n*0.75 +q25i=ceiling(q25d) +q75i=ceiling(q75d) + +iqm = sum(S[(q25i+1):q75i]) +iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] +iqm = iqm/(n*0.5) + +#print(paste("IQM ", iqm)); + +# outliers use ppred to describe it +out_minus = t(as.numeric(V< mu-5*std_dev)*V) +out_plus = t(as.numeric(V> mu+5*std_dev)*V) + +write(mu, paste(args[2], "mean", sep="")); +write(std_dev, paste(args[2], "std", sep="")); +write(SE, paste(args[2], "se", sep="")); +write(var, paste(args[2], "var", sep="")); +write(cv, paste(args[2], "cv", sep="")); +# write(har_mu),paste(args[2], "har", sep="")); +# write(geom_mu, paste(args[2], "geom", sep="")); +write(mn, paste(args[2], "min", sep="")); +write(mx, paste(args[2], "max", sep="")); +write(rng, paste(args[2], "rng", sep="")); +write(g1, paste(args[2], "g1", sep="")); +write(se_g1, paste(args[2], "se_g1", sep="")); +write(g2, paste(args[2], "g2", sep="")); +write(se_g2, paste(args[2], "se_g2", sep="")); +write(md, paste(args[2], "median", sep="")); +write(iqm, paste(args[2], "iqm", sep="")); +writeMM(as(t(out_minus),"CsparseMatrix"), paste(args[2], "out_minus", sep=""), format="text"); +writeMM(as(t(out_plus),"CsparseMatrix"), paste(args[2], "out_plus", sep=""), format="text"); +writeMM(as(t(Q),"CsparseMatrix"), paste(args[2], "quantile", sep=""), format="text"); + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Scale.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/Scale.dml b/src/test/scripts/applications/descriptivestats/Scale.dml index b59c235..e2362cb 100644 --- a/src/test/scripts/applications/descriptivestats/Scale.dml +++ b/src/test/scripts/applications/descriptivestats/Scale.dml @@ -1,114 +1,114 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script Scale.dml? -# Assume S_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for V and rows = 5 for P -# hadoop jar SystemML.jar -f $S_HOME/Scale.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/prob 5 -# "$OUTPUT_DIR/mean" "$OUTPUT_DIR/std" "$OUTPUT_DIR/se" "$OUTPUT_DIR/var" "$OUTPUT_DIR/cv" -# "$OUTPUT_DIR/min" "$OUTPUT_DIR/max" "$OUTPUT_DIR/rng" -# "$OUTPUT_DIR/g1" "$OUTPUT_DIR/se_g1" "$OUTPUT_DIR/g2" "$OUTPUT_DIR/se_g2" -# "$OUTPUT_DIR/median" "$OUTPUT_DIR/iqm" -# "OUTPUT_DIR/out_minus" "$OUTPUT_DIR/out_plus" "$OUTPUT_DIR/quantile" - -V = read($1, rows=$2, cols=1, format="text") -P = read($3, rows=$4, cols=1, format="text") - -n = nrow(V) - -# sum -s1 = sum(V) - -# 2nd central moment -m2 = moment(V, 2) - -# 3rd central moment -m3 = moment(V, 3) - -# 4th central moment -m4 = moment(V, 4) - -# mean -mu = mean(V) - -# variances -var = n/(n-1.0)*m2 - -# standard deviations -std_dev = sqrt(var) - -# standard errors of mean -SE = std_dev/sqrt(n) - -# coefficients of variation -cv = std_dev/mu - -# min and max -mn=min(V) -mx=max(V) - -# range -rng = mx - mn - -# Skewness -g1 = m3/(std_dev^3) - -# standard error of skewness (not sure how it is defined without the weight) -se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) - -# Kurtosis (using binomial formula) -g2 = m4/(std_dev^4) - 3 - -# Standard error of Kurtosis (not sure how it is defined without the weight) -se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5.0)*(n-3.0)) ) - -# median -md = median(V) - -# quantile -Q = quantile(V, P) - -# inter-quartile mean -iqm = interQuartileMean(V) - -# outliers use ppred to describe it -out_minus = ppred(V, mu-5*std_dev, "<")*V -out_plus = ppred(V, mu+5*std_dev, ">")*V - -write(mu, $5); -write(std_dev, $6); -write(SE, $7); -write(var, $8); -write(cv, $9); -write(mn, $10); -write(mx, $11); -write(rng, $12); -write(g1, $13); -write(se_g1, $14); -write(g2, $15); -write(se_g2, $16); -write(md, $17); -write(iqm, $18); -write(out_minus, $19, format="text"); -write(out_plus, $20, format="text"); -write(Q, $21, format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script Scale.dml? +# Assume S_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for V and rows = 5 for P +# hadoop jar SystemML.jar -f $S_HOME/Scale.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/prob 5 +# "$OUTPUT_DIR/mean" "$OUTPUT_DIR/std" "$OUTPUT_DIR/se" "$OUTPUT_DIR/var" "$OUTPUT_DIR/cv" +# "$OUTPUT_DIR/min" "$OUTPUT_DIR/max" "$OUTPUT_DIR/rng" +# "$OUTPUT_DIR/g1" "$OUTPUT_DIR/se_g1" "$OUTPUT_DIR/g2" "$OUTPUT_DIR/se_g2" +# "$OUTPUT_DIR/median" "$OUTPUT_DIR/iqm" +# "OUTPUT_DIR/out_minus" "$OUTPUT_DIR/out_plus" "$OUTPUT_DIR/quantile" + +V = read($1, rows=$2, cols=1, format="text") +P = read($3, rows=$4, cols=1, format="text") + +n = nrow(V) + +# sum +s1 = sum(V) + +# 2nd central moment +m2 = moment(V, 2) + +# 3rd central moment +m3 = moment(V, 3) + +# 4th central moment +m4 = moment(V, 4) + +# mean +mu = mean(V) + +# variances +var = n/(n-1.0)*m2 + +# standard deviations +std_dev = sqrt(var) + +# standard errors of mean +SE = std_dev/sqrt(n) + +# coefficients of variation +cv = std_dev/mu + +# min and max +mn=min(V) +mx=max(V) + +# range +rng = mx - mn + +# Skewness +g1 = m3/(std_dev^3) + +# standard error of skewness (not sure how it is defined without the weight) +se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) + +# Kurtosis (using binomial formula) +g2 = m4/(std_dev^4) - 3 + +# Standard error of Kurtosis (not sure how it is defined without the weight) +se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5.0)*(n-3.0)) ) + +# median +md = median(V) + +# quantile +Q = quantile(V, P) + +# inter-quartile mean +iqm = interQuartileMean(V) + +# outliers use ppred to describe it +out_minus = ppred(V, mu-5*std_dev, "<")*V +out_plus = ppred(V, mu+5*std_dev, ">")*V + +write(mu, $5); +write(std_dev, $6); +write(SE, $7); +write(var, $8); +write(cv, $9); +write(mn, $10); +write(mx, $11); +write(rng, $12); +write(g1, $13); +write(se_g1, $14); +write(g2, $15); +write(se_g2, $16); +write(md, $17); +write(iqm, $18); +write(out_minus, $19, format="text"); +write(out_plus, $20, format="text"); +write(Q, $21, format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleCategorical.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleCategorical.R b/src/test/scripts/applications/descriptivestats/ScaleCategorical.R index 73bf9f3..e1d0880 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleCategorical.R +++ b/src/test/scripts/applications/descriptivestats/ScaleCategorical.R @@ -1,69 +1,69 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.BivariateScaleCategoricalTest.java -# command line invocation assuming $SC_HOME is set to the home of the R script -# Rscript $SC_HOME/ScaleCategorical.R $SC_HOME/in/ $SC_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -A = readMM(paste(args[1], "A.mtx", sep="")); -Y = readMM(paste(args[1], "Y.mtx", sep="")); - -Av = A[,1]; -Yv = Y[,1]; - -W = nrow(A); -my = mean(Yv); #sum(Yv)/W; -varY = var(Yv); - -CFreqs = as.matrix(table(Av)); -CMeans = as.matrix(aggregate(Yv, by=list(Av), "mean")$x); -CVars = as.matrix(aggregate(Yv, by=list(Av), "var")$x); - -# number of categories -R = nrow(CFreqs); - -Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - -anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); -anova_den = sum( (CFreqs-1)*CVars )/(W-R); -ANOVAF = anova_num/anova_den; - -print(anova_num, digits=15); -print(anova_den, digits=15); - -write(Eta, paste(args[2], "Eta", sep="")); - -write(ANOVAF, paste(args[2], "AnovaF", sep="")); - -write(varY, paste(args[2], "VarY", sep="")); - -write(my, paste(args[2], "MeanY", sep="")); - -writeMM(as(CVars,"CsparseMatrix"), paste(args[2], "CVars", sep=""), format="text"); -writeMM(as(CFreqs,"CsparseMatrix"), paste(args[2], "CFreqs", sep=""), format="text"); -writeMM(as(CMeans,"CsparseMatrix"), paste(args[2], "CMeans", sep=""), format="text"); - - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.BivariateScaleCategoricalTest.java +# command line invocation assuming $SC_HOME is set to the home of the R script +# Rscript $SC_HOME/ScaleCategorical.R $SC_HOME/in/ $SC_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +A = readMM(paste(args[1], "A.mtx", sep="")); +Y = readMM(paste(args[1], "Y.mtx", sep="")); + +Av = A[,1]; +Yv = Y[,1]; + +W = nrow(A); +my = mean(Yv); #sum(Yv)/W; +varY = var(Yv); + +CFreqs = as.matrix(table(Av)); +CMeans = as.matrix(aggregate(Yv, by=list(Av), "mean")$x); +CVars = as.matrix(aggregate(Yv, by=list(Av), "var")$x); + +# number of categories +R = nrow(CFreqs); + +Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + +anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); +anova_den = sum( (CFreqs-1)*CVars )/(W-R); +ANOVAF = anova_num/anova_den; + +print(anova_num, digits=15); +print(anova_den, digits=15); + +write(Eta, paste(args[2], "Eta", sep="")); + +write(ANOVAF, paste(args[2], "AnovaF", sep="")); + +write(varY, paste(args[2], "VarY", sep="")); + +write(my, paste(args[2], "MeanY", sep="")); + +writeMM(as(CVars,"CsparseMatrix"), paste(args[2], "CVars", sep=""), format="text"); +writeMM(as(CFreqs,"CsparseMatrix"), paste(args[2], "CFreqs", sep=""), format="text"); +writeMM(as(CMeans,"CsparseMatrix"), paste(args[2], "CMeans", sep=""), format="text"); + + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleCategorical.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleCategorical.dml b/src/test/scripts/applications/descriptivestats/ScaleCategorical.dml index d0d5135..f9d7efb 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleCategorical.dml +++ b/src/test/scripts/applications/descriptivestats/ScaleCategorical.dml @@ -1,62 +1,62 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script ScaleCategorical.dml? -# Assume SC_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for A and Y, A is categorical variable and Y is scale variable -# hadoop jar SystemML.jar -f $SC_HOME/ScaleCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/Y" -# "$OUPUT_DIR/VarY" "$OUTPUT_DIR/MeanY" "$OUTPUT_DIR/CFreqs" "$OUTPUT_DIR/CMeans" "$OUTPUT_DIR/CVars" -# "$OUTPUT_DIR/Eta", "$OUTPUT_DIR/AnovaF" - -A = read($1, rows=$2, cols=1, format="text"); -Y = read($3, rows=$2, cols=1, format="text"); - -# mean and variance in target variable -W = nrow(A); -my = mean(Y); -varY = moment(Y,2) * W/(W-1.0); - -# category-wise (frequencies, means, variances) -CFreqs = aggregate(target=Y, groups=A, fn="count"); -CMeans = aggregate(target=Y, groups=A, fn="mean"); -CVars = aggregate(target=Y, groups=A, fn="variance"); - -# number of categories -R = nrow(CFreqs); - -Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - -anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); -anova_den = sum( (CFreqs-1)*CVars )/(W-R); -ANOVAF = anova_num/anova_den; - -# output required statistics -write(varY, $4); -write(my, $5); - -write(CFreqs, $6, format="text"); -write(CMeans, $7, format="text"); -write(CVars, $8, format="text"); - -write(Eta, $9); -write(ANOVAF, $10); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script ScaleCategorical.dml? +# Assume SC_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for A and Y, A is categorical variable and Y is scale variable +# hadoop jar SystemML.jar -f $SC_HOME/ScaleCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/Y" +# "$OUPUT_DIR/VarY" "$OUTPUT_DIR/MeanY" "$OUTPUT_DIR/CFreqs" "$OUTPUT_DIR/CMeans" "$OUTPUT_DIR/CVars" +# "$OUTPUT_DIR/Eta", "$OUTPUT_DIR/AnovaF" + +A = read($1, rows=$2, cols=1, format="text"); +Y = read($3, rows=$2, cols=1, format="text"); + +# mean and variance in target variable +W = nrow(A); +my = mean(Y); +varY = moment(Y,2) * W/(W-1.0); + +# category-wise (frequencies, means, variances) +CFreqs = aggregate(target=Y, groups=A, fn="count"); +CMeans = aggregate(target=Y, groups=A, fn="mean"); +CVars = aggregate(target=Y, groups=A, fn="variance"); + +# number of categories +R = nrow(CFreqs); + +Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + +anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); +anova_den = sum( (CFreqs-1)*CVars )/(W-R); +ANOVAF = anova_num/anova_den; + +# output required statistics +write(varY, $4); +write(my, $5); + +write(CFreqs, $6, format="text"); +write(CMeans, $7, format="text"); +write(CVars, $8, format="text"); + +write(Eta, $9); +write(ANOVAF, $10); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.R index 6e67716..bdf1cc2 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.R +++ b/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.R @@ -1,78 +1,78 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.BivariateScaleCategoricalTest.java -# command line invocation assuming $SC_HOME is set to the home of the R script -# Rscript $SC_HOME/ScaleCategorical.R $SC_HOME/in/ $SC_HOME/expected/ - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") -# Usage: R --vanilla -args Xfile X < ScaleCategoricalTest.R - -#parseCommandArgs() -###################### -Atemp = readMM(paste(args[1], "A.mtx", sep="")); -Ytemp = readMM(paste(args[1], "Y.mtx", sep="")); -WM = readMM(paste(args[1], "WM.mtx", sep="")); - -Yv=rep(Ytemp[,1],WM[,1]) -Av=rep(Atemp[,1],WM[,1]) - -W = sum(WM); -my = sum(Yv)/W; -varY = var(Yv); - -CFreqs = as.matrix(table(Av)); -CMeans = as.matrix(aggregate(Yv, by=list(Av), "mean")$x); -CVars = as.matrix(aggregate(Yv, by=list(Av), "var")$x); - -# number of categories -R = nrow(CFreqs); - -Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - -anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); -anova_den = sum( (CFreqs-1)*CVars )/(W-R); -ANOVAF = anova_num/anova_den; - -print(W, digits=15); -print(R, digits=15); -print(anova_num, digits=15); -print(anova_den, digits=15); - -####################### - -write(Eta, paste(args[2], "Eta", sep="")); - -write(ANOVAF, paste(args[2], "AnovaF", sep="")); - -write(varY, paste(args[2], "VarY", sep="")); - -write(my, paste(args[2], "MeanY", sep="")); - -writeMM(as(CVars,"CsparseMatrix"), paste(args[2], "CVars", sep=""), format="text"); -writeMM(as(CFreqs,"CsparseMatrix"), paste(args[2], "CFreqs", sep=""), format="text"); -writeMM(as(CMeans,"CsparseMatrix"), paste(args[2], "CMeans", sep=""), format="text"); - - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.BivariateScaleCategoricalTest.java +# command line invocation assuming $SC_HOME is set to the home of the R script +# Rscript $SC_HOME/ScaleCategorical.R $SC_HOME/in/ $SC_HOME/expected/ + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") +# Usage: R --vanilla -args Xfile X < ScaleCategoricalTest.R + +#parseCommandArgs() +###################### +Atemp = readMM(paste(args[1], "A.mtx", sep="")); +Ytemp = readMM(paste(args[1], "Y.mtx", sep="")); +WM = readMM(paste(args[1], "WM.mtx", sep="")); + +Yv=rep(Ytemp[,1],WM[,1]) +Av=rep(Atemp[,1],WM[,1]) + +W = sum(WM); +my = sum(Yv)/W; +varY = var(Yv); + +CFreqs = as.matrix(table(Av)); +CMeans = as.matrix(aggregate(Yv, by=list(Av), "mean")$x); +CVars = as.matrix(aggregate(Yv, by=list(Av), "var")$x); + +# number of categories +R = nrow(CFreqs); + +Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + +anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); +anova_den = sum( (CFreqs-1)*CVars )/(W-R); +ANOVAF = anova_num/anova_den; + +print(W, digits=15); +print(R, digits=15); +print(anova_num, digits=15); +print(anova_den, digits=15); + +####################### + +write(Eta, paste(args[2], "Eta", sep="")); + +write(ANOVAF, paste(args[2], "AnovaF", sep="")); + +write(varY, paste(args[2], "VarY", sep="")); + +write(my, paste(args[2], "MeanY", sep="")); + +writeMM(as(CVars,"CsparseMatrix"), paste(args[2], "CVars", sep=""), format="text"); +writeMM(as(CFreqs,"CsparseMatrix"), paste(args[2], "CFreqs", sep=""), format="text"); +writeMM(as(CMeans,"CsparseMatrix"), paste(args[2], "CMeans", sep=""), format="text"); + + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.dml index b8186fb..7615d54 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.dml +++ b/src/test/scripts/applications/descriptivestats/ScaleCategoricalWithWeightsTest.dml @@ -1,65 +1,65 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script ScaleCategorical.dml? -# Assume SC_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for A and Y, A is categorical variable and Y is scale variable -# hadoop jar SystemML.jar -f $SC_HOME/ScaleCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/Y" "$INPUT_DIR/WM" -# "$OUPUT_DIR/VarY" "$OUTPUT_DIR/MeanY" "$OUTPUT_DIR/CFreqs" "$OUTPUT_DIR/CMeans" "$OUTPUT_DIR/CVars" -# "$OUTPUT_DIR/Eta", "$OUTPUT_DIR/AnovaF" - -#A <- nominal variable -#Y <- scale variable -#WM <- weights - -A = read($1, rows=$2, cols=1, format="text"); -Y = read($3, rows=$2, cols=1, format="text"); -WM = read($4, rows=$2, cols=1, format="text"); - -W = sum(WM); -my = sum(Y*WM)/W; -varY = moment(Y,WM,2) * W/(W-1.0); - -CFreqs = aggregate(target=WM, groups=A, fn="sum"); -CMeans = aggregate(target=Y, groups=A, weights=WM, fn="mean"); -CVars = aggregate(target=Y, groups=A, weights=WM, fn="variance"); - -# number of categories -R = nrow(CFreqs); - -Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - -anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); -anova_den = sum( (CFreqs-1)*CVars )/(W-R); -ANOVAF = anova_num/anova_den; - -# output required statistics -write(varY, $5); -write(my, $6); - -write(CFreqs, $7, format="text"); -write(CMeans, $8, format="text"); -write(CVars, $9, format="text"); - -write(Eta, $10); -write(ANOVAF, $11); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script ScaleCategorical.dml? +# Assume SC_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for A and Y, A is categorical variable and Y is scale variable +# hadoop jar SystemML.jar -f $SC_HOME/ScaleCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/Y" "$INPUT_DIR/WM" +# "$OUPUT_DIR/VarY" "$OUTPUT_DIR/MeanY" "$OUTPUT_DIR/CFreqs" "$OUTPUT_DIR/CMeans" "$OUTPUT_DIR/CVars" +# "$OUTPUT_DIR/Eta", "$OUTPUT_DIR/AnovaF" + +#A <- nominal variable +#Y <- scale variable +#WM <- weights + +A = read($1, rows=$2, cols=1, format="text"); +Y = read($3, rows=$2, cols=1, format="text"); +WM = read($4, rows=$2, cols=1, format="text"); + +W = sum(WM); +my = sum(Y*WM)/W; +varY = moment(Y,WM,2) * W/(W-1.0); + +CFreqs = aggregate(target=WM, groups=A, fn="sum"); +CMeans = aggregate(target=Y, groups=A, weights=WM, fn="mean"); +CVars = aggregate(target=Y, groups=A, weights=WM, fn="variance"); + +# number of categories +R = nrow(CFreqs); + +Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + +anova_num = sum( (CFreqs*(CMeans-my)^2) )/(R-1); +anova_den = sum( (CFreqs-1)*CVars )/(W-R); +ANOVAF = anova_num/anova_den; + +# output required statistics +write(varY, $5); +write(my, $6); + +write(CFreqs, $7, format="text"); +write(CMeans, $8, format="text"); +write(CVars, $9, format="text"); + +write(Eta, $10); +write(ANOVAF, $11); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleScale.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleScale.R b/src/test/scripts/applications/descriptivestats/ScaleScale.R index 202a057..690f3a3 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleScale.R +++ b/src/test/scripts/applications/descriptivestats/ScaleScale.R @@ -1,38 +1,38 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java -# command line invocation assuming $SS_HOME is set to the home of the R script -# Rscript $SS_HOME/ScaleScale.R $SS_HOME/in/ $SS_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -X = readMM(paste(args[1], "X.mtx", sep="")) -Y = readMM(paste(args[1], "Y.mtx", sep="")) - -# cor.test returns a list containing t-statistic, df, p-value, and R -cort = cor.test(X[,1], Y[,1]); - -R = as.numeric(cort[4]); - -write(R, paste(args[2], "PearsonR", sep="")); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java +# command line invocation assuming $SS_HOME is set to the home of the R script +# Rscript $SS_HOME/ScaleScale.R $SS_HOME/in/ $SS_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +X = readMM(paste(args[1], "X.mtx", sep="")) +Y = readMM(paste(args[1], "Y.mtx", sep="")) + +# cor.test returns a list containing t-statistic, df, p-value, and R +cort = cor.test(X[,1], Y[,1]); + +R = as.numeric(cort[4]); + +write(R, paste(args[2], "PearsonR", sep="")); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleScale.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleScale.dml b/src/test/scripts/applications/descriptivestats/ScaleScale.dml index e3d2183..0fd2179 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleScale.dml +++ b/src/test/scripts/applications/descriptivestats/ScaleScale.dml @@ -1,48 +1,48 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script ScaleScale.dml? -# Assume $SS_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume X and Y are scale variables and both have 100000 rows -# hadoop jar SystemML.jar -f $SS_HOME/ScaleScale.dml -args "$INPUT_DIR/X" 100000 "$INPUT_DIR/Y" "$OUPUT_DIR/PearsonR" - -X = read($1, rows=$2, cols=1, format="text"); -Y = read($3, rows=$2, cols=1, format="text"); - -W = nrow(X); - -# Unweighted co-variance -covXY = cov(X,Y); - -# compute standard deviations for both X and Y by computing 2^nd central moment -m2X = moment(X,2); -m2Y = moment(Y,2); -sigmaX = sqrt(m2X * (W/(W-1.0)) ); -sigmaY = sqrt(m2Y * (W/(W-1.0)) ); - -# Pearson's R -R = covXY / (sigmaX*sigmaY); - -write(R, $4); - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script ScaleScale.dml? +# Assume $SS_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume X and Y are scale variables and both have 100000 rows +# hadoop jar SystemML.jar -f $SS_HOME/ScaleScale.dml -args "$INPUT_DIR/X" 100000 "$INPUT_DIR/Y" "$OUPUT_DIR/PearsonR" + +X = read($1, rows=$2, cols=1, format="text"); +Y = read($3, rows=$2, cols=1, format="text"); + +W = nrow(X); + +# Unweighted co-variance +covXY = cov(X,Y); + +# compute standard deviations for both X and Y by computing 2^nd central moment +m2X = moment(X,2); +m2Y = moment(Y,2); +sigmaX = sqrt(m2X * (W/(W-1.0)) ); +sigmaY = sqrt(m2Y * (W/(W-1.0)) ); + +# Pearson's R +R = covXY / (sigmaX*sigmaY); + +write(R, $4); + +
