http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.R index fa3b66a..1375435 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.R +++ b/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.R @@ -1,45 +1,45 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java -# command line invocation assuming $SS_HOME is set to the home of the R script -# Rscript $SS_HOME/ScaleScalePearsonRWithWeightsTest.R $SS_HOME/in/ $SS_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") -library("boot") -# Usage: R --vanilla -args Xfile X < ScaleScaleTest.R - -#parseCommandArgs() -###################### - -X = readMM(paste(args[1], "X.mtx", sep="")) -Y = readMM(paste(args[1], "Y.mtx", sep="")) -WM = readMM(paste(args[1],"WM.mtx", sep="")) - -# create a matrix from X and Y vectors -mat = cbind(X[,1], Y[,1]); - -# corr is a function in "boot" package -R = corr(mat, WM[,1]); - -write(R, paste(args[2], "PearsonR", sep="")); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java +# command line invocation assuming $SS_HOME is set to the home of the R script +# Rscript $SS_HOME/ScaleScalePearsonRWithWeightsTest.R $SS_HOME/in/ $SS_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") +library("boot") +# Usage: R --vanilla -args Xfile X < ScaleScaleTest.R + +#parseCommandArgs() +###################### + +X = readMM(paste(args[1], "X.mtx", sep="")) +Y = readMM(paste(args[1], "Y.mtx", sep="")) +WM = readMM(paste(args[1],"WM.mtx", sep="")) + +# create a matrix from X and Y vectors +mat = cbind(X[,1], Y[,1]); + +# corr is a function in "boot" package +R = corr(mat, WM[,1]); + +write(R, paste(args[2], "PearsonR", sep=""));
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.dml index 82ec837..a3bcf33 100644 --- a/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.dml +++ b/src/test/scripts/applications/descriptivestats/ScaleScalePearsonRWithWeightsTest.dml @@ -1,51 +1,51 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script ScaleScale.dml? -# Assume $SS_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume X and Y are scale variables and both have 100000 rows -# hadoop jar SystemML.jar -f $SS_HOME/ScaleScale.dml -args "$INPUT_DIR/X" 100000 "$INPUT_DIR/Y" "$INPUT_DIR/WM" "$OUPUT_DIR/PearsonR" - -#X <- scale variable -#Y <- scale variable -#WM <- weights - -X = read($1, rows=$2, cols=1, format="text"); -Y = read($3, rows=$2, cols=1, format="text"); -WM = read($4, rows=$2, cols=1, format="text"); - -W = sum(WM); - -# weighted co-variance -covXY = cov(X,Y,WM); - -# compute standard deviations for both X and Y by computing 2^nd central moment -m2X = moment(X,WM,2); -m2Y = moment(Y,WM,2); -sigmaX = sqrt(m2X * (W/(W-1.0)) ); -sigmaY = sqrt(m2Y * (W/(W-1.0)) ); - -# Pearson's R -R = covXY / (sigmaX*sigmaY); - -write(R, $5); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script ScaleScale.dml? +# Assume $SS_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume X and Y are scale variables and both have 100000 rows +# hadoop jar SystemML.jar -f $SS_HOME/ScaleScale.dml -args "$INPUT_DIR/X" 100000 "$INPUT_DIR/Y" "$INPUT_DIR/WM" "$OUPUT_DIR/PearsonR" + +#X <- scale variable +#Y <- scale variable +#WM <- weights + +X = read($1, rows=$2, cols=1, format="text"); +Y = read($3, rows=$2, cols=1, format="text"); +WM = read($4, rows=$2, cols=1, format="text"); + +W = sum(WM); + +# weighted co-variance +covXY = cov(X,Y,WM); + +# compute standard deviations for both X and Y by computing 2^nd central moment +m2X = moment(X,WM,2); +m2Y = moment(Y,WM,2); +sigmaX = sqrt(m2X * (W/(W-1.0)) ); +sigmaY = sqrt(m2Y * (W/(W-1.0)) ); + +# Pearson's R +R = covXY / (sigmaX*sigmaY); + +write(R, $5); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/SimpleQuantileTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/SimpleQuantileTest.dml b/src/test/scripts/applications/descriptivestats/SimpleQuantileTest.dml index 53aea26..9f7a9be 100644 --- a/src/test/scripts/applications/descriptivestats/SimpleQuantileTest.dml +++ b/src/test/scripts/applications/descriptivestats/SimpleQuantileTest.dml @@ -1,35 +1,35 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -$$readhelper$$ - -V = read("$$indir$$vector", rows=$$rows$$, cols=1, format="text") -W = read("$$indir$$weight", rows=$$rows$$, cols=1, format="text") - -# median -md = median(V) #quantile(V, 0.5) -medianHelper1 = md * Helper; -write(medianHelper1, "$$outdir$$median", format="text"); - -# weighted median -wmd = median(V,W) #quantile(V, W, 0.5) -medianHelper2 = wmd * Helper; +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +$$readhelper$$ + +V = read("$$indir$$vector", rows=$$rows$$, cols=1, format="text") +W = read("$$indir$$weight", rows=$$rows$$, cols=1, format="text") + +# median +md = median(V) #quantile(V, 0.5) +medianHelper1 = md * Helper; +write(medianHelper1, "$$outdir$$median", format="text"); + +# weighted median +wmd = median(V,W) #quantile(V, W, 0.5) +medianHelper2 = wmd * Helper; write(medianHelper2, "$$outdir$$weighted_median", format="text"); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.R b/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.R index a5986db..c7b8e8f 100644 --- a/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.R +++ b/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.R @@ -1,63 +1,63 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java -# command line invocation assuming $C_HOME is set to the home of the R script -# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -#library("batch") -library("Matrix") -# Usage: R --vanilla -args Xfile X < DescriptiveStatistics.R - -#parseCommandArgs() -###################### - -V = readMM(paste(args[1], "vector.mtx", sep="")) -W = readMM(paste(args[1], "weight.mtx", sep="")) - -tab = table(rep(V[,1],W[,1])) -cat = t(as.numeric(names(tab))) -Nc = t(as.vector(tab)) - -# the number of categories of a categorical variable -R = length(Nc) - -# total count -s = sum(Nc) - -# percentage values of each categorical compare to the total case number -Pc = Nc / s - -# all categorical values of a categorical variable -#C = t(as.matrix(as.numeric(Nc > 0))) -C= (Nc > 0) - -# mode -mx = max(Nc) -Mode = (Nc == mx) - -writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text"); -write(R, paste(args[2], "R", sep="")); -writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text"); -writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text"); -writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java +# command line invocation assuming $C_HOME is set to the home of the R script +# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +#library("batch") +library("Matrix") +# Usage: R --vanilla -args Xfile X < DescriptiveStatistics.R + +#parseCommandArgs() +###################### + +V = readMM(paste(args[1], "vector.mtx", sep="")) +W = readMM(paste(args[1], "weight.mtx", sep="")) + +tab = table(rep(V[,1],W[,1])) +cat = t(as.numeric(names(tab))) +Nc = t(as.vector(tab)) + +# the number of categories of a categorical variable +R = length(Nc) + +# total count +s = sum(Nc) + +# percentage values of each categorical compare to the total case number +Pc = Nc / s + +# all categorical values of a categorical variable +#C = t(as.matrix(as.numeric(Nc > 0))) +C= (Nc > 0) + +# mode +mx = max(Nc) +Mode = (Nc == mx) + +writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text"); +write(R, paste(args[2], "R", sep="")); +writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text"); +writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text"); +writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.dml b/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.dml index d838cff..cacc311 100644 --- a/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.dml +++ b/src/test/scripts/applications/descriptivestats/WeightedCategoricalTest.dml @@ -1,56 +1,56 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script Categorical.dml? -# Assume C_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for vector -# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/W" "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode" - -V = read($1, rows=$2, cols=1, format="text") -W = read($3, rows=$2, cols=1, format="text") - -# a set of number of values specify the number of cases of each categorical -Nc = table(V, 1, W); - -# the number of categories of a categorical variable -R = nrow(Nc) - -# total count -s = sum(Nc) - -# percentage values of each categorical compare to the total case number -Pc = Nc / s - -# all categorical values of a categorical variable -C = ppred(Nc, 0, ">") - -# mode -mx = max(Nc) -Mode = ppred(Nc, mx, "==") - -write(Nc, $4, format="text") -write(R, $5) -write(Pc, $6, format="text") -write(C, $7, format="text") -write(Mode, $8, format="text") - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script Categorical.dml? +# Assume C_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for vector +# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/W" "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode" + +V = read($1, rows=$2, cols=1, format="text") +W = read($3, rows=$2, cols=1, format="text") + +# a set of number of values specify the number of cases of each categorical +Nc = table(V, 1, W); + +# the number of categories of a categorical variable +R = nrow(Nc) + +# total count +s = sum(Nc) + +# percentage values of each categorical compare to the total case number +Pc = Nc / s + +# all categorical values of a categorical variable +C = ppred(Nc, 0, ">") + +# mode +mx = max(Nc) +Mode = ppred(Nc, mx, "==") + +write(Nc, $4, format="text") +write(R, $5) +write(Pc, $6, format="text") +write(C, $7, format="text") +write(Mode, $8, format="text") + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/WeightedScaleTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/WeightedScaleTest.R b/src/test/scripts/applications/descriptivestats/WeightedScaleTest.R index 866ca02..eba3d1c 100644 --- a/src/test/scripts/applications/descriptivestats/WeightedScaleTest.R +++ b/src/test/scripts/applications/descriptivestats/WeightedScaleTest.R @@ -1,155 +1,155 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java -# command line invocation assuming $S_HOME is set to the home of the R script -# Rscript $S_HOME/WeightedScaleTest.R $S_HOME/in/ $S_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -options(repos="http://cran.stat.ucla.edu/") -is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) - -is_plotrix = is.installed("plotrix"); -if ( !is_plotrix ) { -install.packages("plotrix"); -} -library("plotrix"); - -is_psych = is.installed("psych"); -if ( !is_psych ) { -install.packages("psych"); -} -library("psych") - -is_moments = is.installed("moments"); -if( !is_moments){ -install.packages("moments"); -} -library("moments") - -#library("batch") -library("Matrix") - -# Usage: R --vanilla -args Xfile X < DescriptiveStatistics.R - -#parseCommandArgs() -###################### - -Temp = readMM(paste(args[1], "vector.mtx", sep="")) -W = readMM(paste(args[1], "weight.mtx", sep="")) -P = readMM(paste(args[1], "prob.mtx", sep="")) - -W = round(W) - -V=rep(Temp[,1],W[,1]) - -n = sum(W) - -# sum -s1 = sum(V) - -# mean -mu = s1/n - -# variances -var = var(V) - -# standard deviations -std_dev = sd(V, na.rm = FALSE) - -# standard errors of mean -SE = std.error(V, na.rm) - -# coefficients of variation -cv = std_dev/mu - -# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) -#har_mu = harmonic.mean(V) - -# geometric means is not currently supported. -#geom_mu = geometric.mean(V) - -# min and max -mn=min(V) -mx=max(V) - -# range -rng = mx - mn - -# Skewness -g1 = n^2*moment(V, order=3, central=TRUE)/((n-1)*(n-2)*std_dev^3) - -# standard error of skewness (not sure how it is defined without the weight) -se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) - -m2 = moment(V, order=2, central=TRUE) -m4 = moment(V, order=4, central=TRUE) - -# Kurtosis (using binomial formula) -g2 = (n^2*(n+1)*m4-3*m2^2*n^2*(n-1))/((n-1)*(n-2)*(n-3)*var^2) - -# Standard error of Kurtosis (not sure how it is defined without the weight) -se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5)*(n-3)) ) - -# median -md = median(V) #quantile(V, 0.5, type = 1) - -# quantile -Q = t(quantile(V, P[,1], type = 1)) - -# inter-quartile mean -S=c(sort(V)) - -q25d=n*0.25 -q75d=n*0.75 -q25i=ceiling(q25d) -q75i=ceiling(q75d) - -iqm = sum(S[(q25i+1):q75i]) -iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] -iqm = iqm/(n*0.5) - -#print(paste("IQM ", iqm)); - -# outliers use ppred to describe it -out_minus = t(as.numeric(Temp < mu-5*std_dev)*Temp) -out_plus = t(as.numeric(Temp > mu+5*std_dev)*Temp) - -write(mu, paste(args[2], "mean", sep="")); -write(std_dev, paste(args[2], "std", sep="")); -write(SE, paste(args[2], "se", sep="")); -write(var, paste(args[2], "var", sep="")); -write(cv, paste(args[2], "cv", sep="")); -# write(har_mu),paste(args[2], "har", sep="")); -# write(geom_mu, paste(args[2], "geom", sep="")); -write(mn, paste(args[2], "min", sep="")); -write(mx, paste(args[2], "max", sep="")); -write(rng, paste(args[2], "rng", sep="")); -write(g1, paste(args[2], "g1", sep="")); -write(se_g1, paste(args[2], "se_g1", sep="")); -write(g2, paste(args[2], "g2", sep="")); -write(se_g2, paste(args[2], "se_g2", sep="")); -write(md, paste(args[2], "median", sep="")); -write(iqm, paste(args[2], "iqm", sep="")); -writeMM(as(t(out_minus),"CsparseMatrix"), paste(args[2], "out_minus", sep=""), format="text"); -writeMM(as(t(out_plus),"CsparseMatrix"), paste(args[2], "out_plus", sep=""), format="text"); -writeMM(as(t(Q),"CsparseMatrix"), paste(args[2], "quantile", sep=""), format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java +# command line invocation assuming $S_HOME is set to the home of the R script +# Rscript $S_HOME/WeightedScaleTest.R $S_HOME/in/ $S_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +options(repos="http://cran.stat.ucla.edu/") +is.installed <- function(mypkg) is.element(mypkg, installed.packages()[,1]) + +is_plotrix = is.installed("plotrix"); +if ( !is_plotrix ) { +install.packages("plotrix"); +} +library("plotrix"); + +is_psych = is.installed("psych"); +if ( !is_psych ) { +install.packages("psych"); +} +library("psych") + +is_moments = is.installed("moments"); +if( !is_moments){ +install.packages("moments"); +} +library("moments") + +#library("batch") +library("Matrix") + +# Usage: R --vanilla -args Xfile X < DescriptiveStatistics.R + +#parseCommandArgs() +###################### + +Temp = readMM(paste(args[1], "vector.mtx", sep="")) +W = readMM(paste(args[1], "weight.mtx", sep="")) +P = readMM(paste(args[1], "prob.mtx", sep="")) + +W = round(W) + +V=rep(Temp[,1],W[,1]) + +n = sum(W) + +# sum +s1 = sum(V) + +# mean +mu = s1/n + +# variances +var = var(V) + +# standard deviations +std_dev = sd(V, na.rm = FALSE) + +# standard errors of mean +SE = std.error(V, na.rm) + +# coefficients of variation +cv = std_dev/mu + +# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) +#har_mu = harmonic.mean(V) + +# geometric means is not currently supported. +#geom_mu = geometric.mean(V) + +# min and max +mn=min(V) +mx=max(V) + +# range +rng = mx - mn + +# Skewness +g1 = n^2*moment(V, order=3, central=TRUE)/((n-1)*(n-2)*std_dev^3) + +# standard error of skewness (not sure how it is defined without the weight) +se_g1=sqrt( 6*n*(n-1.0) / ((n-2.0)*(n+1.0)*(n+3.0)) ) + +m2 = moment(V, order=2, central=TRUE) +m4 = moment(V, order=4, central=TRUE) + +# Kurtosis (using binomial formula) +g2 = (n^2*(n+1)*m4-3*m2^2*n^2*(n-1))/((n-1)*(n-2)*(n-3)*var^2) + +# Standard error of Kurtosis (not sure how it is defined without the weight) +se_g2= sqrt( (4*(n^2-1)*se_g1^2)/((n+5)*(n-3)) ) + +# median +md = median(V) #quantile(V, 0.5, type = 1) + +# quantile +Q = t(quantile(V, P[,1], type = 1)) + +# inter-quartile mean +S=c(sort(V)) + +q25d=n*0.25 +q75d=n*0.75 +q25i=ceiling(q25d) +q75i=ceiling(q75d) + +iqm = sum(S[(q25i+1):q75i]) +iqm = iqm + (q25i-q25d)*S[q25i] - (q75i-q75d)*S[q75i] +iqm = iqm/(n*0.5) + +#print(paste("IQM ", iqm)); + +# outliers use ppred to describe it +out_minus = t(as.numeric(Temp < mu-5*std_dev)*Temp) +out_plus = t(as.numeric(Temp > mu+5*std_dev)*Temp) + +write(mu, paste(args[2], "mean", sep="")); +write(std_dev, paste(args[2], "std", sep="")); +write(SE, paste(args[2], "se", sep="")); +write(var, paste(args[2], "var", sep="")); +write(cv, paste(args[2], "cv", sep="")); +# write(har_mu),paste(args[2], "har", sep="")); +# write(geom_mu, paste(args[2], "geom", sep="")); +write(mn, paste(args[2], "min", sep="")); +write(mx, paste(args[2], "max", sep="")); +write(rng, paste(args[2], "rng", sep="")); +write(g1, paste(args[2], "g1", sep="")); +write(se_g1, paste(args[2], "se_g1", sep="")); +write(g2, paste(args[2], "g2", sep="")); +write(se_g2, paste(args[2], "se_g2", sep="")); +write(md, paste(args[2], "median", sep="")); +write(iqm, paste(args[2], "iqm", sep="")); +writeMM(as(t(out_minus),"CsparseMatrix"), paste(args[2], "out_minus", sep=""), format="text"); +writeMM(as(t(out_plus),"CsparseMatrix"), paste(args[2], "out_plus", sep=""), format="text"); +writeMM(as(t(Q),"CsparseMatrix"), paste(args[2], "quantile", sep=""), format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/WeightedScaleTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/WeightedScaleTest.dml b/src/test/scripts/applications/descriptivestats/WeightedScaleTest.dml index 6968c1e..aac58de 100644 --- a/src/test/scripts/applications/descriptivestats/WeightedScaleTest.dml +++ b/src/test/scripts/applications/descriptivestats/WeightedScaleTest.dml @@ -1,125 +1,125 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script Scale.dml? -# Assume S_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for V and rows = 5 for P -# hadoop jar SystemML.jar -f $S_HOME/Scale.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/weight "$INPUT_DIR/prob 5 -# "$OUTPUT_DIR/mean" "$OUTPUT_DIR/std" "$OUTPUT_DIR/se" "$OUTPUT_DIR/var" "$OUTPUT_DIR/cv" -# "$OUTPUT_DIR/min" "$OUTPUT_DIR/max" "$OUTPUT_DIR/rng" -# "$OUTPUT_DIR/g1" "$OUTPUT_DIR/se_g1" "$OUTPUT_DIR/g2" "$OUTPUT_DIR/se_g2" -# "$OUTPUT_DIR/median" "$OUTPUT_DIR/iqm" -# "OUTPUT_DIR/out_minus" "$OUTPUT_DIR/out_plus" "$OUTPUT_DIR/quantile" - -V = read($1, rows=$2, cols=1, format="text") -W = read($3, rows=$2, cols=1, format="text") -P = read($4, rows=$5, cols=1, format="text") - -W = round(W) - -n = nrow(V) - -wt = sum(W) - -# sum -s1 = sum(V*W) - -# 2nd central moment -m2 = moment(V, W, 2) - -# 3rd central moment -m3 = moment(V, W, 3) - -# 4th central moment -m4 = moment(V, W, 4) - -# mean -mu = mean(V, W) - -# variances -var = m2*wt/(wt-1.0) - -# standard deviations -std_dev = sqrt(var) - -# standard errors of mean -SE = std_dev/sqrt(wt) - -# coefficients of variation -cv = std_dev/mu - -# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) -#har_mu = wt/(sum((1.0/V)*W)) - -# geometric means is not currently supported. -#geom_mu = wt*exp(sum(log(V)*W)/wt) - -# min and max -mn=min(V) -mx=max(V) - -# range -rng = mx - mn - -# Skewness -g1 = wt^2*m3/((wt-1)*(wt-2)*std_dev^3) - -# standard error of skewness -se_g1=sqrt( 6*wt*(wt-1) / ((wt-2)*(wt+1)*(wt+3)) ) - -# Kurtosis (using binomial formula) -g2 = (wt^2*(wt+1)*m4-3*m2^2*wt^2*(wt-1))/((wt-1)*(wt-2)*(wt-3)*std_dev^4) - -# Standard error of Kurtosis -se_g2= sqrt( (4*(wt^2-1)*se_g1^2)/((wt+5)*(wt-3)) ) - -# outliers use ppred to describe it -out_minus = ppred(V, mu-5*std_dev, "<")*V -out_plus = ppred(V, mu+5*std_dev, ">")*V - -# median -md = median(V,W); #quantile(V, W, 0.5) - -# quantile -Q = quantile(V, W, P) - -# inter-quartile mean -iqm = interQuartileMean(V, W) - -write(mu, $6); -write(std_dev, $7); -write(SE, $8); -write(var, $9); -write(cv, $10); -write(mn, $11); -write(mx, $12); -write(rng, $13); -write(g1, $14); -write(se_g1, $15); -write(g2, $16); -write(se_g2, $17); -write(md, $18); -write(iqm, $19); -write(out_minus, $20, format="text"); -write(out_plus, $21, format="text"); -write(Q, $22, format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script Scale.dml? +# Assume S_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for V and rows = 5 for P +# hadoop jar SystemML.jar -f $S_HOME/Scale.dml -args "$INPUT_DIR/vector" 10000 "$INPUT_DIR/weight "$INPUT_DIR/prob 5 +# "$OUTPUT_DIR/mean" "$OUTPUT_DIR/std" "$OUTPUT_DIR/se" "$OUTPUT_DIR/var" "$OUTPUT_DIR/cv" +# "$OUTPUT_DIR/min" "$OUTPUT_DIR/max" "$OUTPUT_DIR/rng" +# "$OUTPUT_DIR/g1" "$OUTPUT_DIR/se_g1" "$OUTPUT_DIR/g2" "$OUTPUT_DIR/se_g2" +# "$OUTPUT_DIR/median" "$OUTPUT_DIR/iqm" +# "OUTPUT_DIR/out_minus" "$OUTPUT_DIR/out_plus" "$OUTPUT_DIR/quantile" + +V = read($1, rows=$2, cols=1, format="text") +W = read($3, rows=$2, cols=1, format="text") +P = read($4, rows=$5, cols=1, format="text") + +W = round(W) + +n = nrow(V) + +wt = sum(W) + +# sum +s1 = sum(V*W) + +# 2nd central moment +m2 = moment(V, W, 2) + +# 3rd central moment +m3 = moment(V, W, 3) + +# 4th central moment +m4 = moment(V, W, 4) + +# mean +mu = mean(V, W) + +# variances +var = m2*wt/(wt-1.0) + +# standard deviations +std_dev = sqrt(var) + +# standard errors of mean +SE = std_dev/sqrt(wt) + +# coefficients of variation +cv = std_dev/mu + +# harmonic means (note: may generate out of memory for large sparse matrices becauses of NaNs) +#har_mu = wt/(sum((1.0/V)*W)) + +# geometric means is not currently supported. +#geom_mu = wt*exp(sum(log(V)*W)/wt) + +# min and max +mn=min(V) +mx=max(V) + +# range +rng = mx - mn + +# Skewness +g1 = wt^2*m3/((wt-1)*(wt-2)*std_dev^3) + +# standard error of skewness +se_g1=sqrt( 6*wt*(wt-1) / ((wt-2)*(wt+1)*(wt+3)) ) + +# Kurtosis (using binomial formula) +g2 = (wt^2*(wt+1)*m4-3*m2^2*wt^2*(wt-1))/((wt-1)*(wt-2)*(wt-3)*std_dev^4) + +# Standard error of Kurtosis +se_g2= sqrt( (4*(wt^2-1)*se_g1^2)/((wt+5)*(wt-3)) ) + +# outliers use ppred to describe it +out_minus = ppred(V, mu-5*std_dev, "<")*V +out_plus = ppred(V, mu+5*std_dev, ">")*V + +# median +md = median(V,W); #quantile(V, W, 0.5) + +# quantile +Q = quantile(V, W, P) + +# inter-quartile mean +iqm = interQuartileMean(V, W) + +write(mu, $6); +write(std_dev, $7); +write(SE, $8); +write(var, $9); +write(cv, $10); +write(mn, $11); +write(mx, $12); +write(rng, $13); +write(g1, $14); +write(se_g1, $15); +write(g2, $16); +write(se_g2, $17); +write(md, $18); +write(iqm, $19); +write(out_minus, $20, format="text"); +write(out_plus, $21, format="text"); +write(Q, $22, format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/glm/GLM.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/glm/GLM.R b/src/test/scripts/applications/glm/GLM.R index d569319..32b51f9 100644 --- a/src/test/scripts/applications/glm/GLM.R +++ b/src/test/scripts/applications/glm/GLM.R @@ -1,141 +1,141 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.applications.GLMTest.java -# Intended to solve GLM Regression using R, in order to compare against the DML implementation -# INPUT 1: Matrix X [rows, columns] -# INPUT 2: Matrix y [rows, 1] -# INPUT 3-6: Distribution family and link, see below: -# --------------------------------------------- -# Dst Var Lnk Lnk Distribution Cano- -# typ pow typ pow Family.link nical? -# --------------------------------------------- -# 1 0.0 1 -1.0 Gaussian.inverse -# 1 0.0 1 0.0 Gaussian.log -# 1 0.0 1 1.0 Gaussian.id Yes -# 1 1.0 1 0.0 Poisson.log Yes -# 1 1.0 1 0.5 Poisson.sqrt -# 1 1.0 1 1.0 Poisson.id -# 1 2.0 1 -1.0 Gamma.inverse Yes -# 1 2.0 1 0.0 Gamma.log -# 1 2.0 1 1.0 Gamma.id -# 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes -# 1 3.0 1 -1.0 InvGaussian.inverse -# 1 3.0 1 0.0 InvGaussian.log -# 1 3.0 1 1.0 InvGaussian.id -# 1 * 1 * AnyVariance.AnyLink -# --------------------------------------------- -# 2 -1.0 * * Binomial {-1, 1} -# 2 0.0 * * Binomial { 0, 1} -# 2 1.0 * * Binomial two-column -# 2 * 1 0.0 Binomial.log -# 2 * 2 * Binomial.logit Yes -# 2 * 3 * Binomial.probit -# 2 * 4 * Binomial.cloglog -# 2 * 5 * Binomial.cauchit -# --------------------------------------------- -# INPUT 3: (int) Distribution type -# INPUT 4: (double) For Power families: Variance power of the mean -# INPUT 5: (int) Link function type -# INPUT 6: (double) Link as power of the mean -# INPUT 7: (int) Intercept: 0 = no, 1 = yes -# INPUT 8: (double) tolerance (epsilon) -# INPUT 9: the regression coefficients output file -# OUTPUT : Matrix beta [columns, 1] -# -# Assume that $GLMR_HOME is set to the home of the R script -# Assume input and output directories are $GLMR_HOME/in/ and $GLMR_HOME/expected/ -# Rscript $GLMR_HOME/GLM.R $GLMR_HOME/in/X.mtx $GLMR_HOME/in/y.mtx 2 0.0 2 0.0 1 0.00000001 $GLMR_HOME/expected/w.mtx - -args <- commandArgs (TRUE); - -library ("Matrix"); -# library ("batch"); - -options (warn = -1); - -X_here <- readMM (args[1]); # (paste (args[1], "X.mtx", sep="")); -y_here <- readMM (args[2]); # (paste (args[1], "y.mtx", sep="")); - -num_records <- nrow (X_here); -num_features <- ncol (X_here); -dist_type <- as.integer (args[3]); -dist_param <- as.numeric (args[4]); -link_type <- as.integer (args[5]); -link_power <- as.numeric (args[6]); -icept <- as.integer (args[7]); -eps_n <- as.numeric (args[8]); - -f_ly <- gaussian (); -var_power <- dist_param; - -if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 1.0) { f_ly <- gaussian (link = "identity"); } else -if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == -1.0) { f_ly <- gaussian (link = "inverse"); } else -if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 0.0) { f_ly <- gaussian (link = "log"); } else -if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 1.0) { f_ly <- poisson (link = "identity"); } else -if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.0) { f_ly <- poisson (link = "log"); } else -if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.5) { f_ly <- poisson (link = "sqrt"); } else -if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 1.0) { f_ly <- Gamma (link = "identity"); } else -if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == -1.0) { f_ly <- Gamma (link = "inverse"); } else -if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 0.0) { f_ly <- Gamma (link = "log"); } else -if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 1.0) { f_ly <- inverse.gaussian (link = "identity"); } else -if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -1.0) { f_ly <- inverse.gaussian (link = "inverse"); } else -if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 0.0) { f_ly <- inverse.gaussian (link = "log"); } else -if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -2.0) { f_ly <- inverse.gaussian (link = "1/mu^2"); } else -if (dist_type == 2 & link_type == 1 & link_power == 0.0) { f_ly <- binomial (link = "log"); } else -if (dist_type == 2 & link_type == 1 & link_power == 1.0) { f_ly <- binomial (link = "identity"); } else -if (dist_type == 2 & link_type == 1 & link_power == 0.5) { f_ly <- binomial (link = "sqrt"); } else -if (dist_type == 2 & link_type == 2 ) { f_ly <- binomial (link = "logit"); } else -if (dist_type == 2 & link_type == 3 ) { f_ly <- binomial (link = "probit"); } else -if (dist_type == 2 & link_type == 4 ) { f_ly <- binomial (link = "cloglog"); } else -if (dist_type == 2 & link_type == 5 ) { f_ly <- binomial (link = "cauchit"); } - -# quasi(link = "identity", variance = "constant") -# quasibinomial(link = "logit") -# quasipoisson(link = "log") - -if (dist_type == 2 & dist_param != 1.0) { - y_here <- (y_here - dist_param) / (1.0 - dist_param); -} - -# epsilon tolerance: the iterations converge when |dev - devold|/(|dev| + 0.1) < epsilon. -# maxit integer giving the maximal number of IWLS iterations. -# trace logical indicating if output should be produced for each iteration. -# -c_rol <- glm.control (epsilon = eps_n, maxit = 100, trace = FALSE); - -X_matrix = as.matrix (X_here); -y_matrix = as.matrix (y_here); - -if (icept == 0) { - glmOut <- glm (y_matrix ~ X_matrix - 1, family = f_ly, control = c_rol); - betas <- coef (glmOut); -} else { - glmOut <- glm (y_matrix ~ X_matrix , family = f_ly, control = c_rol); - betas <- coef (glmOut); - beta_intercept = betas [1]; - betas [1 : num_features] = betas [2 : (num_features + 1)]; - betas [num_features + 1] = beta_intercept; -} - -print (c("Deviance", glmOut$deviance)); -writeMM (as (betas, "CsparseMatrix"), args[9], format = "text"); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.applications.GLMTest.java +# Intended to solve GLM Regression using R, in order to compare against the DML implementation +# INPUT 1: Matrix X [rows, columns] +# INPUT 2: Matrix y [rows, 1] +# INPUT 3-6: Distribution family and link, see below: +# --------------------------------------------- +# Dst Var Lnk Lnk Distribution Cano- +# typ pow typ pow Family.link nical? +# --------------------------------------------- +# 1 0.0 1 -1.0 Gaussian.inverse +# 1 0.0 1 0.0 Gaussian.log +# 1 0.0 1 1.0 Gaussian.id Yes +# 1 1.0 1 0.0 Poisson.log Yes +# 1 1.0 1 0.5 Poisson.sqrt +# 1 1.0 1 1.0 Poisson.id +# 1 2.0 1 -1.0 Gamma.inverse Yes +# 1 2.0 1 0.0 Gamma.log +# 1 2.0 1 1.0 Gamma.id +# 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes +# 1 3.0 1 -1.0 InvGaussian.inverse +# 1 3.0 1 0.0 InvGaussian.log +# 1 3.0 1 1.0 InvGaussian.id +# 1 * 1 * AnyVariance.AnyLink +# --------------------------------------------- +# 2 -1.0 * * Binomial {-1, 1} +# 2 0.0 * * Binomial { 0, 1} +# 2 1.0 * * Binomial two-column +# 2 * 1 0.0 Binomial.log +# 2 * 2 * Binomial.logit Yes +# 2 * 3 * Binomial.probit +# 2 * 4 * Binomial.cloglog +# 2 * 5 * Binomial.cauchit +# --------------------------------------------- +# INPUT 3: (int) Distribution type +# INPUT 4: (double) For Power families: Variance power of the mean +# INPUT 5: (int) Link function type +# INPUT 6: (double) Link as power of the mean +# INPUT 7: (int) Intercept: 0 = no, 1 = yes +# INPUT 8: (double) tolerance (epsilon) +# INPUT 9: the regression coefficients output file +# OUTPUT : Matrix beta [columns, 1] +# +# Assume that $GLMR_HOME is set to the home of the R script +# Assume input and output directories are $GLMR_HOME/in/ and $GLMR_HOME/expected/ +# Rscript $GLMR_HOME/GLM.R $GLMR_HOME/in/X.mtx $GLMR_HOME/in/y.mtx 2 0.0 2 0.0 1 0.00000001 $GLMR_HOME/expected/w.mtx + +args <- commandArgs (TRUE); + +library ("Matrix"); +# library ("batch"); + +options (warn = -1); + +X_here <- readMM (args[1]); # (paste (args[1], "X.mtx", sep="")); +y_here <- readMM (args[2]); # (paste (args[1], "y.mtx", sep="")); + +num_records <- nrow (X_here); +num_features <- ncol (X_here); +dist_type <- as.integer (args[3]); +dist_param <- as.numeric (args[4]); +link_type <- as.integer (args[5]); +link_power <- as.numeric (args[6]); +icept <- as.integer (args[7]); +eps_n <- as.numeric (args[8]); + +f_ly <- gaussian (); +var_power <- dist_param; + +if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 1.0) { f_ly <- gaussian (link = "identity"); } else +if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == -1.0) { f_ly <- gaussian (link = "inverse"); } else +if (dist_type == 1 & var_power == 0.0 & link_type == 1 & link_power == 0.0) { f_ly <- gaussian (link = "log"); } else +if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 1.0) { f_ly <- poisson (link = "identity"); } else +if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.0) { f_ly <- poisson (link = "log"); } else +if (dist_type == 1 & var_power == 1.0 & link_type == 1 & link_power == 0.5) { f_ly <- poisson (link = "sqrt"); } else +if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 1.0) { f_ly <- Gamma (link = "identity"); } else +if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == -1.0) { f_ly <- Gamma (link = "inverse"); } else +if (dist_type == 1 & var_power == 2.0 & link_type == 1 & link_power == 0.0) { f_ly <- Gamma (link = "log"); } else +if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 1.0) { f_ly <- inverse.gaussian (link = "identity"); } else +if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -1.0) { f_ly <- inverse.gaussian (link = "inverse"); } else +if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == 0.0) { f_ly <- inverse.gaussian (link = "log"); } else +if (dist_type == 1 & var_power == 3.0 & link_type == 1 & link_power == -2.0) { f_ly <- inverse.gaussian (link = "1/mu^2"); } else +if (dist_type == 2 & link_type == 1 & link_power == 0.0) { f_ly <- binomial (link = "log"); } else +if (dist_type == 2 & link_type == 1 & link_power == 1.0) { f_ly <- binomial (link = "identity"); } else +if (dist_type == 2 & link_type == 1 & link_power == 0.5) { f_ly <- binomial (link = "sqrt"); } else +if (dist_type == 2 & link_type == 2 ) { f_ly <- binomial (link = "logit"); } else +if (dist_type == 2 & link_type == 3 ) { f_ly <- binomial (link = "probit"); } else +if (dist_type == 2 & link_type == 4 ) { f_ly <- binomial (link = "cloglog"); } else +if (dist_type == 2 & link_type == 5 ) { f_ly <- binomial (link = "cauchit"); } + +# quasi(link = "identity", variance = "constant") +# quasibinomial(link = "logit") +# quasipoisson(link = "log") + +if (dist_type == 2 & dist_param != 1.0) { + y_here <- (y_here - dist_param) / (1.0 - dist_param); +} + +# epsilon tolerance: the iterations converge when |dev - devold|/(|dev| + 0.1) < epsilon. +# maxit integer giving the maximal number of IWLS iterations. +# trace logical indicating if output should be produced for each iteration. +# +c_rol <- glm.control (epsilon = eps_n, maxit = 100, trace = FALSE); + +X_matrix = as.matrix (X_here); +y_matrix = as.matrix (y_here); + +if (icept == 0) { + glmOut <- glm (y_matrix ~ X_matrix - 1, family = f_ly, control = c_rol); + betas <- coef (glmOut); +} else { + glmOut <- glm (y_matrix ~ X_matrix , family = f_ly, control = c_rol); + betas <- coef (glmOut); + beta_intercept = betas [1]; + betas [1 : num_features] = betas [2 : (num_features + 1)]; + betas [num_features + 1] = beta_intercept; +} + +print (c("Deviance", glmOut$deviance)); +writeMM (as (betas, "CsparseMatrix"), args[9], format = "text"); +
