http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/stratstats.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/stratstats.dml b/src/test/scripts/applications/ctableStats/stratstats.dml index 3ac684a..7eb1858 100644 --- a/src/test/scripts/applications/ctableStats/stratstats.dml +++ b/src/test/scripts/applications/ctableStats/stratstats.dml @@ -1,350 +1,350 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# STRATIFIED BIVARIATE STATISTICS, VERSION 2 -# -# INPUT 1: Dataset with records as rows (matrix filename) -# INPUT 2: The stratum ID column number (integer) -# Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive. -# INPUT 3: 1st variate column numbers (matrix filename) -# INPUT 4: 2nd variate column numbers (matrix filename) -# INPUT 5: Output (matrix filename) -# -# OUTPUT 1: Output Matrix with 40 columns, containing the following information: -# Rows: One row per each distinct pair (1st variate, 2nd variate) -# Col 01: 1st variate column number -# Col 02: 1st variate global presence count -# Col 03: 1st variate global mean -# Col 04: 1st variate global standard deviation -# Col 05: 1st variate stratified standard deviation -# Col 06: R-squared, 1st variate vs. strata -# Col 07: P-value, 1st variate vs. strata -# Col 08-10: Reserved -# Col 11: 2nd variate column number -# Col 12: 2nd variate global presence count -# Col 13: 2nd variate global mean -# Col 14: 2nd variate global standard deviation -# Col 15: 2nd variate stratified standard deviation -# Col 16: R-squared, 2nd variate vs. strata -# Col 17: P-value, 2nd variate vs. strata -# Col 18-20: Reserved -# Col 21: Global 1st & 2nd variate presence count -# Col 22: Global regression slope (2nd variate vs. 1st variate) -# Col 23: Global regression slope standard deviation -# Col 24: Global correlation = +/- sqrt(R-squared) -# Col 25: Global residual standard deviation -# Col 26: Global R-squared -# Col 27: Global P-value for hypothesis "slope = 0" -# Col 28-30: Reserved -# Col 31: Stratified 1st & 2nd variate presence count -# Col 32: Stratified regression slope (2nd variate vs. 1st variate) -# Col 33: Stratified regression slope standard deviation -# Col 34: Stratified correlation = +/- sqrt(R-squared) -# Col 35: Stratified residual standard deviation -# Col 36: Stratified R-squared -# Col 37: Stratified P-value for hypothesis "slope = 0" -# Col 38: Number of strata with at least two counted points -# Col 39-40: Reserved -# TO DO: GOODNESS OF FIT MEASURE -# -# EXAMPLE: -# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx - -NaN = 0/0; - -print ("BEGIN STRATIFIED STATISTICS SCRIPT"); - -print ("Reading the input matrices..."); - -DataWithNaNs = read ($1, format = "text"); -Xcols = read ($3, format = "text"); -Ycols = read ($4, format = "text"); -stratum_column_id = $2; -num_records = nrow(DataWithNaNs); -num_attrs = ncol(DataWithNaNs); -num_attrs_X = ncol(Xcols); -num_attrs_Y = ncol(Ycols); -num_attrs_XY = num_attrs_X * num_attrs_Y; - - -print ("Preparing the variates..."); - -Data = deNaN (DataWithNaNs); -DataNaNmask = ppred (DataWithNaNs, NaN, "=="); - -tXcols = t(Xcols); -ones = matrix (1.0, rows = num_attrs_X, cols = 1); -one_to_num_attrs_X = sumup (ones); -ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X); -ProjX_ctable = table (tXcols, one_to_num_attrs_X); -ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable; -X = Data %*% ProjX; -X_mask = 1 - (DataNaNmask %*% ProjX); - -tYcols = t(Ycols); -ones = matrix (1.0, rows = num_attrs_Y, cols = 1); -one_to_num_attrs_Y = sumup (ones); -ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y); -ProjY_ctable = table (tYcols, one_to_num_attrs_Y); -ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable; -Y = Data %*% ProjY; -Y_mask = 1 - (DataNaNmask %*% ProjY); - - -print ("Preparing the strata..."); - -Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]); -Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows"); -vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id])); -vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs)); -num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs); -num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata); -ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1); -one_to_num_records_with_nonNaN_strata = sumup (ones); -StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata); -StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows"); -StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata; -num_strata = nrow (StrataSummator); -num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata; -print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata."); - -print ("Computing the global single-variate statistics..."); - -cnt_X_global = colSums (X_mask); -cnt_Y_global = colSums (Y_mask); -avg_X_global = colSums (X) / cnt_X_global; -avg_Y_global = colSums (Y) / cnt_Y_global; -var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global); -var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global); - sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1); -stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1); - sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1) -stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2); - -print ("Computing the stratified single-variate statistics..."); - -# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata - -Cnt_X_per_stratum = StrataSummator %*% X_mask; -Cnt_Y_per_stratum = StrataSummator %*% Y_mask; -Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "=="); -Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "=="); -One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum); -One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum); -num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum); -num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum); - -Sum_X_per_stratum = StrataSummator %*% X; -Sum_Y_per_stratum = StrataSummator %*% Y; - -# Recompute some global statistics to exclude NaN-stratum records - -cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum); -cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum); -sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum); -sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum); -var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata; -var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata; - -# Compute the stratified statistics - -var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum); -var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum); - sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata); -stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3); - sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata); -stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4); -r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata; -r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata; -fStat_X_vs_strata = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata)); -fStat_Y_vs_strata = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata)); -p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata); -p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata); - -print ("Computing the global bivariate statistics..."); - -# Compute the aggregate X vs. Y statistics and map them into proper positions - -cnt_XY_rectangle = t(X_mask) %*% Y_mask; -sum_X_forXY_rectangle = t(X) %*% Y_mask; -sum_XX_forXY_rectangle = t(X * X) %*% Y_mask; -sum_Y_forXY_rectangle = t(X_mask) %*% Y; -sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y); -sum_XY_rectangle = t(X) %*% Y; -cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY); - -# Compute the global bivariate statistics for output - -cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global; -var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global; -var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global; -slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global; - sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global; - sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5); -corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5; -r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global); - sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2) -stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6); - sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2) -stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7); -fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global); -p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2); - -print ("Computing the stratified bivariate statistics..."); - -# Create projections to "intermingle" X and Y into attribute pairs - -Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY); -Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY); -ones_Y_row = matrix (1.0, rows = 1, cols = num_attrs_Y); -for (i in 1:num_attrs_X) { - start_cid = (i - 1) * num_attrs_Y + 1; - end_cid = i * num_attrs_Y; - Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row; - Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row); -} - -# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata - -Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); -Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY)); -Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); - -Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "=="); -One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum); -num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum); - -# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records - -cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum); -sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum); -sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum); -sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum); - -# Compute the stratified bivariate statistics - -var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum); -var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); -cov_sumX_sumY_stratified = sum_XY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); - -slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified; - sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified; - sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8); -corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8; -r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified); -r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified; - sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); -stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_9); - sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); -stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10); -fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified); -p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); - -print ("Preparing the output matrix..."); -OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY); - -OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st variate column number -OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st variate global presence count -OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st variate global mean -OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st variate global standard deviation -OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st variate stratified standard deviation -OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st variate vs. strata -OutMtx [ 7, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st variate vs. strata -OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd variate column number -OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd variate global presence count -OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd variate global mean -OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd variate global standard deviation -OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd variate stratified standard deviation -OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd variate vs. strata -OutMtx [17, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd variate vs. strata - - -OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd variate presence count -OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd variate vs. 1st variate) -OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation -OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared) -OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation -OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared -OutMtx [27, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0" -OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count -OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd variate vs. 1st variate) -OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation -OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared) -OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation -OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared -OutMtx [37, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0" -OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points - -OutMtx = t(OutMtx); - -print ("Writing the output matrix..."); -write (OutMtx, $5, format="text"); -print ("END STRATIFIED STATISTICS SCRIPT"); - - -deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B) - implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem"); - -fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob) -{ # TEMPORARY IMPLEMENTATION - tailprob = fStat; - for (i in 1:nrow(fStat)) { - for (j in 1:ncol(fStat)) { - q = castAsScalar (fStat [i, j]); - d1 = castAsScalar (df_1 [i, j]); - d2 = castAsScalar (df_2 [i, j]); - if (d1 >= 1 & d2 >= 1 & q >= 0.0) { - tailprob [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE); - } else { - tailprob [i, j] = 0/0; - } - } } -} - -sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A) -{ - NaN = 0/0; - mask_A = ppred (input_A, 0.0, ">="); - prep_A = input_A * mask_A; - mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "==")); - prep_A = deNaN (prep_A); - output_A = sqrt (prep_A) / mask_A; -} - -sumup = function (Matrix[double] A) return (Matrix[double] sum_A) -{ - shift = 1; - m_A = nrow(A); - sum_A = A; - while (shift < m_A) { - sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ]; - shift = 2 * shift; - } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# STRATIFIED BIVARIATE STATISTICS, VERSION 2 +# +# INPUT 1: Dataset with records as rows (matrix filename) +# INPUT 2: The stratum ID column number (integer) +# Stratum ID must be a small positive integer; fractional values are rounded; if 0 or less, shifted to positive. +# INPUT 3: 1st variate column numbers (matrix filename) +# INPUT 4: 2nd variate column numbers (matrix filename) +# INPUT 5: Output (matrix filename) +# +# OUTPUT 1: Output Matrix with 40 columns, containing the following information: +# Rows: One row per each distinct pair (1st variate, 2nd variate) +# Col 01: 1st variate column number +# Col 02: 1st variate global presence count +# Col 03: 1st variate global mean +# Col 04: 1st variate global standard deviation +# Col 05: 1st variate stratified standard deviation +# Col 06: R-squared, 1st variate vs. strata +# Col 07: P-value, 1st variate vs. strata +# Col 08-10: Reserved +# Col 11: 2nd variate column number +# Col 12: 2nd variate global presence count +# Col 13: 2nd variate global mean +# Col 14: 2nd variate global standard deviation +# Col 15: 2nd variate stratified standard deviation +# Col 16: R-squared, 2nd variate vs. strata +# Col 17: P-value, 2nd variate vs. strata +# Col 18-20: Reserved +# Col 21: Global 1st & 2nd variate presence count +# Col 22: Global regression slope (2nd variate vs. 1st variate) +# Col 23: Global regression slope standard deviation +# Col 24: Global correlation = +/- sqrt(R-squared) +# Col 25: Global residual standard deviation +# Col 26: Global R-squared +# Col 27: Global P-value for hypothesis "slope = 0" +# Col 28-30: Reserved +# Col 31: Stratified 1st & 2nd variate presence count +# Col 32: Stratified regression slope (2nd variate vs. 1st variate) +# Col 33: Stratified regression slope standard deviation +# Col 34: Stratified correlation = +/- sqrt(R-squared) +# Col 35: Stratified residual standard deviation +# Col 36: Stratified R-squared +# Col 37: Stratified P-value for hypothesis "slope = 0" +# Col 38: Number of strata with at least two counted points +# Col 39-40: Reserved +# TO DO: GOODNESS OF FIT MEASURE +# +# EXAMPLE: +# hadoop jar SystemML.jar -f PATH/stratstats.dml -exec singlenode -args PATH/stratstats_test_data.mtx 1 PATH/stratstats_test_X.mtx PATH/stratstats_test_Y.mtx PATH/stratstats_test_output.mtx + +NaN = 0/0; + +print ("BEGIN STRATIFIED STATISTICS SCRIPT"); + +print ("Reading the input matrices..."); + +DataWithNaNs = read ($1, format = "text"); +Xcols = read ($3, format = "text"); +Ycols = read ($4, format = "text"); +stratum_column_id = $2; +num_records = nrow(DataWithNaNs); +num_attrs = ncol(DataWithNaNs); +num_attrs_X = ncol(Xcols); +num_attrs_Y = ncol(Ycols); +num_attrs_XY = num_attrs_X * num_attrs_Y; + + +print ("Preparing the variates..."); + +Data = deNaN (DataWithNaNs); +DataNaNmask = ppred (DataWithNaNs, NaN, "=="); + +tXcols = t(Xcols); +ones = matrix (1.0, rows = num_attrs_X, cols = 1); +one_to_num_attrs_X = sumup (ones); +ProjX = matrix (0.0, rows = num_attrs, cols = num_attrs_X); +ProjX_ctable = table (tXcols, one_to_num_attrs_X); +ProjX [1:nrow(ProjX_ctable), ] = ProjX_ctable; +X = Data %*% ProjX; +X_mask = 1 - (DataNaNmask %*% ProjX); + +tYcols = t(Ycols); +ones = matrix (1.0, rows = num_attrs_Y, cols = 1); +one_to_num_attrs_Y = sumup (ones); +ProjY = matrix (0.0, rows = num_attrs, cols = num_attrs_Y); +ProjY_ctable = table (tYcols, one_to_num_attrs_Y); +ProjY [1:nrow(ProjY_ctable), ] = ProjY_ctable; +Y = Data %*% ProjY; +Y_mask = 1 - (DataNaNmask %*% ProjY); + + +print ("Preparing the strata..."); + +Proj_to_deNaN_strata = diag (1 - DataNaNmask [, stratum_column_id]); +Proj_to_deNaN_strata = removeEmpty (target = Proj_to_deNaN_strata, margin = "rows"); +vector_of_strata_with_empty_but_no_NaNs = round (Proj_to_deNaN_strata %*% (Data [, stratum_column_id])); +vector_of_strata_with_empty_but_no_NaNs = vector_of_strata_with_empty_but_no_NaNs + (1 - min (vector_of_strata_with_empty_but_no_NaNs)); +num_strata_with_empty_but_no_NaNs = max (vector_of_strata_with_empty_but_no_NaNs); +num_records_with_nonNaN_strata = nrow (Proj_to_deNaN_strata); +ones = matrix (1.0, rows = num_records_with_nonNaN_strata, cols = 1); +one_to_num_records_with_nonNaN_strata = sumup (ones); +StrataSummator_with_empty_from_nonNaNs = table (vector_of_strata_with_empty_but_no_NaNs, one_to_num_records_with_nonNaN_strata); +StrataSummator_from_nonNaNs = removeEmpty (target = StrataSummator_with_empty_from_nonNaNs, margin = "rows"); +StrataSummator = StrataSummator_from_nonNaNs %*% Proj_to_deNaN_strata; +num_strata = nrow (StrataSummator); +num_empty_strata = num_strata_with_empty_but_no_NaNs - num_strata; +print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but non-NaN strata."); + +print ("Computing the global single-variate statistics..."); + +cnt_X_global = colSums (X_mask); +cnt_Y_global = colSums (Y_mask); +avg_X_global = colSums (X) / cnt_X_global; +avg_Y_global = colSums (Y) / cnt_Y_global; +var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global); +var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global); + sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1); +stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1); + sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1) +stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2); + +print ("Computing the stratified single-variate statistics..."); + +# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata + +Cnt_X_per_stratum = StrataSummator %*% X_mask; +Cnt_Y_per_stratum = StrataSummator %*% Y_mask; +Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "=="); +Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "=="); +One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum); +One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum); +num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum); +num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum); + +Sum_X_per_stratum = StrataSummator %*% X; +Sum_Y_per_stratum = StrataSummator %*% Y; + +# Recompute some global statistics to exclude NaN-stratum records + +cnt_X_excluding_NaNstrata = colSums (Cnt_X_per_stratum); +cnt_Y_excluding_NaNstrata = colSums (Cnt_Y_per_stratum); +sum_X_excluding_NaNstrata = colSums (Sum_X_per_stratum); +sum_Y_excluding_NaNstrata = colSums (Sum_Y_per_stratum); +var_sumX_excluding_NaNstrata = colSums (StrataSummator %*% (X * X)) - (sum_X_excluding_NaNstrata * sum_X_excluding_NaNstrata) / cnt_X_excluding_NaNstrata; +var_sumY_excluding_NaNstrata = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_excluding_NaNstrata * sum_Y_excluding_NaNstrata) / cnt_Y_excluding_NaNstrata; + +# Compute the stratified statistics + +var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum); +var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum); + sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata); +stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3); + sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata); +stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4); +r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_excluding_NaNstrata; +r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_excluding_NaNstrata; +fStat_X_vs_strata = ((var_sumX_excluding_NaNstrata - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_excluding_NaNstrata - num_X_nonempty_strata)); +fStat_Y_vs_strata = ((var_sumY_excluding_NaNstrata - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata)); +p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_excluding_NaNstrata - num_X_nonempty_strata); +p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_excluding_NaNstrata - num_Y_nonempty_strata); + +print ("Computing the global bivariate statistics..."); + +# Compute the aggregate X vs. Y statistics and map them into proper positions + +cnt_XY_rectangle = t(X_mask) %*% Y_mask; +sum_X_forXY_rectangle = t(X) %*% Y_mask; +sum_XX_forXY_rectangle = t(X * X) %*% Y_mask; +sum_Y_forXY_rectangle = t(X_mask) %*% Y; +sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y); +sum_XY_rectangle = t(X) %*% Y; +cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY); + +# Compute the global bivariate statistics for output + +cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global; +var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global; +var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global; +slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global; + sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global; + sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5); +corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5; +r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global); + sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2) +stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6); + sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2) +stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7); +fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global); +p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2); + +print ("Computing the stratified bivariate statistics..."); + +# Create projections to "intermingle" X and Y into attribute pairs + +Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY); +Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY); +ones_Y_row = matrix (1.0, rows = 1, cols = num_attrs_Y); +for (i in 1:num_attrs_X) { + start_cid = (i - 1) * num_attrs_Y + 1; + end_cid = i * num_attrs_Y; + Proj_X_to_XY [i, start_cid:end_cid] = ones_Y_row; + Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_row); +} + +# Compute per-stratum statistics, prevent div-0 for locally empty (NaN-filled) strata + +Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); +Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY)); +Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); + +Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "=="); +One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum); +num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum); + +# Recompute some global aggregate X vs. Y statistics to exclude NaN-stratum records + +cnt_XY_excluding_NaNstrata = colSums (Cnt_XY_per_stratum); +sum_XX_forXY_excluding_NaNstrata = colSums (Sum_XX_forXY_per_stratum); +sum_YY_forXY_excluding_NaNstrata = colSums (Sum_YY_forXY_per_stratum); +sum_XY_excluding_NaNstrata = colSums (Sum_XY_per_stratum); + +# Compute the stratified bivariate statistics + +var_sumX_forXY_stratified = sum_XX_forXY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum); +var_sumY_forXY_stratified = sum_YY_forXY_excluding_NaNstrata - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); +cov_sumX_sumY_stratified = sum_XY_excluding_NaNstrata - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); + +slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified; + sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified; + sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8); +corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8; +r_sqr_X_vs_Y_stratified = cov_sumX_sumY_stratified * cov_sumX_sumY_stratified / (var_sumX_forXY_stratified * var_sumY_forXY_stratified); +r_sqr_X_vs_Y_stratified = corr_XY_stratified * corr_XY_stratified; + sqrt_failsafe_input_9 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / var_sumX_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); +stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_9); + sqrt_failsafe_input_10 = (1 - r_sqr_X_vs_Y_stratified) * var_sumY_forXY_stratified / (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); +stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_10); +fStat_Y_vs_X_stratified = (cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified); +p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_excluding_NaNstrata - num_XY_nonempty_strata - 1); + +print ("Preparing the output matrix..."); +OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY); + +OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st variate column number +OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st variate global presence count +OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st variate global mean +OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st variate global standard deviation +OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st variate stratified standard deviation +OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st variate vs. strata +OutMtx [ 7, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st variate vs. strata +OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd variate column number +OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd variate global presence count +OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd variate global mean +OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd variate global standard deviation +OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd variate stratified standard deviation +OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd variate vs. strata +OutMtx [17, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd variate vs. strata + + +OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd variate presence count +OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd variate vs. 1st variate) +OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation +OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared) +OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation +OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared +OutMtx [27, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0" +OutMtx [31, ] = cnt_XY_excluding_NaNstrata; # Stratified 1st & 2nd variate presence count +OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd variate vs. 1st variate) +OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation +OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared) +OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation +OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared +OutMtx [37, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0" +OutMtx [38, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points + +OutMtx = t(OutMtx); + +print ("Writing the output matrix..."); +write (OutMtx, $5, format="text"); +print ("END STRATIFIED STATISTICS SCRIPT"); + + +deNaN = externalFunction (Matrix[Double] A) return (Matrix[Double] B) + implemented in (classname = "org.apache.sysml.udf.lib.DeNaNWrapper", exectype = "mem"); + +fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob) +{ # TEMPORARY IMPLEMENTATION + tailprob = fStat; + for (i in 1:nrow(fStat)) { + for (j in 1:ncol(fStat)) { + q = castAsScalar (fStat [i, j]); + d1 = castAsScalar (df_1 [i, j]); + d2 = castAsScalar (df_2 [i, j]); + if (d1 >= 1 & d2 >= 1 & q >= 0.0) { + tailprob [i, j] = pf (target = q, df1 = d1, df2 = d2, lower.tail=FALSE); + } else { + tailprob [i, j] = 0/0; + } + } } +} + +sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A) +{ + NaN = 0/0; + mask_A = ppred (input_A, 0.0, ">="); + prep_A = input_A * mask_A; + mask_A = mask_A - mask_A * (ppred (prep_A, NaN, "==")); + prep_A = deNaN (prep_A); + output_A = sqrt (prep_A) / mask_A; +} + +sumup = function (Matrix[double] A) return (Matrix[double] sum_A) +{ + shift = 1; + m_A = nrow(A); + sum_A = A; + while (shift < m_A) { + sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ]; + shift = 2 * shift; + } +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/wilson_score.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/wilson_score.dml b/src/test/scripts/applications/ctableStats/wilson_score.dml index 90db3fc..27d0899 100644 --- a/src/test/scripts/applications/ctableStats/wilson_score.dml +++ b/src/test/scripts/applications/ctableStats/wilson_score.dml @@ -1,145 +1,145 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score -# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n -# INPUT 2: The number of rows -# INPUT 3: The output file -# OUTPUT : Matrix [rows, 15] of doubles, containing the following information: -# (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, -# n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, -# m / n, Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right) -# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION! -# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx" - -setwd ("test/scripts/applications/ctableStats"); -source ("Binomial.dml"); - -# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6); -# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0); -# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421); -# [alpha] = binomProb (test_n, test_m, test_p); -# print ("TEST: Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "] = " + castAsScalar (alpha)); - -print ("BEGIN WILSON SCORE SCRIPT"); -print ("Reading X..."); -X = read ($1, rows = $2, cols = 2, format = "text"); -num_rows = $2; -print ("Performing the computation..."); - -M = X [, 1]; -N = X [, 2]; -blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); -sum_M = blahh * sum(M); -sum_N = blahh * sum(N); - -[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M); -[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N); -[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M); - -M_minus_1 = M - 1; -N_minus_1 = N - 1; -big_alpha = 0.975 * blahh; -small_alpha = 0.025 * blahh; - -[l_m_sum_exact] = binomQuantile (sum_M, M_minus_1, big_alpha); -[r_m_sum_exact] = binomQuantile (sum_M, M, small_alpha); -[l_n_sum_exact] = binomQuantile (sum_N, N_minus_1, big_alpha); -[r_n_sum_exact] = binomQuantile (sum_N, N, small_alpha); -[l_m_n_exact] = binomQuantile (N, M_minus_1, big_alpha); -[r_m_n_exact] = binomQuantile (N, M, small_alpha); - -result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0); -result [, 1] = p_m_sum; -result [, 2] = l_m_sum_wilson; -result [, 3] = r_m_sum_wilson; -result [, 4] = l_m_sum_exact; -result [, 5] = r_m_sum_exact; -result [, 6] = p_n_sum; -result [, 7] = l_n_sum_wilson; -result [, 8] = r_n_sum_wilson; -result [, 9] = l_n_sum_exact; -result [, 10] = r_n_sum_exact; -result [, 11] = p_m_n; -result [, 12] = l_m_n_wilson; -result [, 13] = r_m_n_wilson; -result [, 14] = l_m_n_exact; -result [, 15] = r_m_n_exact; - -print ("M / sum(M) RESULTS: Wilson, Exact"); - -for (i in 1:num_rows) { - p1 = castAsScalar (round (result [i, 1] * 100000) / 1000); - lw1 = castAsScalar (round (result [i, 2] * 100000) / 1000); - rw1 = castAsScalar (round (result [i, 3] * 100000) / 1000); - le1 = castAsScalar (round (result [i, 4] * 100000) / 1000); - re1 = castAsScalar (round (result [i, 5] * 100000) / 1000); - print ("Row " + i + ": " - + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = " - + p1 + "% [" + lw1 + "%, " + rw1 + "%] [" + le1 + "%, " + re1 + "%]"); -} - -print ("N / sum(N) RESULTS: Wilson, Exact"); - -for (i in 1:num_rows) { - p2 = castAsScalar (round (result [i, 6] * 100000) / 1000); - lw2 = castAsScalar (round (result [i, 7] * 100000) / 1000); - rw2 = castAsScalar (round (result [i, 8] * 100000) / 1000); - le2 = castAsScalar (round (result [i, 9] * 100000) / 1000); - re2 = castAsScalar (round (result [i, 10] * 100000) / 1000); - print ("Row " + i + ": " - + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = " - + p2 + "% [" + lw2 + "%, " + rw2 + "%] [" + le2 + "%, " + re2 + "%] "); -} - -print ("M / N RESULTS: Wilson, Exact"); - -for (i in 1:num_rows) { - p3 = castAsScalar (round (result [i, 11] * 100000) / 1000); - lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000); - rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000); - le3 = castAsScalar (round (result [i, 14] * 100000) / 1000); - re3 = castAsScalar (round (result [i, 15] * 100000) / 1000); - print ("Row " + i + ": " - + castAsScalar (M [i, 1]) + "/" + castAsScalar ( N [i, 1]) + " = " - + p3 + "% [" + lw3 + "%, " + rw3 + "%] [" + le3 + "%, " + re3 + "%] "); -} - - - -print ("Writing the results..."); -write (result, $3, format = "text"); -print ("END WILSON SCORE SCRIPT"); - - -wilson_confidence = function (Matrix[double] n, Matrix[double] m) -return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) -{ - z = 1.96; # 97.5% normal percentile, for 95% confidence interval - z_sq_n = z * z * n; - qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); - midpt = n * m + z_sq_n / 2; - denom = n * n + z_sq_n; - ratio = m / n; - conf_left = (midpt - qroot) / denom; - conf_right = (midpt + qroot) / denom; -} - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Computes 95% confidence intervals for binomial ratios using Wilson Score and Exact Score +# INPUT 1: Matrix [rows, 2] of integer counts (m, n) where 0 <= m <= n +# INPUT 2: The number of rows +# INPUT 3: The output file +# OUTPUT : Matrix [rows, 15] of doubles, containing the following information: +# (m / sum(m), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, +# n / sum(n), Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right, +# m / n, Wilson 95%-conf.left, Wilson 95%-conf.right, Exact 95%-conf.left, Exact 95%-conf.right) +# PLEASE BE AWARE THAT FOR EXTREMELY SMALL COUNTS THE WILSON INTERVALS WILL BE WRONG! THEY USE GAUSSIAN APPROXIMATION! +# EXAMPLE: wilson_score.dml -args "test/scripts/applications/ctableStats/wilson_test_input.mtx" 7 "test/scripts/applications/ctableStats/wilson_test_output.mtx" + +setwd ("test/scripts/applications/ctableStats"); +source ("Binomial.dml"); + +# test_n = Rand (rows = 1, cols = 1, min = 6, max = 6); +# test_m = Rand (rows = 1, cols = 1, min = 0, max = 0); +# test_p = Rand (rows = 1, cols = 1, min = 0.00421, max = 0.00421); +# [alpha] = binomProb (test_n, test_m, test_p); +# print ("TEST: Prob [Binom (" + castAsScalar (test_n) + ", " + castAsScalar (test_p) + ") <= " + castAsScalar (test_m) + "] = " + castAsScalar (alpha)); + +print ("BEGIN WILSON SCORE SCRIPT"); +print ("Reading X..."); +X = read ($1, rows = $2, cols = 2, format = "text"); +num_rows = $2; +print ("Performing the computation..."); + +M = X [, 1]; +N = X [, 2]; +blahh = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); +sum_M = blahh * sum(M); +sum_N = blahh * sum(N); + +[p_m_sum, l_m_sum_wilson, r_m_sum_wilson] = wilson_confidence (sum_M, M); +[p_n_sum, l_n_sum_wilson, r_n_sum_wilson] = wilson_confidence (sum_N, N); +[p_m_n, l_m_n_wilson, r_m_n_wilson] = wilson_confidence (N, M); + +M_minus_1 = M - 1; +N_minus_1 = N - 1; +big_alpha = 0.975 * blahh; +small_alpha = 0.025 * blahh; + +[l_m_sum_exact] = binomQuantile (sum_M, M_minus_1, big_alpha); +[r_m_sum_exact] = binomQuantile (sum_M, M, small_alpha); +[l_n_sum_exact] = binomQuantile (sum_N, N_minus_1, big_alpha); +[r_n_sum_exact] = binomQuantile (sum_N, N, small_alpha); +[l_m_n_exact] = binomQuantile (N, M_minus_1, big_alpha); +[r_m_n_exact] = binomQuantile (N, M, small_alpha); + +result = Rand (rows = num_rows, cols = 15, min = 0.0, max = 0.0); +result [, 1] = p_m_sum; +result [, 2] = l_m_sum_wilson; +result [, 3] = r_m_sum_wilson; +result [, 4] = l_m_sum_exact; +result [, 5] = r_m_sum_exact; +result [, 6] = p_n_sum; +result [, 7] = l_n_sum_wilson; +result [, 8] = r_n_sum_wilson; +result [, 9] = l_n_sum_exact; +result [, 10] = r_n_sum_exact; +result [, 11] = p_m_n; +result [, 12] = l_m_n_wilson; +result [, 13] = r_m_n_wilson; +result [, 14] = l_m_n_exact; +result [, 15] = r_m_n_exact; + +print ("M / sum(M) RESULTS: Wilson, Exact"); + +for (i in 1:num_rows) { + p1 = castAsScalar (round (result [i, 1] * 100000) / 1000); + lw1 = castAsScalar (round (result [i, 2] * 100000) / 1000); + rw1 = castAsScalar (round (result [i, 3] * 100000) / 1000); + le1 = castAsScalar (round (result [i, 4] * 100000) / 1000); + re1 = castAsScalar (round (result [i, 5] * 100000) / 1000); + print ("Row " + i + ": " + + castAsScalar (M [i, 1]) + "/" + castAsScalar (sum_M [i, 1]) + " = " + + p1 + "% [" + lw1 + "%, " + rw1 + "%] [" + le1 + "%, " + re1 + "%]"); +} + +print ("N / sum(N) RESULTS: Wilson, Exact"); + +for (i in 1:num_rows) { + p2 = castAsScalar (round (result [i, 6] * 100000) / 1000); + lw2 = castAsScalar (round (result [i, 7] * 100000) / 1000); + rw2 = castAsScalar (round (result [i, 8] * 100000) / 1000); + le2 = castAsScalar (round (result [i, 9] * 100000) / 1000); + re2 = castAsScalar (round (result [i, 10] * 100000) / 1000); + print ("Row " + i + ": " + + castAsScalar (N [i, 1]) + "/" + castAsScalar (sum_N [i, 1]) + " = " + + p2 + "% [" + lw2 + "%, " + rw2 + "%] [" + le2 + "%, " + re2 + "%] "); +} + +print ("M / N RESULTS: Wilson, Exact"); + +for (i in 1:num_rows) { + p3 = castAsScalar (round (result [i, 11] * 100000) / 1000); + lw3 = castAsScalar (round (result [i, 12] * 100000) / 1000); + rw3 = castAsScalar (round (result [i, 13] * 100000) / 1000); + le3 = castAsScalar (round (result [i, 14] * 100000) / 1000); + re3 = castAsScalar (round (result [i, 15] * 100000) / 1000); + print ("Row " + i + ": " + + castAsScalar (M [i, 1]) + "/" + castAsScalar ( N [i, 1]) + " = " + + p3 + "% [" + lw3 + "%, " + rw3 + "%] [" + le3 + "%, " + re3 + "%] "); +} + + + +print ("Writing the results..."); +write (result, $3, format = "text"); +print ("END WILSON SCORE SCRIPT"); + + +wilson_confidence = function (Matrix[double] n, Matrix[double] m) +return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) +{ + z = 1.96; # 97.5% normal percentile, for 95% confidence interval + z_sq_n = z * z * n; + qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); + midpt = n * m + z_sq_n / 2; + denom = n * n + z_sq_n; + ratio = m / n; + conf_left = (midpt - qroot) / denom; + conf_right = (midpt + qroot) / denom; +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/zipftest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/zipftest.dml b/src/test/scripts/applications/ctableStats/zipftest.dml index 08fe217..50ac5f4 100644 --- a/src/test/scripts/applications/ctableStats/zipftest.dml +++ b/src/test/scripts/applications/ctableStats/zipftest.dml @@ -1,77 +1,77 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Generator of random records with boolean features -# Average record (row) and feature (column) densities follow -# power laws: E(#1s in line k) = const / (k + add)^pow -# Cell[1, 1] has the highest probability to be 1, also input - -# By setting num_features >> num_records we allow lots of rare -# features while keeping most records nonempty. -# The power ("pow") in the power law determines the tail behavior; -# The additive ("add") determines how steeply the density changes -# in the first few records or features. - -num_records = 1000; # The number of records (rows) -num_features = 50000; # The number of boolean features (columns) - -pow_records = 2.0; # The Zipf law power for record density -pow_features = 1.0; # The Zipf law power for feature density - -add_records = 100.0; # The additive shift for record density -add_features = 20.0; # The additive shift for feature density - -max_cell_prob = 1.0; # The probability for Cell[1, 1] to be 1 - -############ - -c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features); - -vec_records = matrix (1.0, rows = num_records, cols = 1); -vec_records = sumup (vec_records); -vec_records = 1.0 / ((vec_records + add_records)^pow_records); - -vec_features = matrix (1.0, rows = num_features, cols = 1); -vec_features = sumup (vec_features); -vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features); - -Probs = c * (vec_records %*% vec_features); -avg_density_records = rowSums (Probs); -avg_density_features = colSums (Probs); - -Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0); -Data = ppred (Tosses, Probs, "<="); - -write (avg_density_records, "Zipf.AvgDensity.Rows", format="text"); -write (avg_density_features, "Zipf.AvgDensity.Cols", format="text"); -write (Data, "Zipf.Data", format="text"); - - -sumup = function (Matrix[double] A) return (Matrix[double] sum_A) -{ - shift = 1; - m_A = nrow(A); - sum_A = A; - while (shift < m_A) { - sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ]; - shift = 2 * shift; - } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Generator of random records with boolean features +# Average record (row) and feature (column) densities follow +# power laws: E(#1s in line k) = const / (k + add)^pow +# Cell[1, 1] has the highest probability to be 1, also input + +# By setting num_features >> num_records we allow lots of rare +# features while keeping most records nonempty. +# The power ("pow") in the power law determines the tail behavior; +# The additive ("add") determines how steeply the density changes +# in the first few records or features. + +num_records = 1000; # The number of records (rows) +num_features = 50000; # The number of boolean features (columns) + +pow_records = 2.0; # The Zipf law power for record density +pow_features = 1.0; # The Zipf law power for feature density + +add_records = 100.0; # The additive shift for record density +add_features = 20.0; # The additive shift for feature density + +max_cell_prob = 1.0; # The probability for Cell[1, 1] to be 1 + +############ + +c = max_cell_prob * ((1.0 + add_records)^pow_records) * ((1.0 + add_features)^pow_features); + +vec_records = matrix (1.0, rows = num_records, cols = 1); +vec_records = sumup (vec_records); +vec_records = 1.0 / ((vec_records + add_records)^pow_records); + +vec_features = matrix (1.0, rows = num_features, cols = 1); +vec_features = sumup (vec_features); +vec_features = 1.0 / ((t(vec_features) + add_features)^pow_features); + +Probs = c * (vec_records %*% vec_features); +avg_density_records = rowSums (Probs); +avg_density_features = colSums (Probs); + +Tosses = Rand (rows = num_records, cols = num_features, min = 0.0, max = 1.0); +Data = ppred (Tosses, Probs, "<="); + +write (avg_density_records, "Zipf.AvgDensity.Rows", format="text"); +write (avg_density_features, "Zipf.AvgDensity.Cols", format="text"); +write (Data, "Zipf.Data", format="text"); + + +sumup = function (Matrix[double] A) return (Matrix[double] sum_A) +{ + shift = 1; + m_A = nrow(A); + sum_A = A; + while (shift < m_A) { + sum_A [(shift+1):m_A, ] = sum_A [(shift+1):m_A, ] + sum_A [1:(m_A-shift), ]; + shift = 2 * shift; + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/Categorical.R b/src/test/scripts/applications/descriptivestats/Categorical.R index 46c380d..f88c8bf 100644 --- a/src/test/scripts/applications/descriptivestats/Categorical.R +++ b/src/test/scripts/applications/descriptivestats/Categorical.R @@ -1,57 +1,57 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java -# command line invocation assuming $C_HOME is set to the home of the R script -# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -#library("batch") -library("Matrix") - -V = readMM(paste(args[1], "vector.mtx", sep="")) - -tab = table(V[,1]) -cat = t(as.numeric(names(tab))) -Nc = t(as.vector(tab)) - -# the number of categories of a categorical variable -R = length(Nc) - -# total count -s = sum(Nc) - -# percentage values of each categorical compare to the total case number -Pc = Nc / s - -# all categorical values of a categorical variable -C = (Nc > 0) - -# mode -mx = max(Nc) -Mode = (Nc == mx) - -writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text"); -write(R, paste(args[2], "R", sep="")); -writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text"); -writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text"); -writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.UnivariateStatsTest.java +# command line invocation assuming $C_HOME is set to the home of the R script +# Rscript $C_HOME/Categorical.R $C_HOME/in/ $C_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +#library("batch") +library("Matrix") + +V = readMM(paste(args[1], "vector.mtx", sep="")) + +tab = table(V[,1]) +cat = t(as.numeric(names(tab))) +Nc = t(as.vector(tab)) + +# the number of categories of a categorical variable +R = length(Nc) + +# total count +s = sum(Nc) + +# percentage values of each categorical compare to the total case number +Pc = Nc / s + +# all categorical values of a categorical variable +C = (Nc > 0) + +# mode +mx = max(Nc) +Mode = (Nc == mx) + +writeMM(as(t(Nc),"CsparseMatrix"), paste(args[2], "Nc", sep=""), format="text"); +write(R, paste(args[2], "R", sep="")); +writeMM(as(t(Pc),"CsparseMatrix"), paste(args[2], "Pc", sep=""), format="text"); +writeMM(as(t(C),"CsparseMatrix"), paste(args[2], "C", sep=""), format="text"); +writeMM(as(t(Mode),"CsparseMatrix"), paste(args[2], "Mode", sep=""), format="text"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/Categorical.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/Categorical.dml b/src/test/scripts/applications/descriptivestats/Categorical.dml index 735f5d6..7599d06 100644 --- a/src/test/scripts/applications/descriptivestats/Categorical.dml +++ b/src/test/scripts/applications/descriptivestats/Categorical.dml @@ -1,55 +1,55 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script Categorical.dml? -# Assume C_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for vector -# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode" - -V = read($1, rows=$2, cols=1, format="text") - -# a set of number of values specify the number of cases of each categorical -Nc = table(V,1); - -# the number of categories of a categorical variable -R = nrow(Nc) - -# total count -s = sum(Nc) - -# percentage values of each categorical compare to the total case number -Pc = Nc / s - -# all categorical values of a categorical variable -C = ppred(Nc, 0, ">") - -# mode -mx = max(Nc) -Mode = ppred(Nc, mx, "==") - -write(Nc, $3, format="text") -write(R, $4) -write(Pc, $5, format="text") -write(C, $6, format="text") -write(Mode, $7, format="text") - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script Categorical.dml? +# Assume C_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for vector +# hadoop jar SystemML.jar -f $C_HOME/Categorical.dml -args "$INPUT_DIR/vector" 10000 "$OUTPUT_DIR/Nc" "$OUPUT_DIR/R" "$OUTPUT_DIR/Pc" "$OUTPUT_DIR/C" "$OUTPUT_DIR/Mode" + +V = read($1, rows=$2, cols=1, format="text") + +# a set of number of values specify the number of cases of each categorical +Nc = table(V,1); + +# the number of categories of a categorical variable +R = nrow(Nc) + +# total count +s = sum(Nc) + +# percentage values of each categorical compare to the total case number +Pc = Nc / s + +# all categorical values of a categorical variable +C = ppred(Nc, 0, ">") + +# mode +mx = max(Nc) +Mode = ppred(Nc, mx, "==") + +write(Nc, $3, format="text") +write(R, $4) +write(Pc, $5, format="text") +write(C, $6, format="text") +write(Mode, $7, format="text") + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R index 7c9785c..24a391f 100644 --- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R +++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.R @@ -1,49 +1,49 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java -# command line invocation assuming $CC_HOME is set to the home of the R script -# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/ -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -A = readMM(paste(args[1], "A.mtx", sep="")); -B = readMM(paste(args[1], "B.mtx", sep="")); - -F = table(A[,1],B[,1]); - -# chisq.test returns a list containing statistic, p-value, etc. -cst = chisq.test(F); - -# get the chi-squared coefficient from the list -chi_squared = as.numeric(cst[1]); -pValue = as.numeric(cst[3]); - -write(pValue, paste(args[2], "PValue", sep="")); - -q = min(dim(F)); -W = sum(F); -cramers_v = sqrt(chi_squared/(W*(q-1))); - -write(cramers_v, paste(args[2], "CramersV", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java +# command line invocation assuming $CC_HOME is set to the home of the R script +# Rscript $CC_HOME/CategoricalCategorical.R $CC_HOME/in/ $CC_HOME/expected/ +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +A = readMM(paste(args[1], "A.mtx", sep="")); +B = readMM(paste(args[1], "B.mtx", sep="")); + +F = table(A[,1],B[,1]); + +# chisq.test returns a list containing statistic, p-value, etc. +cst = chisq.test(F); + +# get the chi-squared coefficient from the list +chi_squared = as.numeric(cst[1]); +pValue = as.numeric(cst[3]); + +write(pValue, paste(args[2], "PValue", sep="")); + +q = min(dim(F)); +W = sum(F); +cramers_v = sqrt(chi_squared/(W*(q-1))); + +write(cramers_v, paste(args[2], "CramersV", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml index 626bfeb..4301f91 100644 --- a/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml +++ b/src/test/scripts/applications/descriptivestats/CategoricalCategorical.dml @@ -1,56 +1,56 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script CategoricalCategorical.dml? -# Assume CC_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for both A and B -# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV" - -A = read($1, rows=$2, cols=1, format="text"); -B = read($3, rows=$2, cols=1, format="text"); - -# Contingency Table -F = table(A,B); - -# Chi-Squared -W = sum(F); -r = rowSums(F); -c = colSums(F); -E = (r %*% c)/W; -T = (F-E)^2/E; -chi_squared = sum(T); - -# compute p-value -degFreedom = (nrow(F)-1)*(ncol(F)-1); -pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - -# Cramer's V -R = nrow(F); -C = ncol(F); -q = min(R,C); -cramers_v = sqrt(chi_squared/(W*(q-1))); - -write(pValue, $4); -write(cramers_v, $5); - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script CategoricalCategorical.dml? +# Assume CC_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for both A and B +# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategorical.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV" + +A = read($1, rows=$2, cols=1, format="text"); +B = read($3, rows=$2, cols=1, format="text"); + +# Contingency Table +F = table(A,B); + +# Chi-Squared +W = sum(F); +r = rowSums(F); +c = colSums(F); +E = (r %*% c)/W; +T = (F-E)^2/E; +chi_squared = sum(T); + +# compute p-value +degFreedom = (nrow(F)-1)*(ncol(F)-1); +pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + +# Cramer's V +R = nrow(F); +C = ncol(F); +q = min(R,C); +cramers_v = sqrt(chi_squared/(W*(q-1))); + +write(pValue, $4); +write(cramers_v, $5); + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R index 9e3f797..bacc7e3 100644 --- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R +++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.R @@ -1,67 +1,67 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java -# command line invocation assuming $CC_HOME is set to the home of the R script -# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/ -# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R - -args <- commandArgs(TRUE) -options(digits=22) - -library("Matrix") - -#parseCommandArgs() -###################### - -print(commandArgs(TRUE)[1]) - -A = readMM(paste(args[1], "A.mtx", sep="")); -B = readMM(paste(args[1], "B.mtx", sep="")); -WM = readMM(paste(args[1], "WM.mtx", sep="")); - -Av = A[,1]; -Bv = B[,1]; -WMv = WM[,1]; - -# create a data frame with vectors A, B, WM -df = data.frame(Av,Bv,WMv); - -# contingency table with weights -F = xtabs ( WMv ~ Av + Bv, df); - -# chisq.test returns a list containing statistic, p-value, etc. -cst = chisq.test(F); - -# get the chi-squared coefficient from the list -chi_squared = as.numeric(cst[1]); -pValue = as.numeric(cst[3]); - -write(pValue, paste(args[2], "PValue", sep="")); - -####################### - -q = min(dim(F)); -W = sum(F); -cramers_v = sqrt(chi_squared/(W*(q-1))); - -write(cramers_v, paste(args[2], "CramersV", sep="")); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# JUnit test class: dml.test.integration.descriptivestats.CategoricalCategoricalTest.java +# command line invocation assuming $CC_HOME is set to the home of the R script +# Rscript $CC_HOME/CategoricalCategoricalWithWeightsTest.R $CC_HOME/in/ $CC_HOME/expected/ +# Usage: R --vanilla -args Xfile X < CategoricalCategoricalWithWeightsTest.R + +args <- commandArgs(TRUE) +options(digits=22) + +library("Matrix") + +#parseCommandArgs() +###################### + +print(commandArgs(TRUE)[1]) + +A = readMM(paste(args[1], "A.mtx", sep="")); +B = readMM(paste(args[1], "B.mtx", sep="")); +WM = readMM(paste(args[1], "WM.mtx", sep="")); + +Av = A[,1]; +Bv = B[,1]; +WMv = WM[,1]; + +# create a data frame with vectors A, B, WM +df = data.frame(Av,Bv,WMv); + +# contingency table with weights +F = xtabs ( WMv ~ Av + Bv, df); + +# chisq.test returns a list containing statistic, p-value, etc. +cst = chisq.test(F); + +# get the chi-squared coefficient from the list +chi_squared = as.numeric(cst[1]); +pValue = as.numeric(cst[3]); + +write(pValue, paste(args[2], "PValue", sep="")); + +####################### + +q = min(dim(F)); +W = sum(F); +cramers_v = sqrt(chi_squared/(W*(q-1))); + +write(cramers_v, paste(args[2], "CramersV", sep="")); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml index f92f42c..70e50f4 100644 --- a/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml +++ b/src/test/scripts/applications/descriptivestats/CategoricalCategoricalWithWeightsTest.dml @@ -1,60 +1,60 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Note this script is externalized to customers, please do not change w/o consulting component owner. -# How to invoke this dml script CategoricalCategorical.dml? -# Assume CC_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume rows = 10000 for both A and B -# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV" - -# A <- nominal -# B <- nominal -# WM <- weights - -A = read($1, rows=$2, cols=1, format="text"); -B = read($3, rows=$2, cols=1, format="text"); -WM = read($4, rows=$2, cols=1, format="text"); - -# Contingency Table -F = table(A,B,WM); - -# Chi-Squared -W = sum(F); -r = rowSums(F); -c = colSums(F); -E = (r %*% c)/W; -T = (F-E)^2/E; -chi_squared = sum(T); - -# compute p-value -degFreedom = (nrow(F)-1)*(ncol(F)-1); -pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - -# Cramer's V -R = nrow(F); -C = ncol(F); -q = min(R,C); -cramers_v = sqrt(chi_squared/(W*(q-1))); - -write(pValue, $5); -write(cramers_v, $6); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Note this script is externalized to customers, please do not change w/o consulting component owner. +# How to invoke this dml script CategoricalCategorical.dml? +# Assume CC_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume rows = 10000 for both A and B +# hadoop jar SystemML.jar -f $CC_HOME/CategoricalCategoricalWithWeightsTest.dml -args "$INPUT_DIR/A" 10000 "$INPUT_DIR/B" "$INPUT_DIR/WM" "$OUPUT_DIR/PValue" "$OUTPUT_DIR/CramersV" + +# A <- nominal +# B <- nominal +# WM <- weights + +A = read($1, rows=$2, cols=1, format="text"); +B = read($3, rows=$2, cols=1, format="text"); +WM = read($4, rows=$2, cols=1, format="text"); + +# Contingency Table +F = table(A,B,WM); + +# Chi-Squared +W = sum(F); +r = rowSums(F); +c = colSums(F); +E = (r %*% c)/W; +T = (F-E)^2/E; +chi_squared = sum(T); + +# compute p-value +degFreedom = (nrow(F)-1)*(ncol(F)-1); +pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + +# Cramer's V +R = nrow(F); +C = ncol(F); +q = min(R,C); +cramers_v = sqrt(chi_squared/(W*(q-1))); + +write(pValue, $5); +write(cramers_v, $6); +
