http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/stratstats.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/stratstats.dml b/scripts/algorithms/stratstats.dml index fc846f3..2b7425d 100644 --- a/scripts/algorithms/stratstats.dml +++ b/scripts/algorithms/stratstats.dml @@ -1,396 +1,396 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# STRATIFIED BIVARIATE STATISTICS, VERSION 4 -# -# INPUT PARAMETERS: -# ----------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# ----------------------------------------------------------------------------- -# X String --- Location to read matrix X that has all 1-st covariates -# Y String " " Location to read matrix Y that has all 2-nd covariates -# the default value " " means "use X in place of Y" -# S String " " Location to read matrix S that has the stratum column -# the default value " " means "use X in place of S" -# Xcid String " " Location to read the 1-st covariate X-column indices -# the default value " " means "use columns 1 : ncol(X)" -# Ycid String " " Location to read the 2-nd covariate Y-column indices -# the default value " " means "use columns 1 : ncol(Y)" -# Scid Int 1 Column index of the stratum column in S -# O String --- Location to store the output matrix (see below) -# fmt String "text" Matrix output format, usually "text" or "csv" -# ----------------------------------------------------------------------------- -# Note: the stratum column must contain small positive integers; all fractional -# values are rounded; strata with ID <= 0 or NaN are treated as missing. -# -# OUTPUT MATRIX: -# One row per each distinct pair (1st covariate, 2nd covariate) -# 40 columns containing the following information: -# Col 01: 1st covariate X-column number -# Col 02: 1st covariate global presence count -# Col 03: 1st covariate global mean -# Col 04: 1st covariate global standard deviation -# Col 05: 1st covariate stratified standard deviation -# Col 06: R-squared, 1st covariate vs. strata -# Col 07: adjusted R-squared, 1st covariate vs. strata -# Col 08: P-value, 1st covariate vs. strata -# Col 09-10: Reserved -# Col 11: 2nd covariate Y-column number -# Col 12: 2nd covariate global presence count -# Col 13: 2nd covariate global mean -# Col 14: 2nd covariate global standard deviation -# Col 15: 2nd covariate stratified standard deviation -# Col 16: R-squared, 2nd covariate vs. strata -# Col 17: adjusted R-squared, 2nd covariate vs. strata -# Col 18: P-value, 2nd covariate vs. strata -# Col 19-20: Reserved -# Col 21: Global 1st & 2nd covariate presence count -# Col 22: Global regression slope (2nd vs. 1st covariate) -# Col 23: Global regression slope standard deviation -# Col 24: Global correlation = +/- sqrt(R-squared) -# Col 25: Global residual standard deviation -# Col 26: Global R-squared -# Col 27: Global adjusted R-squared -# Col 28: Global P-value for hypothesis "slope = 0" -# Col 29-30: Reserved -# Col 31: Stratified 1st & 2nd covariate presence count -# Col 32: Stratified regression slope (2nd vs. 1st covariate) -# Col 33: Stratified regression slope standard deviation -# Col 34: Stratified correlation = +/- sqrt(R-squared) -# Col 35: Stratified residual standard deviation -# Col 36: Stratified R-squared -# Col 37: Stratified adjusted R-squared -# Col 38: Stratified P-value for hypothesis "slope = 0" -# Col 39: Number of strata with at least two counted points -# Col 40: Reserved -# -# EXAMPLES: -# -# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx -# Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv -# -# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx -# Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx - -fileX = $X; -fileY = ifdef ($Y, " "); -fileS = ifdef ($S, " "); -fileO = $O; -fmtO = ifdef ($fmt, "text"); - -fileXcid = ifdef ($Xcid, " "); -fileYcid = ifdef ($Ycid, " "); -stratum_column_id = ifdef ($Scid, 1); - -print ("BEGIN STRATIFIED STATISTICS SCRIPT"); - -print ("Reading the input matrices..."); - -XwithNaNs = read (fileX); -if (fileY != " ") { - YwithNaNs = read (fileY); -} else { - YwithNaNs = XwithNaNs; -} -if (fileS != " ") { - SwithNaNsFull = read (fileS); - SwithNaNs = SwithNaNsFull [, stratum_column_id]; -} else { - SwithNaNs = XwithNaNs [, stratum_column_id]; -} -if (fileXcid != " ") { - Xcols = read (fileXcid); -} else { - Xcols = t(seq (1, ncol (XwithNaNs), 1)); -} -if (fileYcid != " ") { - Ycols = read (fileYcid); -} else { - Ycols = t(seq (1, ncol (YwithNaNs), 1)); -} -tXcols = t(Xcols); -tYcols = t(Ycols); - -num_records = nrow (XwithNaNs); -num_attrs = ncol (XwithNaNs); -num_attrs_X = ncol (Xcols); -num_attrs_Y = ncol (Ycols); -num_attrs_XY = num_attrs_X * num_attrs_Y; - -print ("Preparing the covariates..."); - -XnoNaNs = replace (target = XwithNaNs, pattern = 0.0/0.0, replacement = 0); -YnoNaNs = replace (target = YwithNaNs, pattern = 0.0/0.0, replacement = 0); -XNaNmask = ppred (XwithNaNs, XwithNaNs, "=="); -YNaNmask = ppred (YwithNaNs, YwithNaNs, "=="); -one_to_num_attrs_X = seq (1, num_attrs_X, 1); -one_to_num_attrs_Y = seq (1, num_attrs_Y, 1); -ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X); -ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y); - -ProjX_ctable = table (tXcols, one_to_num_attrs_X); -ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable; - -ProjY_ctable = table (tYcols, one_to_num_attrs_Y); -ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable; - -X = XnoNaNs %*% ProjX; -Y = YnoNaNs %*% ProjY; -X_mask = XNaNmask %*% ProjX; -Y_mask = YNaNmask %*% ProjY; - -print ("Preparing the strata..."); - -SnoNaNs = replace (target = SwithNaNs, pattern = 0.0/0.0, replacement = 0); -S = round (SnoNaNs) * ppred (SnoNaNs, 0.0, ">"); -Proj_good_stratumID = diag (ppred (S, 0.0, ">")); -Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows"); -vector_of_good_stratumIDs = Proj_good_stratumID %*% S; -vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs)); -num_records_with_good_stratumID = nrow (Proj_good_stratumID); -one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1); - -# Create a group-by summation matrix for records over stratum IDs -# "with_empty" means with stratum IDs that never occur in records - -num_strata_with_empty = max (vector_of_good_stratumIDs); -StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID); -StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows"); -StrataSummator = StrataSummator %*% Proj_good_stratumID; -num_strata = nrow (StrataSummator); -num_empty_strata = num_strata_with_empty - num_strata; -print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata."); - -print ("Computing the global single-variate statistics..."); - -cnt_X_global = colSums (X_mask); -cnt_Y_global = colSums (Y_mask); -avg_X_global = colSums (X) / cnt_X_global; -avg_Y_global = colSums (Y) / cnt_Y_global; -var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global); -var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global); - sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1); -stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1); - sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1); -stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2); - -print ("Computing the stratified single-variate statistics..."); - -# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata - -Cnt_X_per_stratum = StrataSummator %*% X_mask; -Cnt_Y_per_stratum = StrataSummator %*% Y_mask; -Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "=="); -Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "=="); -One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum); -One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum); -num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum); -num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum); - -Sum_X_per_stratum = StrataSummator %*% X; -Sum_Y_per_stratum = StrataSummator %*% Y; - -# Recompute some global statistics to exclude bad stratum-ID records - -cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum); -cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum); -sum_X_with_good_stratumID = colSums (Sum_X_per_stratum); -sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum); -var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID; -var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID; - -# Compute the stratified statistics - -var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum); -var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum); - sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata); -stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3); - sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata); -stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4); -r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID; -r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID; -adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1)); -adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1)); -fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)); -fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)); -p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata); -p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata); - -print ("Computing the global bivariate statistics..."); - -# Compute the aggregate X vs. Y statistics and map them into proper positions - -cnt_XY_rectangle = t(X_mask) %*% Y_mask; -sum_X_forXY_rectangle = t(X) %*% Y_mask; -sum_XX_forXY_rectangle = t(X * X) %*% Y_mask; -sum_Y_forXY_rectangle = t(X_mask) %*% Y; -sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y); -sum_XY_rectangle = t(X) %*% Y; -cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); -ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY); - -# Compute the global bivariate statistics for output - -cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global; -var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global; -var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global; -slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global; - sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global; - sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5); -corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5; -r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global); -adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2); - sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2) -stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6); - sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2) -stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7); -fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global); -p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2); - -print ("Computing the stratified bivariate statistics..."); - -# Create projections to "intermingle" X and Y into attribute pairs - -Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY); -Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY); -ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1); -for (i in 1:num_attrs_X) { - start_cid = (i - 1) * num_attrs_Y + 1; - end_cid = i * num_attrs_Y; - Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col); - Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col); -} - -# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata - -Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); -Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); -Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY)); -Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); - -Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "=="); -One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum); -num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum); - -# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records - -cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum); -sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum); -sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum); -sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum); - -# Compute the stratified bivariate statistics - -var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum); -var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); -cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); - -slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified; - sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified; - sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8); -corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8; -r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified); -temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) -adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata); - sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified; -stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9); - sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified; -stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10); -fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified); -p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1); - -print ("Preparing the output matrix..."); -OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY); - -OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number -OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count -OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean -OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation -OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation -OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata -OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata -OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata -OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number -OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count -OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean -OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation -OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation -OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata -OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata -OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata - -OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count -OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate) -OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation -OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared) -OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation -OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared -OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared -OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0" -OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count -OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate) -OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation -OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared) -OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation -OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared -OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared -OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0" -OutMtx [39, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points - -OutMtx = t(OutMtx); - -print ("Writing the output matrix..."); -write (OutMtx, fileO, format=fmtO); -print ("END STRATIFIED STATISTICS SCRIPT"); - - -fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob) -{ # TEMPORARY IMPLEMENTATION - tailprob = fStat; - for (i in 1:nrow(fStat)) { - for (j in 1:ncol(fStat)) { - q = castAsScalar (fStat [i, j]); - d1 = castAsScalar (df_1 [i, j]); - d2 = castAsScalar (df_2 [i, j]); - if (d1 >= 1 & d2 >= 1 & q >= 0.0) { - tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE); - } else { - tailprob [i, j] = 0/0; - } - } } -} - -sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A) -{ - mask_A = ppred (input_A, 0.0, ">="); - prep_A = input_A * mask_A; - mask_A = mask_A * ppred (prep_A, prep_A, "=="); - prep_A = replace (target = prep_A, pattern = 0.0/0.0, replacement = 0); - output_A = sqrt (prep_A) / mask_A; -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# STRATIFIED BIVARIATE STATISTICS, VERSION 4 +# +# INPUT PARAMETERS: +# ----------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ----------------------------------------------------------------------------- +# X String --- Location to read matrix X that has all 1-st covariates +# Y String " " Location to read matrix Y that has all 2-nd covariates +# the default value " " means "use X in place of Y" +# S String " " Location to read matrix S that has the stratum column +# the default value " " means "use X in place of S" +# Xcid String " " Location to read the 1-st covariate X-column indices +# the default value " " means "use columns 1 : ncol(X)" +# Ycid String " " Location to read the 2-nd covariate Y-column indices +# the default value " " means "use columns 1 : ncol(Y)" +# Scid Int 1 Column index of the stratum column in S +# O String --- Location to store the output matrix (see below) +# fmt String "text" Matrix output format, usually "text" or "csv" +# ----------------------------------------------------------------------------- +# Note: the stratum column must contain small positive integers; all fractional +# values are rounded; strata with ID <= 0 or NaN are treated as missing. +# +# OUTPUT MATRIX: +# One row per each distinct pair (1st covariate, 2nd covariate) +# 40 columns containing the following information: +# Col 01: 1st covariate X-column number +# Col 02: 1st covariate global presence count +# Col 03: 1st covariate global mean +# Col 04: 1st covariate global standard deviation +# Col 05: 1st covariate stratified standard deviation +# Col 06: R-squared, 1st covariate vs. strata +# Col 07: adjusted R-squared, 1st covariate vs. strata +# Col 08: P-value, 1st covariate vs. strata +# Col 09-10: Reserved +# Col 11: 2nd covariate Y-column number +# Col 12: 2nd covariate global presence count +# Col 13: 2nd covariate global mean +# Col 14: 2nd covariate global standard deviation +# Col 15: 2nd covariate stratified standard deviation +# Col 16: R-squared, 2nd covariate vs. strata +# Col 17: adjusted R-squared, 2nd covariate vs. strata +# Col 18: P-value, 2nd covariate vs. strata +# Col 19-20: Reserved +# Col 21: Global 1st & 2nd covariate presence count +# Col 22: Global regression slope (2nd vs. 1st covariate) +# Col 23: Global regression slope standard deviation +# Col 24: Global correlation = +/- sqrt(R-squared) +# Col 25: Global residual standard deviation +# Col 26: Global R-squared +# Col 27: Global adjusted R-squared +# Col 28: Global P-value for hypothesis "slope = 0" +# Col 29-30: Reserved +# Col 31: Stratified 1st & 2nd covariate presence count +# Col 32: Stratified regression slope (2nd vs. 1st covariate) +# Col 33: Stratified regression slope standard deviation +# Col 34: Stratified correlation = +/- sqrt(R-squared) +# Col 35: Stratified residual standard deviation +# Col 36: Stratified R-squared +# Col 37: Stratified adjusted R-squared +# Col 38: Stratified P-value for hypothesis "slope = 0" +# Col 39: Number of strata with at least two counted points +# Col 40: Reserved +# +# EXAMPLES: +# +# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/X.mtx Xcid=INPUT_DIR/Xcid.mtx +# Y=INPUT_DIR/Y.mtx Ycid=INPUT_DIR/Ycid.mtx S=INPUT_DIR/S.mtx Scid=1 O=OUTPUT_DIR/Out.mtx fmt=csv +# +# hadoop jar SystemML.jar -f stratstats.dml -nvargs X=INPUT_DIR/Data.mtx Xcid=INPUT_DIR/Xcid.mtx +# Ycid=INPUT_DIR/Ycid.mtx Scid=1 O=OUTPUT_DIR/Out.mtx + +fileX = $X; +fileY = ifdef ($Y, " "); +fileS = ifdef ($S, " "); +fileO = $O; +fmtO = ifdef ($fmt, "text"); + +fileXcid = ifdef ($Xcid, " "); +fileYcid = ifdef ($Ycid, " "); +stratum_column_id = ifdef ($Scid, 1); + +print ("BEGIN STRATIFIED STATISTICS SCRIPT"); + +print ("Reading the input matrices..."); + +XwithNaNs = read (fileX); +if (fileY != " ") { + YwithNaNs = read (fileY); +} else { + YwithNaNs = XwithNaNs; +} +if (fileS != " ") { + SwithNaNsFull = read (fileS); + SwithNaNs = SwithNaNsFull [, stratum_column_id]; +} else { + SwithNaNs = XwithNaNs [, stratum_column_id]; +} +if (fileXcid != " ") { + Xcols = read (fileXcid); +} else { + Xcols = t(seq (1, ncol (XwithNaNs), 1)); +} +if (fileYcid != " ") { + Ycols = read (fileYcid); +} else { + Ycols = t(seq (1, ncol (YwithNaNs), 1)); +} +tXcols = t(Xcols); +tYcols = t(Ycols); + +num_records = nrow (XwithNaNs); +num_attrs = ncol (XwithNaNs); +num_attrs_X = ncol (Xcols); +num_attrs_Y = ncol (Ycols); +num_attrs_XY = num_attrs_X * num_attrs_Y; + +print ("Preparing the covariates..."); + +XnoNaNs = replace (target = XwithNaNs, pattern = 0.0/0.0, replacement = 0); +YnoNaNs = replace (target = YwithNaNs, pattern = 0.0/0.0, replacement = 0); +XNaNmask = ppred (XwithNaNs, XwithNaNs, "=="); +YNaNmask = ppred (YwithNaNs, YwithNaNs, "=="); +one_to_num_attrs_X = seq (1, num_attrs_X, 1); +one_to_num_attrs_Y = seq (1, num_attrs_Y, 1); +ProjX = matrix (0, rows = num_attrs, cols = num_attrs_X); +ProjY = matrix (0, rows = num_attrs, cols = num_attrs_Y); + +ProjX_ctable = table (tXcols, one_to_num_attrs_X); +ProjX [1 : nrow (ProjX_ctable), ] = ProjX_ctable; + +ProjY_ctable = table (tYcols, one_to_num_attrs_Y); +ProjY [1 : nrow (ProjY_ctable), ] = ProjY_ctable; + +X = XnoNaNs %*% ProjX; +Y = YnoNaNs %*% ProjY; +X_mask = XNaNmask %*% ProjX; +Y_mask = YNaNmask %*% ProjY; + +print ("Preparing the strata..."); + +SnoNaNs = replace (target = SwithNaNs, pattern = 0.0/0.0, replacement = 0); +S = round (SnoNaNs) * ppred (SnoNaNs, 0.0, ">"); +Proj_good_stratumID = diag (ppred (S, 0.0, ">")); +Proj_good_stratumID = removeEmpty (target = Proj_good_stratumID, margin = "rows"); +vector_of_good_stratumIDs = Proj_good_stratumID %*% S; +vector_of_good_stratumIDs = vector_of_good_stratumIDs + (1 - min (vector_of_good_stratumIDs)); +num_records_with_good_stratumID = nrow (Proj_good_stratumID); +one_to_num_records_with_good_stratumID = seq (1, num_records_with_good_stratumID, 1); + +# Create a group-by summation matrix for records over stratum IDs +# "with_empty" means with stratum IDs that never occur in records + +num_strata_with_empty = max (vector_of_good_stratumIDs); +StrataSummator_with_empty = table (vector_of_good_stratumIDs, one_to_num_records_with_good_stratumID); +StrataSummator = removeEmpty (target = StrataSummator_with_empty, margin = "rows"); +StrataSummator = StrataSummator %*% Proj_good_stratumID; +num_strata = nrow (StrataSummator); +num_empty_strata = num_strata_with_empty - num_strata; +print ("There are " + num_strata + " nonempty strata and " + num_empty_strata + " empty but positive-ID strata."); + +print ("Computing the global single-variate statistics..."); + +cnt_X_global = colSums (X_mask); +cnt_Y_global = colSums (Y_mask); +avg_X_global = colSums (X) / cnt_X_global; +avg_Y_global = colSums (Y) / cnt_Y_global; +var_sumX_global = colSums (X * X) - cnt_X_global * (avg_X_global * avg_X_global); +var_sumY_global = colSums (Y * Y) - cnt_Y_global * (avg_Y_global * avg_Y_global); + sqrt_failsafe_input_1 = var_sumX_global / (cnt_X_global - 1); +stdev_X_global = sqrt_failsafe (sqrt_failsafe_input_1); + sqrt_failsafe_input_2 = var_sumY_global / (cnt_Y_global - 1); +stdev_Y_global = sqrt_failsafe (sqrt_failsafe_input_2); + +print ("Computing the stratified single-variate statistics..."); + +# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata + +Cnt_X_per_stratum = StrataSummator %*% X_mask; +Cnt_Y_per_stratum = StrataSummator %*% Y_mask; +Is_none_X_per_stratum = ppred (Cnt_X_per_stratum, 0, "=="); +Is_none_Y_per_stratum = ppred (Cnt_Y_per_stratum, 0, "=="); +One_over_cnt_X_per_stratum = (1 - Is_none_X_per_stratum) / (Cnt_X_per_stratum + Is_none_X_per_stratum); +One_over_cnt_Y_per_stratum = (1 - Is_none_Y_per_stratum) / (Cnt_Y_per_stratum + Is_none_Y_per_stratum); +num_X_nonempty_strata = num_strata - colSums (Is_none_X_per_stratum); +num_Y_nonempty_strata = num_strata - colSums (Is_none_Y_per_stratum); + +Sum_X_per_stratum = StrataSummator %*% X; +Sum_Y_per_stratum = StrataSummator %*% Y; + +# Recompute some global statistics to exclude bad stratum-ID records + +cnt_X_with_good_stratumID = colSums (Cnt_X_per_stratum); +cnt_Y_with_good_stratumID = colSums (Cnt_Y_per_stratum); +sum_X_with_good_stratumID = colSums (Sum_X_per_stratum); +sum_Y_with_good_stratumID = colSums (Sum_Y_per_stratum); +var_sumX_with_good_stratumID = colSums (StrataSummator %*% (X * X)) - (sum_X_with_good_stratumID * sum_X_with_good_stratumID) / cnt_X_with_good_stratumID; +var_sumY_with_good_stratumID = colSums (StrataSummator %*% (Y * Y)) - (sum_Y_with_good_stratumID * sum_Y_with_good_stratumID) / cnt_Y_with_good_stratumID; + +# Compute the stratified statistics + +var_sumX_stratified = colSums (StrataSummator %*% (X * X)) - colSums (One_over_cnt_X_per_stratum * Sum_X_per_stratum * Sum_X_per_stratum); +var_sumY_stratified = colSums (StrataSummator %*% (Y * Y)) - colSums (One_over_cnt_Y_per_stratum * Sum_Y_per_stratum * Sum_Y_per_stratum); + sqrt_failsafe_input_3 = var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata); +stdev_X_stratified = sqrt_failsafe (sqrt_failsafe_input_3); + sqrt_failsafe_input_4 = var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata); +stdev_Y_stratified = sqrt_failsafe (sqrt_failsafe_input_4); +r_sqr_X_vs_strata = 1 - var_sumX_stratified / var_sumX_with_good_stratumID; +r_sqr_Y_vs_strata = 1 - var_sumY_stratified / var_sumY_with_good_stratumID; +adj_r_sqr_X_vs_strata = 1 - (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)) / (var_sumX_with_good_stratumID / (cnt_X_with_good_stratumID - 1)); +adj_r_sqr_Y_vs_strata = 1 - (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)) / (var_sumY_with_good_stratumID / (cnt_Y_with_good_stratumID - 1)); +fStat_X_vs_strata = ((var_sumX_with_good_stratumID - var_sumX_stratified) / (num_X_nonempty_strata - 1)) / (var_sumX_stratified / (cnt_X_with_good_stratumID - num_X_nonempty_strata)); +fStat_Y_vs_strata = ((var_sumY_with_good_stratumID - var_sumY_stratified) / (num_Y_nonempty_strata - 1)) / (var_sumY_stratified / (cnt_Y_with_good_stratumID - num_Y_nonempty_strata)); +p_val_X_vs_strata = fStat_tailprob (fStat_X_vs_strata, num_X_nonempty_strata - 1, cnt_X_with_good_stratumID - num_X_nonempty_strata); +p_val_Y_vs_strata = fStat_tailprob (fStat_Y_vs_strata, num_Y_nonempty_strata - 1, cnt_Y_with_good_stratumID - num_Y_nonempty_strata); + +print ("Computing the global bivariate statistics..."); + +# Compute the aggregate X vs. Y statistics and map them into proper positions + +cnt_XY_rectangle = t(X_mask) %*% Y_mask; +sum_X_forXY_rectangle = t(X) %*% Y_mask; +sum_XX_forXY_rectangle = t(X * X) %*% Y_mask; +sum_Y_forXY_rectangle = t(X_mask) %*% Y; +sum_YY_forXY_rectangle = t(X_mask) %*% (Y * Y); +sum_XY_rectangle = t(X) %*% Y; +cnt_XY_global = matrix (cnt_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_X_forXY_global = matrix (sum_X_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_XX_forXY_global = matrix (sum_XX_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_Y_forXY_global = matrix (sum_Y_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_YY_forXY_global = matrix (sum_YY_forXY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +sum_XY_global = matrix (sum_XY_rectangle, rows = 1, cols = num_attrs_XY, byrow = TRUE); +ones_XY = matrix (1.0, rows = 1, cols = num_attrs_XY); + +# Compute the global bivariate statistics for output + +cov_sumX_sumY_global = sum_XY_global - sum_X_forXY_global * sum_Y_forXY_global / cnt_XY_global; +var_sumX_forXY_global = sum_XX_forXY_global - sum_X_forXY_global * sum_X_forXY_global / cnt_XY_global; +var_sumY_forXY_global = sum_YY_forXY_global - sum_Y_forXY_global * sum_Y_forXY_global / cnt_XY_global; +slope_XY_global = cov_sumX_sumY_global / var_sumX_forXY_global; + sqrt_failsafe_input_5 = var_sumX_forXY_global * var_sumY_forXY_global; + sqrt_failsafe_output_5 = sqrt_failsafe (sqrt_failsafe_input_5); +corr_XY_global = cov_sumX_sumY_global / sqrt_failsafe_output_5; +r_sqr_X_vs_Y_global = cov_sumX_sumY_global * cov_sumX_sumY_global / (var_sumX_forXY_global * var_sumY_forXY_global); +adj_r_sqr_X_vs_Y_global = 1 - (1 - r_sqr_X_vs_Y_global) * (cnt_XY_global - 1) / (cnt_XY_global - 2); + sqrt_failsafe_input_6 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / var_sumX_forXY_global / (cnt_XY_global - 2) +stdev_slope_XY_global = sqrt_failsafe (sqrt_failsafe_input_6); + sqrt_failsafe_input_7 = (1 - r_sqr_X_vs_Y_global) * var_sumY_forXY_global / (cnt_XY_global - 2) +stdev_errY_vs_X_global = sqrt_failsafe (sqrt_failsafe_input_7); +fStat_Y_vs_X_global = (cnt_XY_global - 2) * r_sqr_X_vs_Y_global / (1 - r_sqr_X_vs_Y_global); +p_val_Y_vs_X_global = fStat_tailprob (fStat_Y_vs_X_global, ones_XY, cnt_XY_global - 2); + +print ("Computing the stratified bivariate statistics..."); + +# Create projections to "intermingle" X and Y into attribute pairs + +Proj_X_to_XY = matrix (0.0, rows = num_attrs_X, cols = num_attrs_XY); +Proj_Y_to_XY = matrix (0.0, rows = num_attrs_Y, cols = num_attrs_XY); +ones_Y_col = matrix (1.0, rows = num_attrs_Y, cols = 1); +for (i in 1:num_attrs_X) { + start_cid = (i - 1) * num_attrs_Y + 1; + end_cid = i * num_attrs_Y; + Proj_X_to_XY [i, start_cid:end_cid] = t(ones_Y_col); + Proj_Y_to_XY [ , start_cid:end_cid] = diag (ones_Y_col); +} + +# Compute per-stratum statistics, prevent div-0 for locally empty (due to NaNs in X or Y) strata + +Cnt_XY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_X_forXY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_XX_forXY_per_stratum = StrataSummator %*% (((X * X) %*% Proj_X_to_XY) * ( Y_mask %*% Proj_Y_to_XY)); +Sum_Y_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); +Sum_YY_forXY_per_stratum = StrataSummator %*% (( X_mask %*% Proj_X_to_XY) * ((Y * Y) %*% Proj_Y_to_XY)); +Sum_XY_per_stratum = StrataSummator %*% (( X %*% Proj_X_to_XY) * ( Y %*% Proj_Y_to_XY)); + +Is_none_XY_per_stratum = ppred (Cnt_XY_per_stratum, 0, "=="); +One_over_cnt_XY_per_stratum = (1 - Is_none_XY_per_stratum) / (Cnt_XY_per_stratum + Is_none_XY_per_stratum); +num_XY_nonempty_strata = num_strata - colSums (Is_none_XY_per_stratum); + +# Recompute some global aggregate X vs. Y statistics to exclude bad stratum-ID records + +cnt_XY_with_good_stratumID = colSums (Cnt_XY_per_stratum); +sum_XX_forXY_with_good_stratumID = colSums (Sum_XX_forXY_per_stratum); +sum_YY_forXY_with_good_stratumID = colSums (Sum_YY_forXY_per_stratum); +sum_XY_with_good_stratumID = colSums (Sum_XY_per_stratum); + +# Compute the stratified bivariate statistics + +var_sumX_forXY_stratified = sum_XX_forXY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_X_forXY_per_stratum * One_over_cnt_XY_per_stratum); +var_sumY_forXY_stratified = sum_YY_forXY_with_good_stratumID - colSums (Sum_Y_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); +cov_sumX_sumY_stratified = sum_XY_with_good_stratumID - colSums (Sum_X_forXY_per_stratum * Sum_Y_forXY_per_stratum * One_over_cnt_XY_per_stratum); + +slope_XY_stratified = cov_sumX_sumY_stratified / var_sumX_forXY_stratified; + sqrt_failsafe_input_8 = var_sumX_forXY_stratified * var_sumY_forXY_stratified; + sqrt_failsafe_output_8 = sqrt_failsafe (sqrt_failsafe_input_8); +corr_XY_stratified = cov_sumX_sumY_stratified / sqrt_failsafe_output_8; +r_sqr_X_vs_Y_stratified = (cov_sumX_sumY_stratified ^ 2) / (var_sumX_forXY_stratified * var_sumY_forXY_stratified); +temp_X_vs_Y_stratified = (1 - r_sqr_X_vs_Y_stratified) / (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) +adj_r_sqr_X_vs_Y_stratified = 1 - temp_X_vs_Y_stratified * (cnt_XY_with_good_stratumID - num_XY_nonempty_strata); + sqrt_failsafe_input_9 = temp_X_vs_Y_stratified * var_sumY_forXY_stratified; +stdev_errY_vs_X_stratified = sqrt_failsafe (sqrt_failsafe_input_9); + sqrt_failsafe_input_10 = sqrt_failsafe_input_9 / var_sumX_forXY_stratified; +stdev_slope_XY_stratified = sqrt_failsafe (sqrt_failsafe_input_10); +fStat_Y_vs_X_stratified = (cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1) * r_sqr_X_vs_Y_stratified / (1 - r_sqr_X_vs_Y_stratified); +p_val_Y_vs_X_stratified = fStat_tailprob (fStat_Y_vs_X_stratified, ones_XY, cnt_XY_with_good_stratumID - num_XY_nonempty_strata - 1); + +print ("Preparing the output matrix..."); +OutMtx = matrix (0.0, rows = 40, cols = num_attrs_XY); + +OutMtx [ 1, ] = Xcols %*% Proj_X_to_XY; # 1st covariate column number +OutMtx [ 2, ] = cnt_X_global %*% Proj_X_to_XY; # 1st covariate global presence count +OutMtx [ 3, ] = avg_X_global %*% Proj_X_to_XY; # 1st covariate global mean +OutMtx [ 4, ] = stdev_X_global %*% Proj_X_to_XY; # 1st covariate global standard deviation +OutMtx [ 5, ] = stdev_X_stratified %*% Proj_X_to_XY; # 1st covariate stratified standard deviation +OutMtx [ 6, ] = r_sqr_X_vs_strata %*% Proj_X_to_XY; # R-squared, 1st covariate vs. strata +OutMtx [ 7, ] = adj_r_sqr_X_vs_strata %*% Proj_X_to_XY; # adjusted R-squared, 1st covariate vs. strata +OutMtx [ 8, ] = p_val_X_vs_strata %*% Proj_X_to_XY; # P-value, 1st covariate vs. strata +OutMtx [11, ] = Ycols %*% Proj_Y_to_XY; # 2nd covariate column number +OutMtx [12, ] = cnt_Y_global %*% Proj_Y_to_XY; # 2nd covariate global presence count +OutMtx [13, ] = avg_Y_global %*% Proj_Y_to_XY; # 2nd covariate global mean +OutMtx [14, ] = stdev_Y_global %*% Proj_Y_to_XY; # 2nd covariate global standard deviation +OutMtx [15, ] = stdev_Y_stratified %*% Proj_Y_to_XY; # 2nd covariate stratified standard deviation +OutMtx [16, ] = r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # R-squared, 2nd covariate vs. strata +OutMtx [17, ] = adj_r_sqr_Y_vs_strata %*% Proj_Y_to_XY; # adjusted R-squared, 2nd covariate vs. strata +OutMtx [18, ] = p_val_Y_vs_strata %*% Proj_Y_to_XY; # P-value, 2nd covariate vs. strata + +OutMtx [21, ] = cnt_XY_global; # Global 1st & 2nd covariate presence count +OutMtx [22, ] = slope_XY_global; # Global regression slope (2nd vs. 1st covariate) +OutMtx [23, ] = stdev_slope_XY_global; # Global regression slope standard deviation +OutMtx [24, ] = corr_XY_global; # Global correlation = +/- sqrt(R-squared) +OutMtx [25, ] = stdev_errY_vs_X_global; # Global residual standard deviation +OutMtx [26, ] = r_sqr_X_vs_Y_global; # Global R-squared +OutMtx [27, ] = adj_r_sqr_X_vs_Y_global; # Global adjusted R-squared +OutMtx [28, ] = p_val_Y_vs_X_global; # Global P-value for hypothesis "slope = 0" +OutMtx [31, ] = cnt_XY_with_good_stratumID; # Stratified 1st & 2nd covariate presence count +OutMtx [32, ] = slope_XY_stratified; # Stratified regression slope (2nd vs. 1st covariate) +OutMtx [33, ] = stdev_slope_XY_stratified; # Stratified regression slope standard deviation +OutMtx [34, ] = corr_XY_stratified; # Stratified correlation = +/- sqrt(R-squared) +OutMtx [35, ] = stdev_errY_vs_X_stratified; # Stratified residual standard deviation +OutMtx [36, ] = r_sqr_X_vs_Y_stratified; # Stratified R-squared +OutMtx [37, ] = adj_r_sqr_X_vs_Y_stratified; # Stratified adjusted R-squared +OutMtx [38, ] = p_val_Y_vs_X_stratified; # Stratified P-value for hypothesis "slope = 0" +OutMtx [39, ] = colSums (ppred (Cnt_XY_per_stratum, 2, ">=")); # Number of strata with at least two counted points + +OutMtx = t(OutMtx); + +print ("Writing the output matrix..."); +write (OutMtx, fileO, format=fmtO); +print ("END STRATIFIED STATISTICS SCRIPT"); + + +fStat_tailprob = function (Matrix[double] fStat, Matrix[double] df_1, Matrix[double] df_2) return (Matrix[double] tailprob) +{ # TEMPORARY IMPLEMENTATION + tailprob = fStat; + for (i in 1:nrow(fStat)) { + for (j in 1:ncol(fStat)) { + q = castAsScalar (fStat [i, j]); + d1 = castAsScalar (df_1 [i, j]); + d2 = castAsScalar (df_2 [i, j]); + if (d1 >= 1 & d2 >= 1 & q >= 0.0) { + tailprob [i, j] = pf(target = q, df1 = d1, df2 = d2, lower.tail=FALSE); + } else { + tailprob [i, j] = 0/0; + } + } } +} + +sqrt_failsafe = function (Matrix[double] input_A) return (Matrix[double] output_A) +{ + mask_A = ppred (input_A, 0.0, ">="); + prep_A = input_A * mask_A; + mask_A = mask_A * ppred (prep_A, prep_A, "=="); + prep_A = replace (target = prep_A, pattern = 0.0/0.0, replacement = 0); + output_A = sqrt (prep_A) / mask_A; +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genCorrelatedData.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genCorrelatedData.dml b/scripts/datagen/genCorrelatedData.dml index e81583b..d3289ce 100644 --- a/scripts/datagen/genCorrelatedData.dml +++ b/scripts/datagen/genCorrelatedData.dml @@ -1,46 +1,46 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random correlated data -# can generate any number of variables/columns -# used to test univariate stats computation -# by systemml - -# $1 is number of variables/columns -# $2 is number of samples to create -# $3 is the location to write out the covariance mat -# $4 is the location to write out the generated data -dims = $1 -numSamples = $2 - -U = Rand(rows=dims, cols=dims, min=-1.0, max=1.0, pdf="uniform", seed=0) -denoms = sqrt(colSums(U*U)) -parfor(i in 1:dims){ - U[i,] = U[i,] / denoms -} - -C = t(U)%*%U -write(C, $3, format="binary") - -R = Rand(rows=numSamples, cols=dims, pdf="normal", seed=0) -Rc = R%*%U -write(Rc, $4, format="binary") - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random correlated data +# can generate any number of variables/columns +# used to test univariate stats computation +# by systemml + +# $1 is number of variables/columns +# $2 is number of samples to create +# $3 is the location to write out the covariance mat +# $4 is the location to write out the generated data +dims = $1 +numSamples = $2 + +U = Rand(rows=dims, cols=dims, min=-1.0, max=1.0, pdf="uniform", seed=0) +denoms = sqrt(colSums(U*U)) +parfor(i in 1:dims){ + U[i,] = U[i,] / denoms +} + +C = t(U)%*%U +write(C, $3, format="binary") + +R = Rand(rows=numSamples, cols=dims, pdf="normal", seed=0) +Rc = R%*%U +write(Rc, $4, format="binary") + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4ChisquaredTest.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4ChisquaredTest.dml b/scripts/datagen/genRandData4ChisquaredTest.dml index 42db9dd..e25adf2 100644 --- a/scripts/datagen/genRandData4ChisquaredTest.dml +++ b/scripts/datagen/genRandData4ChisquaredTest.dml @@ -1,87 +1,87 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates a two column matrix of categorical -# variables -# used to test systemml's chi-squared bivariate stat -# computation - -# $1 is number of samples to generate -# $2 is number of categories for 1st categorical variable -# $3 is number of categories for 2nd categorical variable -# $4 is the file to write out the chi-squared statistic to -# $5 is the file to write out the generated data to - -numSamples = $1 -numCategories1 = $2 -numCategories2 = $3 - -o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0, pdf="uniform", seed=0) -o = o / sum(o) - -probs1 = rowSums(o) -probs1 = probs1 / sum(probs1) -probs2 = colSums(o) -probs2 = probs2 / sum(probs2) -e = probs1 %*% probs2 - -chisquared = sum((o-e)^2/e) -write(chisquared, $4, format="binary") - -oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0, pdf="uniform", seed=0) -for(i in 1:numCategories1){ - for(j in 1:numCategories2){ - if(i==1 & j==1){ - oCDF[i,j] = o[1,1] - } - if(i != 1 & j == 1){ - oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j] - } - if(j > 1){ - oCDF[i,j] = oCDF[i,j-1] + o[i,j] - } - } -} - -one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0) -data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0) -parfor(s in 1:numSamples){ - r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) - r = castAsScalar(r_mat) - - cat1 = -1 - cat2 = -1 - continue = 1 - for(i in 1:numCategories1){ - for(j in 1:numCategories2){ - cdf = castAsScalar(oCDF[i,j]) - if(continue == 1 & r <= cdf){ - cat1 = i - cat2 = j - continue = 0 - } - } - } - - data[s,1] = cat1*one - data[s,2] = cat2*one -} -write(data, $5, format="binary") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates a two column matrix of categorical +# variables +# used to test systemml's chi-squared bivariate stat +# computation + +# $1 is number of samples to generate +# $2 is number of categories for 1st categorical variable +# $3 is number of categories for 2nd categorical variable +# $4 is the file to write out the chi-squared statistic to +# $5 is the file to write out the generated data to + +numSamples = $1 +numCategories1 = $2 +numCategories2 = $3 + +o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0, pdf="uniform", seed=0) +o = o / sum(o) + +probs1 = rowSums(o) +probs1 = probs1 / sum(probs1) +probs2 = colSums(o) +probs2 = probs2 / sum(probs2) +e = probs1 %*% probs2 + +chisquared = sum((o-e)^2/e) +write(chisquared, $4, format="binary") + +oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0, pdf="uniform", seed=0) +for(i in 1:numCategories1){ + for(j in 1:numCategories2){ + if(i==1 & j==1){ + oCDF[i,j] = o[1,1] + } + if(i != 1 & j == 1){ + oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j] + } + if(j > 1){ + oCDF[i,j] = oCDF[i,j-1] + o[i,j] + } + } +} + +one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0) +data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0) +parfor(s in 1:numSamples){ + r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) + r = castAsScalar(r_mat) + + cat1 = -1 + cat2 = -1 + continue = 1 + for(i in 1:numCategories1){ + for(j in 1:numCategories2){ + cdf = castAsScalar(oCDF[i,j]) + if(continue == 1 & r <= cdf){ + cat1 = i + cat2 = j + continue = 0 + } + } + } + + data[s,1] = cat1*one + data[s,2] = cat2*one +} +write(data, $5, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4DecisionTree1.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4DecisionTree1.dml b/scripts/datagen/genRandData4DecisionTree1.dml index 3ef9067..b679783 100644 --- a/scripts/datagen/genRandData4DecisionTree1.dml +++ b/scripts/datagen/genRandData4DecisionTree1.dml @@ -1,39 +1,39 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -XCatFile = $XCat; -YFile = $Y; -num_records = $num_records; -num_cat_features = $num_cat; -num_class = $num_class; -num_distinct = $num_distinct; -sparsity = $sp; - -# generate class labels -Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class + 0.99999999999999)); -Y_bin = table (seq (1, num_records), Y); -write (Y_bin, YFile); - -# generate categorical features -X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max = num_distinct + 0.99999999999999, sparsity = sparsity)); -write (X_cat, XCatFile, format = "csv"); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +XCatFile = $XCat; +YFile = $Y; +num_records = $num_records; +num_cat_features = $num_cat; +num_class = $num_class; +num_distinct = $num_distinct; +sparsity = $sp; + +# generate class labels +Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class + 0.99999999999999)); +Y_bin = table (seq (1, num_records), Y); +write (Y_bin, YFile); + +# generate categorical features +X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max = num_distinct + 0.99999999999999, sparsity = sparsity)); +write (X_cat, XCatFile, format = "csv"); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4DecisionTree2.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4DecisionTree2.dml b/scripts/datagen/genRandData4DecisionTree2.dml index 85d3ad0..cc8341c 100644 --- a/scripts/datagen/genRandData4DecisionTree2.dml +++ b/scripts/datagen/genRandData4DecisionTree2.dml @@ -1,40 +1,40 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - - -transformPath = $tPath; -transformSpec = $tSpec; -XCatFile = $XCat; -XFile = $X; -num_records = $num_records; -num_scale_features = $num_scale; -sparsity = $sp; -fmt = $fmt; - -# generate scale features -X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max = 10, sparsity = sparsity); - -# transform categorical features -XCF = read (XCatFile); -X_cat_transformed = transform (target = XCF, transformSpec = transformSpec, transformPath = transformPath); - -X = append (X_scale, X_cat_transformed); -write (X, XFile, format = fmt); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +transformPath = $tPath; +transformSpec = $tSpec; +XCatFile = $XCat; +XFile = $X; +num_records = $num_records; +num_scale_features = $num_scale; +sparsity = $sp; +fmt = $fmt; + +# generate scale features +X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max = 10, sparsity = sparsity); + +# transform categorical features +XCF = read (XCatFile); +X_cat_transformed = transform (target = XCF, transformSpec = transformSpec, transformPath = transformPath); + +X = append (X_scale, X_cat_transformed); +write (X, XFile, format = fmt); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4FTest.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4FTest.dml b/scripts/datagen/genRandData4FTest.dml index 91e7c50..bdd33b9 100644 --- a/scripts/datagen/genRandData4FTest.dml +++ b/scripts/datagen/genRandData4FTest.dml @@ -1,95 +1,95 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random data for F-test -# -# $1 is number of groups (some of -# which may share a gaussian) -# $2 is number of actual groups -# $3 is number of points -# $4 is mean of the gaussian means -# $5 is mean of the gaussian std. deviations -# $6 is file to store computed f-statistic -# $7 is file to store generated data - -numGroups = $1 -numActualGroups = $2 -numSamples = $3 -meanOfMeans = $4 -meanOfStddevs = $5 - -cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) -cntProbs = cntProbs/sum(cntProbs) -cntArr = round(cntProbs * numSamples) -last_cnt = cntArr[numGroups,1] -cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt) - -permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0, pdf="uniform") -ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform") -permut[,1:numActualGroups] = diag(ones) - -one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform") -copy_start_index = numActualGroups+1 -parfor(i in copy_start_index:numGroups){ - r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform", seed=0) - j = castAsScalar(round(r)) - permut[j,i] = one -} - -means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) -abs_means = means_std + meanOfMeans -means = t(t(abs_means) %*% permut) - -stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) -abs_stddevs = stddevs_std + meanOfStddevs -stddevs = t(t(abs_stddevs) %*% permut) - -overall_mean = sum(means*cntArr)/numSamples - -explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0) -unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups) -f = explained_variance / unexplained_variance -write(f, $6, format="binary") - -cntCDFs = cntProbs -for(i in 2:numGroups){ - cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1] -} - -data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform") -parfor(i in 1:numSamples){ - r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) - r1 = castAsScalar(r_mat) - - g = -1 - continue = 1 - for(k in 1:numGroups){ - cdf = castAsScalar(cntCDFs[k,1]) - if(continue==1 & r1<=cdf){ - g = k - continue=0 - } - } - - point = Rand(rows=1, cols=1, pdf="normal", seed=0) - data[i,1] = point*stddevs[g,1] + means[g,1] -} -write(data, $7, format="binary") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for F-test +# +# $1 is number of groups (some of +# which may share a gaussian) +# $2 is number of actual groups +# $3 is number of points +# $4 is mean of the gaussian means +# $5 is mean of the gaussian std. deviations +# $6 is file to store computed f-statistic +# $7 is file to store generated data + +numGroups = $1 +numActualGroups = $2 +numSamples = $3 +meanOfMeans = $4 +meanOfStddevs = $5 + +cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) +cntProbs = cntProbs/sum(cntProbs) +cntArr = round(cntProbs * numSamples) +last_cnt = cntArr[numGroups,1] +cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt) + +permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0, pdf="uniform") +ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform") +permut[,1:numActualGroups] = diag(ones) + +one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform") +copy_start_index = numActualGroups+1 +parfor(i in copy_start_index:numGroups){ + r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform", seed=0) + j = castAsScalar(round(r)) + permut[j,i] = one +} + +means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) +abs_means = means_std + meanOfMeans +means = t(t(abs_means) %*% permut) + +stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) +abs_stddevs = stddevs_std + meanOfStddevs +stddevs = t(t(abs_stddevs) %*% permut) + +overall_mean = sum(means*cntArr)/numSamples + +explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0) +unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups) +f = explained_variance / unexplained_variance +write(f, $6, format="binary") + +cntCDFs = cntProbs +for(i in 2:numGroups){ + cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1] +} + +data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform") +parfor(i in 1:numSamples){ + r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) + r1 = castAsScalar(r_mat) + + g = -1 + continue = 1 + for(k in 1:numGroups){ + cdf = castAsScalar(cntCDFs[k,1]) + if(continue==1 & r1<=cdf){ + g = k + continue=0 + } + } + + point = Rand(rows=1, cols=1, pdf="normal", seed=0) + data[i,1] = point*stddevs[g,1] + means[g,1] +} +write(data, $7, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Kmeans.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4Kmeans.dml b/scripts/datagen/genRandData4Kmeans.dml index 7abcee4..fe50ac5 100644 --- a/scripts/datagen/genRandData4Kmeans.dml +++ b/scripts/datagen/genRandData4Kmeans.dml @@ -1,120 +1,120 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# Generates random Gaussian-mixture data to test k-Means clustering algorithms -# -# INPUT PARAMETERS: -# ---------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# ---------------------------------------------------------------------------- -# nr Int --- Number of records -# nf Int --- Number of features -# nc Int --- Number of clusters -# dc Double --- St.dev. of cluster "centroid" features from zero mean -# dr Double --- St.dev. of the 1-st feature in a record within cluster -# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature -# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus] -# X String --- Location to write matrix X with generated data records -# C String --- Location to write cluster "centroids" (Gaussian means) -# Y String --- Location to write assignment of records to cluster ids -# YbyC String --- Location to write rec-cluster assigns by min-dist to C -# ---------------------------------------------------------------------------- -# -# Example: -# hadoop jar SystemML.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100 -# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx - -print ("BEGIN K-MEANS GENERATOR SCRIPT"); - -num_records = $nr; -num_features = $nf; -num_centroids = $nc; -dist_per_feature_centroids = $dc; -dist_per_feature_first_record = $dr; -feature_bias_factor = $fbf; -cluster_bias_factor = $cbf; - -fileX = ifdef ($X, "X"); -fileC = ifdef ($C, "C"); -fileY = ifdef ($Y, "Y"); -fileYbyC = ifdef ($YbyC, "YbyC"); -fmt = ifdef ($fmt, "text"); - -print ("Generating cluster distribution (mixture) centroids..."); - -C = Rand (rows = num_centroids, cols = num_features, pdf = "normal"); -C = C * dist_per_feature_centroids; - -print ("Generating record-to-cluster assignments..."); - -# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely -# than "num_centroids" by the factor of "cluster_bias_factor" - -rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); -if (cluster_bias_factor == 1.0) { - Y = round (0.5 + rnd * num_centroids); -} else { - rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1))); - Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor)); -} - -print ("Generating within-cluster random shifts..."); - -X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal"); -feature_factors = dist_per_feature_first_record * - exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor)); -X_shift = X_shift %*% diag (feature_factors); - -print ("Generating records by shifting from centroids..."); - -Y_bitmap_raw = table (seq (1, num_records), Y); -Y_bitmap = matrix (0, rows = num_records, cols = num_centroids); -Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw; -X = Y_bitmap %*% C + X_shift; - -print ("Computing record-to-cluster assignments by minimum centroid distance..."); - -D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2)); -P = ppred (D, rowMins (D), "<="); -aggr_P = t(cumsum (t(P))); -Y_by_C = rowSums (ppred (aggr_P, 0, "==")) + 1; - -print ("Computing useful statistics..."); - -sumXsq = sum (X ^ 2); -default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records; -attained_wcss = sumXsq + sum (rowMins (D)); - -print ("Default (single-cluster) WCSS = " + default_wcss); -print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss); - -print ("Writing out the resulting dataset..."); - -write (X, fileX, format = fmt); -write (C, fileC, format = fmt); -write (Y, fileY, format = fmt); -write (Y_by_C, fileYbyC, format = fmt); - -print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC); - -print ("DONE: K-MEANS GENERATOR SCRIPT"); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# nr Int --- Number of records +# nf Int --- Number of features +# nc Int --- Number of clusters +# dc Double --- St.dev. of cluster "centroid" features from zero mean +# dr Double --- St.dev. of the 1-st feature in a record within cluster +# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature +# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus] +# X String --- Location to write matrix X with generated data records +# C String --- Location to write cluster "centroids" (Gaussian means) +# Y String --- Location to write assignment of records to cluster ids +# YbyC String --- Location to write rec-cluster assigns by min-dist to C +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemML.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100 +# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx + +print ("BEGIN K-MEANS GENERATOR SCRIPT"); + +num_records = $nr; +num_features = $nf; +num_centroids = $nc; +dist_per_feature_centroids = $dc; +dist_per_feature_first_record = $dr; +feature_bias_factor = $fbf; +cluster_bias_factor = $cbf; + +fileX = ifdef ($X, "X"); +fileC = ifdef ($C, "C"); +fileY = ifdef ($Y, "Y"); +fileYbyC = ifdef ($YbyC, "YbyC"); +fmt = ifdef ($fmt, "text"); + +print ("Generating cluster distribution (mixture) centroids..."); + +C = Rand (rows = num_centroids, cols = num_features, pdf = "normal"); +C = C * dist_per_feature_centroids; + +print ("Generating record-to-cluster assignments..."); + +# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely +# than "num_centroids" by the factor of "cluster_bias_factor" + +rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); +if (cluster_bias_factor == 1.0) { + Y = round (0.5 + rnd * num_centroids); +} else { + rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1))); + Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor)); +} + +print ("Generating within-cluster random shifts..."); + +X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal"); +feature_factors = dist_per_feature_first_record * + exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor)); +X_shift = X_shift %*% diag (feature_factors); + +print ("Generating records by shifting from centroids..."); + +Y_bitmap_raw = table (seq (1, num_records), Y); +Y_bitmap = matrix (0, rows = num_records, cols = num_centroids); +Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw; +X = Y_bitmap %*% C + X_shift; + +print ("Computing record-to-cluster assignments by minimum centroid distance..."); + +D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2)); +P = ppred (D, rowMins (D), "<="); +aggr_P = t(cumsum (t(P))); +Y_by_C = rowSums (ppred (aggr_P, 0, "==")) + 1; + +print ("Computing useful statistics..."); + +sumXsq = sum (X ^ 2); +default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records; +attained_wcss = sumXsq + sum (rowMins (D)); + +print ("Default (single-cluster) WCSS = " + default_wcss); +print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss); + +print ("Writing out the resulting dataset..."); + +write (X, fileX, format = fmt); +write (C, fileC, format = fmt); +write (Y, fileY, format = fmt); +write (Y_by_C, fileYbyC, format = fmt); + +print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC); + +print ("DONE: K-MEANS GENERATOR SCRIPT"); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4LinearRegression.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4LinearRegression.dml b/scripts/datagen/genRandData4LinearRegression.dml index f0d214e..b257804 100644 --- a/scripts/datagen/genRandData4LinearRegression.dml +++ b/scripts/datagen/genRandData4LinearRegression.dml @@ -1,61 +1,61 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates data to test linear regression - -# $1 is number of samples -# $2 is number of features (independent variables) -# $3 is maximum feature value (absolute value) -# $4 is maximum weight (absolute value) -# $5 is location to store generated weights -# $6 is location to store generated data -# $7 is location to store generated labels -# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y -# $9 is b, 0 disables intercept -# $10 controls sparsity in the generated data -# $11 output format - -numSamples = $1 -numFeatures = $2 -maxFeatureValue = $3 -maxWeight = $4 -addNoise = $8 -b = $9 -fmt = $11 - -X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) -w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) -X = X * maxFeatureValue -w = w * maxWeight -Y = X %*% w - -if(b!=0) { - b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") - w = t(append(t(w), b_mat)) - Y = Y + b -} - -noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) -Y = Y + addNoise*noise - -write(w, $5, format=fmt) -write(X, $6, format=fmt) -write(Y, $7, format=fmt) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates data to test linear regression + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is maximum feature value (absolute value) +# $4 is maximum weight (absolute value) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y +# $9 is b, 0 disables intercept +# $10 controls sparsity in the generated data +# $11 output format + +numSamples = $1 +numFeatures = $2 +maxFeatureValue = $3 +maxWeight = $4 +addNoise = $8 +b = $9 +fmt = $11 + +X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) +w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) +X = X * maxFeatureValue +w = w * maxWeight +Y = X %*% w + +if(b!=0) { + b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") + w = t(append(t(w), b_mat)) + Y = Y + b +} + +noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) +Y = Y + addNoise*noise + +write(w, $5, format=fmt) +write(X, $6, format=fmt) +write(Y, $7, format=fmt)
