Repository: systemml
Updated Branches:
  refs/heads/master 48bfc9e30 -> 25be6a686


[MINOR] Cleanup univariate stats algorithm script (list, ifelse, format)

Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/25be6a68
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/25be6a68
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/25be6a68

Branch: refs/heads/master
Commit: 25be6a68628d02ef73a30e1a08394b1495d45c57
Parents: 48bfc9e
Author: Matthias Boehm <[email protected]>
Authored: Sun Jun 17 13:13:55 2018 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Sun Jun 17 13:13:55 2018 -0700

----------------------------------------------------------------------
 scripts/algorithms/Univar-Stats.dml | 209 ++++++++++++-------------------
 1 file changed, 83 insertions(+), 126 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/25be6a68/scripts/algorithms/Univar-Stats.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Univar-Stats.dml 
b/scripts/algorithms/Univar-Stats.dml
index fd06d68..9a4597f 100644
--- a/scripts/algorithms/Univar-Stats.dml
+++ b/scripts/algorithms/Univar-Stats.dml
@@ -42,143 +42,100 @@
 
 consoleOutput = ifdef($CONSOLE_OUTPUT, FALSE);
 
-A = read($X); # data file
-K = read($TYPES); # attribute kind file
-
-# number of features/attributes
-n = ncol(A);
-
-# number of data records
-m = nrow(A);
-
-# number of statistics
-numBaseStats = 17; # (14 scale stats, 3 categorical stats)
-
+A = read($X);      # data file
+K = read($TYPES);  # attribute kind file
+n = ncol(A);       # number of features/attributes
+m = nrow(A);       # number of data records
+numBaseStats = 17; # number of statistics (14 scale, 3 categorical)
 max_kind = max(K);
 
 # matrices to store computed statistics
 baseStats = matrix(0, rows=numBaseStats, cols=n);
 
 # Compute max domain size among all categorical attributes
-maxs = colMaxs(A);
-maxDomainSize = max( (K > 1) * maxs );
-maxDomain = as.integer(maxDomainSize);
+maxDomain = as.integer(max((K > 1) * colMaxs(A)));
 
 parfor(i in 1:n, check=0) {
-
-       # project out the i^th column
-       F = A[,i];
-
-       kind = as.scalar(K[1,i]);
-
-       if ( kind == 1 ) {
-               #print("[" + i + "] Scale");
-               # compute SCALE statistics on the projected column
-               minimum = min(F);
-               maximum = max(F);
-               rng = maximum - minimum;
-
-               mu = mean(F);
-               m2 = moment(F, 2);
-               m3 = moment(F, 3);
-               m4 = moment(F, 4);
-
-               var = m/(m-1.0)*m2;
-               std_dev = sqrt(var);
-               se = std_dev/sqrt(m);
-               cv = std_dev/mu;
-
-               g1 = m3/(std_dev^3);
-               g2 = m4/(std_dev^4) - 3;
-               #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); 
-               se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); 
-
-               #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) );  
-               se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); 
-
-               md = median(F); #quantile(F, 0.5);
-               iqm = interQuartileMean(F);
-
-               # place the computed statistics in output matrices
-               baseStats[1,i] = minimum;
-               baseStats[2,i] = maximum;
-               baseStats[3,i] = rng;
-
-               baseStats[4,i] = mu;
-               baseStats[5,i] = var;
-               baseStats[6,i] = std_dev;
-               baseStats[7,i] = se;
-               baseStats[8,i] = cv;
-
-               baseStats[9,i] = g1;
-               baseStats[10,i] = g2;
-               baseStats[11,i] = se_g1;
-               baseStats[12,i] = se_g2;
-
-               baseStats[13,i] = md;
-               baseStats[14,i] = iqm;
-       }
-       else {
-               if (kind == 2 | kind == 3) {
-                       #print("[" + i + "] Categorical");
-                       
-                       # check if the categorical column has valid values
-                       minF = min(F);
-                       if (minF <=0) {
-                               print("ERROR: Categorical attributes can only 
take values starting from 1. Encountered a value " + minF + " in attribute " + 
i);
-                       }
-                       else {
-                               # compute CATEGORICAL statistics on the 
projected column
-                               num_cat = max(F); # number of categories
-                               cat_counts = table(F,1, maxDomain, 1);  # 
counts for each category
-
-                               mode = rowIndexMax(t(cat_counts));
-                               mx = max(cat_counts)
-                               modeArr =  (cat_counts == mx)
-                               numModes = sum(modeArr);
-
-                               # place the computed statistics in output 
matrices
-                               baseStats[15,i] = num_cat;
-                               baseStats[16,i] = mode;
-                               baseStats[17,i] = numModes;
-                       }
-               }
-       }
+  # project out the i^th column
+  F = A[,i];
+
+  kind = as.scalar(K[1,i]);
+  minF = min(F);
+  maxF = max(F);
+
+  if ( kind == 1 ) {
+    # compute SCALE statistics on the projected column
+    rng = maxF - minF;
+
+    mu = mean(F);
+    m2 = moment(F, 2);
+    m3 = moment(F, 3);
+    m4 = moment(F, 4);
+
+    var = m/(m-1.0)*m2;
+    std_dev = sqrt(var);
+    se = std_dev/sqrt(m);
+    cv = std_dev/mu;
+
+    g1 = m3/(std_dev^3);
+    g2 = m4/(std_dev^4) - 3;
+    se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) );
+    se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 );
+
+    md = median(F);
+    iqm = interQuartileMean(F);
+
+    baseStats[1:14,i] = as.matrix(list(minF, maxF, rng,
+    mu, var, std_dev, se, cv, g1, g2, se_g1, se_g2, md, iqm));
+  }
+  else {
+    if (kind == 2 | kind == 3) {
+      # check if the categorical column has valid values
+      if( minF <= 0 ) {
+        print("ERROR: Categorical attributes can only take values starting 
from 1. Encountered a value " + minF + " in attribute " + i);
+      }
+      else {
+        # compute CATEGORICAL statistics on the projected column
+        cat_counts = table(F,1, maxDomain, 1);  # counts for each category
+        mode = as.scalar(rowIndexMax(t(cat_counts)));
+        numModes = sum(cat_counts == max(cat_counts));
+
+        baseStats[15:17,i] = as.matrix(list(maxF, mode, numModes));
+      }
+    }
+  }
 }
 
 if (consoleOutput == TRUE) {
-       for(i in 1:n) {
-               print("-------------------------------------------------");
-               kind = as.scalar(K[1,i]);
-               if (kind == 1) {
-                       print("Feature [" + i + "]: Scale");
-                       print(" (01) Minimum             | " + 
as.scalar(baseStats[1,i]));
-                       print(" (02) Maximum             | " + 
as.scalar(baseStats[2,i]));
-                       print(" (03) Range               | " + 
as.scalar(baseStats[3,i]));
-                       print(" (04) Mean                | " + 
as.scalar(baseStats[4,i]));
-                       print(" (05) Variance            | " + 
as.scalar(baseStats[5,i]));
-                       print(" (06) Std deviation       | " + 
as.scalar(baseStats[6,i]));
-                       print(" (07) Std err of mean     | " + 
as.scalar(baseStats[7,i]));
-                       print(" (08) Coeff of variation  | " + 
as.scalar(baseStats[8,i]));
-                       print(" (09) Skewness            | " + 
as.scalar(baseStats[9,i]));
-                       print(" (10) Kurtosis            | " + 
as.scalar(baseStats[10,i]));
-                       print(" (11) Std err of skewness | " + 
as.scalar(baseStats[11,i]));
-                       print(" (12) Std err of kurtosis | " + 
as.scalar(baseStats[12,i]));
-                       print(" (13) Median              | " + 
as.scalar(baseStats[13,i]));
-                       print(" (14) Interquartile mean  | " + 
as.scalar(baseStats[14,i]));
-               } else {
-                       if (kind == 2 | kind == 3) {
-                               if (kind == 2) {
-                                       print("Feature [" + i + "]: Categorical 
(Nominal)");
-                               } else {
-                                       print("Feature [" + i + "]: Categorical 
(Ordinal)");
-                               }
-                               print(" (15) Num of categories   | " + 
as.integer(as.scalar(baseStats[15,i])));
-                               print(" (16) Mode                | " + 
as.integer(as.scalar(baseStats[16,i])));
-                               print(" (17) Num of modes        | " + 
as.integer(as.scalar(baseStats[17,i])));
-                       }
-               }
-       }
+  for(i in 1:n) {
+    print("-------------------------------------------------");
+    kind = as.scalar(K[1,i]);
+    if (kind == 1) {
+      print("Feature [" + i + "]: Scale");
+      print(" (01) Minimum             | " + as.scalar(baseStats[1,i]));
+      print(" (02) Maximum             | " + as.scalar(baseStats[2,i]));
+      print(" (03) Range               | " + as.scalar(baseStats[3,i]));
+      print(" (04) Mean                | " + as.scalar(baseStats[4,i]));
+      print(" (05) Variance            | " + as.scalar(baseStats[5,i]));
+      print(" (06) Std deviation       | " + as.scalar(baseStats[6,i]));
+      print(" (07) Std err of mean     | " + as.scalar(baseStats[7,i]));
+      print(" (08) Coeff of variation  | " + as.scalar(baseStats[8,i]));
+      print(" (09) Skewness            | " + as.scalar(baseStats[9,i]));
+      print(" (10) Kurtosis            | " + as.scalar(baseStats[10,i]));
+      print(" (11) Std err of skewness | " + as.scalar(baseStats[11,i]));
+      print(" (12) Std err of kurtosis | " + as.scalar(baseStats[12,i]));
+      print(" (13) Median              | " + as.scalar(baseStats[13,i]));
+      print(" (14) Interquartile mean  | " + as.scalar(baseStats[14,i]));
+    }
+    else if (kind == 2 | kind == 3) {
+      print(ifelse(kind == 2,
+        "Feature [" + i + "]: Categorical (Nominal)",
+        "Feature [" + i + "]: Categorical (Ordinal)"));
+      print(" (15) Num of categories   | " + 
as.integer(as.scalar(baseStats[15,i])));
+      print(" (16) Mode                | " + 
as.integer(as.scalar(baseStats[16,i])));
+      print(" (17) Num of modes        | " + 
as.integer(as.scalar(baseStats[17,i])));
+    }
+  }
 }
 
 write(baseStats, $STATS);

Reply via email to