[systemds] branch master updated: [SYSTEMDS-3155] Add missing glmPredict builtin function

mboehm7 Sun, 10 Oct 2021 13:06:15 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/master by this push:
     new 2ab37ae  [SYSTEMDS-3155] Add missing glmPredict builtin function
2ab37ae is described below

commit 2ab37aeedcad490085dabde88c10948fb76fa95c
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Oct 10 21:42:48 2021 +0200

    [SYSTEMDS-3155] Add missing glmPredict builtin function
    
    This patch adds the missing glmPredict builtin function (by conversion
    from the existing algorithm script), adds a test, and changes the
    perftest scripts accordingly.
---
 scripts/builtin/glmPredict.dml                     | 419 +++++++++++++++++++++
 scripts/builtin/msvm.dml                           |   2 +-
 scripts/perftest/runAll.sh                         |   2 +-
 scripts/perftest/runGLM_binomial_probit.sh         |   2 +-
 scripts/perftest/runGLM_gamma_log.sh               |   2 +-
 scripts/perftest/runGLM_poisson_log.sh             |   2 +-
 scripts/perftest/runLinearRegCG.sh                 |   2 +-
 scripts/perftest/runLinearRegDS.sh                 |   2 +-
 scripts/perftest/runMultiLogReg.sh                 |   2 +-
 .../perftest/scripts/GLM-predict.dml               |  24 +-
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 src/test/scripts/functions/builtin/lmpredict.dml   |   5 +
 12 files changed, 451 insertions(+), 14 deletions(-)

diff --git a/scripts/builtin/glmPredict.dml b/scripts/builtin/glmPredict.dml
new file mode 100644
index 0000000..46b193d
--- /dev/null
+++ b/scripts/builtin/glmPredict.dml
@@ -0,0 +1,419 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A GLM-TYPE REGRESSION TO A 
NEW (TEST) DATASET
+
+# INPUTS PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME  TYPE    DEFAULT  MEANING
+# 
---------------------------------------------------------------------------------------------
+# X     Matrix  ---      Matrix X of records (feature vectors)
+# B     Matrix  ---      GLM regression parameters (the betas), with dimensions
+#                           ncol(X)   x k: do not add intercept
+#                           ncol(X)+1 x k: add intercept as given by the last 
B-row
+#                           if k > 1, use only B[, 1] unless it is Multinomial 
Logit (dfam=3)
+# ytest Matrix  " "      Response matrix Y, with the following dimensions:
+#                           nrow(X) x 1  : for all distributions (dfam=1 or 2 
or 3)
+#                           nrow(X) x 2  : for Binomial (dfam=2) given by 
(#pos, #neg) counts
+#                           nrow(X) x k+1: for Multinomial (dfam=3) given by 
category counts
+# dfam  Int     1        GLM distribution family: 1 = Power, 2 = Binomial, 3 = 
Multinomial Logit
+# vpow  Double  0.0      Power for Variance defined as (mean)^power (ignored 
if dfam != 1):
+#                        0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = 
Inverse Gaussian
+# link  Int     0        Link function code: 0 = canonical (depends on 
distribution), 1 = Power,
+#                        2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit; 
ignored if Multinomial
+# lpow  Double  1.0      Power for Link function defined as (mean)^power 
(ignored if link != 1):
+#                        -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = 
sqrt, 1.0 = identity
+# disp  Double  1.0      Dispersion value, when available
+# verbose Boolean TRUE   Print statistics to stdout
+# 
---------------------------------------------------------------------------------------------
+
+# OUTPUTS:
+# 
---------------------------------------------------------------------------------------------
+# M     Matrix  " "      Matrix M of predicted means/probabilities:
+#                           nrow(X) x 1  : for Power-type distributions 
(dfam=1)
+#                           nrow(X) x 2  : for Binomial distribution (dfam=2), 
column 2 is "No"
+#                           nrow(X) x k+1: for Multinomial Logit (dfam=3), 
col# k+1 is baseline
+# 
---------------------------------------------------------------------------------------------
+# Additional statistics are printed one per each line, in the following 
+# CSV format: NAME,[COLUMN],[SCALED],VALUE
+# ---
+# NAME   is the string identifier for the statistic, see the table below.
+# COLUMN is an optional integer value that specifies the Y-column for 
per-column statistics;
+#        note that a Binomial/Multinomial one-column Y input is converted into 
multi-column.
+# SCALED is an optional Boolean value (TRUE or FALSE) that tells us whether or 
not the input
+#          dispersion parameter (disp) scaling has been applied to this 
statistic.
+# VALUE  is the value of the statistic.
+# ---
+# NAME                  COLUMN  SCALED  MEANING
+# 
---------------------------------------------------------------------------------------------
+# LOGLHOOD_Z                      +     Log-Likelihood Z-score (in st.dev's 
from mean)
+# LOGLHOOD_Z_PVAL                 +     Log-Likelihood Z-score p-value
+# PEARSON_X2                      +     Pearson residual X^2 statistic
+# PEARSON_X2_BY_DF                +     Pearson X^2 divided by degrees of 
freedom
+# PEARSON_X2_PVAL                 +     Pearson X^2 p-value
+# DEVIANCE_G2                     +     Deviance from saturated model G^2 
statistic
+# DEVIANCE_G2_BY_DF               +     Deviance G^2 divided by degrees of 
freedom
+# DEVIANCE_G2_PVAL                +     Deviance G^2 p-value
+# AVG_TOT_Y               +             Average of Y column for a single 
response value
+# STDEV_TOT_Y             +             St.Dev. of Y column for a single 
response value
+# AVG_RES_Y               +             Average of column residual, i.e. of Y 
- mean(Y|X)
+# STDEV_RES_Y             +             St.Dev. of column residual, i.e. of Y 
- mean(Y|X)
+# PRED_STDEV_RES          +       +     Model-predicted St.Dev. of column 
residual
+# R2                      +             R^2 of Y column residual with bias 
included
+# ADJUSTED_R2             +             Adjusted R^2 of Y column residual with 
bias included
+# R2_NOBIAS               +             R^2 of Y column residual with bias 
subtracted
+# ADJUSTED_R2_NOBIAS      +             Adjusted R^2 of Y column residual with 
bias subtracted
+# 
---------------------------------------------------------------------------------------------
+
+
+
+m_glmPredict = function(Matrix[Double] X, Matrix[Double] B, Matrix[Double] 
ytest=matrix(0,0,0),
+  Boolean intercept = FALSE, Integer dfam=1, Double vpow=0.0, Integer link=0, 
Double lpow=1.0,
+  Double disp=1.0, Boolean verbose=TRUE)
+  return(Matrix[Double] M)
+{
+  dist_type  = dfam;
+  link_type  = link;
+  link_power = as.double(lpow);
+  var_power  = as.double(vpow);
+  dispersion = as.double(disp);
+
+  if (dist_type == 3)
+    link_type = 2;
+  else if (link_type == 0) { # Canonical Link
+    if (dist_type == 1) {
+      link_type = 1;
+      link_power = 1.0 - var_power;
+    }
+    else if (dist_type == 2)
+      link_type = 2;
+  }
+
+  num_records  = nrow (X);
+  num_features = ncol (X);
+  B_full = B;
+  if (dist_type == 3) {
+    beta =  B_full [1 : ncol (X),  ];
+    intercept = B_full [nrow(B_full),  ];
+  } else {
+    beta =  B_full [1 : ncol (X), 1];
+    intercept = B_full [nrow(B_full), 1];
+  }
+  if (nrow (B_full) == ncol (X)) {
+    intercept = 0.0 * intercept;
+    is_intercept = FALSE;
+  } else {
+    num_features = num_features + 1;
+    is_intercept = TRUE;
+  }
+
+  ones_rec = matrix (1, rows = num_records, cols = 1);
+  linear_terms = X %*% beta + ones_rec %*% intercept;
+  [means, vars] =
+    glm_means_and_vars (linear_terms, dist_type, var_power, link_type, 
link_power);
+    
+  M = means;
+
+  if (nrow(ytest) > 0)
+  {
+    Y = ytest;
+    ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
+
+    # Statistics To Compute:
+    Z_logl               = NaN;
+    Z_logl_pValue        = NaN;
+    X2_pearson           = NaN;
+    df_pearson           = -1;
+    G2_deviance          = NaN;
+    df_deviance          = -1;
+    X2_pearson_pValue    = NaN;
+    G2_deviance_pValue   = NaN;
+    Z_logl_scaled        = NaN;
+    Z_logl_scaled_pValue = NaN;
+    X2_scaled            = NaN;
+    X2_scaled_pValue     = NaN;
+    G2_scaled            = NaN;
+    G2_scaled_pValue     = NaN;
+
+    # set Y_counts to avoid 'Initialization of Y_counts depends on if-else 
execution' warning
+    Y_counts = matrix(0.0, rows=1, cols=1);
+
+    if (dist_type == 1 & link_type == 1) {
+      # POWER DISTRIBUTIONS (GAUSSIAN, POISSON, GAMMA, ETC.)
+      if (link_power == 0) {
+        is_zero_Y = (Y == 0);
+        lt_saturated = log (Y + is_zero_Y) - is_zero_Y / (1.0 - is_zero_Y);
+      }
+      else
+        lt_saturated = Y ^ link_power;
+      Y_counts = ones_rec;
+
+      X2_pearson = sum ((Y - means) ^ 2 / vars);
+      df_pearson = num_records - num_features;
+
+      log_l_part = glm_partial_loglikelihood_for_power_dist_and_link 
(linear_terms, Y, var_power, link_power);
+      log_l_part_saturated = glm_partial_loglikelihood_for_power_dist_and_link 
(lt_saturated, Y, var_power, link_power);
+
+      G2_deviance = 2 * sum (log_l_part_saturated) - 2 * sum (log_l_part);
+      df_deviance = num_records - num_features;
+    }
+    else {
+      if (dist_type >= 2) {
+        # BINOMIAL AND MULTINOMIAL DISTRIBUTIONS
+        if (ncol (Y) == 1) {
+          num_categories = ncol (beta) + 1;
+          if (min (Y) <= 0) { 
+            # Category labels "0", "-1" etc. are converted into the baseline 
label
+            Y = Y + (- Y + num_categories) * (Y <= 0);
+          }
+          Y_size = min (num_categories, max(Y));
+          Y_unsized = table (seq (1, num_records, 1), Y);
+          Y = matrix (0, rows = num_records, cols = num_categories);
+          Y [, 1 : Y_size] = Y_unsized [, 1 : Y_size];
+          Y_counts = ones_rec;
+        } else {
+          Y_counts = rowSums (Y);
+        }
+
+        P = means;
+        zero_Y = (Y == 0);
+        zero_P = (P == 0);
+        ones_ctg = matrix (1, rows = ncol(Y), cols = 1);
+
+        logl_vec = rowSums (Y *  log (P + zero_Y)   );
+        ent1_vec = rowSums (P *  log (P + zero_P)   );
+        ent2_vec = rowSums (P * (log (P + zero_P))^2);
+        E_logl   = sum (Y_counts * ent1_vec);
+        V_logl   = sum (Y_counts * (ent2_vec - ent1_vec ^ 2));
+        Z_logl   = (sum (logl_vec) - E_logl) / sqrt (V_logl);
+
+        means = means * (Y_counts %*% t(ones_ctg));
+        vars  = vars  * (Y_counts %*% t(ones_ctg));
+
+        frac_below_5 = sum (means < 5) / (nrow (means) * ncol (means));
+        frac_below_1 = sum (means < 1) / (nrow (means) * ncol (means));
+
+        if (frac_below_5 > 0.2 | frac_below_1 > 0)
+            print ("WARNING: residual statistics are inaccurate here due to 
low cell means.");
+
+        X2_pearson = sum ((Y - means) ^ 2 / means);
+        df_pearson = (num_records - num_features) * (ncol(Y) - 1);
+        G2_deviance = 2 * sum (Y * log ((Y + zero_Y) / (means + zero_Y)));
+        df_deviance = (num_records - num_features) * (ncol(Y) - 1);
+    }}
+
+    if (Z_logl == Z_logl) {
+      Z_logl_absneg = - abs (Z_logl);
+      Z_logl_pValue = 2.0 * pnorm(target = Z_logl_absneg);
+    }
+    if (X2_pearson == X2_pearson & df_pearson > 0)
+      X2_pearson_pValue = pchisq(target = X2_pearson, df = df_pearson, 
lower.tail=FALSE);
+    if (G2_deviance == G2_deviance & df_deviance > 0)
+      G2_deviance_pValue = pchisq(target = G2_deviance, df = df_deviance, 
lower.tail=FALSE);
+
+    Z_logl_scaled = Z_logl / sqrt (dispersion);
+    X2_scaled = X2_pearson / dispersion;
+    G2_scaled = G2_deviance / dispersion;
+
+    if (Z_logl_scaled == Z_logl_scaled) {
+      Z_logl_scaled_absneg = - abs (Z_logl_scaled);
+      Z_logl_scaled_pValue = 2.0 * pnorm(target = Z_logl_scaled_absneg);
+    }
+    if (X2_scaled == X2_scaled & df_pearson > 0)
+      X2_scaled_pValue = pchisq(target = X2_scaled, df = df_pearson, 
lower.tail=FALSE);
+    if (G2_scaled == G2_scaled & df_deviance > 0)
+      G2_scaled_pValue = pchisq(target = G2_scaled, df = df_deviance, 
lower.tail=FALSE);
+
+    avg_tot_Y = colSums (    Y    ) / sum (Y_counts);
+    avg_res_Y = colSums (Y - means) / sum (Y_counts);
+    ss_avg_tot_Y = colSums ((    Y     - Y_counts %*% avg_tot_Y) ^ 2);
+    ss_res_Y = colSums ((Y - means) ^ 2);
+    ss_avg_res_Y = colSums ((Y - means - Y_counts %*% avg_res_Y) ^ 2);
+    df_ss_res_Y = sum (Y_counts) - num_features;
+    df_ss_avg_res_Y = ifelse(is_intercept, df_ss_res_Y, df_ss_res_Y - 1);
+
+    var_tot_Y = ss_avg_tot_Y / (sum (Y_counts) - 1);
+    if (df_ss_avg_res_Y > 0)
+      var_res_Y = ss_avg_res_Y / df_ss_avg_res_Y;
+    else
+      var_res_Y = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
+    R2_nobias  = 1 - ss_avg_res_Y / ss_avg_tot_Y;
+    adjust_R2_nobias = 1 - var_res_Y / var_tot_Y;
+    R2  = 1 - ss_res_Y / ss_avg_tot_Y;
+    if (df_ss_res_Y > 0)
+      adjust_R2 = 1 - (ss_res_Y / df_ss_res_Y) / var_tot_Y;
+    else
+      adjust_R2 = matrix (0.0, rows = 1, cols = ncol (Y)) / 0.0;
+    predicted_avg_var_res_Y = dispersion * colSums (vars) / sum (Y_counts);
+
+    # PREPARING THE OUTPUT CSV STATISTICS FILE
+
+    str = "LOGLHOOD_Z,,FALSE," + Z_logl;
+    str = append (str, "LOGLHOOD_Z_PVAL,,FALSE," + Z_logl_pValue);
+    str = append (str, "PEARSON_X2,,FALSE," + X2_pearson);
+    str = append (str, "PEARSON_X2_BY_DF,,FALSE," + (X2_pearson / df_pearson));
+    str = append (str, "PEARSON_X2_PVAL,,FALSE," + X2_pearson_pValue);
+    str = append (str, "DEVIANCE_G2,,FALSE," + G2_deviance);
+    str = append (str, "DEVIANCE_G2_BY_DF,,FALSE," + (G2_deviance / 
df_deviance));
+    str = append (str, "DEVIANCE_G2_PVAL,,FALSE," + G2_deviance_pValue);
+    str = append (str, "LOGLHOOD_Z,,TRUE," + Z_logl_scaled);
+    str = append (str, "LOGLHOOD_Z_PVAL,,TRUE," + Z_logl_scaled_pValue);
+    str = append (str, "PEARSON_X2,,TRUE," + X2_scaled);
+    str = append (str, "PEARSON_X2_BY_DF,,TRUE," + (X2_scaled / df_pearson));
+    str = append (str, "PEARSON_X2_PVAL,,TRUE," + X2_scaled_pValue);
+    str = append (str, "DEVIANCE_G2,,TRUE," + G2_scaled);
+    str = append (str, "DEVIANCE_G2_BY_DF,,TRUE," + (G2_scaled / df_deviance));
+    str = append (str, "DEVIANCE_G2_PVAL,,TRUE," + G2_scaled_pValue);
+
+    for (i in 1:ncol(Y)) {
+      str = append (str, "AVG_TOT_Y," + i + ",," + as.scalar (avg_tot_Y [1, 
i]));
+      str = append (str, "STDEV_TOT_Y," + i + ",," + as.scalar (sqrt 
(var_tot_Y [1, i])));
+      str = append (str, "AVG_RES_Y," + i + ",," + as.scalar (avg_res_Y [1, 
i]));
+      str = append (str, "STDEV_RES_Y," + i + ",," + as.scalar (sqrt 
(var_res_Y [1, i])));
+      str = append (str, "PRED_STDEV_RES," + i + ",TRUE," + as.scalar (sqrt 
(predicted_avg_var_res_Y [1, i])));
+      str = append (str, "R2," + i + ",," + as.scalar (R2 [1, i]));
+      str = append (str, "ADJUSTED_R2," + i + ",," + as.scalar (adjust_R2 [1, 
i]));
+      str = append (str, "R2_NOBIAS," + i + ",," + as.scalar (R2_nobias [1, 
i]));
+      str = append (str, "ADJUSTED_R2_NOBIAS," + i + ",," + as.scalar 
(adjust_R2_nobias [1, i]));
+    }
+
+    if( verbose )
+      print(str);
+  }
+}
+
+glm_means_and_vars =
+  function (Matrix[double] linear_terms, int dist_type, double var_power, int 
link_type, double link_power)
+  return (Matrix[double] means, Matrix[double] vars)
+  # NOTE: "vars" represents the variance without dispersion, i.e. the V(mu) 
function.
+{
+  num_points = nrow (linear_terms);
+  if (dist_type == 1 & link_type == 1) {
+    # POWER DISTRIBUTION
+    if (link_power ==  0)
+      y_mean = exp (linear_terms);
+    else if (link_power ==  1.0)
+      y_mean = linear_terms;
+    else if (link_power == -1.0)
+      y_mean = 1.0 / linear_terms;
+    else
+      y_mean = linear_terms ^ (1.0 / link_power);
+    if (var_power == 0)
+      var_function = matrix (1.0, rows = num_points, cols = 1);
+    else if (var_power == 1.0)
+      var_function = y_mean;
+    else
+      var_function = y_mean ^ var_power;
+    means = y_mean;
+    vars = var_function;
+  }
+  else if (dist_type == 2 & link_type >= 1 & link_type <= 5) {
+    # BINOMIAL/BERNOULLI DISTRIBUTION
+    y_prob = matrix (0.0, rows = num_points, cols = 2);
+    if(link_type == 1 & link_power == 0)  { # Binomial.log
+      y_prob [, 1]  = exp (linear_terms);
+      y_prob [, 2]  = 1.0 - y_prob [, 1];
+    } else if (link_type == 1 & link_power != 0)  { # Binomial.power_nonlog
+      y_prob [, 1]  = linear_terms ^ (1.0 / link_power);
+      y_prob [, 2]  = 1.0 - y_prob [, 1];
+    } else if (link_type == 2) { # Binomial.logit
+      elt = exp (linear_terms);
+      y_prob [, 1]  = elt / (1.0 + elt);
+      y_prob [, 2]  = 1.0 / (1.0 + elt);
+    } else if (link_type == 3) { # Binomial.probit
+      sign_lt = 2 * (linear_terms >= 0) - 1;
+      t_gp = 1.0 / (1.0 + abs (linear_terms) * 0.231641888);  # 0.231641888 = 
0.3275911 / sqrt (2.0)
+      erf_corr =
+          t_gp * ( 0.254829592 
+        + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by 
M. Abramowitz and I.A. Stegun,
+        + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print 
(Dec 1972), Sec. 7.1.26, p. 299
+        + t_gp * (-1.453152027 
+        + t_gp *   1.061405429)))) * sign_lt * exp (- (linear_terms ^ 2) / 
2.0);
+      y_prob [, 1] = (1 + sign_lt) - erf_corr;
+      y_prob [, 2] = (1 - sign_lt) + erf_corr;
+      y_prob = y_prob / 2;
+    } else if (link_type == 4) { # Binomial.cloglog
+      elt = exp (linear_terms);
+      is_too_small = ((10000000 + elt) == 10000000);
+      y_prob [, 2] = exp (- elt);
+      y_prob [, 1] = (1 - is_too_small) * (1.0 - y_prob [, 2]) + is_too_small 
* elt * (1.0 - elt / 2);
+    } else if (link_type == 5) { # Binomial.cauchit
+      atan_linear_terms = atan (linear_terms);
+      y_prob [, 1] = 0.5 + atan_linear_terms / pi;
+      y_prob [, 2] = 0.5 - atan_linear_terms / pi;
+    }
+    means = y_prob;
+    ones_ctg = matrix (1, rows = 2, cols = 1);
+    vars  = means * (means %*% (1 - diag (ones_ctg)));
+  } else if (dist_type == 3) {
+    # MULTINOMIAL LOGIT DISTRIBUTION
+    elt = exp (linear_terms);
+    ones_pts = matrix (1, rows = num_points, cols = 1);
+    elt = cbind (elt, ones_pts);
+    ones_ctg = matrix (1, rows = ncol (elt), cols = 1);
+    means = elt / (rowSums (elt) %*% t(ones_ctg));
+    vars  = means * (means %*% (1 - diag (ones_ctg)));
+  } else {
+    means = matrix (0.0, rows = num_points, cols = 1);
+    vars  = matrix (0.0, rows = num_points, cols = 1);
+  }
+}
+
+glm_partial_loglikelihood_for_power_dist_and_link =   # Assumes: dist_type == 
1 & link_type == 1
+  function (Matrix[double] linear_terms, Matrix[double] Y, double var_power, 
double link_power)
+  return (Matrix[double] log_l_part)
+{
+  num_records = nrow (Y);
+  if (var_power == 1.0) { # Poisson
+    if (link_power == 0)  { # Poisson.log
+      is_natural_parameter_log_zero = (linear_terms == -Inf);
+      natural_parameters = replace (target = linear_terms, pattern = -Inf, 
replacement = 0);
+      b_cumulant = exp (linear_terms);
+    } else { # Poisson.power_nonlog
+      is_natural_parameter_log_zero = (linear_terms == 0);
+      natural_parameters = log (linear_terms + is_natural_parameter_log_zero) 
/ link_power;
+      b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / 
link_power) - is_natural_parameter_log_zero;
+    }
+    is_minus_infinity = (Y > 0) * is_natural_parameter_log_zero;
+    log_l_part = Y * natural_parameters - b_cumulant - is_minus_infinity / (1 
- is_minus_infinity);
+  }
+  else {
+    if (var_power == 2.0 & link_power == 0)  { # Gamma.log
+      natural_parameters = - exp (- linear_terms);
+      b_cumulant = linear_terms;
+    }
+    else if (var_power == 2.0)  { # Gamma.power_nonlog
+      natural_parameters = - linear_terms ^ (- 1.0 / link_power);
+      b_cumulant = log (linear_terms) / link_power;
+    }
+    else if (link_power == 0) { # PowerDist.log
+      natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - 
var_power);
+      b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power);
+    }
+    else { # PowerDist.power_nonlog
+      power_np = (1.0 - var_power) / link_power;
+      natural_parameters = (linear_terms ^ power_np) / (1.0 - var_power);
+      power_cu = (2.0 - var_power) / link_power;
+      b_cumulant = (linear_terms ^ power_cu) / (2.0 - var_power);
+    }
+    log_l_part = Y * natural_parameters - b_cumulant;
+  }
+}
diff --git a/scripts/builtin/msvm.dml b/scripts/builtin/msvm.dml
index d8dbc37..88f26de 100644
--- a/scripts/builtin/msvm.dml
+++ b/scripts/builtin/msvm.dml
@@ -45,7 +45,7 @@
 
 m_msvm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = 
FALSE,
     Double epsilon = 0.001, Double lambda = 1.0, Integer maxIterations = 100,
-Boolean verbose = FALSE)
+    Boolean verbose = FALSE)
   return(Matrix[Double] model)
 {
   if(min(Y) < 0)
diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh
index 22693b7..9886523 100755
--- a/scripts/perftest/runAll.sh
+++ b/scripts/perftest/runAll.sh
@@ -48,7 +48,7 @@ if [ ! -d results ]; then mkdir -p results ; fi
 date >> results/times.txt
 
 ### Data Generation
-#echo "-- Generating binomial data: " >> results/times.txt;
+echo "-- Generating binomial data: " >> results/times.txt;
 ./genBinomialData.sh ${CMD} ${TEMPFOLDER} &>> logs/genBinomialData.out
 echo "-- Generating multinomial data." >> results/times.txt;
 ./genMultinomialData.sh ${CMD} ${TEMPFOLDER} &>> logs/genMultinomialData.out
diff --git a/scripts/perftest/runGLM_binomial_probit.sh 
b/scripts/perftest/runGLM_binomial_probit.sh
index 5068a57..e37872a 100755
--- a/scripts/perftest/runGLM_binomial_probit.sh
+++ b/scripts/perftest/runGLM_binomial_probit.sh
@@ -41,7 +41,7 @@ for i in 0 1 2; do
 
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=2 link=3 fmt=csv X=$1_test B=${BASE}/b Y=$2_test 
M=${BASE}/m O=${BASE}/out.csv
diff --git a/scripts/perftest/runGLM_gamma_log.sh 
b/scripts/perftest/runGLM_gamma_log.sh
index 787a6c7..6308a50 100755
--- a/scripts/perftest/runGLM_gamma_log.sh
+++ b/scripts/perftest/runGLM_gamma_log.sh
@@ -41,7 +41,7 @@ for i in 0 1 2; do
 
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=1 vpow=2.0 link=1 lpow=0.0 fmt=csv X=$1_test B=${BASE}/b 
Y=$2_test M=${BASE}/m O=${BASE}/out.csv
diff --git a/scripts/perftest/runGLM_poisson_log.sh 
b/scripts/perftest/runGLM_poisson_log.sh
index f8e1861..698ca65 100755
--- a/scripts/perftest/runGLM_poisson_log.sh
+++ b/scripts/perftest/runGLM_poisson_log.sh
@@ -41,7 +41,7 @@ for i in 0 1 2; do
    
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=1 vpow=1.0 link=1 lpow=0.0 fmt=csv X=$1_test B=${BASE}/b 
Y=$2_test M=${BASE}/m O=${BASE}/out.csv
diff --git a/scripts/perftest/runLinearRegCG.sh 
b/scripts/perftest/runLinearRegCG.sh
index ae22a12..e3c36b6 100755
--- a/scripts/perftest/runLinearRegCG.sh
+++ b/scripts/perftest/runLinearRegCG.sh
@@ -42,7 +42,7 @@ do
 
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=1 link=1 vpow=0.0 lpow=1.0 fmt=csv X=$1_test B=${BASE}/b 
Y=$2_test M=${BASE}/m O=${BASE}/out.csv
diff --git a/scripts/perftest/runLinearRegDS.sh 
b/scripts/perftest/runLinearRegDS.sh
index ad4617b..b285aff 100755
--- a/scripts/perftest/runLinearRegDS.sh
+++ b/scripts/perftest/runLinearRegDS.sh
@@ -42,7 +42,7 @@ do
 
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=1 link=1 vpow=0.0 lpow=1.0 fmt=csv X=$1_test B=${BASE}/b 
Y=$2_test M=${BASE}/m O=${BASE}/out.csv
diff --git a/scripts/perftest/runMultiLogReg.sh 
b/scripts/perftest/runMultiLogReg.sh
index 9cdbf36..b5503df 100755
--- a/scripts/perftest/runMultiLogReg.sh
+++ b/scripts/perftest/runMultiLogReg.sh
@@ -42,7 +42,7 @@ for i in 0 1 2; do
 
    #predict
    tstart=$(date +%s.%N)
-   ${CMD} -f ./algorithms/GLM-predict.dml \
+   ${CMD} -f scripts/GLM-predict.dml \
       --config conf/SystemDS-config.xml \
       --stats \
       --nvargs dfam=$DFAM vpow=-1 link=2 lpow=-1 fmt=csv X=$1_test B=${BASE}/b 
Y=$2_test M=${BASE}/m O=${BASE}/out.csv
diff --git a/src/test/scripts/functions/builtin/lmpredict.dml 
b/scripts/perftest/scripts/GLM-predict.dml
similarity index 67%
copy from src/test/scripts/functions/builtin/lmpredict.dml
copy to scripts/perftest/scripts/GLM-predict.dml
index d99e7e2..2064204 100644
--- a/src/test/scripts/functions/builtin/lmpredict.dml
+++ b/scripts/perftest/scripts/GLM-predict.dml
@@ -19,9 +19,21 @@
 #
 #-------------------------------------------------------------
 
-X = read($1) # Training data
-y = read($2) # response values
-p = read($3) # random data to predict
-w = lmDS(X = X, y = y, icpt = 1, reg = 1e-12)
-p = lmPredict(X = X, B = w, ytest=matrix(0,1,1), icpt = 1)
-write(p, $4)
+X = read($X);
+B = read($B);
+Y = matrix(0,0,0);
+if($Y != " ")
+  Y = read($Y); 
+
+dfam = ifdef ($dfam, 1);    # $dfam = 1;
+vpow = ifdef ($vpow, 0.0);  # $vpow = 0.0;
+link = ifdef ($link, 0);    # $link = 0;
+lpow = ifdef ($lpow, 1.0);  # $lpow = 1.0;
+disp = ifdef ($disp, 1.0);  # $disp = 1.0;
+
+
+[M] = glmPredict(X=X, B=B, ytest=Y,
+  dfam=dfam, vpow=vpow, link=link, lpow=lpow, disp=disp);
+
+if( $M != " " )
+  write(M, $M, format=$fmt);
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 7788b94..a5ad588 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -130,6 +130,7 @@ public enum Builtins {
        GAUSSIAN_CLASSIFIER("gaussianClassifier", true),
        GET_ACCURACY("getAccuracy", true),
        GLM("glm", true),
+       GLM_PREDICT("glmPredict", true),
        GMM("gmm", true),
        GMM_PREDICT("gmmPredict", true),
        GNMF("gnmf", true),
diff --git a/src/test/scripts/functions/builtin/lmpredict.dml 
b/src/test/scripts/functions/builtin/lmpredict.dml
index d99e7e2..2105f6c 100644
--- a/src/test/scripts/functions/builtin/lmpredict.dml
+++ b/src/test/scripts/functions/builtin/lmpredict.dml
@@ -24,4 +24,9 @@ y = read($2) # response values
 p = read($3) # random data to predict
 w = lmDS(X = X, y = y, icpt = 1, reg = 1e-12)
 p = lmPredict(X = X, B = w, ytest=matrix(0,1,1), icpt = 1)
+p2 = glmPredict(X = X, B = w, dfam=1, link=1, vpow=0.0, lpow=1.0);
+
+if( sum(abs(p2-p) > 1e8) !=0 )
+  stop("Mismatching lmPredict and glmPredict - no output written");
+
 write(p, $4)

[systemds] branch master updated: [SYSTEMDS-3155] Add missing glmPredict builtin function

Reply via email to