Shafaq-Siddiqi commented on a change in pull request #976: URL: https://github.com/apache/systemds/pull/976#discussion_r455923145
########## File path: scripts/builtin/gmm.dml ########## @@ -0,0 +1,418 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# ------------------------------------------ +# Gaussian Mixture Model +# ------------------------------------------ + +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X Double --- Matrix X +# n_components Integer 3 Number of n_components in the Gaussian mixture model +# model String "VVV" "VVV": unequal variance (full),each component has its own general covariance matrix +# "EEE": equal variance (tied), all components share the same general covariance matrix +# "VVI": spherical, unequal volume (diag), each component has its own diagonal covariance matrix +# "VII": spherical, equal volume (spherical), each component has its own single variance +# init_param String "kmeans" initialize weights with "kmeans" or "random" +# iterations Integer 100 Number of iterations +# reg_covar Double 1e-6 regularization parameter for covariance matrix +# tol Double 0.000001 tolerance value for convergence +# --------------------------------------------------------------------------------------------- + + +#Output(s) +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# weight Double --- A matrix whose [i,k]th entry is the probability that observation i in the test data belongs to the kth class +# labels Double --- Prediction matrix +# df Integer --- Number of estimated parameters +# bic Double --- Bayesian information criterion for best iteration + + + + +m_gmm = function(Matrix[Double] X, Integer n_components = 1, String model = "VVV", String init_params = "kmeans", Integer iter = 100, Double reg_covar = 1e-6, Double tol = 0.000001, Boolean verbose = FALSE ) +return (Matrix[Double] weights, Matrix[Double] labels, Integer df, Double bic) +{ + + # sanity checks + if(model != "VVV" & model != "EEE" & model != "VVI" & model != "VII") + stop("model not supported, should be in VVV, EEE, VVI, VII"); + + [labels, weights, norm] = fit(X, n_components, model, init_params, iter, reg_covar, tol) + df = estimate_free_param(n_components, ncol(X), model) + bic = getBIC(nrow(X),norm,df) +} + +initialize_param = function(Matrix[Double] X, Integer n_components, String init_params, String model, Double reg_covar, Double tol) +return (Matrix[Double] weight, Matrix[Double] mean, List[Unknown] sigma, List[Unknown] precision_chol) +{ + # create responsibility matrix, resp[n_samples, n_components] + resp = matrix(0, nrow(X), n_components) + if(init_params == "kmeans") + { + [C, Y] = kmeans(X, n_components, 10, 10, tol, FALSE, 25) + resp = resp + t(seq(1,n_components)) + resp = resp == Y + } + else if(init_params == "random") + { + resp = Rand(rows = nrow(X), cols=n_components) + resp = resp/rowSums(resp) + } + else stop("invalid parameter value, expected kmeans or random found "+init_params) + + [weight, mean, sigma, precision_chol] = initialize(X, resp, n_components, model, reg_covar) +} + + +# Matrix/Vector Parameters +# input: (X[n_samples, n_features], resp[n_samples, n_components]) +# output: (weight[n_samples, n_components], mean[n_components, n_features], sigma/prec_chol depends on model type) +initialize = function(Matrix[Double] X, Matrix[Double] resp, Integer n_components, String model, Double reg_covar) +return (Matrix[Double] weight, Matrix[Double] mean, List[Unknown] sigma, List[Unknown] precision_chol) +{ + n = nrow(X) + [weight, mean, sigma] = estimate_gaussian_param(X, resp, n_components, model, reg_covar) + weight = weight/n + precision_chol = compute_precision_cholesky(sigma, model) +} + +estimate_gaussian_param = function(Matrix[Double] X, Matrix[Double] resp, Integer n_components, String model, Double reg_covar) +return (Matrix[Double] weight, Matrix[Double] mean, List[Unknown] sigma) +{ + l = list() + n = nrow(X) + # estimate Gaussian parameter + nk = colSums(resp) + 2.220446049250313e-15 + mu = (t(resp) %*% X) / t(nk) + sigma = list() + if(model == "VVV") + sigma = covariances_VVV(X, resp, mu, nk, reg_covar) + else if(model == "EEE") + sigma = covariances_EEE(X, resp, mu, nk, reg_covar) + else if(model == "VVI") + sigma = covariances_VVI(X, resp, mu, nk, reg_covar) + else if (model == "VII") + sigma = covariances_VII(X, resp, mu, nk, reg_covar) + + weight = nk + mean = mu # n_components * n_features +} + +# Matrix/Vector Parameters/List +# input: (X[n_samples, n_features], resp[n_samples, n_components], mu[n_components, n_features], nk[1, n_components]) +# output: (sigma a list of length = n_components where each item in list is a covariance matrix of (n_features * n_features) dimensions) +covariances_VVV = function(Matrix[Double] X, Matrix[Double] resp, Matrix[Double] mu, Matrix[Double] nk, Double reg_covar) +return(List[Unknown] sigma) +{ + sigma = list() + for(k in 1:nrow(mu)) + { + diff = X - mu[k,] + cov = (t(diff * resp[, k]) %*% diff) / as.scalar(nk[1,k]) + cov = cov + diag(matrix(reg_covar, ncol(cov), 1)) + sigma = append(sigma, cov) + } +} + +# Matrix/Vector Parameters/List +# input: (X[n_samples, n_features], resp[n_samples, n_components], mu[n_components, n_features], nk[1, n_components]) +# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_features * n_features) dimensions) +covariances_EEE = function(Matrix[Double] X, Matrix[Double] resp, Matrix[Double] mu, Matrix[Double] nk, Double reg_covar) +return(List[Unknown] sigma) +{ + sigma = list() + avgX2 = t(X) %*% X + avgMean = (t(mu) * nk) %*% mu + cov = avgX2 - avgMean + cov = cov / sum(nk) + cov = cov + diag(matrix(reg_covar, ncol(cov), 1)) + sigma = append(sigma, cov) +} + +# Matrix/Vector Parameters/List +# input: (X[n_samples, n_features], resp[n_samples, n_components], mu[n_components, n_features], nk[1, n_components]) +# output: (sigma a list of length = 1 where item in list is a covariance matrix of (n_components * n_features) dimensions) +covariances_VVI = function(Matrix[Double] X, Matrix[Double] resp, Matrix[Double] mu, Matrix[Double] nk, Double reg_covar) +return(List[Unknown] sigma) +{ + sigma = list() + avgX2 = (t(resp) %*% (X*X)) / t(nk) + avgMean = mu ^ 2 + avgMean2 = mu * (t(resp) %*% X) / t(nk) + cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar + sigma = append(sigma, cov) +} + +# Matrix/Vector Parameters/List +# input: (X[n_samples, n_features], resp[n_samples, n_components], mu[n_components, n_features], nk[1, n_components]) +# output: (sigma a list of length = 1 where item in list is a variance value for each component (1* n_components) dimensions) +covariances_VII = function(Matrix[Double] X, Matrix[Double] resp, Matrix[Double] mu, Matrix[Double] nk, Double reg_covar) +return(List[Unknown] sigma) +{ + sigma = list() + avgX2 = (t(resp) %*% (X*X)) / t(nk) + avgMean = mu ^ 2 + avgMean2 = mu * (t(resp) %*% X) / t(nk) + cov = avgX2 - 2 * avgMean + avgMean2 + reg_covar + sigma = list(rowMeans(cov)) +} + +compute_precision_cholesky = function(List[Unknown] sigma, String model) +return (List[Unknown] precision_chol) +{ + precision_chol = list() + + if(model == "VVV") { + comp = length(sigma) + for(k in 1:length(sigma)) { + cov = as.matrix(sigma[k]) + isSPD = checkSPD(cov) + if(isSPD) { + cov_chol = cholesky(cov) + pre_chol = t(inv(cov_chol)) + precision_chol = append(precision_chol, pre_chol) + } else + stop("Fitting the mixture model failed because some components have ill-defined empirical covariance (i.e., singleton matrix or non-symmetric )."+ + "\nTry to decrease the number of components, or increase reg_covar") + } + } + else if(model == "EEE") { + cov = as.matrix(sigma[1]) + isSPD = checkSPD(cov) + if(isSPD) { + cov_chol = cholesky(cov) + pre_chol = t(inv(cov_chol)) + precision_chol = append(precision_chol, pre_chol) + } else + stop("Fitting the mixture model failed because some components have ill-defined empirical covariance (i.e., singleton matrix or non-symmetric)."+ + "\nTry to decrease the number of components, or increase reg_covar") + } + else { + cov = as.matrix(sigma[1]) + if(sum(cov <= 0) > 0) + stop("Fitting the mixture model failed because some components have ill-defined empirical covariance (i.e., singleton matrix or non-symmetric)."+ + "\nTry to decrease the number of components, or increase reg_covar") + else { + precision_chol = append(precision_chol, 1.0/sqrt(cov)) + } + } +} + +# Expectation step +e_step = function(Matrix[Double] X, Matrix[Double] w, Matrix[Double] mu, List[Unknown] precisions_cholesky, String model) +return(Double norm, Matrix[Double] log_resp){ + weighted_log_prob = estimate_weighted_log_prob(X, w, mu, precisions_cholesky, model) + log_prob_norm = logsumexp(weighted_log_prob) + log_resp = weighted_log_prob - log_prob_norm + norm = mean(log_prob_norm) +} + +# maximization Step +m_step = function(Matrix[Double] X, Matrix[Double] log_resp, Integer n_components, String model, Double reg_covar) +return (Matrix[Double] weight, Matrix[Double] mean, List[Unknown] sigma, List[Unknown] precision_chol) { + n = nrow(X) + [weight, mean, sigma] = estimate_gaussian_param(X, exp(log_resp), n_components, model, reg_covar) + weight = weight/n + precision_chol = compute_precision_cholesky(sigma, model) +} + +estimate_weighted_log_prob = function(Matrix[Double] X, Matrix[Double] w, Matrix[Double] mu, List[Unknown] precisions_cholesky, String model) +return (Matrix[Double] weight_log_pro) +{ + weight_log_pro = estimate_log_prob(X, mu, precisions_cholesky, model) + estimate_log_weights(w) +} + +estimate_log_weights = function(Matrix[Double] w) +return (Matrix[Double] log_weight) +{ + log_weight = log(w) +} + +estimate_log_prob = function(Matrix[Double] X, Matrix[Double] mu, List[Unknown] precisions_cholesky, String model) +return (Matrix[Double] log_prob) +{ + log_prob = estimate_log_gaussian_prob( + X, mu, precisions_cholesky, model) +} + +compute_log_det_cholesky = function(List[Unknown] mat_chol, String model, Integer d) +return(Matrix[Double] log_det_cholesky) +{ + comp = length(mat_chol) + + if(model == "VVV") + { + log_det_chol = matrix(0, 1, comp) + for(k in 1:comp) + { + mat = as.matrix(mat_chol[k]) + log_det = sum(log(diag(t(mat)))) # have to take the log of diag elements only + log_det_chol[1,k] = log_det + } + } + else if(model == "EEE") + { + mat = as.matrix(mat_chol[1]) + log_det_chol = as.matrix(sum(log(diag(mat)))) + } + else if(model == "VVI") + { + mat = as.matrix(mat_chol[1]) + log_det_chol = t(rowSums(log(mat))) + } + else if (model == "VII") + { + mat = as.matrix(mat_chol[1]) + log_det_chol = t(d * log(mat)) + } + log_det_cholesky = log_det_chol +} + +estimate_log_gaussian_prob = function(Matrix[Double] X, Matrix[Double] mu, List[Unknown] prec_chol, String model) +return(Matrix[Double] es_log_prob ) # nrow(X) * n_components +{ + n = nrow(X) + d = ncol(X) + n_components = nrow(mu) + + log_det = compute_log_det_cholesky(prec_chol, model, d) + if(model == "VVV") + { + log_prob = matrix(0, n, n_components) + for(k in 1:n_components) + { + prec = as.matrix(prec_chol[k]) + y = X %*% prec - mu[k,] %*% prec # changing here t intro: y = X %*% prec - mu[k,] %*% prec + log_prob[, k] = rowSums(y*y) + } + } + else if(model == "EEE") + { + log_prob = matrix(0, n, n_components) + prec = as.matrix(prec_chol[1]) + for(k in 1:n_components) { + y = X %*% prec - mu[k,] %*% prec + log_prob[, k] = rowSums(y*y) # TODO replace y*y with squared built-in + } + } + else if(model == "VVI") + { + prec = as.matrix(prec_chol[1]) + precisions = prec^2 + bc_matrix = matrix(1,nrow(X), nrow(mu)) + log_prob = (bc_matrix*t(rowSums(mu^2 * precisions)) - + 2. * (X %*% t(mu * precisions)) + + X^2 %*% t(precisions)) + } + else if (model == "VII") + { + prec = as.matrix(prec_chol[1]) + precisions = prec^ 2 + bc_matrix = matrix(1,nrow(X), nrow(mu)) + log_prob = (bc_matrix * t(rowSums(mu^2) * precisions) - + 2 * X %*% t(mu * precisions) + + rowSums(X*X) %*% t(precisions) ) # TODO replace rowSums(X*X) with squared rowNorm() built-in + } + if(ncol(log_det) == 1) + log_det = matrix(1, 1, ncol(log_prob)) * log_det + es_log_prob = -.5 * (d * log(2 * pi) + log_prob) + log_det +} + +logsumexp = function(Matrix[Double] M) # TODO replace with a built-in function logsumexp Review comment: I am planning to do it in few days. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
