http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/arima_box-jenkins/arima.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/arima_box-jenkins/arima.dml b/src/test/scripts/applications/arima_box-jenkins/arima.dml index f9afea9..73052e0 100644 --- a/src/test/scripts/applications/arima_box-jenkins/arima.dml +++ b/src/test/scripts/applications/arima_box-jenkins/arima.dml @@ -1,287 +1,287 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Arguments -# 1st arg: X (one column time series) -# 2nd arg: max_func_invoc -# 3rd arg: p (non-seasonal AR order) -# 4th arg: d (non-seasonal differencing order) -# 5th arg: q (non-seasonal MA order) -# 6th arg: P (seasonal AR order) -# 7th arg: D (seasonal differencing order) -# 8th arg: Q (seasonal MA order) -# 9th arg: s (period in terms of number of time-steps) -# 10th arg: 0/1 (1 means include.mean) -# 11th arg: 0 to use CG solver, 1 to use Jacobi's method -# 12th arg: file name to store learnt parameters - -#changing to additive sar since R's arima seems to do that - -arima_css = function(Matrix[Double] w, Matrix[Double] X, Integer pIn, Integer P, Integer qIn, Integer Q, Integer s, Integer useJacobi) return (Double obj){ - b = X[,2:ncol(X)]%*%w - - R = Rand(rows=nrow(X), cols=nrow(X), min=0, max=0) - for(i7 in 1:qIn){ - ma_ind_ns = P+pIn+i7 - err_ind_ns = i7 - ones_ns = Rand(rows=nrow(R)-err_ind_ns, cols=1, min=1, max=1) - d_ns = ones_ns * castAsScalar(w[ma_ind_ns,1]) - R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] = R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] + diag(d_ns) - } - for(i8 in 1:Q){ - ma_ind_s = P+pIn+qIn+i8 - err_ind_s = s*i8 - ones_s = Rand(rows=nrow(R)-err_ind_s, cols=1, min=1, max=1) - d_s = ones_s * castAsScalar(w[ma_ind_s,1]) - R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] = R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] + diag(d_s) - } - - #checking for strict diagonal dominance - #required for jacobi's method - # - - max_iter = 100 - tol = 0.01 - - y_hat = Rand(rows=nrow(X), cols=1, min=0, max=0) - iter = 0 - - if(useJacobi == 1){ - check = sum(ppred(rowSums(abs(R)), 1, ">=")) - if(check > 0){ - print("R is not diagonal dominant. Suggest switching to an exact solver.") - } - - diff = tol+1.0 - while(iter < max_iter & diff > tol){ - y_hat_new = b - R%*%y_hat - diff = sum((y_hat_new-y_hat)*(y_hat_new-y_hat)) - y_hat = y_hat_new - iter = iter + 1 - } - }else{ - ones = matrix(1, rows=nrow(X), cols=1) - A = R + diag(ones) - Z = t(A)%*%A - y = t(A)%*%b - r = -y - p = -r - norm_r2 = sum(r*r) - continue = 1 - if(norm_r2 == 0){ - continue = 0 - } - while(iter < max_iter & continue == 1){ - q = Z%*%p - alpha = norm_r2 / castAsScalar(t(p) %*% q) - y_hat = y_hat + alpha * p - old_norm_r2 = norm_r2 - r = r + alpha * q - norm_r2 = sum(r * r) - if(norm_r2 < tol){ - continue = 0 - } - beta = norm_r2 / old_norm_r2 - p = -r + beta * p - iter = iter + 1 - } - } - - errs = X[,1] - y_hat - obj = sum(errs*errs) -} - -#input col of time series data -X = read($1) - -max_func_invoc = $2 - -#non-seasonal order -p = $3 -d = $4 -q = $5 - -#seasonal order -P = $6 -D = $7 -Q = $8 - -#length of the season -s = $9 - -include_mean = $10 - -useJacobi = $11 - -num_rows = nrow(X) - -if(num_rows <= d){ - print("non-seasonal differencing order should be larger than length of the time-series") -} - -Y = X -for(i in 1:d){ - n1 = nrow(Y)+0.0 - Y = Y[2:n1,] - Y[1:n1-1,] -} - -num_rows = nrow(Y)+0.0 -if(num_rows <= s*D){ - print("seasonal differencing order should be larger than number of observations divided by length of season") -} - -for(i in 1:D){ - n1 = nrow(Y)+0.0 - Y = Y[s+1:n1,] - Y[1:n1-s,] -} - -n = nrow(Y) - -max_ar_col = P+p -max_ma_col = Q+q -if(max_ar_col > max_ma_col){ - max_arma_col = max_ar_col -}else{ - max_arma_col = max_ma_col -} - -mu = 0 -if(include_mean == 1){ - mu = sum(Y)/nrow(Y) - Y = Y - mu -} - -totcols = 1+p+P+Q+q #target col (X), p-P cols, q-Q cols - -Z = Rand(rows=n, cols=totcols, min=0, max=0) -Z[,1] = Y #target col - -parfor(i1 in 1:p, check=0){ - Z[i1+1:n,1+i1] = Y[1:n-i1,] -} -parfor(i2 in 1:P, check=0){ - Z[s*i2+1:n,1+p+i2] = Y[1:n-s*i2,] -} -parfor(i5 in 1:q, check=0){ - Z[i5+1:n,1+P+p+i5] = Y[1:n-i5,] -} -parfor(i6 in 1:Q, check=0){ - Z[s*i6+1:n,1+P+p+q+i6] = Y[1:n-s*i6,] -} - -one = Rand(rows=1, cols=1, min=1, max=1) - -simplex = Rand(rows=totcols-1, cols=totcols, min=0, max=0) -for(i in 2:ncol(simplex)){ - simplex[i-1,i] = 0.1 -} - -num_func_invoc = 0 - -objvals = Rand(rows=1, cols=ncol(simplex), min=0, max=0) -parfor(i3 in 1:ncol(simplex)){ - arima_css_obj_val = arima_css(simplex[,i3], Z, p, P, q, Q, s, useJacobi) - objvals[1,i3] = arima_css_obj_val -} -num_func_invoc = num_func_invoc + ncol(simplex) - -tol = 1.5 * 10^(-8) * castAsScalar(objvals[1,1]) - -continue = 1 -while(continue == 1 & num_func_invoc <= max_func_invoc) { - best_index = 1 - worst_index = 1 - for(i in 2:ncol(objvals)){ - this = castAsScalar(objvals[1,i]) - that = castAsScalar(objvals[1,best_index]) - if(that > this){ - best_index = i - } - that = castAsScalar(objvals[1,worst_index]) - if(that < this){ - worst_index = i - } - } - - best_obj_val = castAsScalar(objvals[1,best_index]) - worst_obj_val = castAsScalar(objvals[1,worst_index]) - if(worst_obj_val <= best_obj_val + tol){ - continue = 0 - } - - print("#Function calls::" + num_func_invoc + " OBJ: " + best_obj_val) - - c = (rowSums(simplex) - simplex[,worst_index])/(nrow(simplex)) - - x_r = 2*c - simplex[,worst_index] - obj_x_r = arima_css(x_r, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_r < best_obj_val){ - x_e = 2*x_r - c - obj_x_e = arima_css(x_e, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_r <= obj_x_e){ - simplex[,worst_index] = x_r - objvals[1,worst_index] = obj_x_r - }else{ - simplex[,worst_index] = x_e - objvals[1,worst_index] = obj_x_e - } - }else{ - if(obj_x_r < worst_obj_val){ - simplex[,worst_index] = x_r - objvals[1,worst_index] = obj_x_r - } - - x_c_in = (simplex[,worst_index] + c)/2 - obj_x_c_in = arima_css(x_c_in, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_c_in < castAsScalar(objvals[1,worst_index])){ - simplex[,worst_index] = x_c_in - objvals[1,worst_index] = obj_x_c_in - }else{ - if(obj_x_r >= worst_obj_val){ - best_point = simplex[,best_index] - parfor(i4 in 1:ncol(simplex)){ - if(i4 != best_index){ - simplex[,i4] = (simplex[,i4] + best_point)/2 - tmp = arima_css(simplex[,i4], Z, p, P, q, Q, s, useJacobi) - objvals[1,i4] = tmp*one - } - } - num_func_invoc = num_func_invoc + ncol(simplex) - 1 - } - } - } -} - -best_point = simplex[,best_index] -if(include_mean == 1){ - tmp2 = Rand(rows=totcols, cols=1, min=0, max=0) - tmp2[1:nrow(best_point),1] = best_point - tmp2[nrow(tmp2),1] = mu - best_point = tmp2 -} - -write(best_point, $12, format="text") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Arguments +# 1st arg: X (one column time series) +# 2nd arg: max_func_invoc +# 3rd arg: p (non-seasonal AR order) +# 4th arg: d (non-seasonal differencing order) +# 5th arg: q (non-seasonal MA order) +# 6th arg: P (seasonal AR order) +# 7th arg: D (seasonal differencing order) +# 8th arg: Q (seasonal MA order) +# 9th arg: s (period in terms of number of time-steps) +# 10th arg: 0/1 (1 means include.mean) +# 11th arg: 0 to use CG solver, 1 to use Jacobi's method +# 12th arg: file name to store learnt parameters + +#changing to additive sar since R's arima seems to do that + +arima_css = function(Matrix[Double] w, Matrix[Double] X, Integer pIn, Integer P, Integer qIn, Integer Q, Integer s, Integer useJacobi) return (Double obj){ + b = X[,2:ncol(X)]%*%w + + R = Rand(rows=nrow(X), cols=nrow(X), min=0, max=0) + for(i7 in 1:qIn){ + ma_ind_ns = P+pIn+i7 + err_ind_ns = i7 + ones_ns = Rand(rows=nrow(R)-err_ind_ns, cols=1, min=1, max=1) + d_ns = ones_ns * castAsScalar(w[ma_ind_ns,1]) + R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] = R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] + diag(d_ns) + } + for(i8 in 1:Q){ + ma_ind_s = P+pIn+qIn+i8 + err_ind_s = s*i8 + ones_s = Rand(rows=nrow(R)-err_ind_s, cols=1, min=1, max=1) + d_s = ones_s * castAsScalar(w[ma_ind_s,1]) + R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] = R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] + diag(d_s) + } + + #checking for strict diagonal dominance + #required for jacobi's method + # + + max_iter = 100 + tol = 0.01 + + y_hat = Rand(rows=nrow(X), cols=1, min=0, max=0) + iter = 0 + + if(useJacobi == 1){ + check = sum(ppred(rowSums(abs(R)), 1, ">=")) + if(check > 0){ + print("R is not diagonal dominant. Suggest switching to an exact solver.") + } + + diff = tol+1.0 + while(iter < max_iter & diff > tol){ + y_hat_new = b - R%*%y_hat + diff = sum((y_hat_new-y_hat)*(y_hat_new-y_hat)) + y_hat = y_hat_new + iter = iter + 1 + } + }else{ + ones = matrix(1, rows=nrow(X), cols=1) + A = R + diag(ones) + Z = t(A)%*%A + y = t(A)%*%b + r = -y + p = -r + norm_r2 = sum(r*r) + continue = 1 + if(norm_r2 == 0){ + continue = 0 + } + while(iter < max_iter & continue == 1){ + q = Z%*%p + alpha = norm_r2 / castAsScalar(t(p) %*% q) + y_hat = y_hat + alpha * p + old_norm_r2 = norm_r2 + r = r + alpha * q + norm_r2 = sum(r * r) + if(norm_r2 < tol){ + continue = 0 + } + beta = norm_r2 / old_norm_r2 + p = -r + beta * p + iter = iter + 1 + } + } + + errs = X[,1] - y_hat + obj = sum(errs*errs) +} + +#input col of time series data +X = read($1) + +max_func_invoc = $2 + +#non-seasonal order +p = $3 +d = $4 +q = $5 + +#seasonal order +P = $6 +D = $7 +Q = $8 + +#length of the season +s = $9 + +include_mean = $10 + +useJacobi = $11 + +num_rows = nrow(X) + +if(num_rows <= d){ + print("non-seasonal differencing order should be larger than length of the time-series") +} + +Y = X +for(i in 1:d){ + n1 = nrow(Y)+0.0 + Y = Y[2:n1,] - Y[1:n1-1,] +} + +num_rows = nrow(Y)+0.0 +if(num_rows <= s*D){ + print("seasonal differencing order should be larger than number of observations divided by length of season") +} + +for(i in 1:D){ + n1 = nrow(Y)+0.0 + Y = Y[s+1:n1,] - Y[1:n1-s,] +} + +n = nrow(Y) + +max_ar_col = P+p +max_ma_col = Q+q +if(max_ar_col > max_ma_col){ + max_arma_col = max_ar_col +}else{ + max_arma_col = max_ma_col +} + +mu = 0 +if(include_mean == 1){ + mu = sum(Y)/nrow(Y) + Y = Y - mu +} + +totcols = 1+p+P+Q+q #target col (X), p-P cols, q-Q cols + +Z = Rand(rows=n, cols=totcols, min=0, max=0) +Z[,1] = Y #target col + +parfor(i1 in 1:p, check=0){ + Z[i1+1:n,1+i1] = Y[1:n-i1,] +} +parfor(i2 in 1:P, check=0){ + Z[s*i2+1:n,1+p+i2] = Y[1:n-s*i2,] +} +parfor(i5 in 1:q, check=0){ + Z[i5+1:n,1+P+p+i5] = Y[1:n-i5,] +} +parfor(i6 in 1:Q, check=0){ + Z[s*i6+1:n,1+P+p+q+i6] = Y[1:n-s*i6,] +} + +one = Rand(rows=1, cols=1, min=1, max=1) + +simplex = Rand(rows=totcols-1, cols=totcols, min=0, max=0) +for(i in 2:ncol(simplex)){ + simplex[i-1,i] = 0.1 +} + +num_func_invoc = 0 + +objvals = Rand(rows=1, cols=ncol(simplex), min=0, max=0) +parfor(i3 in 1:ncol(simplex)){ + arima_css_obj_val = arima_css(simplex[,i3], Z, p, P, q, Q, s, useJacobi) + objvals[1,i3] = arima_css_obj_val +} +num_func_invoc = num_func_invoc + ncol(simplex) + +tol = 1.5 * 10^(-8) * castAsScalar(objvals[1,1]) + +continue = 1 +while(continue == 1 & num_func_invoc <= max_func_invoc) { + best_index = 1 + worst_index = 1 + for(i in 2:ncol(objvals)){ + this = castAsScalar(objvals[1,i]) + that = castAsScalar(objvals[1,best_index]) + if(that > this){ + best_index = i + } + that = castAsScalar(objvals[1,worst_index]) + if(that < this){ + worst_index = i + } + } + + best_obj_val = castAsScalar(objvals[1,best_index]) + worst_obj_val = castAsScalar(objvals[1,worst_index]) + if(worst_obj_val <= best_obj_val + tol){ + continue = 0 + } + + print("#Function calls::" + num_func_invoc + " OBJ: " + best_obj_val) + + c = (rowSums(simplex) - simplex[,worst_index])/(nrow(simplex)) + + x_r = 2*c - simplex[,worst_index] + obj_x_r = arima_css(x_r, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_r < best_obj_val){ + x_e = 2*x_r - c + obj_x_e = arima_css(x_e, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_r <= obj_x_e){ + simplex[,worst_index] = x_r + objvals[1,worst_index] = obj_x_r + }else{ + simplex[,worst_index] = x_e + objvals[1,worst_index] = obj_x_e + } + }else{ + if(obj_x_r < worst_obj_val){ + simplex[,worst_index] = x_r + objvals[1,worst_index] = obj_x_r + } + + x_c_in = (simplex[,worst_index] + c)/2 + obj_x_c_in = arima_css(x_c_in, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_c_in < castAsScalar(objvals[1,worst_index])){ + simplex[,worst_index] = x_c_in + objvals[1,worst_index] = obj_x_c_in + }else{ + if(obj_x_r >= worst_obj_val){ + best_point = simplex[,best_index] + parfor(i4 in 1:ncol(simplex)){ + if(i4 != best_index){ + simplex[,i4] = (simplex[,i4] + best_point)/2 + tmp = arima_css(simplex[,i4], Z, p, P, q, Q, s, useJacobi) + objvals[1,i4] = tmp*one + } + } + num_func_invoc = num_func_invoc + ncol(simplex) - 1 + } + } + } +} + +best_point = simplex[,best_index] +if(include_mean == 1){ + tmp2 = Rand(rows=totcols, cols=1, min=0, max=0) + tmp2[1:nrow(best_point),1] = best_point + tmp2[nrow(tmp2),1] = mu + best_point = tmp2 +} + +write(best_point, $12, format="text")
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/arima_box-jenkins/arima.pydml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/arima_box-jenkins/arima.pydml b/src/test/scripts/applications/arima_box-jenkins/arima.pydml index 0c4be73..9b3387c 100644 --- a/src/test/scripts/applications/arima_box-jenkins/arima.pydml +++ b/src/test/scripts/applications/arima_box-jenkins/arima.pydml @@ -1,258 +1,258 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Arguments -# 1st arg: X (one column time series) -# 2nd arg: max_func_invoc -# 3rd arg: p (non-seasonal AR order) -# 4th arg: d (non-seasonal differencing order) -# 5th arg: q (non-seasonal MA order) -# 6th arg: P (seasonal AR order) -# 7th arg: D (seasonal differencing order) -# 8th arg: Q (seasonal MA order) -# 9th arg: s (period in terms of number of time-steps) -# 10th arg: 0/1 (1 means include.mean) -# 11th arg: 0 to use CG solver, 1 to use Jacobi's method -# 12th arg: file name to store learnt parameters - -#changing to additive sar since R's arima seems to do that - -def arima_css(w:matrix[float], X:matrix[float], pIn: int, P: int, qIn: int, Q:int, s:int, useJacobi: int) -> (obj: float): - b = dot(X[,2:ncol(X)], w) - - R = Rand(rows=nrow(X), cols=nrow(X), min=0, max=0) - for(i7 in 1:qIn): - ma_ind_ns = P+pIn+i7 - err_ind_ns = i7 - ones_ns = Rand(rows=nrow(R)-err_ind_ns, cols=1, min=1, max=1) - d_ns = ones_ns * castAsScalar(w[ma_ind_ns,1]) - R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] = R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] + diag(d_ns) - for(i8 in 1:Q): - ma_ind_s = P+pIn+qIn+i8 - err_ind_s = s*i8 - ones_s = Rand(rows=nrow(R)-err_ind_s, cols=1, min=1, max=1) - d_s = ones_s * castAsScalar(w[ma_ind_s,1]) - R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] = R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] + diag(d_s) - - #checking for strict diagonal dominance - #required for jacobi's method - - max_iter = 100 - tol = 0.01 - - y_hat = Rand(rows=nrow(X), cols=1, min=0, max=0) - iter = 0 - - if(useJacobi == 1): - check = sum(ppred(rowSums(abs(R)), 1, ">=")) - if(check > 0): - print("R is not diagonal dominant. Suggest switching to an exact solver.") - diff = tol+1.0 - while(iter < max_iter & diff > tol): - y_hat_new = b - dot(R, y_hat) - diff = sum((y_hat_new-y_hat)*(y_hat_new-y_hat)) - y_hat = y_hat_new - iter = iter + 1 - else: - ones = full(1, rows=nrow(X), cols=1) - A = R + diag(ones) - transpose_A = transpose(A) - Z = dot(transpose_A, A) - y = dot(transpose_A, b) - r = -y - p = -r - norm_r2 = sum(r*r) - continue = 1 - if(norm_r2 == 0): - continue = 0 - while(iter < max_iter & continue == 1): - q = dot(Z, p) - transpose_p = transpose(p) - alpha = norm_r2 / castAsScalar(dot(transpose_p, q)) - y_hat = y_hat + alpha * p - old_norm_r2 = norm_r2 - r = r + alpha * q - norm_r2 = sum(r * r) - if(norm_r2 < tol): - continue = 0 - beta = norm_r2 / old_norm_r2 - p = -r + beta * p - iter = iter + 1 - errs = X[,1] - y_hat - obj = sum(errs*errs) -# end arima_css function - -#input col of time series data -X = load($1) - -max_func_invoc = $2 - -#non-seasonal order -p = $3 -d = $4 -q = $5 - -#seasonal order -P = $6 -D = $7 -Q = $8 - -#length of the season -s = $9 - -include_mean = $10 - -useJacobi = $11 - -num_rows = nrow(X) - -if(num_rows <= d): - print("non-seasonal differencing order should be larger than length of the time-series") - -Y = X -for(i in 1:d): - n1 = nrow(Y)+0.0 - Y = Y[2:n1,] - Y[1:n1-1,] - -num_rows = nrow(Y)+0.0 -if(num_rows <= s*D): - print("seasonal differencing order should be larger than number of observations divided by length of season") - -for(i in 1:D): - n1 = nrow(Y)+0.0 - Y = Y[s+1:n1,] - Y[1:n1-s,] - -n = nrow(Y) - -max_ar_col = P+p -max_ma_col = Q+q -if(max_ar_col > max_ma_col): - max_arma_col = max_ar_col -else: - max_arma_col = max_ma_col - -mu = 0 -if(include_mean == 1): - mu = sum(Y)/nrow(Y) - Y = Y - mu - -totcols = 1+p+P+Q+q #target col (X), p-P cols, q-Q cols - -Z = Rand(rows=n, cols=totcols, min=0, max=0) -Z[,1] = Y #target col - -parfor(i1 in 1:p, check=0): - Z[i1+1:n,1+i1] = Y[1:n-i1,] - -parfor(i2 in 1:P, check=0): - Z[s*i2+1:n,1+p+i2] = Y[1:n-s*i2,] - -parfor(i5 in 1:q, check=0): - Z[i5+1:n,1+P+p+i5] = Y[1:n-i5,] - -parfor(i6 in 1:Q, check=0): - Z[s*i6+1:n,1+P+p+q+i6] = Y[1:n-s*i6,] - - -one = Rand(rows=1, cols=1, min=1, max=1) - -simplex = Rand(rows=totcols-1, cols=totcols, min=0, max=0) -for(i in 2:ncol(simplex)): - simplex[i-1,i] = 0.1 - -num_func_invoc = 0 - -objvals = Rand(rows=1, cols=ncol(simplex), min=0, max=0) -parfor(i3 in 1:ncol(simplex)): - arima_css_obj_val = arima_css(simplex[,i3], Z, p, P, q, Q, s, useJacobi) - objvals[1,i3] = arima_css_obj_val - -num_func_invoc = num_func_invoc + ncol(simplex) - -tol = 1.5 * (10**-8) * castAsScalar(objvals[1,1]) - -continue = 1 -while(continue == 1 & num_func_invoc <= max_func_invoc): - best_index = 1 - worst_index = 1 - for(i in 2:ncol(objvals)): - this = castAsScalar(objvals[1,i]) - that = castAsScalar(objvals[1,best_index]) - if(that > this): - best_index = i - that = castAsScalar(objvals[1,worst_index]) - if(that < this): - worst_index = i - - best_obj_val = castAsScalar(objvals[1,best_index]) - worst_obj_val = castAsScalar(objvals[1,worst_index]) - if(worst_obj_val <= best_obj_val + tol): - continue = 0 - - print("#Function calls::" + num_func_invoc + " OBJ: " + best_obj_val) - - c = (rowSums(simplex) - simplex[,worst_index])/(nrow(simplex)) - - x_r = 2*c - simplex[,worst_index] - obj_x_r = arima_css(x_r, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_r < best_obj_val): - x_e = 2*x_r - c - obj_x_e = arima_css(x_e, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_r <= obj_x_e): - simplex[,worst_index] = x_r - objvals[1,worst_index] = obj_x_r - else: - simplex[,worst_index] = x_e - objvals[1,worst_index] = obj_x_e - else: - if(obj_x_r < worst_obj_val): - simplex[,worst_index] = x_r - objvals[1,worst_index] = obj_x_r - - x_c_in = (simplex[,worst_index] + c)/2 - obj_x_c_in = arima_css(x_c_in, Z, p, P, q, Q, s, useJacobi) - num_func_invoc = num_func_invoc + 1 - - if(obj_x_c_in < castAsScalar(objvals[1,worst_index])): - simplex[,worst_index] = x_c_in - objvals[1,worst_index] = obj_x_c_in - else: - if(obj_x_r >= worst_obj_val): - best_point = simplex[,best_index] - parfor(i4 in 1:ncol(simplex)): - if(i4 != best_index): - simplex[,i4] = (simplex[,i4] + best_point)/2 - tmp = arima_css(simplex[,i4], Z, p, P, q, Q, s, useJacobi) - objvals[1,i4] = tmp*one - num_func_invoc = num_func_invoc + ncol(simplex) - 1 - -best_point = simplex[,best_index] -if(include_mean == 1): - tmp2 = Rand(rows=totcols, cols=1, min=0, max=0) - tmp2[1:nrow(best_point),1] = best_point - tmp2[nrow(tmp2),1] = mu - best_point = tmp2 - -save(best_point, $12, format="text") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Arguments +# 1st arg: X (one column time series) +# 2nd arg: max_func_invoc +# 3rd arg: p (non-seasonal AR order) +# 4th arg: d (non-seasonal differencing order) +# 5th arg: q (non-seasonal MA order) +# 6th arg: P (seasonal AR order) +# 7th arg: D (seasonal differencing order) +# 8th arg: Q (seasonal MA order) +# 9th arg: s (period in terms of number of time-steps) +# 10th arg: 0/1 (1 means include.mean) +# 11th arg: 0 to use CG solver, 1 to use Jacobi's method +# 12th arg: file name to store learnt parameters + +#changing to additive sar since R's arima seems to do that + +def arima_css(w:matrix[float], X:matrix[float], pIn: int, P: int, qIn: int, Q:int, s:int, useJacobi: int) -> (obj: float): + b = dot(X[,2:ncol(X)], w) + + R = Rand(rows=nrow(X), cols=nrow(X), min=0, max=0) + for(i7 in 1:qIn): + ma_ind_ns = P+pIn+i7 + err_ind_ns = i7 + ones_ns = Rand(rows=nrow(R)-err_ind_ns, cols=1, min=1, max=1) + d_ns = ones_ns * castAsScalar(w[ma_ind_ns,1]) + R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] = R[1+err_ind_ns:nrow(R),1:ncol(R)-err_ind_ns] + diag(d_ns) + for(i8 in 1:Q): + ma_ind_s = P+pIn+qIn+i8 + err_ind_s = s*i8 + ones_s = Rand(rows=nrow(R)-err_ind_s, cols=1, min=1, max=1) + d_s = ones_s * castAsScalar(w[ma_ind_s,1]) + R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] = R[1+err_ind_s:nrow(R),1:ncol(R)-err_ind_s] + diag(d_s) + + #checking for strict diagonal dominance + #required for jacobi's method + + max_iter = 100 + tol = 0.01 + + y_hat = Rand(rows=nrow(X), cols=1, min=0, max=0) + iter = 0 + + if(useJacobi == 1): + check = sum(ppred(rowSums(abs(R)), 1, ">=")) + if(check > 0): + print("R is not diagonal dominant. Suggest switching to an exact solver.") + diff = tol+1.0 + while(iter < max_iter & diff > tol): + y_hat_new = b - dot(R, y_hat) + diff = sum((y_hat_new-y_hat)*(y_hat_new-y_hat)) + y_hat = y_hat_new + iter = iter + 1 + else: + ones = full(1, rows=nrow(X), cols=1) + A = R + diag(ones) + transpose_A = transpose(A) + Z = dot(transpose_A, A) + y = dot(transpose_A, b) + r = -y + p = -r + norm_r2 = sum(r*r) + continue = 1 + if(norm_r2 == 0): + continue = 0 + while(iter < max_iter & continue == 1): + q = dot(Z, p) + transpose_p = transpose(p) + alpha = norm_r2 / castAsScalar(dot(transpose_p, q)) + y_hat = y_hat + alpha * p + old_norm_r2 = norm_r2 + r = r + alpha * q + norm_r2 = sum(r * r) + if(norm_r2 < tol): + continue = 0 + beta = norm_r2 / old_norm_r2 + p = -r + beta * p + iter = iter + 1 + errs = X[,1] - y_hat + obj = sum(errs*errs) +# end arima_css function + +#input col of time series data +X = load($1) + +max_func_invoc = $2 + +#non-seasonal order +p = $3 +d = $4 +q = $5 + +#seasonal order +P = $6 +D = $7 +Q = $8 + +#length of the season +s = $9 + +include_mean = $10 + +useJacobi = $11 + +num_rows = nrow(X) + +if(num_rows <= d): + print("non-seasonal differencing order should be larger than length of the time-series") + +Y = X +for(i in 1:d): + n1 = nrow(Y)+0.0 + Y = Y[2:n1,] - Y[1:n1-1,] + +num_rows = nrow(Y)+0.0 +if(num_rows <= s*D): + print("seasonal differencing order should be larger than number of observations divided by length of season") + +for(i in 1:D): + n1 = nrow(Y)+0.0 + Y = Y[s+1:n1,] - Y[1:n1-s,] + +n = nrow(Y) + +max_ar_col = P+p +max_ma_col = Q+q +if(max_ar_col > max_ma_col): + max_arma_col = max_ar_col +else: + max_arma_col = max_ma_col + +mu = 0 +if(include_mean == 1): + mu = sum(Y)/nrow(Y) + Y = Y - mu + +totcols = 1+p+P+Q+q #target col (X), p-P cols, q-Q cols + +Z = Rand(rows=n, cols=totcols, min=0, max=0) +Z[,1] = Y #target col + +parfor(i1 in 1:p, check=0): + Z[i1+1:n,1+i1] = Y[1:n-i1,] + +parfor(i2 in 1:P, check=0): + Z[s*i2+1:n,1+p+i2] = Y[1:n-s*i2,] + +parfor(i5 in 1:q, check=0): + Z[i5+1:n,1+P+p+i5] = Y[1:n-i5,] + +parfor(i6 in 1:Q, check=0): + Z[s*i6+1:n,1+P+p+q+i6] = Y[1:n-s*i6,] + + +one = Rand(rows=1, cols=1, min=1, max=1) + +simplex = Rand(rows=totcols-1, cols=totcols, min=0, max=0) +for(i in 2:ncol(simplex)): + simplex[i-1,i] = 0.1 + +num_func_invoc = 0 + +objvals = Rand(rows=1, cols=ncol(simplex), min=0, max=0) +parfor(i3 in 1:ncol(simplex)): + arima_css_obj_val = arima_css(simplex[,i3], Z, p, P, q, Q, s, useJacobi) + objvals[1,i3] = arima_css_obj_val + +num_func_invoc = num_func_invoc + ncol(simplex) + +tol = 1.5 * (10**-8) * castAsScalar(objvals[1,1]) + +continue = 1 +while(continue == 1 & num_func_invoc <= max_func_invoc): + best_index = 1 + worst_index = 1 + for(i in 2:ncol(objvals)): + this = castAsScalar(objvals[1,i]) + that = castAsScalar(objvals[1,best_index]) + if(that > this): + best_index = i + that = castAsScalar(objvals[1,worst_index]) + if(that < this): + worst_index = i + + best_obj_val = castAsScalar(objvals[1,best_index]) + worst_obj_val = castAsScalar(objvals[1,worst_index]) + if(worst_obj_val <= best_obj_val + tol): + continue = 0 + + print("#Function calls::" + num_func_invoc + " OBJ: " + best_obj_val) + + c = (rowSums(simplex) - simplex[,worst_index])/(nrow(simplex)) + + x_r = 2*c - simplex[,worst_index] + obj_x_r = arima_css(x_r, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_r < best_obj_val): + x_e = 2*x_r - c + obj_x_e = arima_css(x_e, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_r <= obj_x_e): + simplex[,worst_index] = x_r + objvals[1,worst_index] = obj_x_r + else: + simplex[,worst_index] = x_e + objvals[1,worst_index] = obj_x_e + else: + if(obj_x_r < worst_obj_val): + simplex[,worst_index] = x_r + objvals[1,worst_index] = obj_x_r + + x_c_in = (simplex[,worst_index] + c)/2 + obj_x_c_in = arima_css(x_c_in, Z, p, P, q, Q, s, useJacobi) + num_func_invoc = num_func_invoc + 1 + + if(obj_x_c_in < castAsScalar(objvals[1,worst_index])): + simplex[,worst_index] = x_c_in + objvals[1,worst_index] = obj_x_c_in + else: + if(obj_x_r >= worst_obj_val): + best_point = simplex[,best_index] + parfor(i4 in 1:ncol(simplex)): + if(i4 != best_index): + simplex[,i4] = (simplex[,i4] + best_point)/2 + tmp = arima_css(simplex[,i4], Z, p, P, q, Q, s, useJacobi) + objvals[1,i4] = tmp*one + num_func_invoc = num_func_invoc + ncol(simplex) - 1 + +best_point = simplex[,best_index] +if(include_mean == 1): + tmp2 = Rand(rows=totcols, cols=1, min=0, max=0) + tmp2[1:nrow(best_point),1] = best_point + tmp2[nrow(tmp2),1] = mu + best_point = tmp2 + +save(best_point, $12, format="text") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/Binomial.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/Binomial.dml b/src/test/scripts/applications/ctableStats/Binomial.dml index 9d2f7f4..a3cbf5a 100644 --- a/src/test/scripts/applications/ctableStats/Binomial.dml +++ b/src/test/scripts/applications/ctableStats/Binomial.dml @@ -1,171 +1,171 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# COMMON FUNCTIONS TO SOLVE BINOMIAL DISTRIBUTION PROBLEMS -# WORK OVER VECTORS (IN PARALLEL) TO SAVE COMPUTATION TIME - -# Computes binomial parameter p (the biased-coin probability) -# such that Prob [Binom(n, p) <= m] = alpha -# Use it for "exact" confidence intervals over p given m, n: -# For example, for 95%-confidence intervals, use [p1, p2] -# such that Prob [Binom(n, p1) <= m-1] = 0.975 -# and Prob [Binom(n, p2) <= m ] = 0.025 -binomQuantile = - function (Matrix[double] n_vector, Matrix[double] m_vector, Matrix[double] alpha_vector) - return (Matrix[double] p_vector) -{ - num_rows = nrow (n_vector); - p_min = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); - alpha_p_min = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); - p_max = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); - alpha_p_max = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); - - for (i in 1:27) { # Uses "division by half" method to solve equations - p_new = (p_min + p_max) / 2.0; - [alpha_p_new] = binomProb (n_vector, m_vector, p_new); - move_new_to_max = ppred (alpha_p_new, alpha_vector, "<"); - p_max = (1 - move_new_to_max) * p_max + move_new_to_max * p_new; - p_min = (1 - move_new_to_max) * p_new + move_new_to_max * p_min; - alpha_p_max = (1 - move_new_to_max) * alpha_p_max + move_new_to_max * alpha_p_new; - alpha_p_min = (1 - move_new_to_max) * alpha_p_new + move_new_to_max * alpha_p_min; - } - p_vector = (p_min + p_max) / 2.0; -} - - -# Computes the cumulative distribution fuction of the binomial distribution, -# that is, Prob [Binom(n, p) <= m], using the incomplete Beta function -# approximated via a continued fraction, see "Handbook of Mathematical Functions" -# edited by M. Abramowitz and I.A. Stegun, U.S. Nat-l Bureau of Standards, -# 10th print (Dec 1972), Sec. 26.5.8-26.5.9, p. 944 -binomProb = - function (Matrix[double] n_vector, Matrix[double] m_vector, Matrix[double] p_vector) - return (Matrix[double] result) -{ - num_rows = nrow (n_vector); - num_iterations = 100; - - mean_vector = p_vector * n_vector; - is_opposite = ppred (mean_vector, m_vector, "<"); - l_vector = is_opposite * (n_vector - (m_vector + 1)) + (1 - is_opposite) * m_vector; - q_vector = is_opposite * (1.0 - p_vector) + (1 - is_opposite) * p_vector; - n_minus_l_vector = n_vector - l_vector; - - is_result_zero1 = ppred (l_vector, - 0.0000000001, "<"); - is_result_one1 = ppred (n_minus_l_vector, 0.0000000001, "<"); - is_result_zero2 = ppred (q_vector, 0.9999999999, ">"); - is_result_one2 = ppred (q_vector, 0.0000000001, "<"); - - is_result_zero = is_result_zero1 + (1 - is_result_zero1) * is_result_zero2 * (1 - is_result_one1); - is_result_one = (is_result_one1 + (1 - is_result_one1) * is_result_one2) * (1 - is_result_zero); - - result = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); - result = result + is_result_one; - is_already_done = is_result_zero + is_result_one; - still_iterating = 1 - is_already_done; - - n_vector = (1 - is_already_done) * n_vector + is_already_done * 2; - l_vector = (1 - is_already_done) * l_vector + is_already_done; - n_minus_l_vector = (1 - is_already_done) * n_minus_l_vector + is_already_done; - q_vector = (1 - is_already_done) * q_vector + is_already_done * 0.8; - - numer_old = q_vector; - denom_old = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); - numer = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); - denom = 1.0 - q_vector; - - is_i_even = 1; - - for (i in 1:num_iterations) # The continued fraction iterations - { - is_i_even = 1 - is_i_even; - e_term = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); - if (i > 1) { - if (is_i_even == 1) { - e_term = - (2 * n_minus_l_vector + (i - 2)) * (2 * l_vector - (i - 2)); - } - if (is_i_even == 0) { - e_term = (i - 1) * (2 * n_vector + (i - 1)); - } - e_term = e_term / (n_minus_l_vector + (i - 2)) / (n_minus_l_vector + (i - 1)); - e_term = e_term * 0.25; - } - numer_new = still_iterating * (q_vector * numer + (1.0 - q_vector) * e_term * numer_old) + (1.0 - still_iterating); - denom_new = still_iterating * (q_vector * denom + (1.0 - q_vector) * e_term * denom_old) + (1.0 - still_iterating); - numer_old = still_iterating * (q_vector * numer) + (1.0 - still_iterating); - denom_old = still_iterating * (q_vector * denom) + (1.0 - still_iterating); - numer = numer_new; - denom = denom_new; - - abs_denom = abs (denom); - denom_too_big = ppred (abs_denom, 10000000000.0, ">"); - denom_too_small = ppred (abs_denom, 0.0000000001, "<"); - denom_normal = 1.0 - denom_too_big - denom_too_small; - rescale_vector = denom_too_big * 0.0000000001 + denom_too_small * 10000000000.0 + denom_normal; - numer_old = numer_old * rescale_vector; - denom_old = denom_old * rescale_vector; - numer = numer * rescale_vector; - denom = denom * rescale_vector; - - convergence_check_left = abs (numer * denom_old - numer_old * denom); - convergence_check_right = abs (numer * denom_old) * 0.000000001; - has_converged = ppred (convergence_check_left, convergence_check_right, "<="); - has_converged = still_iterating * has_converged; - still_iterating = still_iterating - has_converged; - result = result + has_converged * numer / denom; - } - - result = result + still_iterating * numer / denom; - - n_vector_not_already_done = (1 - is_already_done) * n_vector; - l_vector_not_already_done = (1 - is_already_done) * l_vector; - n_minus_l_vector_not_already_done = (1 - is_already_done) * n_minus_l_vector; - q_vector_not_already_done = (1 - is_already_done) * q_vector + is_already_done; - one_minus_q_vector_not_already_done = (1 - is_already_done) * (1.0 - q_vector) + is_already_done; - - [n_logfact] = logFactorial (n_vector_not_already_done); - [l_logfact] = logFactorial (l_vector_not_already_done); - [n_minus_l_logfact] = logFactorial (n_minus_l_vector_not_already_done); - - log_update_factor = n_logfact - l_logfact - n_minus_l_logfact + l_vector * log (q_vector_not_already_done) - + n_minus_l_vector * log (one_minus_q_vector_not_already_done); - updated_result = result * (is_already_done + (1 - is_already_done) * exp (log_update_factor)); - result = is_opposite + (1 - 2 * is_opposite) * updated_result; -} - - -# Computes the logarithm of the factorial of x >= 0 via the Gamma function -# From paper: C. Lanczos "A Precision Approximation of the Gamma Function", -# Journal of the SIAM: Numerical Analysis, Series B, Vol. 1, 1964, pp. 86-96 -logFactorial = function (Matrix[double] x) return (Matrix[double] logfact) -{ - y = 1.000000000178; - y = y + 76.180091729406 / (x + 1); - y = y - 86.505320327112 / (x + 2); - y = y + 24.014098222230 / (x + 3); - y = y - 1.231739516140 / (x + 4); - y = y + 0.001208580030 / (x + 5); - y = y - 0.000005363820 / (x + 6); - logfact = log(y) + (x + 0.5) * log(x + 5.5) - (x + 5.5) + 0.91893853320467; # log(sqrt(2 * PI)); -} - - - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# COMMON FUNCTIONS TO SOLVE BINOMIAL DISTRIBUTION PROBLEMS +# WORK OVER VECTORS (IN PARALLEL) TO SAVE COMPUTATION TIME + +# Computes binomial parameter p (the biased-coin probability) +# such that Prob [Binom(n, p) <= m] = alpha +# Use it for "exact" confidence intervals over p given m, n: +# For example, for 95%-confidence intervals, use [p1, p2] +# such that Prob [Binom(n, p1) <= m-1] = 0.975 +# and Prob [Binom(n, p2) <= m ] = 0.025 +binomQuantile = + function (Matrix[double] n_vector, Matrix[double] m_vector, Matrix[double] alpha_vector) + return (Matrix[double] p_vector) +{ + num_rows = nrow (n_vector); + p_min = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); + alpha_p_min = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); + p_max = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); + alpha_p_max = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); + + for (i in 1:27) { # Uses "division by half" method to solve equations + p_new = (p_min + p_max) / 2.0; + [alpha_p_new] = binomProb (n_vector, m_vector, p_new); + move_new_to_max = ppred (alpha_p_new, alpha_vector, "<"); + p_max = (1 - move_new_to_max) * p_max + move_new_to_max * p_new; + p_min = (1 - move_new_to_max) * p_new + move_new_to_max * p_min; + alpha_p_max = (1 - move_new_to_max) * alpha_p_max + move_new_to_max * alpha_p_new; + alpha_p_min = (1 - move_new_to_max) * alpha_p_new + move_new_to_max * alpha_p_min; + } + p_vector = (p_min + p_max) / 2.0; +} + + +# Computes the cumulative distribution fuction of the binomial distribution, +# that is, Prob [Binom(n, p) <= m], using the incomplete Beta function +# approximated via a continued fraction, see "Handbook of Mathematical Functions" +# edited by M. Abramowitz and I.A. Stegun, U.S. Nat-l Bureau of Standards, +# 10th print (Dec 1972), Sec. 26.5.8-26.5.9, p. 944 +binomProb = + function (Matrix[double] n_vector, Matrix[double] m_vector, Matrix[double] p_vector) + return (Matrix[double] result) +{ + num_rows = nrow (n_vector); + num_iterations = 100; + + mean_vector = p_vector * n_vector; + is_opposite = ppred (mean_vector, m_vector, "<"); + l_vector = is_opposite * (n_vector - (m_vector + 1)) + (1 - is_opposite) * m_vector; + q_vector = is_opposite * (1.0 - p_vector) + (1 - is_opposite) * p_vector; + n_minus_l_vector = n_vector - l_vector; + + is_result_zero1 = ppred (l_vector, - 0.0000000001, "<"); + is_result_one1 = ppred (n_minus_l_vector, 0.0000000001, "<"); + is_result_zero2 = ppred (q_vector, 0.9999999999, ">"); + is_result_one2 = ppred (q_vector, 0.0000000001, "<"); + + is_result_zero = is_result_zero1 + (1 - is_result_zero1) * is_result_zero2 * (1 - is_result_one1); + is_result_one = (is_result_one1 + (1 - is_result_one1) * is_result_one2) * (1 - is_result_zero); + + result = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); + result = result + is_result_one; + is_already_done = is_result_zero + is_result_one; + still_iterating = 1 - is_already_done; + + n_vector = (1 - is_already_done) * n_vector + is_already_done * 2; + l_vector = (1 - is_already_done) * l_vector + is_already_done; + n_minus_l_vector = (1 - is_already_done) * n_minus_l_vector + is_already_done; + q_vector = (1 - is_already_done) * q_vector + is_already_done * 0.8; + + numer_old = q_vector; + denom_old = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); + numer = Rand (rows = num_rows, cols = 1, min = 0.0, max = 0.0); + denom = 1.0 - q_vector; + + is_i_even = 1; + + for (i in 1:num_iterations) # The continued fraction iterations + { + is_i_even = 1 - is_i_even; + e_term = Rand (rows = num_rows, cols = 1, min = 1.0, max = 1.0); + if (i > 1) { + if (is_i_even == 1) { + e_term = - (2 * n_minus_l_vector + (i - 2)) * (2 * l_vector - (i - 2)); + } + if (is_i_even == 0) { + e_term = (i - 1) * (2 * n_vector + (i - 1)); + } + e_term = e_term / (n_minus_l_vector + (i - 2)) / (n_minus_l_vector + (i - 1)); + e_term = e_term * 0.25; + } + numer_new = still_iterating * (q_vector * numer + (1.0 - q_vector) * e_term * numer_old) + (1.0 - still_iterating); + denom_new = still_iterating * (q_vector * denom + (1.0 - q_vector) * e_term * denom_old) + (1.0 - still_iterating); + numer_old = still_iterating * (q_vector * numer) + (1.0 - still_iterating); + denom_old = still_iterating * (q_vector * denom) + (1.0 - still_iterating); + numer = numer_new; + denom = denom_new; + + abs_denom = abs (denom); + denom_too_big = ppred (abs_denom, 10000000000.0, ">"); + denom_too_small = ppred (abs_denom, 0.0000000001, "<"); + denom_normal = 1.0 - denom_too_big - denom_too_small; + rescale_vector = denom_too_big * 0.0000000001 + denom_too_small * 10000000000.0 + denom_normal; + numer_old = numer_old * rescale_vector; + denom_old = denom_old * rescale_vector; + numer = numer * rescale_vector; + denom = denom * rescale_vector; + + convergence_check_left = abs (numer * denom_old - numer_old * denom); + convergence_check_right = abs (numer * denom_old) * 0.000000001; + has_converged = ppred (convergence_check_left, convergence_check_right, "<="); + has_converged = still_iterating * has_converged; + still_iterating = still_iterating - has_converged; + result = result + has_converged * numer / denom; + } + + result = result + still_iterating * numer / denom; + + n_vector_not_already_done = (1 - is_already_done) * n_vector; + l_vector_not_already_done = (1 - is_already_done) * l_vector; + n_minus_l_vector_not_already_done = (1 - is_already_done) * n_minus_l_vector; + q_vector_not_already_done = (1 - is_already_done) * q_vector + is_already_done; + one_minus_q_vector_not_already_done = (1 - is_already_done) * (1.0 - q_vector) + is_already_done; + + [n_logfact] = logFactorial (n_vector_not_already_done); + [l_logfact] = logFactorial (l_vector_not_already_done); + [n_minus_l_logfact] = logFactorial (n_minus_l_vector_not_already_done); + + log_update_factor = n_logfact - l_logfact - n_minus_l_logfact + l_vector * log (q_vector_not_already_done) + + n_minus_l_vector * log (one_minus_q_vector_not_already_done); + updated_result = result * (is_already_done + (1 - is_already_done) * exp (log_update_factor)); + result = is_opposite + (1 - 2 * is_opposite) * updated_result; +} + + +# Computes the logarithm of the factorial of x >= 0 via the Gamma function +# From paper: C. Lanczos "A Precision Approximation of the Gamma Function", +# Journal of the SIAM: Numerical Analysis, Series B, Vol. 1, 1964, pp. 86-96 +logFactorial = function (Matrix[double] x) return (Matrix[double] logfact) +{ + y = 1.000000000178; + y = y + 76.180091729406 / (x + 1); + y = y - 86.505320327112 / (x + 2); + y = y + 24.014098222230 / (x + 3); + y = y - 1.231739516140 / (x + 4); + y = y + 0.001208580030 / (x + 5); + y = y - 0.000005363820 / (x + 6); + logfact = log(y) + (x + 0.5) * log(x + 5.5) - (x + 5.5) + 0.91893853320467; # log(sqrt(2 * PI)); +} + + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/ctci.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/ctci.dml b/src/test/scripts/applications/ctableStats/ctci.dml index 22d3044..4aa3ae3 100644 --- a/src/test/scripts/applications/ctableStats/ctci.dml +++ b/src/test/scripts/applications/ctableStats/ctci.dml @@ -1,145 +1,145 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# CTCI.DML: TWO-ATTRIBUTE CONTINGENCY TABLE CONFIDENCE INTERVAL ANALYSIS for categorical data -# Computes 95% confidence intervals for binomial ratios using both Wilson and Exact Scores -# INPUT 1: Dataset as an (N x 2) matrix, input file path/name -# Rows: Individual data points -# Col 1: Partition attribute (e.g. US State code), must be positive integer -# Col 2: Label attribute (e.g. positive/negative/neutral), must be positive integer -# INPUT 2: Number of data points N (i.e. input matrix size, rows) -# INPUT 3: "Null" label code, 0 if there is no "null" label -# INPUT 4: Head Matrix output file path/name -# INPUT 5: Body Matrix output file path/name -# OUTPUT 1: Head Matrix with per-label information: -# Rows: One row per each distinct code of the label attribute (1, 2, 3, ...) -# Col 1: First column index of the Body Matrix block for this label, or 0 if "null" -# Col 2: Overall number of data points with this label -# Col 3: Percentage (out of 100) of data points to have this label -# OUTPUT 2: Body Matrix with per-partition statistics: -# Rows: One row per each distinct code of the partition attribute (1, 2, 3, ...) -# Columns: Arranged in blocks with the same schema, one block per each non-null label, -# with the first column index specified in the Head Matrix -# Block Col 0: Number of points, i.e. the count, with this label in the given partition -# Block Col 1: Percentage (out of 100) of points to have the label vs. all in the partition -# Block Col 2: Small side, 95% confid. int-l (of above percentage), Wilson Score -# Block Col 3: Large side, 95% confid. int-l (of above percentage), Wilson Score -# Block Col 4: Small side, 95% confid. int-l (of above percentage), Exact Binomial Score -# Block Col 5: Large side, 95% confid. int-l (of above percentage), Exact Binomial Score -# Block Col 6: Percentage (out of 100) of points to lie in the partition vs. all with the label -# Block Col 7: Small side, 95% confid. int-l (of above percentage), Wilson Score -# Block Col 8: Large side, 95% confid. int-l (of above percentage), Wilson Score -# Block Col 9: Small side, 95% confid. int-l (of above percentage), Exact Binomial Score -# Block Col 10: Large side, 95% confid. int-l (of above percentage), Exact Binomial Score -# Block Col 11-99: RESERVED and set to zero -# -# EXAMPLE: -# hadoop jar SystemML.jar -f PATH/ctci.dml -args PATH/ctci_test.mtx 5602 2 PATH/ctci_test_head.mtx PATH/ctci_test_body.mtx - -setwd ("test/scripts/applications/ctableStats"); # SET TO THE SCRIPT FOLDER -source ("Binomial.dml"); # THIS SCRIPT SHOULD BE THERE TOO -powerOfTen = 10000; # CONSTANT FOR ROUNDING THE RESULTS - -print ("BEGIN CTABLE ANALYSIS SCRIPT"); -print ("Reading the input matrix..."); -InData = read($1, rows = $2, cols = 2, format = "text"); -print ("Computing the contingency table..."); -CT = table (InData [, 1], InData [, 2]); -# DEBUG LINE ONLY: write (CT, "test/scripts/applications/ctableStats/ctci_test_CT.mtx", format="text"); -print ("Preparing for the output tables..."); -nullLabel = $3; -numPartitions = nrow (CT); -numLabels = ncol (CT); -cntPartitions = rowSums (CT); -cntLabels = t(colSums (CT)); -numBodyBlocks = numLabels; -for (iLabel in 1:numLabels) { - if (iLabel == nullLabel) { - numBodyBlocks = numBodyBlocks - 1; -} } -numBodyCols = numBodyBlocks * 100; -HeadMtx = Rand (rows = numLabels, cols = 3, min = 0, max = 0); -HeadMtx [, 2] = cntLabels; -HeadMtx [, 3] = 100.0 * cntLabels / sum (cntLabels); -BodyMtx = Rand (rows = numPartitions, cols = numBodyCols, min = 0, max = 0); -zeros = Rand (rows = numPartitions, cols = 1, min = 0, max = 0); -zero = Rand (rows = 1, cols = 1, min = 0, max = 0); -big_alpha = 0.975 + zeros; -small_alpha = 0.025 + zeros; -iBlock = 0; -for (iLabel in 1:numLabels) -{ - if (iLabel != nullLabel) { - if (1==1) { - print ("Processing label " + iLabel + ":"); - } - fCol = 1 + iBlock * 100; - HeadMtx [iLabel, 1] = fCol + zero; - cntPartitionsWithLabel = CT [, iLabel]; - BodyMtx [, fCol] = cntPartitionsWithLabel; - - print (" (partition & label) / (all partition) ratios..."); - - cntPartitionsWithLabel_minus_1 = cntPartitionsWithLabel - 1; - [ratio1, left_conf_wilson1, right_conf_wilson1] = - wilson_confidence (cntPartitions, cntPartitionsWithLabel); - [left_conf_exact1] = binomQuantile (cntPartitions, cntPartitionsWithLabel_minus_1, big_alpha); - [right_conf_exact1] = binomQuantile (cntPartitions, cntPartitionsWithLabel, small_alpha); - - BodyMtx [, fCol + 1] = round (ratio1 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 2] = round (left_conf_wilson1 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 3] = round (right_conf_wilson1 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 4] = round (left_conf_exact1 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 5] = round (right_conf_exact1 * 100.0 * powerOfTen) / powerOfTen; - - print (" (partition & label) / (all label) ratios..."); - - cntThisLabel = zeros + castAsScalar (cntLabels [iLabel, 1]); - [ratio2, left_conf_wilson2, right_conf_wilson2] = - wilson_confidence (cntThisLabel, cntPartitionsWithLabel); - [left_conf_exact2] = binomQuantile (cntThisLabel, cntPartitionsWithLabel_minus_1, big_alpha); - [right_conf_exact2] = binomQuantile (cntThisLabel, cntPartitionsWithLabel, small_alpha); - - BodyMtx [, fCol + 6] = round (ratio2 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 7] = round (left_conf_wilson2 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 8] = round (right_conf_wilson2 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol + 9] = round (left_conf_exact2 * 100.0 * powerOfTen) / powerOfTen; - BodyMtx [, fCol +10] = round (right_conf_exact2 * 100.0 * powerOfTen) / powerOfTen; - - iBlock = iBlock + 1; -} } -print ("Writing the output matrices..."); -write (HeadMtx, $4, format="text"); -write (BodyMtx, $5, format="text"); -print ("END CTABLE ANALYSIS SCRIPT"); - -wilson_confidence = function (Matrix[double] n, Matrix[double] m) -return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) -{ - z = 1.96; # 97.5% normal percentile, for 95% confidence interval - z_sq_n = z * z * n; - qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); - midpt = n * m + z_sq_n / 2; - denom = n * n + z_sq_n; - ratio = m / n; - conf_left = (midpt - qroot) / denom; - conf_right = (midpt + qroot) / denom; -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# CTCI.DML: TWO-ATTRIBUTE CONTINGENCY TABLE CONFIDENCE INTERVAL ANALYSIS for categorical data +# Computes 95% confidence intervals for binomial ratios using both Wilson and Exact Scores +# INPUT 1: Dataset as an (N x 2) matrix, input file path/name +# Rows: Individual data points +# Col 1: Partition attribute (e.g. US State code), must be positive integer +# Col 2: Label attribute (e.g. positive/negative/neutral), must be positive integer +# INPUT 2: Number of data points N (i.e. input matrix size, rows) +# INPUT 3: "Null" label code, 0 if there is no "null" label +# INPUT 4: Head Matrix output file path/name +# INPUT 5: Body Matrix output file path/name +# OUTPUT 1: Head Matrix with per-label information: +# Rows: One row per each distinct code of the label attribute (1, 2, 3, ...) +# Col 1: First column index of the Body Matrix block for this label, or 0 if "null" +# Col 2: Overall number of data points with this label +# Col 3: Percentage (out of 100) of data points to have this label +# OUTPUT 2: Body Matrix with per-partition statistics: +# Rows: One row per each distinct code of the partition attribute (1, 2, 3, ...) +# Columns: Arranged in blocks with the same schema, one block per each non-null label, +# with the first column index specified in the Head Matrix +# Block Col 0: Number of points, i.e. the count, with this label in the given partition +# Block Col 1: Percentage (out of 100) of points to have the label vs. all in the partition +# Block Col 2: Small side, 95% confid. int-l (of above percentage), Wilson Score +# Block Col 3: Large side, 95% confid. int-l (of above percentage), Wilson Score +# Block Col 4: Small side, 95% confid. int-l (of above percentage), Exact Binomial Score +# Block Col 5: Large side, 95% confid. int-l (of above percentage), Exact Binomial Score +# Block Col 6: Percentage (out of 100) of points to lie in the partition vs. all with the label +# Block Col 7: Small side, 95% confid. int-l (of above percentage), Wilson Score +# Block Col 8: Large side, 95% confid. int-l (of above percentage), Wilson Score +# Block Col 9: Small side, 95% confid. int-l (of above percentage), Exact Binomial Score +# Block Col 10: Large side, 95% confid. int-l (of above percentage), Exact Binomial Score +# Block Col 11-99: RESERVED and set to zero +# +# EXAMPLE: +# hadoop jar SystemML.jar -f PATH/ctci.dml -args PATH/ctci_test.mtx 5602 2 PATH/ctci_test_head.mtx PATH/ctci_test_body.mtx + +setwd ("test/scripts/applications/ctableStats"); # SET TO THE SCRIPT FOLDER +source ("Binomial.dml"); # THIS SCRIPT SHOULD BE THERE TOO +powerOfTen = 10000; # CONSTANT FOR ROUNDING THE RESULTS + +print ("BEGIN CTABLE ANALYSIS SCRIPT"); +print ("Reading the input matrix..."); +InData = read($1, rows = $2, cols = 2, format = "text"); +print ("Computing the contingency table..."); +CT = table (InData [, 1], InData [, 2]); +# DEBUG LINE ONLY: write (CT, "test/scripts/applications/ctableStats/ctci_test_CT.mtx", format="text"); +print ("Preparing for the output tables..."); +nullLabel = $3; +numPartitions = nrow (CT); +numLabels = ncol (CT); +cntPartitions = rowSums (CT); +cntLabels = t(colSums (CT)); +numBodyBlocks = numLabels; +for (iLabel in 1:numLabels) { + if (iLabel == nullLabel) { + numBodyBlocks = numBodyBlocks - 1; +} } +numBodyCols = numBodyBlocks * 100; +HeadMtx = Rand (rows = numLabels, cols = 3, min = 0, max = 0); +HeadMtx [, 2] = cntLabels; +HeadMtx [, 3] = 100.0 * cntLabels / sum (cntLabels); +BodyMtx = Rand (rows = numPartitions, cols = numBodyCols, min = 0, max = 0); +zeros = Rand (rows = numPartitions, cols = 1, min = 0, max = 0); +zero = Rand (rows = 1, cols = 1, min = 0, max = 0); +big_alpha = 0.975 + zeros; +small_alpha = 0.025 + zeros; +iBlock = 0; +for (iLabel in 1:numLabels) +{ + if (iLabel != nullLabel) { + if (1==1) { + print ("Processing label " + iLabel + ":"); + } + fCol = 1 + iBlock * 100; + HeadMtx [iLabel, 1] = fCol + zero; + cntPartitionsWithLabel = CT [, iLabel]; + BodyMtx [, fCol] = cntPartitionsWithLabel; + + print (" (partition & label) / (all partition) ratios..."); + + cntPartitionsWithLabel_minus_1 = cntPartitionsWithLabel - 1; + [ratio1, left_conf_wilson1, right_conf_wilson1] = + wilson_confidence (cntPartitions, cntPartitionsWithLabel); + [left_conf_exact1] = binomQuantile (cntPartitions, cntPartitionsWithLabel_minus_1, big_alpha); + [right_conf_exact1] = binomQuantile (cntPartitions, cntPartitionsWithLabel, small_alpha); + + BodyMtx [, fCol + 1] = round (ratio1 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 2] = round (left_conf_wilson1 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 3] = round (right_conf_wilson1 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 4] = round (left_conf_exact1 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 5] = round (right_conf_exact1 * 100.0 * powerOfTen) / powerOfTen; + + print (" (partition & label) / (all label) ratios..."); + + cntThisLabel = zeros + castAsScalar (cntLabels [iLabel, 1]); + [ratio2, left_conf_wilson2, right_conf_wilson2] = + wilson_confidence (cntThisLabel, cntPartitionsWithLabel); + [left_conf_exact2] = binomQuantile (cntThisLabel, cntPartitionsWithLabel_minus_1, big_alpha); + [right_conf_exact2] = binomQuantile (cntThisLabel, cntPartitionsWithLabel, small_alpha); + + BodyMtx [, fCol + 6] = round (ratio2 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 7] = round (left_conf_wilson2 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 8] = round (right_conf_wilson2 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol + 9] = round (left_conf_exact2 * 100.0 * powerOfTen) / powerOfTen; + BodyMtx [, fCol +10] = round (right_conf_exact2 * 100.0 * powerOfTen) / powerOfTen; + + iBlock = iBlock + 1; +} } +print ("Writing the output matrices..."); +write (HeadMtx, $4, format="text"); +write (BodyMtx, $5, format="text"); +print ("END CTABLE ANALYSIS SCRIPT"); + +wilson_confidence = function (Matrix[double] n, Matrix[double] m) +return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) +{ + z = 1.96; # 97.5% normal percentile, for 95% confidence interval + z_sq_n = z * z * n; + qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); + midpt = n * m + z_sq_n / 2; + denom = n * n + z_sq_n; + ratio = m / n; + conf_left = (midpt - qroot) / denom; + conf_right = (midpt + qroot) / denom; +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/ctableStats/ctci_odds.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/ctableStats/ctci_odds.dml b/src/test/scripts/applications/ctableStats/ctci_odds.dml index 856b24c..3d97489 100644 --- a/src/test/scripts/applications/ctableStats/ctci_odds.dml +++ b/src/test/scripts/applications/ctableStats/ctci_odds.dml @@ -1,178 +1,178 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# CTCI_ODDS.DML: TWO-ATTRIBUTE CONTINGENCY TABLE ODDS-RATIO CONFIDENCE INTERVAL ANALYSIS -# Computes 95% confidence intervals for odds ratios using a Gaussian approximation for log-odds -# INPUT 1: Dataset as an (N x 2) matrix, input file path/name -# Rows: Individual data points -# Col 1: Partition attribute (e.g. US State code), must be positive integer -# Col 2: Label attribute (e.g. positive/negative/neutral), must be positive integer -# INPUT 2: Number of data points N (i.e. input matrix size, rows) -# INPUT 3: "Null" label code, 0 if there is no "null" label -# INPUT 4: Output Matrix file path/name -# OUTPUT 1: Output Matrix with the following information: -# Rows: One row per each distinct pair (partition, label) excluding "null" label -# Col 1: Partition attribute value -# Col 2: Label attribute value -# Col 3: Number of data points with this (partition, label) pair -# Col 4: Number of data points with the same partition, but a different label -# Col 5: Number of data points with a different partition, but the same label -# Col 6: Number of data points with a different partition and a different label -# Col 7: The odds ratio -# Col 8: Small side of 95%-confidence interval for the odds ratio -# Col 9: Large side of 95%-confidence interval for the odds ratio -# Col 10: How many sigmas away the log-odds ratio is from zero -# Col 11: Chi-squared statistic -# Col 12: Cramer's V * 100% -# Col 13: Log-odds ratio P-value * 100% -# Col 14: Chi-squared P-value * 100% -# Col 15: Percentage (out of 100) of data points in this paritition to have this label -# Col 16: Small side of 95%-confid. int-l of above percentage, Wilson Score -# Col 17: Large side of 95%-confid. int-l of above percentage, Wilson Score -# Col 18: Percentage (out of 100) of data points overall to have this label -# Col 19: Small side of 95%-confid. int-l of above percentage, Wilson Score -# Col 20: Large side of 95%-confid. int-l of above percentage, Wilson Score -# Col 21: Percentage (out of 100) of data points overall to lie in this partition -# Col 22: Small side of 95%-confid. int-l of above percentage, Wilson Score -# Col 23: Large side of 95%-confid. int-l of above percentage, Wilson Score -# -# EXAMPLE: -# hadoop jar SystemML.jar -f PATH/ctci_odds.dml -args PATH/ctci_test.mtx 5602 2 PATH/ctci_odds_test_output.mtx - -powerOfTen = 10000; # CONSTANT FOR ROUNDING THE RESULTS -print ("BEGIN CTABLE ANALYSIS SCRIPT"); -print ("Reading the input matrix..."); -InData = read ($1, rows = $2, cols = 2, format = "text"); -numPoints = $2; -print ("Computing the contingency table..."); -CT = table (InData [, 1], InData [, 2]); -# DEBUG LINE ONLY: write (CT, "test/scripts/applications/ctableStats/ctci_test_CT.mtx", format="text"); -print ("Preparing for the output tables..."); -nullLabel = $3; -numPartitions = nrow (CT); -numLabels = ncol (CT); -numOutRows = numPartitions * numLabels; -if (nullLabel > 0 & nullLabel <= numLabels) { - numOutRows = numOutRows - numPartitions; -} -cntPartitions = rowSums (CT); -cntLabels = t(colSums (CT)); -OutMtx = Rand (rows = numOutRows, cols = 23, min = 0, max = 0); -idx = 0; -zero = Rand (rows = 1, cols = 1, min = 0, max = 0); -for (iLabel in 1:numLabels) -{ - if (iLabel != nullLabel) - { - if (1==1) { - print ("Processing label " + iLabel + ":"); - } - for (iPartition in 1:numPartitions) - { - idx = idx + 1; - OutMtx [idx, 1] = iPartition + zero; - OutMtx [idx, 2] = iLabel + zero; - - n11 = CT [iPartition, iLabel]; - n01 = cntPartitions [iPartition, 1] - CT [iPartition, iLabel]; - n10 = cntLabels [iLabel, 1] - CT [iPartition, iLabel]; - n00 = numPoints - cntPartitions [iPartition, 1] - cntLabels [iLabel, 1] + CT [iPartition, iLabel]; - odds_ratio = n11 * n00 / (n01 * n10); - sigma_log_odds_ratio = sqrt (1.0 / n00 + 1.0 / n01 + 1.0 / n10 + 1.0 / n11); - odds_ratio_interval_small = odds_ratio / exp (1.96 * sigma_log_odds_ratio); - odds_ratio_interval_large = odds_ratio * exp (1.96 * sigma_log_odds_ratio); - num_sigmas_away = abs (log (odds_ratio) / sigma_log_odds_ratio); - chi_diff = n00 * n11 - n01 * n10; - chi_denom = (n00 + n01) * (n10 + n11) * (n00 + n10) * (n01 + n11); - chi_square = (n00 + n01 + n10 + n11) * chi_diff * chi_diff / chi_denom; - cramers_V = sqrt (chi_square / (n00 + n01 + n10 + n11)); - - OutMtx [idx, 3] = n11; - OutMtx [idx, 4] = n01; - OutMtx [idx, 5] = n10; - OutMtx [idx, 6] = n00; - OutMtx [idx, 7] = round (odds_ratio * powerOfTen) / powerOfTen; - OutMtx [idx, 8] = round (odds_ratio_interval_small * powerOfTen) / powerOfTen; - OutMtx [idx, 9] = round (odds_ratio_interval_large * powerOfTen) / powerOfTen; - OutMtx [idx, 10] = round (num_sigmas_away * powerOfTen) / powerOfTen; - OutMtx [idx, 11] = round (chi_square * powerOfTen) / powerOfTen; - OutMtx [idx, 12] = round (100.0 * cramers_V * powerOfTen) / powerOfTen; - - gauss_pts = Rand (rows = 2, cols = 1, min = 0, max = 0); - gauss_pts [1, 1] = - num_sigmas_away; - gauss_pts [2, 1] = - sqrt (chi_square); - gauss_probs = gaussian_probability (gauss_pts); - pval_odds = gauss_probs [1, 1] * 2.0; - pval_chi2 = gauss_probs [2, 1] * 2.0; - - OutMtx [idx, 13] = round (100.0 * pval_odds * powerOfTen) / powerOfTen; - OutMtx [idx, 14] = round (100.0 * pval_chi2 * powerOfTen) / powerOfTen; - - m_cnt = Rand (rows = 3, cols = 1, min = 0, max = 0); - n_cnt = Rand (rows = 3, cols = 1, min = 0, max = 0); - m_cnt [1, 1] = CT [iPartition, iLabel]; - n_cnt [1, 1] = cntPartitions [iPartition, 1]; - m_cnt [2, 1] = cntLabels [iLabel, 1]; - n_cnt [2, 1] = numPoints + zero; - m_cnt [3, 1] = cntPartitions [iPartition, 1]; - n_cnt [3, 1] = numPoints + zero; - [ratios, conf_interval_small, conf_interval_large] = wilson_confidence (n_cnt, m_cnt); - OutMtx [idx, 15] = round (100.0 * ratios [1, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 16] = round (100.0 * conf_interval_small [1, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 17] = round (100.0 * conf_interval_large [1, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 18] = round (100.0 * ratios [2, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 19] = round (100.0 * conf_interval_small [2, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 20] = round (100.0 * conf_interval_large [2, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 21] = round (100.0 * ratios [3, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 22] = round (100.0 * conf_interval_small [3, 1] * powerOfTen) / powerOfTen; - OutMtx [idx, 23] = round (100.0 * conf_interval_large [3, 1] * powerOfTen) / powerOfTen; -} } } - -print ("Writing the output matrix..."); -write (OutMtx, $4, format="text"); -print ("END CTABLE ANALYSIS SCRIPT"); - -wilson_confidence = function (Matrix[double] n, Matrix[double] m) -return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) -{ - z = 1.96; # 97.5% normal percentile, for 95% confidence interval - z_sq_n = z * z * n; - qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); - midpt = n * m + z_sq_n / 2; - denom = n * n + z_sq_n; - ratio = m / n; - conf_left = (midpt - qroot) / denom; - conf_right = (midpt + qroot) / denom; -} - -gaussian_probability = function (Matrix[double] vector_of_points) - return (Matrix[double] vector_of_probabilities) -{ - t_gp = 1.0 / (1.0 + abs (vector_of_points) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - erf_gp = 1.0 - t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))) * exp (- vector_of_points * vector_of_points / 2.0); - erf_gp = erf_gp * 2.0 * (ppred (vector_of_points, 0.0, ">") - 0.5); - vector_of_probabilities = 0.5 + 0.5 * erf_gp; -} - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# CTCI_ODDS.DML: TWO-ATTRIBUTE CONTINGENCY TABLE ODDS-RATIO CONFIDENCE INTERVAL ANALYSIS +# Computes 95% confidence intervals for odds ratios using a Gaussian approximation for log-odds +# INPUT 1: Dataset as an (N x 2) matrix, input file path/name +# Rows: Individual data points +# Col 1: Partition attribute (e.g. US State code), must be positive integer +# Col 2: Label attribute (e.g. positive/negative/neutral), must be positive integer +# INPUT 2: Number of data points N (i.e. input matrix size, rows) +# INPUT 3: "Null" label code, 0 if there is no "null" label +# INPUT 4: Output Matrix file path/name +# OUTPUT 1: Output Matrix with the following information: +# Rows: One row per each distinct pair (partition, label) excluding "null" label +# Col 1: Partition attribute value +# Col 2: Label attribute value +# Col 3: Number of data points with this (partition, label) pair +# Col 4: Number of data points with the same partition, but a different label +# Col 5: Number of data points with a different partition, but the same label +# Col 6: Number of data points with a different partition and a different label +# Col 7: The odds ratio +# Col 8: Small side of 95%-confidence interval for the odds ratio +# Col 9: Large side of 95%-confidence interval for the odds ratio +# Col 10: How many sigmas away the log-odds ratio is from zero +# Col 11: Chi-squared statistic +# Col 12: Cramer's V * 100% +# Col 13: Log-odds ratio P-value * 100% +# Col 14: Chi-squared P-value * 100% +# Col 15: Percentage (out of 100) of data points in this paritition to have this label +# Col 16: Small side of 95%-confid. int-l of above percentage, Wilson Score +# Col 17: Large side of 95%-confid. int-l of above percentage, Wilson Score +# Col 18: Percentage (out of 100) of data points overall to have this label +# Col 19: Small side of 95%-confid. int-l of above percentage, Wilson Score +# Col 20: Large side of 95%-confid. int-l of above percentage, Wilson Score +# Col 21: Percentage (out of 100) of data points overall to lie in this partition +# Col 22: Small side of 95%-confid. int-l of above percentage, Wilson Score +# Col 23: Large side of 95%-confid. int-l of above percentage, Wilson Score +# +# EXAMPLE: +# hadoop jar SystemML.jar -f PATH/ctci_odds.dml -args PATH/ctci_test.mtx 5602 2 PATH/ctci_odds_test_output.mtx + +powerOfTen = 10000; # CONSTANT FOR ROUNDING THE RESULTS +print ("BEGIN CTABLE ANALYSIS SCRIPT"); +print ("Reading the input matrix..."); +InData = read ($1, rows = $2, cols = 2, format = "text"); +numPoints = $2; +print ("Computing the contingency table..."); +CT = table (InData [, 1], InData [, 2]); +# DEBUG LINE ONLY: write (CT, "test/scripts/applications/ctableStats/ctci_test_CT.mtx", format="text"); +print ("Preparing for the output tables..."); +nullLabel = $3; +numPartitions = nrow (CT); +numLabels = ncol (CT); +numOutRows = numPartitions * numLabels; +if (nullLabel > 0 & nullLabel <= numLabels) { + numOutRows = numOutRows - numPartitions; +} +cntPartitions = rowSums (CT); +cntLabels = t(colSums (CT)); +OutMtx = Rand (rows = numOutRows, cols = 23, min = 0, max = 0); +idx = 0; +zero = Rand (rows = 1, cols = 1, min = 0, max = 0); +for (iLabel in 1:numLabels) +{ + if (iLabel != nullLabel) + { + if (1==1) { + print ("Processing label " + iLabel + ":"); + } + for (iPartition in 1:numPartitions) + { + idx = idx + 1; + OutMtx [idx, 1] = iPartition + zero; + OutMtx [idx, 2] = iLabel + zero; + + n11 = CT [iPartition, iLabel]; + n01 = cntPartitions [iPartition, 1] - CT [iPartition, iLabel]; + n10 = cntLabels [iLabel, 1] - CT [iPartition, iLabel]; + n00 = numPoints - cntPartitions [iPartition, 1] - cntLabels [iLabel, 1] + CT [iPartition, iLabel]; + odds_ratio = n11 * n00 / (n01 * n10); + sigma_log_odds_ratio = sqrt (1.0 / n00 + 1.0 / n01 + 1.0 / n10 + 1.0 / n11); + odds_ratio_interval_small = odds_ratio / exp (1.96 * sigma_log_odds_ratio); + odds_ratio_interval_large = odds_ratio * exp (1.96 * sigma_log_odds_ratio); + num_sigmas_away = abs (log (odds_ratio) / sigma_log_odds_ratio); + chi_diff = n00 * n11 - n01 * n10; + chi_denom = (n00 + n01) * (n10 + n11) * (n00 + n10) * (n01 + n11); + chi_square = (n00 + n01 + n10 + n11) * chi_diff * chi_diff / chi_denom; + cramers_V = sqrt (chi_square / (n00 + n01 + n10 + n11)); + + OutMtx [idx, 3] = n11; + OutMtx [idx, 4] = n01; + OutMtx [idx, 5] = n10; + OutMtx [idx, 6] = n00; + OutMtx [idx, 7] = round (odds_ratio * powerOfTen) / powerOfTen; + OutMtx [idx, 8] = round (odds_ratio_interval_small * powerOfTen) / powerOfTen; + OutMtx [idx, 9] = round (odds_ratio_interval_large * powerOfTen) / powerOfTen; + OutMtx [idx, 10] = round (num_sigmas_away * powerOfTen) / powerOfTen; + OutMtx [idx, 11] = round (chi_square * powerOfTen) / powerOfTen; + OutMtx [idx, 12] = round (100.0 * cramers_V * powerOfTen) / powerOfTen; + + gauss_pts = Rand (rows = 2, cols = 1, min = 0, max = 0); + gauss_pts [1, 1] = - num_sigmas_away; + gauss_pts [2, 1] = - sqrt (chi_square); + gauss_probs = gaussian_probability (gauss_pts); + pval_odds = gauss_probs [1, 1] * 2.0; + pval_chi2 = gauss_probs [2, 1] * 2.0; + + OutMtx [idx, 13] = round (100.0 * pval_odds * powerOfTen) / powerOfTen; + OutMtx [idx, 14] = round (100.0 * pval_chi2 * powerOfTen) / powerOfTen; + + m_cnt = Rand (rows = 3, cols = 1, min = 0, max = 0); + n_cnt = Rand (rows = 3, cols = 1, min = 0, max = 0); + m_cnt [1, 1] = CT [iPartition, iLabel]; + n_cnt [1, 1] = cntPartitions [iPartition, 1]; + m_cnt [2, 1] = cntLabels [iLabel, 1]; + n_cnt [2, 1] = numPoints + zero; + m_cnt [3, 1] = cntPartitions [iPartition, 1]; + n_cnt [3, 1] = numPoints + zero; + [ratios, conf_interval_small, conf_interval_large] = wilson_confidence (n_cnt, m_cnt); + OutMtx [idx, 15] = round (100.0 * ratios [1, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 16] = round (100.0 * conf_interval_small [1, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 17] = round (100.0 * conf_interval_large [1, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 18] = round (100.0 * ratios [2, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 19] = round (100.0 * conf_interval_small [2, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 20] = round (100.0 * conf_interval_large [2, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 21] = round (100.0 * ratios [3, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 22] = round (100.0 * conf_interval_small [3, 1] * powerOfTen) / powerOfTen; + OutMtx [idx, 23] = round (100.0 * conf_interval_large [3, 1] * powerOfTen) / powerOfTen; +} } } + +print ("Writing the output matrix..."); +write (OutMtx, $4, format="text"); +print ("END CTABLE ANALYSIS SCRIPT"); + +wilson_confidence = function (Matrix[double] n, Matrix[double] m) +return (Matrix[double] ratio, Matrix[double] conf_left, Matrix[double] conf_right) +{ + z = 1.96; # 97.5% normal percentile, for 95% confidence interval + z_sq_n = z * z * n; + qroot = sqrt (z_sq_n * (m * (n - m) + z_sq_n / 4)); + midpt = n * m + z_sq_n / 2; + denom = n * n + z_sq_n; + ratio = m / n; + conf_left = (midpt - qroot) / denom; + conf_right = (midpt + qroot) / denom; +} + +gaussian_probability = function (Matrix[double] vector_of_points) + return (Matrix[double] vector_of_probabilities) +{ + t_gp = 1.0 / (1.0 + abs (vector_of_points) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + erf_gp = 1.0 - t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))) * exp (- vector_of_points * vector_of_points / 2.0); + erf_gp = erf_gp * 2.0 * (ppred (vector_of_points, 0.0, ">") - 0.5); + vector_of_probabilities = 0.5 + 0.5 * erf_gp; +} +
