http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4LogReg_LTstats.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4LogReg_LTstats.dml b/scripts/datagen/genRandData4LogReg_LTstats.dml index 6742f0c..2ec5aef 100644 --- a/scripts/datagen/genRandData4LogReg_LTstats.dml +++ b/scripts/datagen/genRandData4LogReg_LTstats.dml @@ -1,233 +1,233 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# generates random data to test bi- and multinomial logistic regression - -# $N = number of training samples -# $Nt = number of test samples (or 0 if none) -# $nf = number of features (independent variables) -# $nc = number of categories; = 1 if "binomial" with +1/-1 labels -# $Xmin = minimum feature value -# $Xmax = maximum feature value -# $spars = controls sparsity in the generated data -# $avgLTmin = average linear term (X %*% beta + intercept), minimum value -# $avgLTmax = average linear term (X %*% beta + intercept), maximum value -# $stdLT = requested standard deviation for the linear terms -# $iceptmin = intercept, minimum value (0.0 disables intercept) -# $iceptmax = intercept, maximum value (0.0 disables intercept) -# $B = location to store generated regression parameters -# $X = location to store generated training data -# $Y = location to store generated training category labels -# $Xt = location to store generated test data -# $Yt = location to store generated test category labels -# -# Example: -# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs -# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25 -# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123 - -numTrainingSamples = $N; -numTestSamples = $Nt; -numFeatures = $nf; -numCategories = $nc; -minIntercept = $iceptmin; -maxIntercept = $iceptmax; -minXentry = $Xmin; -maxXentry = $Xmax; -minAvgLT = $avgLTmin; -maxAvgLT = $avgLTmax; -sparsityLevel = $spars; -stdevLT = $stdLT; -fileB = ifdef ($B, "B"); -fileX = ifdef ($X, "X"); -fileY = ifdef ($Y, "Y"); -fileXt = ifdef ($Xt, "Xt"); -fileYt = ifdef ($Yt, "Yt"); - - -numSamples = numTrainingSamples + numTestSamples; - -isBinomialPMOne = FALSE; -if (numCategories == 1) { - numCategories = 2; - isBinomialPMOne = TRUE; -} -do_we_output_intercept = 1; -if (minIntercept == 0.0 & maxIntercept == 0.0) { - do_we_output_intercept = 0; -} - -X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel); - -meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform"); -sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1); -b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform"); - -meanLT_minus_intercept = meanLT - b_intercept; -[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT); - -ones = matrix (1.0, rows = numSamples, cols = 1); -LT = X %*% B + ones %*% b_intercept; -actual_meanLT = colSums (LT) / numSamples; -actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples); - -for (i in 1:(numCategories - 1)) { - if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) { - print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i])); - } else { - print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i])); - } - print (" Wanted LT mean = " + castAsScalar (meanLT [1, i]) + ", st.dev. = " + castAsScalar (new_sigmaLT [1, i])); - print (" Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ", st.dev. = " + castAsScalar (actual_sigmaLT [1, i])); -} - - -ones = matrix (1.0, rows = 1, cols = numCategories - 1); -Prob = exp (LT); -Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones); -Prob = t(cumsum (t(Prob))); - -r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0); -R = r %*% ones; -Y = 1 + rowSums (ppred (Prob, R, "<")); -if (isBinomialPMOne) { - Y = 3 - 2 * Y; -} - - -/* USE FOR LINEAR REGRESSION - -r = Rand (rows = numSamples, cols = 1, pdf = "normal"); -Y = LT [, 1] + r; - -*/ - - -if (do_we_output_intercept == 1) { - new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B)); - new_B [1:nrow(B), 1:ncol(B)] = B; - new_B [nrow(B)+1, 1:ncol(B)] = b_intercept; - write (new_B, fileB, format="mm"); -} else { - write (B, fileB, format="mm"); -} - -if (numTestSamples > 0) { - X_train = X [1:numTrainingSamples,]; - Y_train = Y [1:numTrainingSamples,]; - X_test = X [(numTrainingSamples+1):numSamples,]; - Y_test = Y [(numTrainingSamples+1):numSamples,]; - write (X_train, fileX, format="mm"); - write (Y_train, fileY, format="mm"); - write (X_test, fileXt, format="mm"); - write (Y_test, fileYt, format="mm"); -} else { - write (X, fileX, format="mm"); - write (Y, fileY, format="mm"); -} - - - - - - -# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W -# To be used for data generation in the testing of GLM, Logistic Regression, etc. -# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are -# the desired mean and standard deviation for X %*% W[, i] -# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i] -# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced, -# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint. -generateWeights = - function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT) - return (Matrix[double] W, Matrix[double] new_sigmaLT) -{ - num_w = ncol (meanLT); # Number of output weight vectors - dim_w = ncol (X); # Number of features / dimensions in a weight vector - w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT - # (all orthogonal shift directions do not affect meanLT) - - # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT - - w_1 = straightenX (X); - r_1 = (X %*% w_1) - 1.0; - norm_r_1_sq = sum (r_1 ^ 2); - - # For each W[, i] generate uniformly random directions to shift away from "w_1" - - DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal"); - DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X - XDW = X %*% DW; - - # Determine how far to shift in the chosen directions to satisfy the constraints - # Use the positive root of the quadratic equation; relax sigmaLT where needed - - a_qe = colSums (XDW ^ 2); - b_qe = 2.0 * meanLT * (t(r_1) %*% XDW); - c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X); - - is_sigmaLT_OK = ppred (c_qe, 0.0, "<="); - new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X)); - c_qe = is_sigmaLT_OK * c_qe; - x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe); - - # Scale and shift "w_1" in the "DW" directions to produce the result: - - ones = matrix (1.0, rows = dim_w, cols = 1); - W = w_1 %*% meanLT + DW * (ones %*% x_qe); -} - -# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 -# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale -# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). -straightenX = - function (Matrix[double] X) - return (Matrix[double] w) -{ - w_X = t(colSums(X)); - lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); - eps = 0.000000001 * nrow(X); - - # BEGIN LEAST SQUARES - - r_LS = - w_X; - z_LS = matrix (0.0, rows = ncol(X), cols = 1); - p_LS = - r_LS; - norm_r2_LS = sum (r_LS ^ 2); - i_LS = 0; - while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps) - { - temp_LS = X %*% p_LS; - q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS; - alpha_LS = norm_r2_LS / sum (p_LS * q_LS); - z_LS = z_LS + alpha_LS * p_LS; - old_norm_r2_LS = norm_r2_LS; - r_LS = r_LS + alpha_LS * q_LS; - norm_r2_LS = sum (r_LS ^ 2); - p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; - i_LS = i_LS + 1; - } - - # END LEAST SQUARES - - w = (nrow(X) / sum (w_X * z_LS)) * z_LS; -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# generates random data to test bi- and multinomial logistic regression + +# $N = number of training samples +# $Nt = number of test samples (or 0 if none) +# $nf = number of features (independent variables) +# $nc = number of categories; = 1 if "binomial" with +1/-1 labels +# $Xmin = minimum feature value +# $Xmax = maximum feature value +# $spars = controls sparsity in the generated data +# $avgLTmin = average linear term (X %*% beta + intercept), minimum value +# $avgLTmax = average linear term (X %*% beta + intercept), maximum value +# $stdLT = requested standard deviation for the linear terms +# $iceptmin = intercept, minimum value (0.0 disables intercept) +# $iceptmax = intercept, maximum value (0.0 disables intercept) +# $B = location to store generated regression parameters +# $X = location to store generated training data +# $Y = location to store generated training category labels +# $Xt = location to store generated test data +# $Yt = location to store generated test category labels +# +# Example: +# hadoop jar SystemML.jar -f genRandData4LogReg_LTstats.dml -nvargs +# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25 +# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123 + +numTrainingSamples = $N; +numTestSamples = $Nt; +numFeatures = $nf; +numCategories = $nc; +minIntercept = $iceptmin; +maxIntercept = $iceptmax; +minXentry = $Xmin; +maxXentry = $Xmax; +minAvgLT = $avgLTmin; +maxAvgLT = $avgLTmax; +sparsityLevel = $spars; +stdevLT = $stdLT; +fileB = ifdef ($B, "B"); +fileX = ifdef ($X, "X"); +fileY = ifdef ($Y, "Y"); +fileXt = ifdef ($Xt, "Xt"); +fileYt = ifdef ($Yt, "Yt"); + + +numSamples = numTrainingSamples + numTestSamples; + +isBinomialPMOne = FALSE; +if (numCategories == 1) { + numCategories = 2; + isBinomialPMOne = TRUE; +} +do_we_output_intercept = 1; +if (minIntercept == 0.0 & maxIntercept == 0.0) { + do_we_output_intercept = 0; +} + +X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel); + +meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform"); +sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1); +b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform"); + +meanLT_minus_intercept = meanLT - b_intercept; +[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT); + +ones = matrix (1.0, rows = numSamples, cols = 1); +LT = X %*% B + ones %*% b_intercept; +actual_meanLT = colSums (LT) / numSamples; +actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples); + +for (i in 1:(numCategories - 1)) { + if (castAsScalar (new_sigmaLT [1, i]) == castAsScalar (sigmaLT [1, i])) { + print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i])); + } else { + print ("Category " + i + ": Intercept = " + castAsScalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + castAsScalar (sigmaLT [1, i])); + } + print (" Wanted LT mean = " + castAsScalar (meanLT [1, i]) + ", st.dev. = " + castAsScalar (new_sigmaLT [1, i])); + print (" Actual LT mean = " + castAsScalar (actual_meanLT [1, i]) + ", st.dev. = " + castAsScalar (actual_sigmaLT [1, i])); +} + + +ones = matrix (1.0, rows = 1, cols = numCategories - 1); +Prob = exp (LT); +Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones); +Prob = t(cumsum (t(Prob))); + +r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0); +R = r %*% ones; +Y = 1 + rowSums (ppred (Prob, R, "<")); +if (isBinomialPMOne) { + Y = 3 - 2 * Y; +} + + +/* USE FOR LINEAR REGRESSION + +r = Rand (rows = numSamples, cols = 1, pdf = "normal"); +Y = LT [, 1] + r; + +*/ + + +if (do_we_output_intercept == 1) { + new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B)); + new_B [1:nrow(B), 1:ncol(B)] = B; + new_B [nrow(B)+1, 1:ncol(B)] = b_intercept; + write (new_B, fileB, format="mm"); +} else { + write (B, fileB, format="mm"); +} + +if (numTestSamples > 0) { + X_train = X [1:numTrainingSamples,]; + Y_train = Y [1:numTrainingSamples,]; + X_test = X [(numTrainingSamples+1):numSamples,]; + Y_test = Y [(numTrainingSamples+1):numSamples,]; + write (X_train, fileX, format="mm"); + write (Y_train, fileY, format="mm"); + write (X_test, fileXt, format="mm"); + write (Y_test, fileYt, format="mm"); +} else { + write (X, fileX, format="mm"); + write (Y, fileY, format="mm"); +} + + + + + + +# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W +# To be used for data generation in the testing of GLM, Logistic Regression, etc. +# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are +# the desired mean and standard deviation for X %*% W[, i] +# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i] +# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced, +# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint. +generateWeights = + function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT) + return (Matrix[double] W, Matrix[double] new_sigmaLT) +{ + num_w = ncol (meanLT); # Number of output weight vectors + dim_w = ncol (X); # Number of features / dimensions in a weight vector + w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT + # (all orthogonal shift directions do not affect meanLT) + + # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT + + w_1 = straightenX (X); + r_1 = (X %*% w_1) - 1.0; + norm_r_1_sq = sum (r_1 ^ 2); + + # For each W[, i] generate uniformly random directions to shift away from "w_1" + + DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal"); + DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X + XDW = X %*% DW; + + # Determine how far to shift in the chosen directions to satisfy the constraints + # Use the positive root of the quadratic equation; relax sigmaLT where needed + + a_qe = colSums (XDW ^ 2); + b_qe = 2.0 * meanLT * (t(r_1) %*% XDW); + c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X); + + is_sigmaLT_OK = ppred (c_qe, 0.0, "<="); + new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X)); + c_qe = is_sigmaLT_OK * c_qe; + x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe); + + # Scale and shift "w_1" in the "DW" directions to produce the result: + + ones = matrix (1.0, rows = dim_w, cols = 1); + W = w_1 %*% meanLT + DW * (ones %*% x_qe); +} + +# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 +# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale +# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). +straightenX = + function (Matrix[double] X) + return (Matrix[double] w) +{ + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps = 0.000000001 * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps) + { + temp_LS = X %*% p_LS; + q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4MultiClassSVM.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4MultiClassSVM.dml b/scripts/datagen/genRandData4MultiClassSVM.dml index 65ee1d4..5d9fbcb 100644 --- a/scripts/datagen/genRandData4MultiClassSVM.dml +++ b/scripts/datagen/genRandData4MultiClassSVM.dml @@ -1,68 +1,68 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random data to test linear logistic regression - -# $1 is number of samples -# $2 is number of features (independent variables) -# $3 is maximum feature value (absolute value) -# $4 is maximum weight (absolute value) -# $5 is location to store generated weights -# $6 is location to store generated data -# $7 is location to store generated labels -# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 -# $9 is b, 0 disables intercept -# $10 controls sparsity in the generated data - -numSamples = $1 -numFeatures = $2 -maxFeatureValue = $3 -maxWeight = $4 -addNoise = $8 -b = $9 - -X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) -X = X * maxFeatureValue - -w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) -w = w * maxWeight - -ot = X%*%w -if(b!=0) { - b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") - w = t(append(t(w), b_mat)) - ot = ot + b -} - -prob = 1/(1+exp(-ot)) -if(addNoise == 1){ - r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) -}else{ - print("this data generator generates the same dataset for both noise=0 and noise=1") - r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) - #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") -} -Y = 1 - 2*ppred(prob, r, "<") -Y = (Y+3)/2 - -write(w, $5, format="binary") -write(X, $6, format="binary") -write(Y, $7, format="binary") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data to test linear logistic regression + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is maximum feature value (absolute value) +# $4 is maximum weight (absolute value) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 +# $9 is b, 0 disables intercept +# $10 controls sparsity in the generated data + +numSamples = $1 +numFeatures = $2 +maxFeatureValue = $3 +maxWeight = $4 +addNoise = $8 +b = $9 + +X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) +X = X * maxFeatureValue + +w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) +w = w * maxWeight + +ot = X%*%w +if(b!=0) { + b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") + w = t(append(t(w), b_mat)) + ot = ot + b +} + +prob = 1/(1+exp(-ot)) +if(addNoise == 1){ + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) +}else{ + print("this data generator generates the same dataset for both noise=0 and noise=1") + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) + #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") +} +Y = 1 - 2*ppred(prob, r, "<") +Y = (Y+3)/2 + +write(w, $5, format="binary") +write(X, $6, format="binary") +write(Y, $7, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMF.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4NMF.dml b/scripts/datagen/genRandData4NMF.dml index 5988d48..cf18430 100644 --- a/scripts/datagen/genRandData4NMF.dml +++ b/scripts/datagen/genRandData4NMF.dml @@ -1,129 +1,129 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random data for non-negative -# matrix factorization -# -# follows lda's generative model -# see Blei, Ng & Jordan, JMLR'03 paper -# titled Latent Dirichlet Allocation -# -# $1 is number of samples -# $2 is number of features -# $3 is number of latent factors -# $4 is number of features per sample -# (may overlap). use this to vary -# sparsity. -# $5 is file to store sample mixtures -# $6 is file to store factors -# $7 is file to store generated data - -numDocuments = $1 -numFeatures = $2 -numTopics = $3 -numWordsPerDoc = $4 - -docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) -denomsTM = rowSums(docTopicMixtures) -zerosInDenomsTM = ppred(denomsTM, 0, "==") -denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM -parfor(i in 1:numTopics){ - docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM -} -write(docTopicMixtures, $5, format="binary") -for(j in 2:numTopics){ - docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] -} - -topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) -parfor(i in 1:numTopics){ - topicDist = topicDistributions[i,] - - denom2 = sum(topicDist) - if(denom2 == 0){ - denom2 = denom2 + 0.1 - } - - topicDistributions[i,] = topicDist / denom2 -} -write(topicDistributions, $6, format="binary") -for(j in 2:numFeatures){ - topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] -} - -data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") - -parfor(i in 1:numDocuments){ - docTopic = docTopicMixtures[i,] - - ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform"); - - r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) - r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) - - for(j in 1:numWordsPerDoc){ - rz = castAsScalar(r_z[j,1]) - continue = 1 - - z = -1 - #this is a workaround - #z=1 - - for(k1 in 1:numTopics){ - prob = castAsScalar(docTopic[1,k1]) - if(continue==1 & rz <= prob){ - z=k1 - continue=0 - } - } - - if(z==-1){ - print("z is unassigned: " + z) - z = numTopics - } - - rw = castAsScalar(r_w[j,1]) - continue = 1 - - w = -1 - #this is a workaround - #w = 1 - - for(k2 in 1:numFeatures){ - prob = castAsScalar(topicDistributions[z,k2]) - if(continue == 1 & rw <= prob){ - w = k2 - continue = 0 - } - } - - if(w==-1){ - print("w is unassigned: " + w) - w = numFeatures - } - - ldata[1,w] = ldata[1,w] + 1 - } - - data[i,] = ldata; -} - -write(data, $7, format="binary") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for non-negative +# matrix factorization +# +# follows lda's generative model +# see Blei, Ng & Jordan, JMLR'03 paper +# titled Latent Dirichlet Allocation +# +# $1 is number of samples +# $2 is number of features +# $3 is number of latent factors +# $4 is number of features per sample +# (may overlap). use this to vary +# sparsity. +# $5 is file to store sample mixtures +# $6 is file to store factors +# $7 is file to store generated data + +numDocuments = $1 +numFeatures = $2 +numTopics = $3 +numWordsPerDoc = $4 + +docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +denomsTM = rowSums(docTopicMixtures) +zerosInDenomsTM = ppred(denomsTM, 0, "==") +denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM +parfor(i in 1:numTopics){ + docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM +} +write(docTopicMixtures, $5, format="binary") +for(j in 2:numTopics){ + docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] +} + +topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +parfor(i in 1:numTopics){ + topicDist = topicDistributions[i,] + + denom2 = sum(topicDist) + if(denom2 == 0){ + denom2 = denom2 + 0.1 + } + + topicDistributions[i,] = topicDist / denom2 +} +write(topicDistributions, $6, format="binary") +for(j in 2:numFeatures){ + topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] +} + +data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") + +parfor(i in 1:numDocuments){ + docTopic = docTopicMixtures[i,] + + ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform"); + + r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + + for(j in 1:numWordsPerDoc){ + rz = castAsScalar(r_z[j,1]) + continue = 1 + + z = -1 + #this is a workaround + #z=1 + + for(k1 in 1:numTopics){ + prob = castAsScalar(docTopic[1,k1]) + if(continue==1 & rz <= prob){ + z=k1 + continue=0 + } + } + + if(z==-1){ + print("z is unassigned: " + z) + z = numTopics + } + + rw = castAsScalar(r_w[j,1]) + continue = 1 + + w = -1 + #this is a workaround + #w = 1 + + for(k2 in 1:numFeatures){ + prob = castAsScalar(topicDistributions[z,k2]) + if(continue == 1 & rw <= prob){ + w = k2 + continue = 0 + } + } + + if(w==-1){ + print("w is unassigned: " + w) + w = numFeatures + } + + ldata[1,w] = ldata[1,w] + 1 + } + + data[i,] = ldata; +} + +write(data, $7, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4NMFBlockwise.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4NMFBlockwise.dml b/scripts/datagen/genRandData4NMFBlockwise.dml index 63133da..e3fd67f 100644 --- a/scripts/datagen/genRandData4NMFBlockwise.dml +++ b/scripts/datagen/genRandData4NMFBlockwise.dml @@ -1,138 +1,138 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random data for non-negative -# matrix factorization -# -# follows lda's generative model -# see Blei, Ng & Jordan, JMLR'03 paper -# titled Latent Dirichlet Allocation -# -# $1 is number of samples -# $2 is number of features -# $3 is number of latent factors -# $4 is number of features per sample -# (may overlap). use this to vary -# sparsity. -# $5 is file to store sample mixtures -# $6 is file to store factors -# $7 is file to store generated data -# -# $8 is the blocksize, i.e., number of rows per block -# (should be set such that $8x$2 fits in mem budget) - -numDocuments = $1 -numFeatures = $2 -numTopics = $3 -numWordsPerDoc = $4 -blocksize = $8 - -docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) -denomsTM = rowSums(docTopicMixtures) -zerosInDenomsTM = ppred(denomsTM, 0, "==") -denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM -parfor(i in 1:numTopics){ - docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM -} -write(docTopicMixtures, $5, format="binary") -for(j in 2:numTopics){ - docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] -} - -topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) -parfor(i in 1:numTopics){ - topicDist = topicDistributions[i,] - - denom2 = sum(topicDist) - if(denom2 == 0){ - denom2 = denom2 + 0.1 - } - - topicDistributions[i,] = topicDist / denom2 -} -write(topicDistributions, $6, format="binary") -for(j in 2:numFeatures){ - topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] -} - -data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") - -#outer-loop for blockwise computation -for( k in seq(1,numDocuments,blocksize) ) -{ - len = min(blocksize,numDocuments-k); #block length - data = data0[k:(k+len),]; #obtain block - - parfor(i in 1:len){ - docTopic = docTopicMixtures[i,] - - r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) - r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) - - for(j in 1:numWordsPerDoc){ - rz = castAsScalar(r_z[j,1]) - continue = 1 - - z = -1 - #this is a workaround - #z=1 - - for(k1 in 1:numTopics){ - prob = castAsScalar(docTopic[1,k1]) - if(continue==1 & rz <= prob){ - z=k1 - continue=0 - } - } - - if(z==-1){ - print("z is unassigned: " + z) - z = numTopics - } - - rw = castAsScalar(r_w[j,1]) - continue = 1 - - w = -1 - #this is a workaround - #w = 1 - - for(k2 in 1:numFeatures){ - prob = castAsScalar(topicDistributions[z,k2]) - if(continue == 1 & rw <= prob){ - w = k2 - continue = 0 - } - } - - if(w==-1){ - print("w is unassigned: " + w) - w = numFeatures - } - - data[i,w] = data[i,w] + 1 - } - } - - data0[k:(k+len),] = data; # write block back -} - -write(data0, $7, format="binary") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for non-negative +# matrix factorization +# +# follows lda's generative model +# see Blei, Ng & Jordan, JMLR'03 paper +# titled Latent Dirichlet Allocation +# +# $1 is number of samples +# $2 is number of features +# $3 is number of latent factors +# $4 is number of features per sample +# (may overlap). use this to vary +# sparsity. +# $5 is file to store sample mixtures +# $6 is file to store factors +# $7 is file to store generated data +# +# $8 is the blocksize, i.e., number of rows per block +# (should be set such that $8x$2 fits in mem budget) + +numDocuments = $1 +numFeatures = $2 +numTopics = $3 +numWordsPerDoc = $4 +blocksize = $8 + +docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +denomsTM = rowSums(docTopicMixtures) +zerosInDenomsTM = ppred(denomsTM, 0, "==") +denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM +parfor(i in 1:numTopics){ + docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM +} +write(docTopicMixtures, $5, format="binary") +for(j in 2:numTopics){ + docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] +} + +topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +parfor(i in 1:numTopics){ + topicDist = topicDistributions[i,] + + denom2 = sum(topicDist) + if(denom2 == 0){ + denom2 = denom2 + 0.1 + } + + topicDistributions[i,] = topicDist / denom2 +} +write(topicDistributions, $6, format="binary") +for(j in 2:numFeatures){ + topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] +} + +data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") + +#outer-loop for blockwise computation +for( k in seq(1,numDocuments,blocksize) ) +{ + len = min(blocksize,numDocuments-k); #block length + data = data0[k:(k+len),]; #obtain block + + parfor(i in 1:len){ + docTopic = docTopicMixtures[i,] + + r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + + for(j in 1:numWordsPerDoc){ + rz = castAsScalar(r_z[j,1]) + continue = 1 + + z = -1 + #this is a workaround + #z=1 + + for(k1 in 1:numTopics){ + prob = castAsScalar(docTopic[1,k1]) + if(continue==1 & rz <= prob){ + z=k1 + continue=0 + } + } + + if(z==-1){ + print("z is unassigned: " + z) + z = numTopics + } + + rw = castAsScalar(r_w[j,1]) + continue = 1 + + w = -1 + #this is a workaround + #w = 1 + + for(k2 in 1:numFeatures){ + prob = castAsScalar(topicDistributions[z,k2]) + if(continue == 1 & rw <= prob){ + w = k2 + continue = 0 + } + } + + if(w==-1){ + print("w is unassigned: " + w) + w = numFeatures + } + + data[i,w] = data[i,w] + 1 + } + } + + data0[k:(k+len),] = data; # write block back +} + +write(data0, $7, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4SurvAnalysis.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml b/scripts/datagen/genRandData4SurvAnalysis.dml index 7ac235b..da94a22 100644 --- a/scripts/datagen/genRandData4SurvAnalysis.dml +++ b/scripts/datagen/genRandData4SurvAnalysis.dml @@ -1,133 +1,133 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS -# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# type Sting --- The type of model for which the data is being generated: "kaplan-meier" or "cox" -# n Int Number of records -# lambda Double 2.0 Scale parameter of the Weibull distribution used for generating timestamps -# v Double 1.5 Shape parameter of the Weibull distribution used for generating timestamps -# p Double 0.8 1 - probability of a record being censored -# g Int 2 If type=kaplan-meier the number of categorical features used for grouping -# s Int 1 If type=kaplan-meier the number of categorical features used for stratifying -# f Int 10 If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features -# m Int 100 If type=cox the number of features in the model -# sp Double 1.0 If type=cox the sparsity of the feature matrix -# O String --- Location to write the output matrix containing random data for the kaplan-meier or the cox model -# B String --- If type=cox location to write the output matrix containing the coefficients for the cox model -# TE String --- Location to store column indices of X corresponding to timestamp (first row) and event information (second row) -# F String --- Location to store column indices of X which are to be used for fitting the Cox model -# fmt String "text" The output format of results of the kaplan-meier analysis, such as "text" or "csv" -# --------------------------------------------------------------------------------------------- -# OUTPUTS: -# 1- If type=kaplan-meier an n x (2+g+s) matrix O with -# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v -# - column 2 contains the information whether an event occurred (1) or data is censored (0) -# - columns 3:2+g contain categorical features used for grouping -# - columns 3+g:2+g+s contain categorical features used for stratifying -# if type=cox an n x (2+m) matrix O with -# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v -# - column 2 contains the information whether an event occurred (1) or data is censored (0) -# - columns 3:2+m contain scale features -# 2- If type=cox a coefficient matrix B -# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row) -# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model - -type = $type; # either "kaplan-meier" or "cox" -num_records = $n; -lambda = ifdef ($l, 2.0); -p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored -# parameters related to the kaplan-meier model -n_groups = ifdef ($g, 2); -n_strata = ifdef ($s, 1); -max_level = ifdef ($f, 10); -# parameters related to the cox model -num_features = ifdef ($m, 1000); -sparsity = ifdef ($sp, 1.0); -fileO = $O; -fileB = $B; -fileTE = $TE; -fileF = $F; -fmtO = ifdef ($fmt, "text"); # $fmt="text" -p_censor = 1 - p_event; # prob. that record is censored - -if (type == "kaplan-meier") { - - v = ifdef ($v, 1.5); - # generate categorical features used for grouping and stratifying - X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform")); - - # generate timestamps - U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); - T = (-log (U) / lambda) ^ (1/v); - -} else if (type == "cox") { - - v = ifdef ($v, 50); - # generate feature matrix - X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity); - - # generate coefficients - B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range; - - # generate timestamps - U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); - T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v); - -} else { - stop ("Wrong model type!"); -} - -Y = matrix (0, rows = num_records, cols = 2); -event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event))); -n_time = sum (event); -Y[,2] = event; - -# binning of event times -min_T = min (T); -max_T = max (T); -# T = T - min_T; -len = max_T - min_T; -num_bins = len / n_time; -T = ceil (T / num_bins); - -# print ("min(T) " + min(T) + " max(T) " + max(T)); -Y[,1] = T; - -O = append (Y, X); -write (O, fileO, format = fmtO); - -if (type == "cox") { - write (B, fileB, format = fmtO); - -} - -TE = matrix ("1 2", rows = 2, cols = 1); -F = seq (1, num_features); -write (TE, fileTE, format = fmtO); -write (F, fileF, format = fmtO); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS +# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTIRUTION WITH PARAMETERS LAMBDA AND V +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# type Sting --- The type of model for which the data is being generated: "kaplan-meier" or "cox" +# n Int Number of records +# lambda Double 2.0 Scale parameter of the Weibull distribution used for generating timestamps +# v Double 1.5 Shape parameter of the Weibull distribution used for generating timestamps +# p Double 0.8 1 - probability of a record being censored +# g Int 2 If type=kaplan-meier the number of categorical features used for grouping +# s Int 1 If type=kaplan-meier the number of categorical features used for stratifying +# f Int 10 If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features +# m Int 100 If type=cox the number of features in the model +# sp Double 1.0 If type=cox the sparsity of the feature matrix +# O String --- Location to write the output matrix containing random data for the kaplan-meier or the cox model +# B String --- If type=cox location to write the output matrix containing the coefficients for the cox model +# TE String --- Location to store column indices of X corresponding to timestamp (first row) and event information (second row) +# F String --- Location to store column indices of X which are to be used for fitting the Cox model +# fmt String "text" The output format of results of the kaplan-meier analysis, such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUTS: +# 1- If type=kaplan-meier an n x (2+g+s) matrix O with +# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v +# - column 2 contains the information whether an event occurred (1) or data is censored (0) +# - columns 3:2+g contain categorical features used for grouping +# - columns 3+g:2+g+s contain categorical features used for stratifying +# if type=cox an n x (2+m) matrix O with +# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v +# - column 2 contains the information whether an event occurred (1) or data is censored (0) +# - columns 3:2+m contain scale features +# 2- If type=cox a coefficient matrix B +# 3- A colum matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row) +# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model + +type = $type; # either "kaplan-meier" or "cox" +num_records = $n; +lambda = ifdef ($l, 2.0); +p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored +# parameters related to the kaplan-meier model +n_groups = ifdef ($g, 2); +n_strata = ifdef ($s, 1); +max_level = ifdef ($f, 10); +# parameters related to the cox model +num_features = ifdef ($m, 1000); +sparsity = ifdef ($sp, 1.0); +fileO = $O; +fileB = $B; +fileTE = $TE; +fileF = $F; +fmtO = ifdef ($fmt, "text"); # $fmt="text" +p_censor = 1 - p_event; # prob. that record is censored + +if (type == "kaplan-meier") { + + v = ifdef ($v, 1.5); + # generate categorical features used for grouping and stratifying + X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform")); + + # generate timestamps + U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); + T = (-log (U) / lambda) ^ (1/v); + +} else if (type == "cox") { + + v = ifdef ($v, 50); + # generate feature matrix + X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity); + + # generate coefficients + B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range; + + # generate timestamps + U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); + T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v); + +} else { + stop ("Wrong model type!"); +} + +Y = matrix (0, rows = num_records, cols = 2); +event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event))); +n_time = sum (event); +Y[,2] = event; + +# binning of event times +min_T = min (T); +max_T = max (T); +# T = T - min_T; +len = max_T - min_T; +num_bins = len / n_time; +T = ceil (T / num_bins); + +# print ("min(T) " + min(T) + " max(T) " + max(T)); +Y[,1] = T; + +O = append (Y, X); +write (O, fileO, format = fmtO); + +if (type == "cox") { + write (B, fileB, format = fmtO); + +} + +TE = matrix ("1 2", rows = 2, cols = 1); +F = seq (1, num_features); +write (TE, fileTE, format = fmtO); +write (F, fileF, format = fmtO); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Transform.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4Transform.dml b/scripts/datagen/genRandData4Transform.dml index b207629..bc799d6 100644 --- a/scripts/datagen/genRandData4Transform.dml +++ b/scripts/datagen/genRandData4Transform.dml @@ -1,96 +1,96 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# Generates random data to test transform with -# -# rows, cols: dimensions of the data matrix to be generated -# prob_categorical: percentage of the generated cols to be categorical -# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols -# prob_missing: percentage of the generated (scale) cols to have missing values -# prob_missing_cell: probability of a cell to have a missing value -# out_X, out_missing, out_categorical: output file names -# - -#params for size of data -num_rows = ifdef($rows, 1000) -num_cols = ifdef($cols, 25) - -#params for kind of cols -prob_categorical = ifdef($prob_cat, 0.1) -min_domain_size = ifdef($min_domain, 1) -max_domain_size = ifdef($max_domain, 10) - -#params for missing value cols -prob_missing_col = ifdef($prob_missing, 0.1) -prob_missing_val = ifdef($prob_missing_cell, 0.2) - -num_scalar_cols = as.double(num_cols) -num_categorical_cols = 0.0 -scalar_ind = matrix(1, rows=num_scalar_cols, cols=1) -if(prob_categorical > 0){ - categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") - categorical_ind = ppred(categorical_ind, prob_categorical, "<") - categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows") - num_categorical_cols = sum(categorical_ind) - write(categorical_col_ids, $out_categorical, format="csv") - - domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform") - domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes) - - categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform") - categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1))) - - scalar_ind = 1-categorical_ind -} - -scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows") -num_scalar_cols = sum(scalar_ind) -scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform") - -if(num_categorical_cols > 0 & num_scalar_cols > 0){ - X = append(scalar_X, categorical_X) - permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols) - fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols) - permut_mat = t(append(t(permut_mat), t(fill_in))) - X = X %*% permut_mat -}else{ - if(num_categorical_cols > 0) X = categorical_X - else{ - if(num_scalar_cols > 0) X = scalar_X - else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale") - } -} - -if(prob_missing_col > 0){ - missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") - missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<") - #currently only support missing value imputation for scale cols - missing_col_ind = missing_col_ind * scalar_ind - missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows") - missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform") - missing_values = ppred(missing_values, prob_missing_val, "<") - X = append(X, missing_values) - - write(missing_col_ids, $out_missing, format="csv") -} - -write(X, $out_X, format="csv") +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random data to test transform with +# +# rows, cols: dimensions of the data matrix to be generated +# prob_categorical: percentage of the generated cols to be categorical +# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols +# prob_missing: percentage of the generated (scale) cols to have missing values +# prob_missing_cell: probability of a cell to have a missing value +# out_X, out_missing, out_categorical: output file names +# + +#params for size of data +num_rows = ifdef($rows, 1000) +num_cols = ifdef($cols, 25) + +#params for kind of cols +prob_categorical = ifdef($prob_cat, 0.1) +min_domain_size = ifdef($min_domain, 1) +max_domain_size = ifdef($max_domain, 10) + +#params for missing value cols +prob_missing_col = ifdef($prob_missing, 0.1) +prob_missing_val = ifdef($prob_missing_cell, 0.2) + +num_scalar_cols = as.double(num_cols) +num_categorical_cols = 0.0 +scalar_ind = matrix(1, rows=num_scalar_cols, cols=1) +if(prob_categorical > 0){ + categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") + categorical_ind = ppred(categorical_ind, prob_categorical, "<") + categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows") + num_categorical_cols = sum(categorical_ind) + write(categorical_col_ids, $out_categorical, format="csv") + + domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform") + domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes) + + categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform") + categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1))) + + scalar_ind = 1-categorical_ind +} + +scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows") +num_scalar_cols = sum(scalar_ind) +scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform") + +if(num_categorical_cols > 0 & num_scalar_cols > 0){ + X = append(scalar_X, categorical_X) + permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols) + fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols) + permut_mat = t(append(t(permut_mat), t(fill_in))) + X = X %*% permut_mat +}else{ + if(num_categorical_cols > 0) X = categorical_X + else{ + if(num_scalar_cols > 0) X = scalar_X + else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale") + } +} + +if(prob_missing_col > 0){ + missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") + missing_col_ind = ppred(missing_col_ind, prob_missing_col, "<") + #currently only support missing value imputation for scale cols + missing_col_ind = missing_col_ind * scalar_ind + missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows") + missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform") + missing_values = ppred(missing_values, prob_missing_val, "<") + X = append(X, missing_values) + + write(missing_col_ids, $out_missing, format="csv") +} + +write(X, $out_X, format="csv") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/datagen/genRandData4Univariate.dml ---------------------------------------------------------------------- diff --git a/scripts/datagen/genRandData4Univariate.dml b/scripts/datagen/genRandData4Univariate.dml index d3c842c..bcbd528 100644 --- a/scripts/datagen/genRandData4Univariate.dml +++ b/scripts/datagen/genRandData4Univariate.dml @@ -1,61 +1,61 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# generates random numbers from a distribution -# with specified mean, standard deviation, -# skewness, kurtosis -# mean and standard deviation are taken in as -# arguments by this script -# a,b,c,d are coefficients computed by some -# equation solver determined from the specified -# skewness and kurtosis using power method -# polynomials -# -# for more details see: -# Statistical Simulation: Power Method Polynomials -# and Other Transformations -# Author: Todd C. Headrick -# Chapman & Hall/CRC, Boca Raton, FL, 2010. -# ISBN 978-1-4200-6490-2 - -# $1 is the number of random points to be sampled -# $2 is specified mean -# $3 is specified standard deviation -# $4-$7 are a,b,c,d obtained by solving a system -# of equations using specified kurtosis and skewness -# $8 is the file to write out the generated data to - -numSamples = $1 -mu = $2 -sigma = $3 -a = $4 -b = $5 -c = $6 -d = $7 - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random numbers from a distribution +# with specified mean, standard deviation, +# skewness, kurtosis +# mean and standard deviation are taken in as +# arguments by this script +# a,b,c,d are coefficients computed by some +# equation solver determined from the specified +# skewness and kurtosis using power method +# polynomials +# +# for more details see: +# Statistical Simulation: Power Method Polynomials +# and Other Transformations +# Author: Todd C. Headrick +# Chapman & Hall/CRC, Boca Raton, FL, 2010. +# ISBN 978-1-4200-6490-2 + +# $1 is the number of random points to be sampled +# $2 is specified mean +# $3 is specified standard deviation +# $4-$7 are a,b,c,d obtained by solving a system +# of equations using specified kurtosis and skewness +# $8 is the file to write out the generated data to + +numSamples = $1 +mu = $2 +sigma = $3 +a = $4 +b = $5 +c = $6 +d = $7 + print("a=" + a + " b=" + b + " c=" + c + " d=" + d) -X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) -Y = a + b*X + c*X^2 + d*X^3 - -Z = Y*sigma + mu -write(Z, $8, format="binary") +X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) +Y = a + b*X + c*X^2 + d*X^3 + +Z = Y*sigma + mu +write(Z, $8, format="binary") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/PPCA.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/PPCA.dml b/scripts/staging/PPCA.dml index 667c709..abfcdb1 100644 --- a/scripts/staging/PPCA.dml +++ b/scripts/staging/PPCA.dml @@ -1,160 +1,160 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. -# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed -# Platforms. Tarek Elgamal et.al. - -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- location to read the matrix X input matrix -# k Int --- indicates dimension of the new vector space constructed from eigen vectors -# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm -# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm -# iter Int 10 maximum number of iterations -# fmt String 'text' output format of results PPCA such as "text" or "csv" -# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100 -# --------------------------------------------------------------------------------------------- -# OUTPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# C Matrix --- principal components -# V Matrix --- eigenvalues / eigenvalues of principal components -# - -X = read($X); - -fileC = $C; -fileV = $V; - -k = ifdef($k, ncol(X)); -iter = ifdef($iter, 10); -tolobj = ifdef($tolobj, 0.00001); -tolrecerr = ifdef($tolrecerr, 0.02); -fmt0 = ifdef($fmt, "text"); - -n = nrow(X); -m = ncol(X); - -#initializing principal components matrix -C = rand(rows=m, cols=k, pdf="normal"); -ss = rand(rows=1, cols=1, pdf="normal"); -ss = as.scalar(ss); -ssPrev = ss; - -# best selected principle components - with the lowest reconstruction error -PC = C; - -# initilizing reconstruction error -RE = tolrecerr+1; -REBest = RE; - -Z = matrix(0,rows=1,cols=1); - -#Objective function value -ObjRelChng = tolobj+1; - -# mean centered input matrix - dim -> [n,m] -Xm = X - colMeans(X); - -#I -> k x k -ITMP = matrix(1,rows=k,cols=1); -I = diag(ITMP); - -i = 0; -while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){ - #Estimation step - Covariance matrix - #M -> k x k - M = t(C) %*% C + I*ss; - - #Auxilary matrix with n latent variables - # Z -> n x k - Z = Xm %*% (C %*% inv(M)); - - #ZtZ -> k x k - ZtZ = t(Z) %*% Z + inv(M)*ss; - - #XtZ -> m x k - XtZ = t(Xm) %*% Z; - - #Maximization step - #C -> m x k - ZtZ_sum = sum(ZtZ); #+n*inv(M)); - C = XtZ/ZtZ_sum; - - #ss2 -> 1 x 1 - ss2 = trace(ZtZ * (t(C) %*% C)); - - #ss3 -> 1 x 1 - ss3 = sum((Z %*% t(C)) %*% t(Xm)); - - #Frobenius norm of reconstruction error -> Euclidean norm - #Fn -> 1 x 1 - Fn = sum(Xm*Xm); - - #ss -> 1 x 1 - ss = (Fn + ss2 - 2*ss3)/(n*m); - - #calculating objective function relative change - ObjRelChng = abs(1 - ss/ssPrev); - #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss); - - #Reconstruction error - R = ((Z %*% t(C)) - Xm); - - #calculate the error - #TODO rethink calculation of reconstruction error .... - #1-Norm of reconstruction error - a big dense matrix - #RE -> n x m - RE = abs(sum(R)/sum(Xm)); - if (RE < REBest){ - PC = C; - REBest = RE; - } - #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE); - - ssPrev = ss; - i = i+1; -} -print("Objective Relative Change: " + ObjRelChng); -print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest); - -# reconstructs data -# RD -> n x k -RD = X %*% PC; - -# calculate eigenvalues - principle component variance -RDMean = colMeans(RD); -V = t(colMeans(RD*RD) - (RDMean*RDMean)); - -# sorting eigenvalues and eigenvectors in decreasing order -V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE); -VF_decr = table(seq(1,nrow(V)),V_decr_idx); -V = VF_decr %*% V; -PC = PC %*% VF_decr; - -# writing principal components -write(PC, fileC, format=fmt0); -# writing eigen values/pc variance -write(V, fileV, format=fmt0); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This script performs Probabilistic Principal Component Analysis (PCA) on the given input data. +# It is based on paper: sPCA: Scalable Principal Component Analysis for Big Data on Distributed +# Platforms. Tarek Elgamal et.al. + +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- location to read the matrix X input matrix +# k Int --- indicates dimension of the new vector space constructed from eigen vectors +# tolobj Int 0.00001 objective function tolerance value to stop ppca algorithm +# tolrecerr Int 0.02 reconstruction error tolerance value to stop the algorithm +# iter Int 10 maximum number of iterations +# fmt String 'text' output format of results PPCA such as "text" or "csv" +# hadoop jar SystemML.jar -f PPCA.dml -nvargs X=/INPUT_DIR/X C=/OUTPUT_DIR/C V=/OUTPUT_DIR/V k=2 tol=0.2 iter=100 +# --------------------------------------------------------------------------------------------- +# OUTPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# C Matrix --- principal components +# V Matrix --- eigenvalues / eigenvalues of principal components +# + +X = read($X); + +fileC = $C; +fileV = $V; + +k = ifdef($k, ncol(X)); +iter = ifdef($iter, 10); +tolobj = ifdef($tolobj, 0.00001); +tolrecerr = ifdef($tolrecerr, 0.02); +fmt0 = ifdef($fmt, "text"); + +n = nrow(X); +m = ncol(X); + +#initializing principal components matrix +C = rand(rows=m, cols=k, pdf="normal"); +ss = rand(rows=1, cols=1, pdf="normal"); +ss = as.scalar(ss); +ssPrev = ss; + +# best selected principle components - with the lowest reconstruction error +PC = C; + +# initilizing reconstruction error +RE = tolrecerr+1; +REBest = RE; + +Z = matrix(0,rows=1,cols=1); + +#Objective function value +ObjRelChng = tolobj+1; + +# mean centered input matrix - dim -> [n,m] +Xm = X - colMeans(X); + +#I -> k x k +ITMP = matrix(1,rows=k,cols=1); +I = diag(ITMP); + +i = 0; +while (i < iter & ObjRelChng > tolobj & RE > tolrecerr){ + #Estimation step - Covariance matrix + #M -> k x k + M = t(C) %*% C + I*ss; + + #Auxilary matrix with n latent variables + # Z -> n x k + Z = Xm %*% (C %*% inv(M)); + + #ZtZ -> k x k + ZtZ = t(Z) %*% Z + inv(M)*ss; + + #XtZ -> m x k + XtZ = t(Xm) %*% Z; + + #Maximization step + #C -> m x k + ZtZ_sum = sum(ZtZ); #+n*inv(M)); + C = XtZ/ZtZ_sum; + + #ss2 -> 1 x 1 + ss2 = trace(ZtZ * (t(C) %*% C)); + + #ss3 -> 1 x 1 + ss3 = sum((Z %*% t(C)) %*% t(Xm)); + + #Frobenius norm of reconstruction error -> Euclidean norm + #Fn -> 1 x 1 + Fn = sum(Xm*Xm); + + #ss -> 1 x 1 + ss = (Fn + ss2 - 2*ss3)/(n*m); + + #calculating objective function relative change + ObjRelChng = abs(1 - ss/ssPrev); + #print("Objective Relative Change: " + ObjRelChng + ", Objective: " + ss); + + #Reconstruction error + R = ((Z %*% t(C)) - Xm); + + #calculate the error + #TODO rethink calculation of reconstruction error .... + #1-Norm of reconstruction error - a big dense matrix + #RE -> n x m + RE = abs(sum(R)/sum(Xm)); + if (RE < REBest){ + PC = C; + REBest = RE; + } + #print("ss: " + ss +" = Fn( "+ Fn +" ) + ss2( " + ss2 +" ) - 2*ss3( " + ss3 + " ), Reconstruction Error: " + RE); + + ssPrev = ss; + i = i+1; +} +print("Objective Relative Change: " + ObjRelChng); +print ("Number of iterations: " + i + ", Reconstruction Err: " + REBest); + +# reconstructs data +# RD -> n x k +RD = X %*% PC; + +# calculate eigenvalues - principle component variance +RDMean = colMeans(RD); +V = t(colMeans(RD*RD) - (RDMean*RDMean)); + +# sorting eigenvalues and eigenvectors in decreasing order +V_decr_idx = order(target=V,by=1,decreasing=TRUE,index.return=TRUE); +VF_decr = table(seq(1,nrow(V)),V_decr_idx); +V = VF_decr %*% V; +PC = PC %*% VF_decr; + +# writing principal components +write(PC, fileC, format=fmt0); +# writing eigen values/pc variance +write(V, fileV, format=fmt0); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/staging/regression/lasso/lasso.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/regression/lasso/lasso.dml b/scripts/staging/regression/lasso/lasso.dml index fb520df..1da88d3 100644 --- a/scripts/staging/regression/lasso/lasso.dml +++ b/scripts/staging/regression/lasso/lasso.dml @@ -1,113 +1,113 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -#uses the sparsa algorithm to perform lasso regression - -X = read($X) -y = read($Y) -n = nrow(X) -m = ncol(X) - -#params -tol = 10^(-15) -M = 5 -tau = 1 -maxiter = 1000 - -#constants -eta = 2 -sigma = 0.01 -alpha_min = 10^(-30) -alpha_max = 10^(30) - -#init -alpha = 1 -w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform") -history = -1*10^30 * matrix(1, rows=M, cols=1) - -r = X %*% w - y -g = t(X) %*% r -obj = 0.5 * sum(r*r) + tau*sum(abs(w)) - -print("Initial OBJ=" + obj) - -history[M,1] = obj - -inactive_set = matrix(1, rows=m, cols=1) -iter = 0 -continue = TRUE -while(iter < maxiter & continue) { - dw = matrix(0, rows=m, cols=1) - dg = matrix(0, rows=m, cols=1) - relChangeObj = -1.0 - - inner_iter = 0 - inner_continue = TRUE - inner_maxiter = 100 - while(inner_iter < inner_maxiter & inner_continue) { - u = w - g/alpha - lambda = tau/alpha - - wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">") - dw = wnew - w - dw2 = sum(dw*dw) - - r = X %*% wnew - y - gnew = t(X) %*% r - objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew)) - obj_threshold = max(history) - 0.5*sigma*alpha*dw2 - - if(objnew <= obj_threshold) { - w = wnew - dg = gnew - g - g = gnew - inner_continue = FALSE - - history[1:(M-1),] = history[2:M,] - history[M,1] = objnew - relChangeObj = abs(objnew - obj)/obj - obj = objnew - } - else - alpha = eta*alpha - - inner_iter = inner_iter + 1 - } - - if(inner_continue) - print("Inner loop did not converge") - - alphanew = sum(dw*dg)/sum(dw*dw) - alpha = max(alpha_min, min(alpha_max, alphanew)) - - old_inactive_set = inactive_set - inactive_set = ppred(w, 0, "!=") - diff = sum(abs(old_inactive_set - inactive_set)) - - if(diff == 0 & relChangeObj < tol) - continue = FALSE - - num_inactive = sum(ppred(w, 0, "!=")) - print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive) - iter = iter + 1 -} - -write(w, $model) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +#uses the sparsa algorithm to perform lasso regression + +X = read($X) +y = read($Y) +n = nrow(X) +m = ncol(X) + +#params +tol = 10^(-15) +M = 5 +tau = 1 +maxiter = 1000 + +#constants +eta = 2 +sigma = 0.01 +alpha_min = 10^(-30) +alpha_max = 10^(30) + +#init +alpha = 1 +w = Rand(rows=m, cols=1, min=0, max=1, pdf="uniform") +history = -1*10^30 * matrix(1, rows=M, cols=1) + +r = X %*% w - y +g = t(X) %*% r +obj = 0.5 * sum(r*r) + tau*sum(abs(w)) + +print("Initial OBJ=" + obj) + +history[M,1] = obj + +inactive_set = matrix(1, rows=m, cols=1) +iter = 0 +continue = TRUE +while(iter < maxiter & continue) { + dw = matrix(0, rows=m, cols=1) + dg = matrix(0, rows=m, cols=1) + relChangeObj = -1.0 + + inner_iter = 0 + inner_continue = TRUE + inner_maxiter = 100 + while(inner_iter < inner_maxiter & inner_continue) { + u = w - g/alpha + lambda = tau/alpha + + wnew = sign(u) * (abs(u) - lambda) * ppred(abs(u) - lambda, 0, ">") + dw = wnew - w + dw2 = sum(dw*dw) + + r = X %*% wnew - y + gnew = t(X) %*% r + objnew = 0.5 * sum(r*r) + tau*sum(abs(wnew)) + obj_threshold = max(history) - 0.5*sigma*alpha*dw2 + + if(objnew <= obj_threshold) { + w = wnew + dg = gnew - g + g = gnew + inner_continue = FALSE + + history[1:(M-1),] = history[2:M,] + history[M,1] = objnew + relChangeObj = abs(objnew - obj)/obj + obj = objnew + } + else + alpha = eta*alpha + + inner_iter = inner_iter + 1 + } + + if(inner_continue) + print("Inner loop did not converge") + + alphanew = sum(dw*dg)/sum(dw*dw) + alpha = max(alpha_min, min(alpha_max, alphanew)) + + old_inactive_set = inactive_set + inactive_set = ppred(w, 0, "!=") + diff = sum(abs(old_inactive_set - inactive_set)) + + if(diff == 0 & relChangeObj < tol) + continue = FALSE + + num_inactive = sum(ppred(w, 0, "!=")) + print("ITER=" + iter + " OBJ=" + obj + " relative change=" + relChangeObj + " num_inactive=" + num_inactive) + iter = iter + 1 +} + +write(w, $model)
