http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Kmeans.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/Kmeans.dml b/scripts/algorithms/Kmeans.dml index 3ec47a8..2887baa 100644 --- a/scripts/algorithms/Kmeans.dml +++ b/scripts/algorithms/Kmeans.dml @@ -1,282 +1,282 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# Implements the k-Means clustering algorithm -# -# INPUT PARAMETERS: -# ---------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# ---------------------------------------------------------------------------- -# X String --- Location to read matrix X with the input data records -# k Int --- Number of centroids -# runs Int 10 Number of runs (with different initial centroids) -# maxi Int 1000 Maximum number of iterations per run -# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio -# samp Int 50 Average number of records per centroid in data samples -# C String "C.mtx" Location to store the output matrix with the centroids -# isY Int 0 0 = do not write Y, 1 = write Y -# Y String "Y.mtx" Location to store the mapping of records to centroids -# fmt String "text" Matrix output format, usually "text" or "csv" -# verb Int 0 0 = do not print per-iteration stats, 1 = print them -# ---------------------------------------------------------------------------- -# -# Example: -# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx -# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1 - -fileX = $X; -fileY = ifdef ($Y, "Y.mtx"); -fileC = ifdef ($C, "C.mtx"); - -num_centroids = $k; -num_runs = ifdef ($runs, 10); # $runs=10; -max_iter = ifdef ($maxi, 1000); # $maxi=1000; -eps = ifdef ($tol, 0.000001); # $tol=0.000001; -is_write_Y = ifdef ($isY, 0); # $isY=0; -is_verbose = ifdef ($verb, 0); # $verb=0; -fmtCY = ifdef ($fmt, "text"); # $fmt="text"; -avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50; - - -print ("BEGIN K-MEANS SCRIPT"); -print ("Reading X..."); - -# X : matrix of data points as rows -X = read (fileX); -num_records = nrow (X); -num_features = ncol (X); - -sumXsq = sum (X ^ 2); -# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B)) - -# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES: - -print ("Taking data samples for initialization..."); - -[sample_maps, samples_vs_runs_map, sample_block_size] = - get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid); - -is_row_in_samples = rowSums (sample_maps); -X_samples = sample_maps %*% X; -X_samples_sq_norms = rowSums (X_samples ^ 2); - -print ("Initializing the centroids for all runs..."); -All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features); - -# We select centroids according to the k-Means++ heuristic applied to a sample of X -# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids, -# with the out-of-range X_sample positions in min_distances set to 0.0 - -min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random - -for (i in 1 : num_centroids) -{ - # "Matricize" and prefix-sum to compute the cumulative distribution function: - min_distances_matrix_form = - matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE); - cdf_min_distances = cumsum (min_distances_matrix_form); - - # Select the i-th centroid in each sample as a random sample row id with - # probability ~ min_distances: - random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0); - threshold_matrix = random_row * cdf_min_distances [sample_block_size, ]; - centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, "<"))) + 1; - - # Place the selected centroids together, one per run, into a matrix: - centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs)); - centroid_placer_raw = - table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids); - centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw; - centroids = centroid_placer %*% X_samples; - - # Place the selected centroids into their appropriate slots in All_Centroids: - centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs); - centroid_placer_raw = - table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1)); - centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw; - All_Centroids = All_Centroids + centroid_placer %*% centroids; - - # Update min_distances to preserve the loop invariant: - distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2) - - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids)); - if (i == 1) { - min_distances = is_row_in_samples * distances; - } else { - min_distances = min (min_distances, distances); -} } - -# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS: - -termination_code = matrix (0, rows = num_runs, cols = 1); -final_wcss = matrix (0, rows = num_runs, cols = 1); -num_iterations = matrix (0, rows = num_runs, cols = 1); - -print ("Performing k-means iterations for all runs..."); - -parfor (run_index in 1 : num_runs, check = 0) -{ - C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ]; - C_old = C; - iter_count = 0; - term_code = 0; - wcss = 0; - - while (term_code == 0) - { - # Compute Euclidean squared distances from records (X rows) to centroids (C rows) - # without the C-independent term, then take the minimum for each record - D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); - minD = rowMins (D); - # Compute the current centroid-based within-cluster sum of squares (WCSS) - wcss_old = wcss; - wcss = sumXsq + sum (minD); - if (is_verbose == 1) { - if (iter_count == 0) { - print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss); - } else { - print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss - + "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids)); - } } - # Check if convergence or maximum iteration has been reached - if (wcss_old - wcss < eps * wcss & iter_count > 0) { - term_code = 1; # Convergence is reached - } else { - if (iter_count >= max_iter) { - term_code = 2; # Maximum iteration is reached - } else { - iter_count = iter_count + 1; - # Find the closest centroid for each record - P = ppred (D, minD, "<="); - # If some records belong to multiple centroids, share them equally - P = P / rowSums (P); - # Compute the column normalization factor for P - P_denom = colSums (P); - if (sum (ppred (P_denom, 0.0, "<=")) > 0) { - term_code = 3; # There is a "runaway" centroid with 0.0 denominator - } else { - C_old = C; - # Compute new centroids as weighted averages over the records - C = (t(P) %*% X) / t(P_denom); - } } } } - print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss); - All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C; - final_wcss [run_index, 1] = wcss; - termination_code [run_index, 1] = term_code; - num_iterations [run_index, 1] = iter_count; -} - -# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS: - -termination_bitmap = matrix (0, rows = num_runs, cols = 3); -termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code); -termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw; -termination_stats = colSums (termination_bitmap); -print ("Number of successful runs = " + as.integer (castAsScalar (termination_stats [1, 1]))); -print ("Number of incomplete runs = " + as.integer (castAsScalar (termination_stats [1, 2]))); -print ("Number of failed runs (with lost centroids) = " + as.integer (castAsScalar (termination_stats [1, 3]))); - -num_successful_runs = castAsScalar (termination_stats [1, 1]); -if (num_successful_runs > 0) { - final_wcss_successful = final_wcss * termination_bitmap [, 1]; - worst_wcss = max (final_wcss_successful); - best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1])); - avg_wcss = sum (final_wcss_successful) / num_successful_runs; - best_index_vector = ppred (final_wcss_successful, best_wcss, "=="); - aggr_best_index_vector = cumsum (best_index_vector); - best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 1); - print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss - + "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss); - C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ]; - print ("Writing out the best-WCSS centroids..."); - write (C, fileC, format=fmtCY); - if (is_write_Y == 1) { - print ("Writing out the best-WCSS cluster labels..."); - D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); - P = ppred (D, rowMins (D), "<="); - aggr_P = t(cumsum (t(P))); - Y = rowSums (ppred (aggr_P, 0, "==")) + 1 - write (Y, fileY, format=fmtCY); - } - print ("DONE."); -} else { - stop ("No output is produced. Try increasing the number of iterations and/or runs."); -} - - - -get_sample_maps = function (int num_records, int num_samples, int approx_sample_size) - return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size) -{ - if (approx_sample_size < num_records) { - # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's - # to get the sample block size (to allocate space): - sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size))); - num_rows = sample_block_size * num_samples; - - # Generate all samples in parallel by converting uniform random values into random - # integer skip-ahead intervals and prefix-summing them: - sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0); - sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5); - # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one] - sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one) - - # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1": - is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, "<="); - sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range - + (num_records + 1) * (1 - is_sample_rec_id_within_range); - - # Rearrange all samples (and their out-of-range indicators) into one column-vector: - sample_rec_ids = - matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE); - is_row_in_samples = - matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE); - - # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation - # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere: - sample_maps_raw = table (seq (1, num_rows), sample_rec_ids); - max_rec_id = ncol (sample_maps_raw); - if (max_rec_id >= num_records) { - sample_maps = sample_maps_raw [, 1 : num_records]; - } else { - sample_maps = matrix (0, rows = num_rows, cols = num_records); - sample_maps [, 1 : max_rec_id] = sample_maps_raw; - } - - # Create a 0-1-matrix that maps each sample column ID into all row positions of the - # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1: - sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1); - # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c: - col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size); - sample_col_map = table (sample_positions, col_positions); - # Remove the out-of-sample-range positions by cutting off the last row: - sample_col_map = sample_col_map [1 : (num_rows), ]; - - } else { - one_per_record = matrix (1, rows = num_records, cols = 1); - sample_block_size = num_records; - sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records); - sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples); - for (i in 1:num_samples) { - sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record); - sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record; -} } } - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Implements the k-Means clustering algorithm +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# X String --- Location to read matrix X with the input data records +# k Int --- Number of centroids +# runs Int 10 Number of runs (with different initial centroids) +# maxi Int 1000 Maximum number of iterations per run +# tol Double 0.000001 Tolerance (epsilon) for WCSS change ratio +# samp Int 50 Average number of records per centroid in data samples +# C String "C.mtx" Location to store the output matrix with the centroids +# isY Int 0 0 = do not write Y, 1 = write Y +# Y String "Y.mtx" Location to store the mapping of records to centroids +# fmt String "text" Matrix output format, usually "text" or "csv" +# verb Int 0 0 = do not print per-iteration stats, 1 = print them +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx +# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1 + +fileX = $X; +fileY = ifdef ($Y, "Y.mtx"); +fileC = ifdef ($C, "C.mtx"); + +num_centroids = $k; +num_runs = ifdef ($runs, 10); # $runs=10; +max_iter = ifdef ($maxi, 1000); # $maxi=1000; +eps = ifdef ($tol, 0.000001); # $tol=0.000001; +is_write_Y = ifdef ($isY, 0); # $isY=0; +is_verbose = ifdef ($verb, 0); # $verb=0; +fmtCY = ifdef ($fmt, "text"); # $fmt="text"; +avg_sample_size_per_centroid = ifdef ($samp, 50); # $samp=50; + + +print ("BEGIN K-MEANS SCRIPT"); +print ("Reading X..."); + +# X : matrix of data points as rows +X = read (fileX); +num_records = nrow (X); +num_features = ncol (X); + +sumXsq = sum (X ^ 2); +# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B)) + +# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES: + +print ("Taking data samples for initialization..."); + +[sample_maps, samples_vs_runs_map, sample_block_size] = + get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid); + +is_row_in_samples = rowSums (sample_maps); +X_samples = sample_maps %*% X; +X_samples_sq_norms = rowSums (X_samples ^ 2); + +print ("Initializing the centroids for all runs..."); +All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features); + +# We select centroids according to the k-Means++ heuristic applied to a sample of X +# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids, +# with the out-of-range X_sample positions in min_distances set to 0.0 + +min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random + +for (i in 1 : num_centroids) +{ + # "Matricize" and prefix-sum to compute the cumulative distribution function: + min_distances_matrix_form = + matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE); + cdf_min_distances = cumsum (min_distances_matrix_form); + + # Select the i-th centroid in each sample as a random sample row id with + # probability ~ min_distances: + random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0); + threshold_matrix = random_row * cdf_min_distances [sample_block_size, ]; + centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, "<"))) + 1; + + # Place the selected centroids together, one per run, into a matrix: + centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs)); + centroid_placer_raw = + table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids); + centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw; + centroids = centroid_placer %*% X_samples; + + # Place the selected centroids into their appropriate slots in All_Centroids: + centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs); + centroid_placer_raw = + table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1)); + centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw; + All_Centroids = All_Centroids + centroid_placer %*% centroids; + + # Update min_distances to preserve the loop invariant: + distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2) + - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids)); + if (i == 1) { + min_distances = is_row_in_samples * distances; + } else { + min_distances = min (min_distances, distances); +} } + +# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS: + +termination_code = matrix (0, rows = num_runs, cols = 1); +final_wcss = matrix (0, rows = num_runs, cols = 1); +num_iterations = matrix (0, rows = num_runs, cols = 1); + +print ("Performing k-means iterations for all runs..."); + +parfor (run_index in 1 : num_runs, check = 0) +{ + C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ]; + C_old = C; + iter_count = 0; + term_code = 0; + wcss = 0; + + while (term_code == 0) + { + # Compute Euclidean squared distances from records (X rows) to centroids (C rows) + # without the C-independent term, then take the minimum for each record + D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); + minD = rowMins (D); + # Compute the current centroid-based within-cluster sum of squares (WCSS) + wcss_old = wcss; + wcss = sumXsq + sum (minD); + if (is_verbose == 1) { + if (iter_count == 0) { + print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss); + } else { + print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss + + "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids)); + } } + # Check if convergence or maximum iteration has been reached + if (wcss_old - wcss < eps * wcss & iter_count > 0) { + term_code = 1; # Convergence is reached + } else { + if (iter_count >= max_iter) { + term_code = 2; # Maximum iteration is reached + } else { + iter_count = iter_count + 1; + # Find the closest centroid for each record + P = ppred (D, minD, "<="); + # If some records belong to multiple centroids, share them equally + P = P / rowSums (P); + # Compute the column normalization factor for P + P_denom = colSums (P); + if (sum (ppred (P_denom, 0.0, "<=")) > 0) { + term_code = 3; # There is a "runaway" centroid with 0.0 denominator + } else { + C_old = C; + # Compute new centroids as weighted averages over the records + C = (t(P) %*% X) / t(P_denom); + } } } } + print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss); + All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C; + final_wcss [run_index, 1] = wcss; + termination_code [run_index, 1] = term_code; + num_iterations [run_index, 1] = iter_count; +} + +# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS: + +termination_bitmap = matrix (0, rows = num_runs, cols = 3); +termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code); +termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw; +termination_stats = colSums (termination_bitmap); +print ("Number of successful runs = " + as.integer (castAsScalar (termination_stats [1, 1]))); +print ("Number of incomplete runs = " + as.integer (castAsScalar (termination_stats [1, 2]))); +print ("Number of failed runs (with lost centroids) = " + as.integer (castAsScalar (termination_stats [1, 3]))); + +num_successful_runs = castAsScalar (termination_stats [1, 1]); +if (num_successful_runs > 0) { + final_wcss_successful = final_wcss * termination_bitmap [, 1]; + worst_wcss = max (final_wcss_successful); + best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1])); + avg_wcss = sum (final_wcss_successful) / num_successful_runs; + best_index_vector = ppred (final_wcss_successful, best_wcss, "=="); + aggr_best_index_vector = cumsum (best_index_vector); + best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 1); + print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss + + "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss); + C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ]; + print ("Writing out the best-WCSS centroids..."); + write (C, fileC, format=fmtCY); + if (is_write_Y == 1) { + print ("Writing out the best-WCSS cluster labels..."); + D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); + P = ppred (D, rowMins (D), "<="); + aggr_P = t(cumsum (t(P))); + Y = rowSums (ppred (aggr_P, 0, "==")) + 1 + write (Y, fileY, format=fmtCY); + } + print ("DONE."); +} else { + stop ("No output is produced. Try increasing the number of iterations and/or runs."); +} + + + +get_sample_maps = function (int num_records, int num_samples, int approx_sample_size) + return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size) +{ + if (approx_sample_size < num_records) { + # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's + # to get the sample block size (to allocate space): + sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size))); + num_rows = sample_block_size * num_samples; + + # Generate all samples in parallel by converting uniform random values into random + # integer skip-ahead intervals and prefix-summing them: + sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0); + sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5); + # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one] + sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one) + + # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1": + is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, "<="); + sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range + + (num_records + 1) * (1 - is_sample_rec_id_within_range); + + # Rearrange all samples (and their out-of-range indicators) into one column-vector: + sample_rec_ids = + matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE); + is_row_in_samples = + matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE); + + # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation + # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere: + sample_maps_raw = table (seq (1, num_rows), sample_rec_ids); + max_rec_id = ncol (sample_maps_raw); + if (max_rec_id >= num_records) { + sample_maps = sample_maps_raw [, 1 : num_records]; + } else { + sample_maps = matrix (0, rows = num_rows, cols = num_records); + sample_maps [, 1 : max_rec_id] = sample_maps_raw; + } + + # Create a 0-1-matrix that maps each sample column ID into all row positions of the + # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1: + sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1); + # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c: + col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size); + sample_col_map = table (sample_positions, col_positions); + # Remove the out-of-sample-range positions by cutting off the last row: + sample_col_map = sample_col_map [1 : (num_rows), ]; + + } else { + one_per_record = matrix (1, rows = num_records, cols = 1); + sample_block_size = num_records; + sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records); + sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples); + for (i in 1:num_samples) { + sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record); + sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record; +} } } +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/LinearRegCG.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/LinearRegCG.dml b/scripts/algorithms/LinearRegCG.dml index a485d83..ebfa5a4 100644 --- a/scripts/algorithms/LinearRegCG.dml +++ b/scripts/algorithms/LinearRegCG.dml @@ -1,286 +1,286 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM -# -# INPUT PARAMETERS: -# -------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# -------------------------------------------------------------------------------------------- -# X String --- Location (on HDFS) to read the matrix X of feature vectors -# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values -# B String --- Location to store estimated regression parameters (the betas) -# O String " " Location to write the printed statistics; by default is standard output -# Log String " " Location to write per-iteration variables for log/debugging purposes -# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero -# for highly dependend/sparse/numerous features -# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if -# L2 norm of the beta-residual is less than tolerance * its initial norm -# maxi Int 0 Maximum number of conjugate gradient iterations, 0 = no maximum -# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" -# -------------------------------------------------------------------------------------------- -# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: -# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: -# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B -# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# Col.2: betas for shifted/rescaled X and intercept -# -# In addition, some regression statistics are provided in CSV format, one comma-separated -# name-value pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------- -# AVG_TOT_Y Average of the response value Y -# STDEV_TOT_Y Standard Deviation of the response value Y -# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias -# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) -# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. -# PLAIN_R2 Plain R^2 of residual with bias included vs. total average -# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average -# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average -# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average -# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant -# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant -# ------------------------------------------------------------------------------------- -# * The last two statistics are only printed if there is no intercept (icpt=0) -# -# The Log file, when requested, contains the following per-iteration variables in CSV -# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for -# initial values: -# -# NAME MEANING -# ------------------------------------------------------------------------------------- -# CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y -# where A = t(X) %*% X + diag (lambda), or a similar quantity -# CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial -# ------------------------------------------------------------------------------------- -# -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B -# O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv Log=OUTPUT_DIR/log - -fileX = $X; -fileY = $Y; -fileB = $B; -fileO = ifdef ($O, " "); -fileLog = ifdef ($Log, " "); -fmtB = ifdef ($fmt, "text"); - -intercept_status = ifdef ($icpt, 0); # $icpt=0; -tolerance = ifdef ($tol, 0.000001); # $tol=0.000001; -max_iteration = ifdef ($maxi, 0); # $maxi=0; -regularization = ifdef ($reg, 0.000001); # $reg=0.000001; - -print ("BEGIN LINEAR REGRESSION SCRIPT"); -print ("Reading X and Y..."); -X = read (fileX); -y = read (fileY); - -n = nrow (X); -m = ncol (X); -ones_n = matrix (1, rows = n, cols = 1); -zero_cell = matrix (0, rows = 1, cols = 1); - -# Introduce the intercept, shift and rescale the columns of X if needed - -m_ext = m; -if (intercept_status == 1 | intercept_status == 2) # add the intercept column -{ - X = append (X, ones_n); - m_ext = ncol (X); -} - -scale_lambda = matrix (1, rows = m_ext, cols = 1); -if (intercept_status == 1 | intercept_status == 2) -{ - scale_lambda [m_ext, 1] = 0; -} - -if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 -{ # Important assumption: X [, m_ext] = ones_n - avg_X_cols = t(colSums(X)) / n; - var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); - is_unsafe = ppred (var_X_cols, 0.0, "<="); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [m_ext, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [m_ext, 1] = 0; -} else { - scale_X = matrix (1, rows = m_ext, cols = 1); - shift_X = matrix (0, rows = m_ext, cols = 1); -} - -# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)" -# instead of "X". However, in order to preserve the sparsity of X, -# we apply the transform associatively to some other part of the expression -# in which it occurs. To avoid materializing a large matrix, we rewrite it: -# -# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# ssX_A = diag (scale_X) %*% A; -# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A; -# -# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; - -lambda = scale_lambda * regularization; -beta_unscaled = matrix (0, rows = m_ext, cols = 1); - -if (max_iteration == 0) { - max_iteration = m_ext; -} -i = 0; - -# BEGIN THE CONJUGATE GRADIENT ALGORITHM -print ("Running the CG algorithm..."); - -r = - t(X) %*% y; - -if (intercept_status == 2) { - r = scale_X * r + shift_X %*% r [m_ext, ]; -} - -p = - r; -norm_r2 = sum (r ^ 2); -norm_r2_initial = norm_r2; -norm_r2_target = norm_r2_initial * tolerance ^ 2; -print ("||r|| initial value = " + sqrt (norm_r2_initial) + ", target value = " + sqrt (norm_r2_target)); -log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial); -log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0"); - -while (i < max_iteration & norm_r2 > norm_r2_target) -{ - if (intercept_status == 2) { - ssX_p = scale_X * p; - ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p; - } else { - ssX_p = p; - } - - q = t(X) %*% (X %*% ssX_p); - - if (intercept_status == 2) { - q = scale_X * q + shift_X %*% q [m_ext, ]; - } - - q = q + lambda * p; - a = norm_r2 / sum (p * q); - beta_unscaled = beta_unscaled + a * p; - r = r + a * q; - old_norm_r2 = norm_r2; - norm_r2 = sum (r ^ 2); - p = -r + (norm_r2 / old_norm_r2) * p; - i = i + 1; - print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt (norm_r2 / norm_r2_initial)); - log_str = append (log_str, "CG_RESIDUAL_NORM," + i + "," + sqrt (norm_r2)); - log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 / norm_r2_initial)); -} - -if (i >= max_iteration) { - print ("Warning: the maximum number of iterations has been reached."); -} -print ("The CG algorithm is done."); -# END THE CONJUGATE GRADIENT ALGORITHM - -if (intercept_status == 2) { - beta = scale_X * beta_unscaled; - beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; -} else { - beta = beta_unscaled; -} - -print ("Computing the statistics..."); - -avg_tot = sum (y) / n; -ss_tot = sum (y ^ 2); -ss_avg_tot = ss_tot - n * avg_tot ^ 2; -var_tot = ss_avg_tot / (n - 1); -y_residual = y - X %*% beta; -avg_res = sum (y_residual) / n; -ss_res = sum (y_residual ^ 2); -ss_avg_res = ss_res - n * avg_res ^ 2; - -plain_R2 = 1 - ss_res / ss_avg_tot; -if (n > m_ext) { - dispersion = ss_res / (n - m_ext); - adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); -} else { - dispersion = 0.0 / 0.0; - adjusted_R2 = 0.0 / 0.0; -} - -plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; -deg_freedom = n - m - 1; -if (deg_freedom > 0) { - var_res = ss_avg_res / deg_freedom; - adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); -} else { - var_res = 0.0 / 0.0; - adjusted_R2_nobias = 0.0 / 0.0; - print ("Warning: zero or negative number of degrees of freedom."); -} - -plain_R2_vs_0 = 1 - ss_res / ss_tot; -if (n > m) { - adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); -} else { - adjusted_R2_vs_0 = 0.0 / 0.0; -} - -str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y -str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y -str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias -str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) -str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. -str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average -str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average -str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average -str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average -if (intercept_status == 0) { - str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant - str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant -} - -if (fileO != " ") { - write (str, fileO); -} else { - print (str); -} - -# Prepare the output matrix -print ("Writing the output matrix..."); - -if (intercept_status == 2) { - beta_out = append (beta, beta_unscaled); -} else { - beta_out = beta; -} -write (beta_out, fileB, format=fmtB); - -if (fileLog != " ") { - write (log_str, fileLog); -} -print ("END LINEAR REGRESSION SCRIPT"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# X String --- Location (on HDFS) to read the matrix X of feature vectors +# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values +# B String --- Location to store estimated regression parameters (the betas) +# O String " " Location to write the printed statistics; by default is standard output +# Log String " " Location to write per-iteration variables for log/debugging purposes +# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero +# for highly dependend/sparse/numerous features +# tol Double 0.000001 Tolerance (epsilon); conjugate graduent procedure terminates early if +# L2 norm of the beta-residual is less than tolerance * its initial norm +# maxi Int 0 Maximum number of conjugate gradient iterations, 0 = no maximum +# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" +# -------------------------------------------------------------------------------------------- +# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: +# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: +# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B +# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# Col.2: betas for shifted/rescaled X and intercept +# +# In addition, some regression statistics are provided in CSV format, one comma-separated +# name-value pair per each line, as follows: +# +# NAME MEANING +# ------------------------------------------------------------------------------------- +# AVG_TOT_Y Average of the response value Y +# STDEV_TOT_Y Standard Deviation of the response value Y +# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias +# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) +# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. +# PLAIN_R2 Plain R^2 of residual with bias included vs. total average +# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average +# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average +# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average +# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant +# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant +# ------------------------------------------------------------------------------------- +# * The last two statistics are only printed if there is no intercept (icpt=0) +# +# The Log file, when requested, contains the following per-iteration variables in CSV +# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for +# initial values: +# +# NAME MEANING +# ------------------------------------------------------------------------------------- +# CG_RESIDUAL_NORM L2-norm of Conj.Grad.residual, which is A %*% beta - t(X) %*% y +# where A = t(X) %*% X + diag (lambda), or a similar quantity +# CG_RESIDUAL_RATIO Ratio of current L2-norm of Conj.Grad.residual over the initial +# ------------------------------------------------------------------------------------- +# +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B +# O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv Log=OUTPUT_DIR/log + +fileX = $X; +fileY = $Y; +fileB = $B; +fileO = ifdef ($O, " "); +fileLog = ifdef ($Log, " "); +fmtB = ifdef ($fmt, "text"); + +intercept_status = ifdef ($icpt, 0); # $icpt=0; +tolerance = ifdef ($tol, 0.000001); # $tol=0.000001; +max_iteration = ifdef ($maxi, 0); # $maxi=0; +regularization = ifdef ($reg, 0.000001); # $reg=0.000001; + +print ("BEGIN LINEAR REGRESSION SCRIPT"); +print ("Reading X and Y..."); +X = read (fileX); +y = read (fileY); + +n = nrow (X); +m = ncol (X); +ones_n = matrix (1, rows = n, cols = 1); +zero_cell = matrix (0, rows = 1, cols = 1); + +# Introduce the intercept, shift and rescale the columns of X if needed + +m_ext = m; +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = append (X, ones_n); + m_ext = ncol (X); +} + +scale_lambda = matrix (1, rows = m_ext, cols = 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [m_ext, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, m_ext] = ones_n + avg_X_cols = t(colSums(X)) / n; + var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [m_ext, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [m_ext, 1] = 0; +} else { + scale_X = matrix (1, rows = m_ext, cols = 1); + shift_X = matrix (0, rows = m_ext, cols = 1); +} + +# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)" +# instead of "X". However, in order to preserve the sparsity of X, +# we apply the transform associatively to some other part of the expression +# in which it occurs. To avoid materializing a large matrix, we rewrite it: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; + +lambda = scale_lambda * regularization; +beta_unscaled = matrix (0, rows = m_ext, cols = 1); + +if (max_iteration == 0) { + max_iteration = m_ext; +} +i = 0; + +# BEGIN THE CONJUGATE GRADIENT ALGORITHM +print ("Running the CG algorithm..."); + +r = - t(X) %*% y; + +if (intercept_status == 2) { + r = scale_X * r + shift_X %*% r [m_ext, ]; +} + +p = - r; +norm_r2 = sum (r ^ 2); +norm_r2_initial = norm_r2; +norm_r2_target = norm_r2_initial * tolerance ^ 2; +print ("||r|| initial value = " + sqrt (norm_r2_initial) + ", target value = " + sqrt (norm_r2_target)); +log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial); +log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0"); + +while (i < max_iteration & norm_r2 > norm_r2_target) +{ + if (intercept_status == 2) { + ssX_p = scale_X * p; + ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p; + } else { + ssX_p = p; + } + + q = t(X) %*% (X %*% ssX_p); + + if (intercept_status == 2) { + q = scale_X * q + shift_X %*% q [m_ext, ]; + } + + q = q + lambda * p; + a = norm_r2 / sum (p * q); + beta_unscaled = beta_unscaled + a * p; + r = r + a * q; + old_norm_r2 = norm_r2; + norm_r2 = sum (r ^ 2); + p = -r + (norm_r2 / old_norm_r2) * p; + i = i + 1; + print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt (norm_r2 / norm_r2_initial)); + log_str = append (log_str, "CG_RESIDUAL_NORM," + i + "," + sqrt (norm_r2)); + log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 / norm_r2_initial)); +} + +if (i >= max_iteration) { + print ("Warning: the maximum number of iterations has been reached."); +} +print ("The CG algorithm is done."); +# END THE CONJUGATE GRADIENT ALGORITHM + +if (intercept_status == 2) { + beta = scale_X * beta_unscaled; + beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; +} else { + beta = beta_unscaled; +} + +print ("Computing the statistics..."); + +avg_tot = sum (y) / n; +ss_tot = sum (y ^ 2); +ss_avg_tot = ss_tot - n * avg_tot ^ 2; +var_tot = ss_avg_tot / (n - 1); +y_residual = y - X %*% beta; +avg_res = sum (y_residual) / n; +ss_res = sum (y_residual ^ 2); +ss_avg_res = ss_res - n * avg_res ^ 2; + +plain_R2 = 1 - ss_res / ss_avg_tot; +if (n > m_ext) { + dispersion = ss_res / (n - m_ext); + adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); +} else { + dispersion = 0.0 / 0.0; + adjusted_R2 = 0.0 / 0.0; +} + +plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; +deg_freedom = n - m - 1; +if (deg_freedom > 0) { + var_res = ss_avg_res / deg_freedom; + adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); +} else { + var_res = 0.0 / 0.0; + adjusted_R2_nobias = 0.0 / 0.0; + print ("Warning: zero or negative number of degrees of freedom."); +} + +plain_R2_vs_0 = 1 - ss_res / ss_tot; +if (n > m) { + adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); +} else { + adjusted_R2_vs_0 = 0.0 / 0.0; +} + +str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y +str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y +str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias +str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) +str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. +str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average +str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average +str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average +str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average +if (intercept_status == 0) { + str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant + str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant +} + +if (fileO != " ") { + write (str, fileO); +} else { + print (str); +} + +# Prepare the output matrix +print ("Writing the output matrix..."); + +if (intercept_status == 2) { + beta_out = append (beta, beta_unscaled); +} else { + beta_out = beta; +} +write (beta_out, fileB, format=fmtB); + +if (fileLog != " ") { + write (log_str, fileLog); +} +print ("END LINEAR REGRESSION SCRIPT"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/LinearRegDS.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/LinearRegDS.dml b/scripts/algorithms/LinearRegDS.dml index 501acc8..0ec663a 100644 --- a/scripts/algorithms/LinearRegDS.dml +++ b/scripts/algorithms/LinearRegDS.dml @@ -1,224 +1,224 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y -# -# INPUT PARAMETERS: -# -------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# -------------------------------------------------------------------------------------------- -# X String --- Location (on HDFS) to read the matrix X of feature vectors -# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values -# B String --- Location to store estimated regression parameters (the betas) -# O String " " Location to write the printed statistics; by default is standard output -# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero -# for highly dependend/sparse/numerous features -# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" -# -------------------------------------------------------------------------------------------- -# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: -# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: -# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B -# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# Col.2: betas for shifted/rescaled X and intercept -# -# In addition, some regression statistics are provided in CSV format, one comma-separated -# name-value pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------- -# AVG_TOT_Y Average of the response value Y -# STDEV_TOT_Y Standard Deviation of the response value Y -# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias -# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) -# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. -# PLAIN_R2 Plain R^2 of residual with bias included vs. total average -# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average -# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average -# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average -# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant -# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant -# ------------------------------------------------------------------------------------- -# * The last two statistics are only printed if there is no intercept (icpt=0) -# -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B -# O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv - -fileX = $X; -fileY = $Y; -fileB = $B; -fileO = ifdef ($O, " "); -fmtB = ifdef ($fmt, "text"); - -intercept_status = ifdef ($icpt, 0); # $icpt=0; -regularization = ifdef ($reg, 0.000001); # $reg=0.000001; - -print ("BEGIN LINEAR REGRESSION SCRIPT"); -print ("Reading X and Y..."); -X = read (fileX); -y = read (fileY); - -n = nrow (X); -m = ncol (X); -ones_n = matrix (1, rows = n, cols = 1); -zero_cell = matrix (0, rows = 1, cols = 1); - -# Introduce the intercept, shift and rescale the columns of X if needed - -m_ext = m; -if (intercept_status == 1 | intercept_status == 2) # add the intercept column -{ - X = append (X, ones_n); - m_ext = ncol (X); -} - -scale_lambda = matrix (1, rows = m_ext, cols = 1); -if (intercept_status == 1 | intercept_status == 2) -{ - scale_lambda [m_ext, 1] = 0; -} - -if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 -{ # Important assumption: X [, m_ext] = ones_n - avg_X_cols = t(colSums(X)) / n; - var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); - is_unsafe = ppred (var_X_cols, 0.0, "<="); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [m_ext, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [m_ext, 1] = 0; -} else { - scale_X = matrix (1, rows = m_ext, cols = 1); - shift_X = matrix (0, rows = m_ext, cols = 1); -} - -# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)" -# instead of "X". However, in order to preserve the sparsity of X, -# we apply the transform associatively to some other part of the expression -# in which it occurs. To avoid materializing a large matrix, we rewrite it: -# -# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# ssX_A = diag (scale_X) %*% A; -# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A; -# -# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; - -lambda = scale_lambda * regularization; - -# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) - -A = t(X) %*% X; -b = t(X) %*% y; -if (intercept_status == 2) { - A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]); - A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; - b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ]; -} -A = A + diag (lambda); - -print ("Calling the Direct Solver..."); - -beta_unscaled = solve (A, b); - -# END THE DIRECT SOLVE ALGORITHM - -if (intercept_status == 2) { - beta = scale_X * beta_unscaled; - beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; -} else { - beta = beta_unscaled; -} - -print ("Computing the statistics..."); - -avg_tot = sum (y) / n; -ss_tot = sum (y ^ 2); -ss_avg_tot = ss_tot - n * avg_tot ^ 2; -var_tot = ss_avg_tot / (n - 1); -y_residual = y - X %*% beta; -avg_res = sum (y_residual) / n; -ss_res = sum (y_residual ^ 2); -ss_avg_res = ss_res - n * avg_res ^ 2; - -plain_R2 = 1 - ss_res / ss_avg_tot; -if (n > m_ext) { - dispersion = ss_res / (n - m_ext); - adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); -} else { - dispersion = 0.0 / 0.0; - adjusted_R2 = 0.0 / 0.0; -} - -plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; -deg_freedom = n - m - 1; -if (deg_freedom > 0) { - var_res = ss_avg_res / deg_freedom; - adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); -} else { - var_res = 0.0 / 0.0; - adjusted_R2_nobias = 0.0 / 0.0; - print ("Warning: zero or negative number of degrees of freedom."); -} - -plain_R2_vs_0 = 1 - ss_res / ss_tot; -if (n > m) { - adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); -} else { - adjusted_R2_vs_0 = 0.0 / 0.0; -} - -str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y -str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y -str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias -str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) -str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. -str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average -str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average -str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average -str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average -if (intercept_status == 0) { - str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant - str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant -} - -if (fileO != " ") { - write (str, fileO); -} else { - print (str); -} - -# Prepare the output matrix -print ("Writing the output matrix..."); - -if (intercept_status == 2) { - beta_out = append (beta, beta_unscaled); -} else { - beta_out = beta; -} -write (beta_out, fileB, format=fmtB); -print ("END LINEAR REGRESSION SCRIPT"); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + lambda) beta = X^T y +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# X String --- Location (on HDFS) to read the matrix X of feature vectors +# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values +# B String --- Location to store estimated regression parameters (the betas) +# O String " " Location to write the printed statistics; by default is standard output +# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# reg Double 0.000001 Regularization constant (lambda) for L2-regularization; set to nonzero +# for highly dependend/sparse/numerous features +# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" +# -------------------------------------------------------------------------------------------- +# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: +# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: +# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B +# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# Col.2: betas for shifted/rescaled X and intercept +# +# In addition, some regression statistics are provided in CSV format, one comma-separated +# name-value pair per each line, as follows: +# +# NAME MEANING +# ------------------------------------------------------------------------------------- +# AVG_TOT_Y Average of the response value Y +# STDEV_TOT_Y Standard Deviation of the response value Y +# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias +# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) +# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. +# PLAIN_R2 Plain R^2 of residual with bias included vs. total average +# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average +# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average +# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average +# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant +# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant +# ------------------------------------------------------------------------------------- +# * The last two statistics are only printed if there is no intercept (icpt=0) +# +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/B +# O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv + +fileX = $X; +fileY = $Y; +fileB = $B; +fileO = ifdef ($O, " "); +fmtB = ifdef ($fmt, "text"); + +intercept_status = ifdef ($icpt, 0); # $icpt=0; +regularization = ifdef ($reg, 0.000001); # $reg=0.000001; + +print ("BEGIN LINEAR REGRESSION SCRIPT"); +print ("Reading X and Y..."); +X = read (fileX); +y = read (fileY); + +n = nrow (X); +m = ncol (X); +ones_n = matrix (1, rows = n, cols = 1); +zero_cell = matrix (0, rows = 1, cols = 1); + +# Introduce the intercept, shift and rescale the columns of X if needed + +m_ext = m; +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = append (X, ones_n); + m_ext = ncol (X); +} + +scale_lambda = matrix (1, rows = m_ext, cols = 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [m_ext, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, m_ext] = ones_n + avg_X_cols = t(colSums(X)) / n; + var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [m_ext, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [m_ext, 1] = 0; +} else { + scale_X = matrix (1, rows = m_ext, cols = 1); + shift_X = matrix (0, rows = m_ext, cols = 1); +} + +# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)" +# instead of "X". However, in order to preserve the sparsity of X, +# we apply the transform associatively to some other part of the expression +# in which it occurs. To avoid materializing a large matrix, we rewrite it: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; + +lambda = scale_lambda * regularization; + +# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) + +A = t(X) %*% X; +b = t(X) %*% y; +if (intercept_status == 2) { + A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]); + A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; + b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ]; +} +A = A + diag (lambda); + +print ("Calling the Direct Solver..."); + +beta_unscaled = solve (A, b); + +# END THE DIRECT SOLVE ALGORITHM + +if (intercept_status == 2) { + beta = scale_X * beta_unscaled; + beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; +} else { + beta = beta_unscaled; +} + +print ("Computing the statistics..."); + +avg_tot = sum (y) / n; +ss_tot = sum (y ^ 2); +ss_avg_tot = ss_tot - n * avg_tot ^ 2; +var_tot = ss_avg_tot / (n - 1); +y_residual = y - X %*% beta; +avg_res = sum (y_residual) / n; +ss_res = sum (y_residual ^ 2); +ss_avg_res = ss_res - n * avg_res ^ 2; + +plain_R2 = 1 - ss_res / ss_avg_tot; +if (n > m_ext) { + dispersion = ss_res / (n - m_ext); + adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); +} else { + dispersion = 0.0 / 0.0; + adjusted_R2 = 0.0 / 0.0; +} + +plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; +deg_freedom = n - m - 1; +if (deg_freedom > 0) { + var_res = ss_avg_res / deg_freedom; + adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); +} else { + var_res = 0.0 / 0.0; + adjusted_R2_nobias = 0.0 / 0.0; + print ("Warning: zero or negative number of degrees of freedom."); +} + +plain_R2_vs_0 = 1 - ss_res / ss_tot; +if (n > m) { + adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); +} else { + adjusted_R2_vs_0 = 0.0 / 0.0; +} + +str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y +str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y +str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias +str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) +str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. +str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average +str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average +str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average +str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average +if (intercept_status == 0) { + str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant + str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant +} + +if (fileO != " ") { + write (str, fileO); +} else { + print (str); +} + +# Prepare the output matrix +print ("Writing the output matrix..."); + +if (intercept_status == 2) { + beta_out = append (beta, beta_unscaled); +} else { + beta_out = beta; +} +write (beta_out, fileB, format=fmtB); +print ("END LINEAR REGRESSION SCRIPT");
