[46/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

dusenberrymw Fri, 22 Jan 2016 08:34:50 -0800

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Kmeans.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Kmeans.dml b/scripts/algorithms/Kmeans.dml
index 3ec47a8..2887baa 100644
--- a/scripts/algorithms/Kmeans.dml
+++ b/scripts/algorithms/Kmeans.dml
@@ -1,282 +1,282 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Implements the k-Means clustering algorithm
-#
-# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME  TYPE   DEFAULT  MEANING
-# ----------------------------------------------------------------------------
-# X     String   ---    Location to read matrix X with the input data records
-# k     Int      ---    Number of centroids
-# runs  Int       10    Number of runs (with different initial centroids)
-# maxi  Int     1000    Maximum number of iterations per run
-# tol   Double 0.000001 Tolerance (epsilon) for WCSS change ratio
-# samp  Int       50    Average number of records per centroid in data samples
-# C     String  "C.mtx" Location to store the output matrix with the centroids
-# isY   Int        0    0 = do not write Y,  1 = write Y
-# Y     String  "Y.mtx" Location to store the mapping of records to centroids
-# fmt   String  "text"  Matrix output format, usually "text" or "csv"
-# verb  Int        0    0 = do not print per-iteration stats, 1 = print them
-# ----------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
-# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 
tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1
-
-fileX = $X;
-fileY = ifdef ($Y, "Y.mtx");
-fileC = ifdef ($C, "C.mtx");
-
-num_centroids = $k;
-num_runs   = ifdef ($runs, 10);      # $runs=10;
-max_iter   = ifdef ($maxi, 1000);    # $maxi=1000;
-eps        = ifdef ($tol, 0.000001); # $tol=0.000001;
-is_write_Y = ifdef ($isY, 0);        # $isY=0;
-is_verbose = ifdef ($verb, 0);       # $verb=0;
-fmtCY      = ifdef ($fmt, "text");   # $fmt="text";
-avg_sample_size_per_centroid = ifdef ($samp, 50);  # $samp=50;
-
-
-print ("BEGIN K-MEANS SCRIPT");
-print ("Reading X...");
-
-# X : matrix of data points as rows
-X = read (fileX);
-num_records   = nrow (X);
-num_features  = ncol (X);
-
-sumXsq = sum (X ^ 2);
-# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
-
-# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
-
-print ("Taking data samples for initialization...");
-
-[sample_maps, samples_vs_runs_map, sample_block_size] = 
-    get_sample_maps (num_records, num_runs, num_centroids * 
avg_sample_size_per_centroid);
-
-is_row_in_samples = rowSums (sample_maps);
-X_samples = sample_maps %*% X;
-X_samples_sq_norms = rowSums (X_samples ^ 2);
-
-print ("Initializing the centroids for all runs...");
-All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = 
num_features);
-
-# We select centroids according to the k-Means++ heuristic applied to a sample 
of X
-# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest 
centroids,
-# with the out-of-range X_sample positions in min_distances set to 0.0
-
-min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at 
random
-
-for (i in 1 : num_centroids)
-{
-    # "Matricize" and prefix-sum to compute the cumulative distribution 
function:
-    min_distances_matrix_form = 
-        matrix (min_distances, rows = sample_block_size, cols = num_runs, 
byrow = FALSE);
-    cdf_min_distances = cumsum (min_distances_matrix_form);
-    
-    # Select the i-th centroid in each sample as a random sample row id with
-    # probability ~ min_distances:
-    random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);  
-    threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
-    centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, 
"<"))) + 1;
-    
-    # Place the selected centroids together, one per run, into a matrix:
-    centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * 
num_runs));
-    centroid_placer_raw = 
-        table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 
1) + centroid_ids);
-    centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
-    centroids = centroid_placer %*% X_samples;
-    
-    # Place the selected centroids into their appropriate slots in 
All_Centroids:
-    centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
-    centroid_placer_raw = 
-        table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq 
(1, num_runs, 1));
-    centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
-    All_Centroids = All_Centroids + centroid_placer %*% centroids;
-    
-    # Update min_distances to preserve the loop invariant:
-    distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums 
(centroids ^ 2)
-              - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
-    if (i == 1) {
-        min_distances = is_row_in_samples * distances;
-    } else {
-        min_distances = min (min_distances, distances);
-}   }
-
-# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
-
-termination_code = matrix (0, rows = num_runs, cols = 1);
-final_wcss = matrix (0, rows = num_runs, cols = 1);
-num_iterations = matrix (0, rows = num_runs, cols = 1);
-
-print ("Performing k-means iterations for all runs...");
-
-parfor (run_index in 1 : num_runs, check = 0)
-{
-    C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids 
* run_index), ];
-    C_old = C;
-    iter_count = 0;
-    term_code = 0;
-    wcss = 0;
-
-    while (term_code == 0)
-    {
-        # Compute Euclidean squared distances from records (X rows) to 
centroids (C rows)
-        # without the C-independent term, then take the minimum for each record
-        D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
-        minD = rowMins (D);
-        # Compute the current centroid-based within-cluster sum of squares 
(WCSS)
-        wcss_old = wcss;
-        wcss = sumXsq + sum (minD);
-        if (is_verbose == 1) {
-            if (iter_count == 0) {
-                print ("Run " + run_index + ", At Start-Up:  Centroid WCSS = " 
+ wcss);
-            } else {
-                print ("Run " + run_index + ", Iteration " + iter_count + ":  
Centroid WCSS = " + wcss
-                    + ";  Centroid change (avg.sq.dist.) = " + (sum ((C - 
C_old) ^ 2) / num_centroids));
-        }   }
-        # Check if convergence or maximum iteration has been reached
-        if (wcss_old - wcss < eps * wcss & iter_count > 0) {
-            term_code = 1;  # Convergence is reached
-        } else {
-            if (iter_count >= max_iter) {
-                term_code = 2;  # Maximum iteration is reached
-            } else {
-                iter_count = iter_count + 1;
-                # Find the closest centroid for each record
-                P = ppred (D, minD, "<=");
-                # If some records belong to multiple centroids, share them 
equally
-                P = P / rowSums (P);
-                # Compute the column normalization factor for P
-                P_denom = colSums (P);
-                if (sum (ppred (P_denom, 0.0, "<=")) > 0) {
-                    term_code = 3;  # There is a "runaway" centroid with 0.0 
denominator
-                } else {
-                    C_old = C;
-                    # Compute new centroids as weighted averages over the 
records
-                    C = (t(P) %*% X) / t(P_denom);
-    }   }   }   }
-    print ("Run " + run_index + ", Iteration " + iter_count + ":  Terminated 
with code = " + term_code + ",  Centroid WCSS = " + wcss);
-    All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * 
run_index), ] = C;
-    final_wcss [run_index, 1] = wcss;
-    termination_code [run_index, 1] = term_code;
-    num_iterations [run_index, 1] = iter_count;
-}
-
-# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
-
-termination_bitmap = matrix (0, rows = num_runs, cols = 3);
-termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
-termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = 
termination_bitmap_raw;
-termination_stats = colSums (termination_bitmap);
-print ("Number of successful runs = " + as.integer (castAsScalar 
(termination_stats [1, 1])));
-print ("Number of incomplete runs = " + as.integer (castAsScalar 
(termination_stats [1, 2])));
-print ("Number of failed runs (with lost centroids) = " + as.integer 
(castAsScalar (termination_stats [1, 3])));
-
-num_successful_runs = castAsScalar (termination_stats [1, 1]);
-if (num_successful_runs > 0) {
-    final_wcss_successful = final_wcss * termination_bitmap [, 1];
-    worst_wcss = max (final_wcss_successful);
-    best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - 
termination_bitmap [, 1]));
-    avg_wcss = sum (final_wcss_successful) / num_successful_runs;
-    best_index_vector = ppred (final_wcss_successful, best_wcss, "==");
-    aggr_best_index_vector = cumsum (best_index_vector);
-    best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 
1);
-    print ("Successful runs:  Best run is " + best_index + " with Centroid 
WCSS = " + best_wcss 
-        + ";  Avg WCSS = " + avg_wcss + ";  Worst WCSS = " + worst_wcss);
-    C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids 
* best_index), ];
-    print ("Writing out the best-WCSS centroids...");
-    write (C, fileC, format=fmtCY);
-    if (is_write_Y == 1) {
-        print ("Writing out the best-WCSS cluster labels...");
-        D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
-        P = ppred (D, rowMins (D), "<=");
-        aggr_P = t(cumsum (t(P)));
-        Y = rowSums (ppred (aggr_P, 0, "==")) + 1
-        write (Y, fileY, format=fmtCY);
-    }
-    print ("DONE.");
-} else {
-    stop ("No output is produced.  Try increasing the number of iterations 
and/or runs.");
-}
-
-
-
-get_sample_maps = function (int num_records, int num_samples, int 
approx_sample_size)
-    return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int 
sample_block_size)
-{
-    if (approx_sample_size < num_records) {
-        # Input value "approx_sample_size" is the average sample size; 
increase it by ~10 std.dev's
-        # to get the sample block size (to allocate space):
-        sample_block_size = as.integer (approx_sample_size + round (10 * sqrt 
(approx_sample_size)));
-        num_rows = sample_block_size * num_samples;
-        
-        # Generate all samples in parallel by converting uniform random values 
into random
-        # integer skip-ahead intervals and prefix-summing them:
-        sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, 
min = 0.0, max = 1.0);
-        sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - 
approx_sample_size / num_records) + 0.5);
-        # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 
zeros before a one]
-        sample_rec_ids = cumsum (sample_rec_ids);  #  (skip to next one) --> 
(skip to i-th one)
-        
-        # Replace all sample record ids over "num_records" (i.e. out of range) 
by "num_records + 1":
-        is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, 
"<=");
-        sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range 
-                       + (num_records + 1) * (1 - 
is_sample_rec_id_within_range);
-        
-        # Rearrange all samples (and their out-of-range indicators) into one 
column-vector:
-        sample_rec_ids = 
-            matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
-        is_row_in_samples = 
-            matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, 
byrow = FALSE);
-
-        # Use contingency table to create the "sample_maps" matrix that is a 
vertical concatenation
-        # of 0-1-matrices, one per sample, each with 1s at (i, 
sample_record[i]) and 0s elsewhere:
-        sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
-        max_rec_id = ncol (sample_maps_raw);
-        if (max_rec_id >= num_records) {
-            sample_maps = sample_maps_raw [, 1 : num_records];
-        } else {
-            sample_maps = matrix (0, rows = num_rows, cols = num_records);     
   
-            sample_maps [, 1 : max_rec_id] = sample_maps_raw;
-        }
-        
-        # Create a 0-1-matrix that maps each sample column ID into all row 
positions of the
-        # corresponding sample; map out-of-sample-range positions to row id = 
num_rows + 1:
-        sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 
1, -1);
-        # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, 
..., n_c:
-        col_positions = round (0.5 + seq (0, num_rows - 1, 1) / 
sample_block_size);
-        sample_col_map = table (sample_positions, col_positions);
-        # Remove the out-of-sample-range positions by cutting off the last row:
-        sample_col_map = sample_col_map [1 : (num_rows), ];
-        
-    } else {
-        one_per_record = matrix (1, rows = num_records, cols = 1);
-        sample_block_size = num_records;
-        sample_maps    = matrix (0, rows = (num_records * num_samples), cols = 
num_records);
-        sample_col_map = matrix (0, rows = (num_records * num_samples), cols = 
num_samples);
-        for (i in 1:num_samples) {
-            sample_maps    [(num_records * (i - 1) + 1) : (num_records * i),  
] = diag (one_per_record);
-            sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), 
i] = one_per_record;
-}   }   }
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Implements the k-Means clustering algorithm
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# ----------------------------------------------------------------------------
+# X     String   ---    Location to read matrix X with the input data records
+# k     Int      ---    Number of centroids
+# runs  Int       10    Number of runs (with different initial centroids)
+# maxi  Int     1000    Maximum number of iterations per run
+# tol   Double 0.000001 Tolerance (epsilon) for WCSS change ratio
+# samp  Int       50    Average number of records per centroid in data samples
+# C     String  "C.mtx" Location to store the output matrix with the centroids
+# isY   Int        0    0 = do not write Y,  1 = write Y
+# Y     String  "Y.mtx" Location to store the mapping of records to centroids
+# fmt   String  "text"  Matrix output format, usually "text" or "csv"
+# verb  Int        0    0 = do not print per-iteration stats, 1 = print them
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 C=centroids.mtx
+# hadoop jar SystemML.jar -f Kmeans.dml -nvargs X=X.mtx k=5 runs=100 maxi=5000 
tol=0.00000001 samp=20 C=centroids.mtx isY=1 Y=clusters.mtx verb=1
+
+fileX = $X;
+fileY = ifdef ($Y, "Y.mtx");
+fileC = ifdef ($C, "C.mtx");
+
+num_centroids = $k;
+num_runs   = ifdef ($runs, 10);      # $runs=10;
+max_iter   = ifdef ($maxi, 1000);    # $maxi=1000;
+eps        = ifdef ($tol, 0.000001); # $tol=0.000001;
+is_write_Y = ifdef ($isY, 0);        # $isY=0;
+is_verbose = ifdef ($verb, 0);       # $verb=0;
+fmtCY      = ifdef ($fmt, "text");   # $fmt="text";
+avg_sample_size_per_centroid = ifdef ($samp, 50);  # $samp=50;
+
+
+print ("BEGIN K-MEANS SCRIPT");
+print ("Reading X...");
+
+# X : matrix of data points as rows
+X = read (fileX);
+num_records   = nrow (X);
+num_features  = ncol (X);
+
+sumXsq = sum (X ^ 2);
+# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B))
+
+# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES:
+
+print ("Taking data samples for initialization...");
+
+[sample_maps, samples_vs_runs_map, sample_block_size] = 
+    get_sample_maps (num_records, num_runs, num_centroids * 
avg_sample_size_per_centroid);
+
+is_row_in_samples = rowSums (sample_maps);
+X_samples = sample_maps %*% X;
+X_samples_sq_norms = rowSums (X_samples ^ 2);
+
+print ("Initializing the centroids for all runs...");
+All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = 
num_features);
+
+# We select centroids according to the k-Means++ heuristic applied to a sample 
of X
+# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest 
centroids,
+# with the out-of-range X_sample positions in min_distances set to 0.0
+
+min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at 
random
+
+for (i in 1 : num_centroids)
+{
+    # "Matricize" and prefix-sum to compute the cumulative distribution 
function:
+    min_distances_matrix_form = 
+        matrix (min_distances, rows = sample_block_size, cols = num_runs, 
byrow = FALSE);
+    cdf_min_distances = cumsum (min_distances_matrix_form);
+    
+    # Select the i-th centroid in each sample as a random sample row id with
+    # probability ~ min_distances:
+    random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0);  
+    threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
+    centroid_ids = t(colSums (ppred (cdf_min_distances, threshold_matrix, 
"<"))) + 1;
+    
+    # Place the selected centroids together, one per run, into a matrix:
+    centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * 
num_runs));
+    centroid_placer_raw = 
+        table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 
1) + centroid_ids);
+    centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw;
+    centroids = centroid_placer %*% X_samples;
+    
+    # Place the selected centroids into their appropriate slots in 
All_Centroids:
+    centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs);
+    centroid_placer_raw = 
+        table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq 
(1, num_runs, 1));
+    centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw;
+    All_Centroids = All_Centroids + centroid_placer %*% centroids;
+    
+    # Update min_distances to preserve the loop invariant:
+    distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums 
(centroids ^ 2)
+              - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids));
+    if (i == 1) {
+        min_distances = is_row_in_samples * distances;
+    } else {
+        min_distances = min (min_distances, distances);
+}   }
+
+# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS:
+
+termination_code = matrix (0, rows = num_runs, cols = 1);
+final_wcss = matrix (0, rows = num_runs, cols = 1);
+num_iterations = matrix (0, rows = num_runs, cols = 1);
+
+print ("Performing k-means iterations for all runs...");
+
+parfor (run_index in 1 : num_runs, check = 0)
+{
+    C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids 
* run_index), ];
+    C_old = C;
+    iter_count = 0;
+    term_code = 0;
+    wcss = 0;
+
+    while (term_code == 0)
+    {
+        # Compute Euclidean squared distances from records (X rows) to 
centroids (C rows)
+        # without the C-independent term, then take the minimum for each record
+        D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+        minD = rowMins (D);
+        # Compute the current centroid-based within-cluster sum of squares 
(WCSS)
+        wcss_old = wcss;
+        wcss = sumXsq + sum (minD);
+        if (is_verbose == 1) {
+            if (iter_count == 0) {
+                print ("Run " + run_index + ", At Start-Up:  Centroid WCSS = " 
+ wcss);
+            } else {
+                print ("Run " + run_index + ", Iteration " + iter_count + ":  
Centroid WCSS = " + wcss
+                    + ";  Centroid change (avg.sq.dist.) = " + (sum ((C - 
C_old) ^ 2) / num_centroids));
+        }   }
+        # Check if convergence or maximum iteration has been reached
+        if (wcss_old - wcss < eps * wcss & iter_count > 0) {
+            term_code = 1;  # Convergence is reached
+        } else {
+            if (iter_count >= max_iter) {
+                term_code = 2;  # Maximum iteration is reached
+            } else {
+                iter_count = iter_count + 1;
+                # Find the closest centroid for each record
+                P = ppred (D, minD, "<=");
+                # If some records belong to multiple centroids, share them 
equally
+                P = P / rowSums (P);
+                # Compute the column normalization factor for P
+                P_denom = colSums (P);
+                if (sum (ppred (P_denom, 0.0, "<=")) > 0) {
+                    term_code = 3;  # There is a "runaway" centroid with 0.0 
denominator
+                } else {
+                    C_old = C;
+                    # Compute new centroids as weighted averages over the 
records
+                    C = (t(P) %*% X) / t(P_denom);
+    }   }   }   }
+    print ("Run " + run_index + ", Iteration " + iter_count + ":  Terminated 
with code = " + term_code + ",  Centroid WCSS = " + wcss);
+    All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * 
run_index), ] = C;
+    final_wcss [run_index, 1] = wcss;
+    termination_code [run_index, 1] = term_code;
+    num_iterations [run_index, 1] = iter_count;
+}
+
+# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS:
+
+termination_bitmap = matrix (0, rows = num_runs, cols = 3);
+termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code);
+termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = 
termination_bitmap_raw;
+termination_stats = colSums (termination_bitmap);
+print ("Number of successful runs = " + as.integer (castAsScalar 
(termination_stats [1, 1])));
+print ("Number of incomplete runs = " + as.integer (castAsScalar 
(termination_stats [1, 2])));
+print ("Number of failed runs (with lost centroids) = " + as.integer 
(castAsScalar (termination_stats [1, 3])));
+
+num_successful_runs = castAsScalar (termination_stats [1, 1]);
+if (num_successful_runs > 0) {
+    final_wcss_successful = final_wcss * termination_bitmap [, 1];
+    worst_wcss = max (final_wcss_successful);
+    best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - 
termination_bitmap [, 1]));
+    avg_wcss = sum (final_wcss_successful) / num_successful_runs;
+    best_index_vector = ppred (final_wcss_successful, best_wcss, "==");
+    aggr_best_index_vector = cumsum (best_index_vector);
+    best_index = as.integer (sum (ppred (aggr_best_index_vector, 0, "==")) + 
1);
+    print ("Successful runs:  Best run is " + best_index + " with Centroid 
WCSS = " + best_wcss 
+        + ";  Avg WCSS = " + avg_wcss + ";  Worst WCSS = " + worst_wcss);
+    C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids 
* best_index), ];
+    print ("Writing out the best-WCSS centroids...");
+    write (C, fileC, format=fmtCY);
+    if (is_write_Y == 1) {
+        print ("Writing out the best-WCSS cluster labels...");
+        D =  -2 * (X %*% t(C)) + t(rowSums (C ^ 2));
+        P = ppred (D, rowMins (D), "<=");
+        aggr_P = t(cumsum (t(P)));
+        Y = rowSums (ppred (aggr_P, 0, "==")) + 1
+        write (Y, fileY, format=fmtCY);
+    }
+    print ("DONE.");
+} else {
+    stop ("No output is produced.  Try increasing the number of iterations 
and/or runs.");
+}
+
+
+
+get_sample_maps = function (int num_records, int num_samples, int 
approx_sample_size)
+    return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int 
sample_block_size)
+{
+    if (approx_sample_size < num_records) {
+        # Input value "approx_sample_size" is the average sample size; 
increase it by ~10 std.dev's
+        # to get the sample block size (to allocate space):
+        sample_block_size = as.integer (approx_sample_size + round (10 * sqrt 
(approx_sample_size)));
+        num_rows = sample_block_size * num_samples;
+        
+        # Generate all samples in parallel by converting uniform random values 
into random
+        # integer skip-ahead intervals and prefix-summing them:
+        sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, 
min = 0.0, max = 1.0);
+        sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - 
approx_sample_size / num_records) + 0.5);
+        # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 
zeros before a one]
+        sample_rec_ids = cumsum (sample_rec_ids);  #  (skip to next one) --> 
(skip to i-th one)
+        
+        # Replace all sample record ids over "num_records" (i.e. out of range) 
by "num_records + 1":
+        is_sample_rec_id_within_range = ppred (sample_rec_ids, num_records, 
"<=");
+        sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range 
+                       + (num_records + 1) * (1 - 
is_sample_rec_id_within_range);
+        
+        # Rearrange all samples (and their out-of-range indicators) into one 
column-vector:
+        sample_rec_ids = 
+            matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE);
+        is_row_in_samples = 
+            matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, 
byrow = FALSE);
+
+        # Use contingency table to create the "sample_maps" matrix that is a 
vertical concatenation
+        # of 0-1-matrices, one per sample, each with 1s at (i, 
sample_record[i]) and 0s elsewhere:
+        sample_maps_raw = table (seq (1, num_rows), sample_rec_ids);
+        max_rec_id = ncol (sample_maps_raw);
+        if (max_rec_id >= num_records) {
+            sample_maps = sample_maps_raw [, 1 : num_records];
+        } else {
+            sample_maps = matrix (0, rows = num_rows, cols = num_records);     
   
+            sample_maps [, 1 : max_rec_id] = sample_maps_raw;
+        }
+        
+        # Create a 0-1-matrix that maps each sample column ID into all row 
positions of the
+        # corresponding sample; map out-of-sample-range positions to row id = 
num_rows + 1:
+        sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 
1, -1);
+        # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, 
..., n_c:
+        col_positions = round (0.5 + seq (0, num_rows - 1, 1) / 
sample_block_size);
+        sample_col_map = table (sample_positions, col_positions);
+        # Remove the out-of-sample-range positions by cutting off the last row:
+        sample_col_map = sample_col_map [1 : (num_rows), ];
+        
+    } else {
+        one_per_record = matrix (1, rows = num_records, cols = 1);
+        sample_block_size = num_records;
+        sample_maps    = matrix (0, rows = (num_records * num_samples), cols = 
num_records);
+        sample_col_map = matrix (0, rows = (num_records * num_samples), cols = 
num_samples);
+        for (i in 1:num_samples) {
+            sample_maps    [(num_records * (i - 1) + 1) : (num_records * i),  
] = diag (one_per_record);
+            sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), 
i] = one_per_record;
+}   }   }
+


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/LinearRegCG.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/LinearRegCG.dml 
b/scripts/algorithms/LinearRegCG.dml
index a485d83..ebfa5a4 100644
--- a/scripts/algorithms/LinearRegCG.dml
+++ b/scripts/algorithms/LinearRegCG.dml
@@ -1,286 +1,286 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM
-#
-# INPUT PARAMETERS:
-# 
--------------------------------------------------------------------------------------------
-# NAME  TYPE   DEFAULT  MEANING
-# 
--------------------------------------------------------------------------------------------
-# X     String  ---     Location (on HDFS) to read the matrix X of feature 
vectors
-# Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of 
response values
-# B     String  ---     Location to store estimated regression parameters (the 
betas)
-# O     String  " "     Location to write the printed statistics; by default 
is standard output
-# Log   String  " "     Location to write per-iteration variables for 
log/debugging purposes
-# icpt  Int      0      Intercept presence, shifting and rescaling the columns 
of X:
-#                       0 = no intercept, no shifting, no rescaling;
-#                       1 = add intercept, but neither shift nor rescale X;
-#                       2 = add intercept, shift & rescale X columns to mean = 
0, variance = 1
-# reg   Double 0.000001 Regularization constant (lambda) for 
L2-regularization; set to nonzero
-#                       for highly dependend/sparse/numerous features
-# tol   Double 0.000001 Tolerance (epsilon); conjugate graduent procedure 
terminates early if
-#                       L2 norm of the beta-residual is less than tolerance * 
its initial norm
-# maxi  Int      0      Maximum number of conjugate gradient iterations, 0 = 
no maximum
-# fmt   String "text"   Matrix output format for B (the betas) only, usually 
"text" or "csv"
-# 
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on 
icpt input value:
-#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM 
X AND B:
-# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% 
B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-#                        Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, some regression statistics are provided in CSV format, one 
comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME                  MEANING
-# 
-------------------------------------------------------------------------------------
-# AVG_TOT_Y             Average of the response value Y
-# STDEV_TOT_Y           Standard Deviation of the response value Y
-# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual 
bias
-# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # 
deg. fr.
-# PLAIN_R2              Plain R^2 of residual with bias included vs. total 
average
-# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total 
average
-# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total 
average
-# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. 
total average
-# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero 
constant
-# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero 
constant
-# 
-------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-#
-# The Log file, when requested, contains the following per-iteration variables 
in CSV
-# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION 
= 0 for
-# initial values:
-#
-# NAME                  MEANING
-# 
-------------------------------------------------------------------------------------
-# CG_RESIDUAL_NORM      L2-norm of Conj.Grad.residual, which is A %*% beta - 
t(X) %*% y
-#                           where A = t(X) %*% X + diag (lambda), or a similar 
quantity
-# CG_RESIDUAL_RATIO     Ratio of current L2-norm of Conj.Grad.residual over 
the initial
-# 
-------------------------------------------------------------------------------------
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y B=OUTPUT_DIR/B
-#     O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv 
Log=OUTPUT_DIR/log
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileO = ifdef ($O, " ");
-fileLog = ifdef ($Log, " ");
-fmtB = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0);     # $icpt=0;
-tolerance = ifdef ($tol, 0.000001);      # $tol=0.000001;
-max_iteration = ifdef ($maxi, 0);        # $maxi=0;
-regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
-
-print ("BEGIN LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X = read (fileX);
-y = read (fileY);
-
-n = nrow (X);
-m = ncol (X);
-ones_n = matrix (1, rows = n, cols = 1);
-zero_cell = matrix (0, rows = 1, cols = 1);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-
-m_ext = m;
-if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
-{
-    X = append (X, ones_n);
-    m_ext = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = m_ext, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
-    scale_lambda [m_ext, 1] = 0;
-}
-
-if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
-{                           # Important assumption: X [, m_ext] = ones_n
-    avg_X_cols = t(colSums(X)) / n;
-    var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
-    is_unsafe = ppred (var_X_cols, 0.0, "<=");
-    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-    scale_X [m_ext, 1] = 1;
-    shift_X = - avg_X_cols * scale_X;
-    shift_X [m_ext, 1] = 0;
-} else {
-    scale_X = matrix (1, rows = m_ext, cols = 1);
-    shift_X = matrix (0, rows = m_ext, cols = 1);
-}
-
-# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
-# instead of "X".  However, in order to preserve the sparsity of X,
-# we apply the transform associatively to some other part of the expression
-# in which it occurs.  To avoid materializing a large matrix, we rewrite it:
-#
-# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
-# ssX_A  = diag (scale_X) %*% A;
-# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-
-lambda = scale_lambda * regularization;
-beta_unscaled = matrix (0, rows = m_ext, cols = 1);
-
-if (max_iteration == 0) {
-    max_iteration = m_ext;
-}
-i = 0;
-
-# BEGIN THE CONJUGATE GRADIENT ALGORITHM
-print ("Running the CG algorithm...");
-
-r = - t(X) %*% y;
-
-if (intercept_status == 2) {
-    r = scale_X * r + shift_X %*% r [m_ext, ];
-}
-
-p = - r;
-norm_r2 = sum (r ^ 2);
-norm_r2_initial = norm_r2;
-norm_r2_target = norm_r2_initial * tolerance ^ 2;
-print ("||r|| initial value = " + sqrt (norm_r2_initial) + ",  target value = 
" + sqrt (norm_r2_target));
-log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial);
-log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0");
-
-while (i < max_iteration & norm_r2 > norm_r2_target)
-{
-    if (intercept_status == 2) {
-        ssX_p = scale_X * p;
-        ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p;
-    } else {
-        ssX_p = p;
-    }
-    
-    q = t(X) %*% (X %*% ssX_p);
-
-    if (intercept_status == 2) {
-        q = scale_X * q + shift_X %*% q [m_ext, ];
-    }
-
-       q = q + lambda * p;
-       a = norm_r2 / sum (p * q);
-       beta_unscaled = beta_unscaled + a * p;
-       r = r + a * q;
-       old_norm_r2 = norm_r2;
-       norm_r2 = sum (r ^ 2);
-       p = -r + (norm_r2 / old_norm_r2) * p;
-       i = i + 1;
-       print ("Iteration " + i + ":  ||r|| / ||r init|| = " + sqrt (norm_r2 / 
norm_r2_initial));
-       log_str = append (log_str, "CG_RESIDUAL_NORM,"  + i + "," + sqrt 
(norm_r2));
-    log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 
/ norm_r2_initial));
-}
-
-if (i >= max_iteration) {
-    print ("Warning: the maximum number of iterations has been reached.");
-}
-print ("The CG algorithm is done.");
-# END THE CONJUGATE GRADIENT ALGORITHM
-
-if (intercept_status == 2) {
-    beta = scale_X * beta_unscaled;
-    beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-} else {
-    beta = beta_unscaled;
-}
-
-print ("Computing the statistics...");
-
-avg_tot = sum (y) / n;
-ss_tot = sum (y ^ 2);
-ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-var_tot = ss_avg_tot / (n - 1);
-y_residual = y - X %*% beta;
-avg_res = sum (y_residual) / n;
-ss_res = sum (y_residual ^ 2);
-ss_avg_res = ss_res - n * avg_res ^ 2;
-
-plain_R2 = 1 - ss_res / ss_avg_tot;
-if (n > m_ext) {
-    dispersion  = ss_res / (n - m_ext);
-    adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-} else {
-    dispersion  = 0.0 / 0.0;
-    adjusted_R2 = 0.0 / 0.0;
-}
-
-plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-deg_freedom = n - m - 1;
-if (deg_freedom > 0) {
-    var_res = ss_avg_res / deg_freedom;
-    adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
-} else {
-    var_res = 0.0 / 0.0;
-    adjusted_R2_nobias = 0.0 / 0.0;
-    print ("Warning: zero or negative number of degrees of freedom.");
-}
-
-plain_R2_vs_0 = 1 - ss_res / ss_tot;
-if (n > m) {
-    adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
-} else {
-    adjusted_R2_vs_0 = 0.0 / 0.0;
-}
-
-str = "AVG_TOT_Y," + avg_tot;                                    #  Average of 
the response value Y
-str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard 
Deviation of the response value Y
-str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of 
the residual Y - pred(Y|X), i.e. residual bias
-str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard 
Deviation of the residual Y - pred(Y|X)
-str = append (str, "DISPERSION," + dispersion);                  #  GLM-style 
dispersion, i.e. residual sum of squares / # d.f.
-str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 
of residual with bias included vs. total average
-str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted 
R^2 of residual with bias included vs. total average
-str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 
of residual with bias subtracted vs. total average
-str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted 
R^2 of residual with bias subtracted vs. total average
-if (intercept_status == 0) {
-    str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 
of residual with bias included vs. zero constant
-    str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted 
R^2 of residual with bias included vs. zero constant
-}
-
-if (fileO != " ") {
-    write (str, fileO);
-} else {
-    print (str);
-}
-
-# Prepare the output matrix
-print ("Writing the output matrix...");
-
-if (intercept_status == 2) {
-    beta_out = append (beta, beta_unscaled);
-} else {
-    beta_out = beta;
-}
-write (beta_out, fileB, format=fmtB);
-
-if (fileLog != " ") {
-    write (log_str, fileLog);
-}
-print ("END LINEAR REGRESSION SCRIPT");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM
+#
+# INPUT PARAMETERS:
+# 
--------------------------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# 
--------------------------------------------------------------------------------------------
+# X     String  ---     Location (on HDFS) to read the matrix X of feature 
vectors
+# Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of 
response values
+# B     String  ---     Location to store estimated regression parameters (the 
betas)
+# O     String  " "     Location to write the printed statistics; by default 
is standard output
+# Log   String  " "     Location to write per-iteration variables for 
log/debugging purposes
+# icpt  Int      0      Intercept presence, shifting and rescaling the columns 
of X:
+#                       0 = no intercept, no shifting, no rescaling;
+#                       1 = add intercept, but neither shift nor rescale X;
+#                       2 = add intercept, shift & rescale X columns to mean = 
0, variance = 1
+# reg   Double 0.000001 Regularization constant (lambda) for 
L2-regularization; set to nonzero
+#                       for highly dependend/sparse/numerous features
+# tol   Double 0.000001 Tolerance (epsilon); conjugate graduent procedure 
terminates early if
+#                       L2 norm of the beta-residual is less than tolerance * 
its initial norm
+# maxi  Int      0      Maximum number of conjugate gradient iterations, 0 = 
no maximum
+# fmt   String "text"   Matrix output format for B (the betas) only, usually 
"text" or "csv"
+# 
--------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on 
icpt input value:
+#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM 
X AND B:
+# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% 
B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+#                        Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, some regression statistics are provided in CSV format, one 
comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME                  MEANING
+# 
-------------------------------------------------------------------------------------
+# AVG_TOT_Y             Average of the response value Y
+# STDEV_TOT_Y           Standard Deviation of the response value Y
+# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual 
bias
+# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # 
deg. fr.
+# PLAIN_R2              Plain R^2 of residual with bias included vs. total 
average
+# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total 
average
+# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total 
average
+# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. 
total average
+# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero 
constant
+# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero 
constant
+# 
-------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+#
+# The Log file, when requested, contains the following per-iteration variables 
in CSV
+# format, each line containing triple (NAME, ITERATION, VALUE) with ITERATION 
= 0 for
+# initial values:
+#
+# NAME                  MEANING
+# 
-------------------------------------------------------------------------------------
+# CG_RESIDUAL_NORM      L2-norm of Conj.Grad.residual, which is A %*% beta - 
t(X) %*% y
+#                           where A = t(X) %*% X + diag (lambda), or a similar 
quantity
+# CG_RESIDUAL_RATIO     Ratio of current L2-norm of Conj.Grad.residual over 
the initial
+# 
-------------------------------------------------------------------------------------
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f LinearRegCG.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y B=OUTPUT_DIR/B
+#     O=OUTPUT_DIR/Out icpt=2 reg=1.0 tol=0.001 maxi=100 fmt=csv 
Log=OUTPUT_DIR/log
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileO = ifdef ($O, " ");
+fileLog = ifdef ($Log, " ");
+fmtB = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0);     # $icpt=0;
+tolerance = ifdef ($tol, 0.000001);      # $tol=0.000001;
+max_iteration = ifdef ($maxi, 0);        # $maxi=0;
+regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
+
+print ("BEGIN LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X = read (fileX);
+y = read (fileY);
+
+n = nrow (X);
+m = ncol (X);
+ones_n = matrix (1, rows = n, cols = 1);
+zero_cell = matrix (0, rows = 1, cols = 1);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+m_ext = m;
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = append (X, ones_n);
+    m_ext = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = m_ext, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [m_ext, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, m_ext] = ones_n
+    avg_X_cols = t(colSums(X)) / n;
+    var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+    is_unsafe = ppred (var_X_cols, 0.0, "<=");
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [m_ext, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [m_ext, 1] = 0;
+} else {
+    scale_X = matrix (1, rows = m_ext, cols = 1);
+    shift_X = matrix (0, rows = m_ext, cols = 1);
+}
+
+# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
+# instead of "X".  However, in order to preserve the sparsity of X,
+# we apply the transform associatively to some other part of the expression
+# in which it occurs.  To avoid materializing a large matrix, we rewrite it:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+
+lambda = scale_lambda * regularization;
+beta_unscaled = matrix (0, rows = m_ext, cols = 1);
+
+if (max_iteration == 0) {
+    max_iteration = m_ext;
+}
+i = 0;
+
+# BEGIN THE CONJUGATE GRADIENT ALGORITHM
+print ("Running the CG algorithm...");
+
+r = - t(X) %*% y;
+
+if (intercept_status == 2) {
+    r = scale_X * r + shift_X %*% r [m_ext, ];
+}
+
+p = - r;
+norm_r2 = sum (r ^ 2);
+norm_r2_initial = norm_r2;
+norm_r2_target = norm_r2_initial * tolerance ^ 2;
+print ("||r|| initial value = " + sqrt (norm_r2_initial) + ",  target value = 
" + sqrt (norm_r2_target));
+log_str = "CG_RESIDUAL_NORM,0," + sqrt (norm_r2_initial);
+log_str = append (log_str, "CG_RESIDUAL_RATIO,0,1.0");
+
+while (i < max_iteration & norm_r2 > norm_r2_target)
+{
+    if (intercept_status == 2) {
+        ssX_p = scale_X * p;
+        ssX_p [m_ext, ] = ssX_p [m_ext, ] + t(shift_X) %*% p;
+    } else {
+        ssX_p = p;
+    }
+    
+    q = t(X) %*% (X %*% ssX_p);
+
+    if (intercept_status == 2) {
+        q = scale_X * q + shift_X %*% q [m_ext, ];
+    }
+
+       q = q + lambda * p;
+       a = norm_r2 / sum (p * q);
+       beta_unscaled = beta_unscaled + a * p;
+       r = r + a * q;
+       old_norm_r2 = norm_r2;
+       norm_r2 = sum (r ^ 2);
+       p = -r + (norm_r2 / old_norm_r2) * p;
+       i = i + 1;
+       print ("Iteration " + i + ":  ||r|| / ||r init|| = " + sqrt (norm_r2 / 
norm_r2_initial));
+       log_str = append (log_str, "CG_RESIDUAL_NORM,"  + i + "," + sqrt 
(norm_r2));
+    log_str = append (log_str, "CG_RESIDUAL_RATIO," + i + "," + sqrt (norm_r2 
/ norm_r2_initial));
+}
+
+if (i >= max_iteration) {
+    print ("Warning: the maximum number of iterations has been reached.");
+}
+print ("The CG algorithm is done.");
+# END THE CONJUGATE GRADIENT ALGORITHM
+
+if (intercept_status == 2) {
+    beta = scale_X * beta_unscaled;
+    beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+} else {
+    beta = beta_unscaled;
+}
+
+print ("Computing the statistics...");
+
+avg_tot = sum (y) / n;
+ss_tot = sum (y ^ 2);
+ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+var_tot = ss_avg_tot / (n - 1);
+y_residual = y - X %*% beta;
+avg_res = sum (y_residual) / n;
+ss_res = sum (y_residual ^ 2);
+ss_avg_res = ss_res - n * avg_res ^ 2;
+
+plain_R2 = 1 - ss_res / ss_avg_tot;
+if (n > m_ext) {
+    dispersion  = ss_res / (n - m_ext);
+    adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+} else {
+    dispersion  = 0.0 / 0.0;
+    adjusted_R2 = 0.0 / 0.0;
+}
+
+plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+deg_freedom = n - m - 1;
+if (deg_freedom > 0) {
+    var_res = ss_avg_res / deg_freedom;
+    adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+} else {
+    var_res = 0.0 / 0.0;
+    adjusted_R2_nobias = 0.0 / 0.0;
+    print ("Warning: zero or negative number of degrees of freedom.");
+}
+
+plain_R2_vs_0 = 1 - ss_res / ss_tot;
+if (n > m) {
+    adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+} else {
+    adjusted_R2_vs_0 = 0.0 / 0.0;
+}
+
+str = "AVG_TOT_Y," + avg_tot;                                    #  Average of 
the response value Y
+str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard 
Deviation of the response value Y
+str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of 
the residual Y - pred(Y|X), i.e. residual bias
+str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard 
Deviation of the residual Y - pred(Y|X)
+str = append (str, "DISPERSION," + dispersion);                  #  GLM-style 
dispersion, i.e. residual sum of squares / # d.f.
+str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 
of residual with bias included vs. total average
+str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted 
R^2 of residual with bias included vs. total average
+str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 
of residual with bias subtracted vs. total average
+str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted 
R^2 of residual with bias subtracted vs. total average
+if (intercept_status == 0) {
+    str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 
of residual with bias included vs. zero constant
+    str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted 
R^2 of residual with bias included vs. zero constant
+}
+
+if (fileO != " ") {
+    write (str, fileO);
+} else {
+    print (str);
+}
+
+# Prepare the output matrix
+print ("Writing the output matrix...");
+
+if (intercept_status == 2) {
+    beta_out = append (beta, beta_unscaled);
+} else {
+    beta_out = beta;
+}
+write (beta_out, fileB, format=fmtB);
+
+if (fileLog != " ") {
+    write (log_str, fileLog);
+}
+print ("END LINEAR REGRESSION SCRIPT");

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/LinearRegDS.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/LinearRegDS.dml 
b/scripts/algorithms/LinearRegDS.dml
index 501acc8..0ec663a 100644
--- a/scripts/algorithms/LinearRegDS.dml
+++ b/scripts/algorithms/LinearRegDS.dml
@@ -1,224 +1,224 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + 
lambda) beta = X^T y
-#
-# INPUT PARAMETERS:
-# 
--------------------------------------------------------------------------------------------
-# NAME  TYPE   DEFAULT  MEANING
-# 
--------------------------------------------------------------------------------------------
-# X     String  ---     Location (on HDFS) to read the matrix X of feature 
vectors
-# Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of 
response values
-# B     String  ---     Location to store estimated regression parameters (the 
betas)
-# O     String  " "     Location to write the printed statistics; by default 
is standard output
-# icpt  Int      0      Intercept presence, shifting and rescaling the columns 
of X:
-#                       0 = no intercept, no shifting, no rescaling;
-#                       1 = add intercept, but neither shift nor rescale X;
-#                       2 = add intercept, shift & rescale X columns to mean = 
0, variance = 1
-# reg   Double 0.000001 Regularization constant (lambda) for 
L2-regularization; set to nonzero
-#                       for highly dependend/sparse/numerous features
-# fmt   String "text"   Matrix output format for B (the betas) only, usually 
"text" or "csv"
-# 
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of regression parameters (the betas) and its size depend on 
icpt input value:
-#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM 
X AND B:
-# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% 
B[1:ncol(X), 1], or just X %*% B
-# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
-#                        Col.2: betas for shifted/rescaled X and intercept
-#
-# In addition, some regression statistics are provided in CSV format, one 
comma-separated
-# name-value pair per each line, as follows:
-#
-# NAME                  MEANING
-# 
-------------------------------------------------------------------------------------
-# AVG_TOT_Y             Average of the response value Y
-# STDEV_TOT_Y           Standard Deviation of the response value Y
-# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual 
bias
-# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
-# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # 
deg. fr.
-# PLAIN_R2              Plain R^2 of residual with bias included vs. total 
average
-# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total 
average
-# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total 
average
-# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. 
total average
-# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero 
constant
-# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero 
constant
-# 
-------------------------------------------------------------------------------------
-# * The last two statistics are only printed if there is no intercept (icpt=0)
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y B=OUTPUT_DIR/B
-#     O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv
-
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-fileO = ifdef ($O, " ");
-fmtB  = ifdef ($fmt, "text");
-
-intercept_status = ifdef ($icpt, 0);     # $icpt=0;
-regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
-
-print ("BEGIN LINEAR REGRESSION SCRIPT");
-print ("Reading X and Y...");
-X = read (fileX);
-y = read (fileY);
-
-n = nrow (X);
-m = ncol (X);
-ones_n = matrix (1, rows = n, cols = 1);
-zero_cell = matrix (0, rows = 1, cols = 1);
-
-# Introduce the intercept, shift and rescale the columns of X if needed
-
-m_ext = m;
-if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
-{
-    X = append (X, ones_n);
-    m_ext = ncol (X);
-}
-
-scale_lambda = matrix (1, rows = m_ext, cols = 1);
-if (intercept_status == 1 | intercept_status == 2)
-{
-    scale_lambda [m_ext, 1] = 0;
-}
-
-if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
-{                           # Important assumption: X [, m_ext] = ones_n
-    avg_X_cols = t(colSums(X)) / n;
-    var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
-    is_unsafe = ppred (var_X_cols, 0.0, "<=");
-    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-    scale_X [m_ext, 1] = 1;
-    shift_X = - avg_X_cols * scale_X;
-    shift_X [m_ext, 1] = 0;
-} else {
-    scale_X = matrix (1, rows = m_ext, cols = 1);
-    shift_X = matrix (0, rows = m_ext, cols = 1);
-}
-
-# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
-# instead of "X".  However, in order to preserve the sparsity of X,
-# we apply the transform associatively to some other part of the expression
-# in which it occurs.  To avoid materializing a large matrix, we rewrite it:
-#
-# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
-# ssX_A  = diag (scale_X) %*% A;
-# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
-#
-# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
-# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-
-lambda = scale_lambda * regularization;
-
-# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-
-A = t(X) %*% X;
-b = t(X) %*% y;
-if (intercept_status == 2) {
-    A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
-    A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-    b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
-}
-A = A + diag (lambda);
-
-print ("Calling the Direct Solver...");
-
-beta_unscaled = solve (A, b);
-
-# END THE DIRECT SOLVE ALGORITHM
-
-if (intercept_status == 2) {
-    beta = scale_X * beta_unscaled;
-    beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-} else {
-    beta = beta_unscaled;
-}
-
-print ("Computing the statistics...");
-
-avg_tot = sum (y) / n;
-ss_tot = sum (y ^ 2);
-ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-var_tot = ss_avg_tot / (n - 1);
-y_residual = y - X %*% beta;
-avg_res = sum (y_residual) / n;
-ss_res = sum (y_residual ^ 2);
-ss_avg_res = ss_res - n * avg_res ^ 2;
-
-plain_R2 = 1 - ss_res / ss_avg_tot;
-if (n > m_ext) {
-    dispersion  = ss_res / (n - m_ext);
-    adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-} else {
-    dispersion  = 0.0 / 0.0;
-    adjusted_R2 = 0.0 / 0.0;
-}
-
-plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-deg_freedom = n - m - 1;
-if (deg_freedom > 0) {
-    var_res = ss_avg_res / deg_freedom;
-    adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
-} else {
-    var_res = 0.0 / 0.0;
-    adjusted_R2_nobias = 0.0 / 0.0;
-    print ("Warning: zero or negative number of degrees of freedom.");
-}
-
-plain_R2_vs_0 = 1 - ss_res / ss_tot;
-if (n > m) {
-    adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
-} else {
-    adjusted_R2_vs_0 = 0.0 / 0.0;
-}
-
-str = "AVG_TOT_Y," + avg_tot;                                    #  Average of 
the response value Y
-str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard 
Deviation of the response value Y
-str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of 
the residual Y - pred(Y|X), i.e. residual bias
-str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard 
Deviation of the residual Y - pred(Y|X)
-str = append (str, "DISPERSION," + dispersion);                  #  GLM-style 
dispersion, i.e. residual sum of squares / # d.f.
-str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 
of residual with bias included vs. total average
-str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted 
R^2 of residual with bias included vs. total average
-str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 
of residual with bias subtracted vs. total average
-str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted 
R^2 of residual with bias subtracted vs. total average
-if (intercept_status == 0) {
-    str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 
of residual with bias included vs. zero constant
-    str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted 
R^2 of residual with bias included vs. zero constant
-}
-
-if (fileO != " ") {
-    write (str, fileO);
-} else {
-    print (str);
-}
-
-# Prepare the output matrix
-print ("Writing the output matrix...");
-
-if (intercept_status == 2) {
-    beta_out = append (beta, beta_unscaled);
-} else {
-    beta_out = beta;
-}
-write (beta_out, fileB, format=fmtB);
-print ("END LINEAR REGRESSION SCRIPT");
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT SOLVES LINEAR REGRESSION USING A DIRECT SOLVER FOR (X^T X + 
lambda) beta = X^T y
+#
+# INPUT PARAMETERS:
+# 
--------------------------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# 
--------------------------------------------------------------------------------------------
+# X     String  ---     Location (on HDFS) to read the matrix X of feature 
vectors
+# Y     String  ---     Location (on HDFS) to read the 1-column matrix Y of 
response values
+# B     String  ---     Location to store estimated regression parameters (the 
betas)
+# O     String  " "     Location to write the printed statistics; by default 
is standard output
+# icpt  Int      0      Intercept presence, shifting and rescaling the columns 
of X:
+#                       0 = no intercept, no shifting, no rescaling;
+#                       1 = add intercept, but neither shift nor rescale X;
+#                       2 = add intercept, shift & rescale X columns to mean = 
0, variance = 1
+# reg   Double 0.000001 Regularization constant (lambda) for 
L2-regularization; set to nonzero
+#                       for highly dependend/sparse/numerous features
+# fmt   String "text"   Matrix output format for B (the betas) only, usually 
"text" or "csv"
+# 
--------------------------------------------------------------------------------------------
+# OUTPUT: Matrix of regression parameters (the betas) and its size depend on 
icpt input value:
+#         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM 
X AND B:
+# icpt=0: ncol(X)   x 1  Betas for X only                Y ~ X %*% 
B[1:ncol(X), 1], or just X %*% B
+# icpt=1: ncol(X)+1 x 1  Betas for X and intercept       Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+# icpt=2: ncol(X)+1 x 2  Col.1: betas for X & intercept  Y ~ X %*% 
B[1:ncol(X), 1] + B[ncol(X)+1, 1]
+#                        Col.2: betas for shifted/rescaled X and intercept
+#
+# In addition, some regression statistics are provided in CSV format, one 
comma-separated
+# name-value pair per each line, as follows:
+#
+# NAME                  MEANING
+# 
-------------------------------------------------------------------------------------
+# AVG_TOT_Y             Average of the response value Y
+# STDEV_TOT_Y           Standard Deviation of the response value Y
+# AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual 
bias
+# STDEV_RES_Y           Standard Deviation of the residual Y - pred(Y|X)
+# DISPERSION            GLM-style dispersion, i.e. residual sum of squares / # 
deg. fr.
+# PLAIN_R2              Plain R^2 of residual with bias included vs. total 
average
+# ADJUSTED_R2           Adjusted R^2 of residual with bias included vs. total 
average
+# PLAIN_R2_NOBIAS       Plain R^2 of residual with bias subtracted vs. total 
average
+# ADJUSTED_R2_NOBIAS    Adjusted R^2 of residual with bias subtracted vs. 
total average
+# PLAIN_R2_VS_0         * Plain R^2 of residual with bias included vs. zero 
constant
+# ADJUSTED_R2_VS_0      * Adjusted R^2 of residual with bias included vs. zero 
constant
+# 
-------------------------------------------------------------------------------------
+# * The last two statistics are only printed if there is no intercept (icpt=0)
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f LinearRegDS.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y B=OUTPUT_DIR/B
+#     O=OUTPUT_DIR/Out icpt=2 reg=1.0 fmt=csv
+
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+fileO = ifdef ($O, " ");
+fmtB  = ifdef ($fmt, "text");
+
+intercept_status = ifdef ($icpt, 0);     # $icpt=0;
+regularization = ifdef ($reg, 0.000001); # $reg=0.000001;
+
+print ("BEGIN LINEAR REGRESSION SCRIPT");
+print ("Reading X and Y...");
+X = read (fileX);
+y = read (fileY);
+
+n = nrow (X);
+m = ncol (X);
+ones_n = matrix (1, rows = n, cols = 1);
+zero_cell = matrix (0, rows = 1, cols = 1);
+
+# Introduce the intercept, shift and rescale the columns of X if needed
+
+m_ext = m;
+if (intercept_status == 1 | intercept_status == 2)  # add the intercept column
+{
+    X = append (X, ones_n);
+    m_ext = ncol (X);
+}
+
+scale_lambda = matrix (1, rows = m_ext, cols = 1);
+if (intercept_status == 1 | intercept_status == 2)
+{
+    scale_lambda [m_ext, 1] = 0;
+}
+
+if (intercept_status == 2)  # scale-&-shift X columns to mean 0, variance 1
+{                           # Important assumption: X [, m_ext] = ones_n
+    avg_X_cols = t(colSums(X)) / n;
+    var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+    is_unsafe = ppred (var_X_cols, 0.0, "<=");
+    scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+    scale_X [m_ext, 1] = 1;
+    shift_X = - avg_X_cols * scale_X;
+    shift_X [m_ext, 1] = 0;
+} else {
+    scale_X = matrix (1, rows = m_ext, cols = 1);
+    shift_X = matrix (0, rows = m_ext, cols = 1);
+}
+
+# Henceforth, if intercept_status == 2, we use "X %*% (SHIFT/SCALE TRANSFORM)"
+# instead of "X".  However, in order to preserve the sparsity of X,
+# we apply the transform associatively to some other part of the expression
+# in which it occurs.  To avoid materializing a large matrix, we rewrite it:
+#
+# ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+# ssX_A  = diag (scale_X) %*% A;
+# ssX_A [m_ext, ] = ssX_A [m_ext, ] + t(shift_X) %*% A;
+#
+# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+# tssX_A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+
+lambda = scale_lambda * regularization;
+
+# BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
+
+A = t(X) %*% X;
+b = t(X) %*% y;
+if (intercept_status == 2) {
+    A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
+    A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+    b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
+}
+A = A + diag (lambda);
+
+print ("Calling the Direct Solver...");
+
+beta_unscaled = solve (A, b);
+
+# END THE DIRECT SOLVE ALGORITHM
+
+if (intercept_status == 2) {
+    beta = scale_X * beta_unscaled;
+    beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+} else {
+    beta = beta_unscaled;
+}
+
+print ("Computing the statistics...");
+
+avg_tot = sum (y) / n;
+ss_tot = sum (y ^ 2);
+ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+var_tot = ss_avg_tot / (n - 1);
+y_residual = y - X %*% beta;
+avg_res = sum (y_residual) / n;
+ss_res = sum (y_residual ^ 2);
+ss_avg_res = ss_res - n * avg_res ^ 2;
+
+plain_R2 = 1 - ss_res / ss_avg_tot;
+if (n > m_ext) {
+    dispersion  = ss_res / (n - m_ext);
+    adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+} else {
+    dispersion  = 0.0 / 0.0;
+    adjusted_R2 = 0.0 / 0.0;
+}
+
+plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+deg_freedom = n - m - 1;
+if (deg_freedom > 0) {
+    var_res = ss_avg_res / deg_freedom;
+    adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+} else {
+    var_res = 0.0 / 0.0;
+    adjusted_R2_nobias = 0.0 / 0.0;
+    print ("Warning: zero or negative number of degrees of freedom.");
+}
+
+plain_R2_vs_0 = 1 - ss_res / ss_tot;
+if (n > m) {
+    adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+} else {
+    adjusted_R2_vs_0 = 0.0 / 0.0;
+}
+
+str = "AVG_TOT_Y," + avg_tot;                                    #  Average of 
the response value Y
+str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  Standard 
Deviation of the response value Y
+str = append (str, "AVG_RES_Y," + avg_res);                      #  Average of 
the residual Y - pred(Y|X), i.e. residual bias
+str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  Standard 
Deviation of the residual Y - pred(Y|X)
+str = append (str, "DISPERSION," + dispersion);                  #  GLM-style 
dispersion, i.e. residual sum of squares / # d.f.
+str = append (str, "PLAIN_R2," + plain_R2);                      #  Plain R^2 
of residual with bias included vs. total average
+str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  Adjusted 
R^2 of residual with bias included vs. total average
+str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias);        #  Plain R^2 
of residual with bias subtracted vs. total average
+str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  Adjusted 
R^2 of residual with bias subtracted vs. total average
+if (intercept_status == 0) {
+    str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0);        #  Plain R^2 
of residual with bias included vs. zero constant
+    str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);  #  Adjusted 
R^2 of residual with bias included vs. zero constant
+}
+
+if (fileO != " ") {
+    write (str, fileO);
+} else {
+    print (str);
+}
+
+# Prepare the output matrix
+print ("Writing the output matrix...");
+
+if (intercept_status == 2) {
+    beta_out = append (beta, beta_unscaled);
+} else {
+    beta_out = beta;
+}
+write (beta_out, fileB, format=fmtB);
+print ("END LINEAR REGRESSION SCRIPT");

[46/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

Reply via email to