http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/Algorithms
 Reference/SystemML_Algorithms_Reference.tex
----------------------------------------------------------------------
diff --git a/Algorithms Reference/SystemML_Algorithms_Reference.tex 
b/Algorithms Reference/SystemML_Algorithms_Reference.tex
deleted file mode 100644
index 75308c9..0000000
--- a/Algorithms Reference/SystemML_Algorithms_Reference.tex    
+++ /dev/null
@@ -1,174 +0,0 @@
-\begin{comment}
-
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements.  See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership.  The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License.  You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied.  See the License for the
- specific language governing permissions and limitations
- under the License.
-
-\end{comment}
-
-\documentclass[letter]{article}
-\usepackage{graphicx,amsmath,amssymb,amsthm,subfigure,color,url,multirow,rotating,comment}
-\usepackage{tikz}
-\usepackage[normalem]{ulem}
-\usepackage[np,autolanguage]{numprint}
-\usepackage{tabularx}
-
-\usepackage[pdftex]{hyperref}
-\hypersetup{
-    unicode=false,          % non-Latin characters in Acrobat’s bookmarks
-    pdftoolbar=true,        % show Acrobat’s toolbar?
-    pdfmenubar=true,        % show Acrobat’s menu?
-    pdffitwindow=true,      % window fit to page when opened
-    pdfstartview={FitV},    % fits the width of the page to the window
-    pdftitle={SystemML Algorithms Reference},    % title
-    pdfauthor={SystemML Team}, % author
-    pdfsubject={Documentation},   % subject of the document
-    pdfkeywords={},         % list of keywords
-    pdfnewwindow=true,      % links in new window
-    bookmarksnumbered=true, % put section numbers in bookmarks
-    bookmarksopen=true,     % open up bookmark tree
-    bookmarksopenlevel=1,   % \maxdimen level to which bookmarks are open
-    colorlinks=true,        % false: boxed links; true: colored links
-    linkcolor=black,        % color of internal links  
-    citecolor=blue,         % color of links to bibliography
-    filecolor=black,        % color of file links
-    urlcolor=black          % color of external links
-}
-
-
-\newtheorem{definition}{Definition}
-\newtheorem{example}{Example}
-
-\newcommand{\Paragraph}[1]{\vspace*{1ex} \noindent {\bf #1} \hspace*{1ex}}
-\newenvironment{Itemize}{\vspace{-0.5ex}\begin{itemize}\setlength{\itemsep}{-0.2ex}
-}{\end{itemize}\vspace{-0.5ex}}
-\newenvironment{Enumerate}{\vspace{-0.5ex}\begin{enumerate}\setlength{\itemsep}{-0.2ex}
-}{\end{enumerate}\vspace{-0.5ex}}
-\newenvironment{Description}{\vspace{-0.5ex}\begin{description}\setlength{\itemsep}{-0.2ex}
-}{\end{description}\vspace{-0.5ex}}
-
-
-\newcommand{\SystemML}{\texttt{SystemML} }
-\newcommand{\hml}{\texttt{hadoop jar SystemML.jar} }
-\newcommand{\pxp}{\mathbin{\texttt{\%\textasteriskcentered\%}}}
-\newcommand{\todo}[1]{{{\color{red}TODO: #1}}}
-\newcommand{\Normal}{\ensuremath{\mathop{\mathrm{Normal}}\nolimits}}
-\newcommand{\Prob}{\ensuremath{\mathop{\mathrm{Prob}\hspace{0.5pt}}\nolimits}}
-\newcommand{\E}{\ensuremath{\mathop{\mathrm{E}}\nolimits}}
-\newcommand{\mean}{\ensuremath{\mathop{\mathrm{mean}}\nolimits}}
-\newcommand{\Var}{\ensuremath{\mathop{\mathrm{Var}}\nolimits}}
-\newcommand{\Cov}{\ensuremath{\mathop{\mathrm{Cov}}\nolimits}}
-\newcommand{\stdev}{\ensuremath{\mathop{\mathrm{st.dev}}\nolimits}}
-\newcommand{\atan}{\ensuremath{\mathop{\mathrm{arctan}}\nolimits}}
-\newcommand{\diag}{\ensuremath{\mathop{\mathrm{diag}}\nolimits}}
-\newcommand{\const}{\ensuremath{\mathop{\mathrm{const}}\nolimits}}
-\newcommand{\eps}{\varepsilon}
-
-\sloppy
-
-%%%%%%%%%%%%%%%%%%%%% 
-% header
-%%%%%%%%%%%%%%%%%%%%%
-
-\title{\LARGE{{\SystemML Algorithms Reference}}} 
-\date{\today}
-
-%%%%%%%%%%%%%%%%%%%%%
-% document start
-%%%%%%%%%%%%%%%%%%%%%
-\begin{document}       
-
-%\pagenumbering{roman}
-\maketitle
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Descriptive Statistics}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\input{DescriptiveStats}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Classification}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\input{LogReg}
-
-\subsection{Support Vector Machines}
-
-\input{BinarySVM}
-
-\input{MultiSVM}
-
-\input{NaiveBayes}
-
-\input{DecisionTrees}
-
-\input{RandomForest}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Clustering}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\input{Kmeans}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Regression}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\input{LinReg}
-
-\input{StepLinRegDS}
-
-\input{GLM}
-
-\input{StepGLM}
-
-\input{GLMpredict.tex}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Matrix Factorization}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-\input{pca}
-
-\input{ALS.tex}
-
-%%{\color{red}\subsection{GNMF}}
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%{\color{red}\section{Sequence Mining}}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
-
-
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Survival Analysis}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\input{KaplanMeier}
-
-\input{Cox}
-
-\bibliographystyle{abbrv}
-
-\bibliography{SystemML_ALgorithms_Reference}
-
-       
-%%%%%%%%%%%%%%%%%%%%%
-% document end
-%%%%%%%%%%%%%%%%%%%%%
-\end{document}
-
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/Language
 Reference/PyDML Language Reference.doc
----------------------------------------------------------------------
diff --git a/Language Reference/PyDML Language Reference.doc b/Language 
Reference/PyDML Language Reference.doc
deleted file mode 100644
index b43b6db..0000000
Binary files a/Language Reference/PyDML Language Reference.doc and /dev/null 
differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/Language
 Reference/Python syntax for DML.doc
----------------------------------------------------------------------
diff --git a/Language Reference/Python syntax for DML.doc b/Language 
Reference/Python syntax for DML.doc
deleted file mode 100644
index ee43a6b..0000000
Binary files a/Language Reference/Python syntax for DML.doc and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/Language
 Reference/README_HADOOP_CONFIG.txt
----------------------------------------------------------------------
diff --git a/Language Reference/README_HADOOP_CONFIG.txt b/Language 
Reference/README_HADOOP_CONFIG.txt
deleted file mode 100644
index e34d4f3..0000000
--- a/Language Reference/README_HADOOP_CONFIG.txt       
+++ /dev/null
@@ -1,83 +0,0 @@
-Usage
------
-The machine learning algorithms described in SystemML_Algorithms_Reference.pdf 
can be invoked
-from the hadoop command line using the described, algorithm-specific 
parameters. 
-
-Generic command line arguments arguments are provided by the help command 
below.
-
-   hadoop jar SystemML.jar -? or -help 
-
-
-Recommended configurations
---------------------------
-1) JVM Heap Sizes: 
-We recommend an equal-sized JVM configuration for clients, mappers, and 
reducers. For the client
-process this can be done via
-
-   export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" 
-   
-where Xmx specifies the maximum heap size, Xms the initial heap size, and Xmn 
is size of the young 
-generation. For Xmn values of equal or less than 15% of the max heap size, we 
guarantee the memory budget.
-
-For mapper or reducer JVM configurations, the following properties can be 
specified in mapred-site.xml,
-where 'child' refers to both mapper and reducer. If map and reduce are 
specified individually, they take 
-precedence over the generic property.
-
-  <property>
-    <name>mapreduce.child.java.opts</name> <!-- synonym: 
mapred.child.java.opts -->
-    <value>-Xmx2048m -Xms2048m -Xmn256m</value>
-  </property>
-  <property>
-    <name>mapreduce.map.java.opts</name> <!-- synonym: mapred.map.java.opts -->
-    <value>-Xmx2048m -Xms2048m -Xmn256m</value>
-  </property>
-  <property>
-    <name>mapreduce.reduce.java.opts</name> <!-- synonym: 
mapred.reduce.java.opts -->
-    <value>-Xmx2048m -Xms2048m -Xmn256m</value>
-  </property>
- 
-
-2) CP Memory Limitation:
-There exist size limitations for in-memory matrices. Dense in-memory matrices 
are limited to 16GB 
-independent of their dimension. Sparse in-memory matrices are limited to 2G 
rows and 2G columns 
-but the overall matrix can be larger. These limitations do only apply to 
in-memory matrices but 
-NOT in HDFS or involved in MR computations. Setting HADOOP_CLIENT_OPTS below 
those limitations 
-prevents runtime errors.
-
-3) Transparent Huge Pages (on Red Hat Enterprise Linux 6):
-Hadoop workloads might show very high System CPU utilization if THP is 
enabled. In case of such 
-behavior, we recommend to disable THP with
-   
-   echo never > /sys/kernel/mm/redhat_transparent_hugepage/enabled
-   
-4) JVM Reuse:
-Performance benefits from JVM reuse because data sets that fit into the mapper 
memory budget are 
-reused across tasks per slot. However, Hadoop 1.0.3 JVM Reuse is incompatible 
with security (when 
-using the LinuxTaskController). The workaround is to use the 
DefaultTaskController. SystemML provides 
-a configuration property in SystemML-config.xml to enable JVM reuse on a per 
job level without
-changing the global cluster configuration.
-   
-   <jvmreuse>false</jvmreuse> 
-   
-5) Number of Reducers:
-The number of reducers can have significant impact on performance. SystemML 
provides a configuration
-property to set the default number of reducers per job without changing the 
global cluster configuration.
-In general, we recommend a setting of twice the number of nodes. Smaller 
numbers create less intermediate
-files, larger numbers increase the degree of parallelism for compute and 
parallel write. In
-SystemML-config.xml, set:
-   
-   <!-- default number of reduce tasks per MR job, default: 2 x number of 
nodes -->
-   <numreducers>12</numreducers> 
-
-6) SystemML temporary directories:
-SystemML uses temporary directories in two different locations: (1) on local 
file system for temping from 
-the client process, and (2) on HDFS for intermediate results between different 
MR jobs and between MR jobs 
-and in-memory operations. Locations of these directories can be configured in 
SystemML-config.xml with the
-following properties:
-
-   <!-- local fs tmp working directory-->
-   <localtmpdir>/tmp/systemml</localtmpdir>
-
-   <!-- hdfs tmp working directory--> 
-   <scratch>scratch_space</scratch> 
- 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/_config.yml
----------------------------------------------------------------------
diff --git a/_config.yml b/_config.yml
index ba1a808..15e0852 100644
--- a/_config.yml
+++ b/_config.yml
@@ -10,6 +10,10 @@ include:
   - _static
   - _modules
 
+exclude:
+  - alg-ref
+  - lang-ref
+
 # These allow the documentation to be updated with newer releases
 SYSTEMML_VERSION: Latest
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/alg-ref/ALS.tex
----------------------------------------------------------------------
diff --git a/alg-ref/ALS.tex b/alg-ref/ALS.tex
new file mode 100644
index 0000000..c2a5e3a
--- /dev/null
+++ b/alg-ref/ALS.tex
@@ -0,0 +1,298 @@
+\begin{comment}
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+\end{comment}
+
+\subsection{Matrix Completion via Alternating Minimizations}
+\label{matrix_completion}
+
+\noindent{\bf Description}
+\smallskip
+
+Low-rank matrix completion is an effective technique for statistical data 
analysis widely used in the data mining and machine learning applications.
+Matrix completion is a variant of low-rank matrix factorization with the goal 
of recovering a partially observed and potentially noisy matrix from a subset 
of its revealed entries.
+Perhaps the most popular applications in which matrix completion has been 
successfully applied is in the context of collaborative filtering in 
recommender systems. 
+In this setting, the rows in the data matrix correspond to users, 
+the columns to items such as movies, and entries to feedback provided by users 
for items. 
+The goal is to predict missing entries of the rating matrix. 
+This implementation uses the alternating least-squares (ALS) technique for 
solving large-scale matrix completion problems.\\ 
+
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}ALS.dml}
+       {\tt{} -nvargs}
+       {\tt{} V=}path/file
+       {\tt{} L=}path/file
+       {\tt{} R=}path/file
+%      {\tt{} VO=}path/file
+       {\tt{} rank=}int
+       {\tt{} reg=}L2$\mid$wL2%regularization
+       {\tt{} lambda=}double
+       {\tt{} fmt=}format
+       
+}
+
+
+\smallskip
+\noindent{\bf Arguments}
+\begin{Description}
+       \item[{\tt V}:]
+       Location (on HDFS) to read the input (user-item) matrix $V$ to be 
factorized
+       \item[{\tt L}:]
+       Location (on HDFS) to write the left (user) factor matrix $L$
+       \item[{\tt R}:]
+       Location (on HDFS) to write the right (item) factor matrix $R$
+%      \item[{\tt VO}:]
+%      Location (on HDFS) to write the input matrix $VO$ with empty rows and 
columns removed (if there are any)
+       \item[{\tt rank}:] (default:\mbox{ }{\tt 10})
+       Rank of the factorization
+       \item[{\tt reg}] (default:\mbox{ }{\tt L2})
+       Regularization:\\
+       {\tt L2} = L2 regularization;\\
+       {\tt wL2} = weighted L2 regularization;\\
+       if {\tt reg} is not provided no regularization will be performed. 
+       \item[{\tt lambda}:] (default:\mbox{ }{\tt 0.000001})
+       Regularization parameter
+       \item[{\tt maxi}:] (default:\mbox{ }{\tt 50})
+        Maximum number of iterations
+       \item[{\tt check}:] (default:\mbox{ }{\tt FALSE})
+       Check for convergence after every iteration, i.e., updating $L$ and $R$ 
once
+       \item[{\tt thr}:] (default:\mbox{ }{\tt 0.0001})
+       Assuming {\tt check=TRUE}, the algorithm stops and convergence is 
declared 
+       if the decrease in loss in any two consecutive iterations falls below 
threshold {\tt thr}; 
+       if {\tt check=FALSE} parameter {\tt thr} is ignored.
+       \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+       Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
+\end{Description}
+ 
+ \smallskip
+ \noindent{\bf Usage: ALS Prediction/Top-K Prediction}
+ \smallskip
+ 
+ {\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}ALS\_predict.dml}
+       {\tt{} -nvargs}
+       {\tt{} X=}path/file
+       {\tt{} Y=}path/file
+       {\tt{} L=}path/file
+       {\tt{} R=}path/file
+       {\tt{} Vrows=}int
+       {\tt{} Vcols=}int
+       {\tt{} fmt=}format
+       
+ }\smallskip
+ 
+ 
+  \smallskip  
+  {\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}ALS\_topk\_predict.dml}
+       {\tt{} -nvargs}
+       {\tt{} X=}path/file
+       {\tt{} Y=}path/file
+       {\tt{} L=}path/file
+       {\tt{} R=}path/file
+       {\tt{} V=}path/file
+       {\tt{} K=}int
+       {\tt{} fmt=}format
+       
+  }\smallskip
+ 
+%   \noindent{\bf Arguments --- Prediction}
+%   \begin{Description}
+%      \item[{\tt X}:]
+%      Location (on HDFS) to read the input matrix $X$ containing user-ids 
(first column) and item-ids (second column) 
+%      \item[{\tt L}:]
+%      Location (on HDFS) to read the left (user) factor matrix $L$
+%      \item[{\tt R}:]
+%      Location (on HDFS) to read the right (item) factor matrix $R$
+%      \item[{\tt Y}:]
+%      Location (on HDFS) to write the output matrix $Y$ containing user-ids 
(first column), item-ids (second column) and predicted ratings (third column)
+%      \item[{\tt Vrows}:] 
+%      Number of rows of the user-item matrix $V$
+%      \item[{\tt Vcols}] 
+%      Number of columns of the user-item matrix $V$ 
+%      \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+%      Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
+%   \end{Description}
+   
+
+  \noindent{\bf Arguments --- Prediction/Top-K Prediction}
+  \begin{Description}
+       \item[{\tt V}:]
+       Location (on HDFS) to read the user-item matrix $V$ 
+       \item[{\tt X}:]
+       Location (on HDFS) to read the input matrix $X$ with following format:
+       \begin{itemize}
+               \item for {ALS\_predict.dml}: a 2-column matrix that contains 
the user-ids (first column) and the item-ids (second column),
+               \item for {ALS\_topk\_predict.dml}: a 1-column matrix that 
contains the user-ids.
+       \end{itemize} 
+       \item[{\tt Y}:]
+       Location (on HDFS) to write the output of prediction with the following 
format:
+       \begin{itemize}
+               \item for {ALS\_predict.dml}: a 3-column matrix that contains 
the user-ids (first column), the item-ids (second column) and the predicted 
ratings (third column),
+               \item for {ALS\_topk\_predict.dml}: a ($K+1$)-column matrix 
that contains the user-ids in the first column and the top-K item-ids in the 
remaining $K$ columns will be stored at {\tt Y}.
+               Additionally, a matrix with the same dimensions that contains 
the corresponding actual top-K ratings will be stored at {\tt Y.ratings}; see 
below for details. 
+       \end{itemize}
+%      Note the following output format in predicting top-K items. 
+%      For a user with no available ratings in $V$ no 
+%      top-K items will be provided, i.e., the corresponding row in $Y$ will 
contains 0s.   
+%      Moreover, $K'<K$ items with highest predicted ratings will be provided 
for a user $i$ 
+%      if the number of missing ratings $K'$ (i.e., those with 0 value in $V$) 
for $i$ is less than $K$.
+       \item[{\tt L}:]
+       Location (on HDFS) to read the left (user) factor matrix $L$
+       \item[{\tt R}:]
+       Location (on HDFS) to write the right (item) factor matrix $R$
+       \item[{\tt Vrows}:] 
+       Number of rows of $V$ (i.e., number of users)
+       \item[{\tt Vcols}] 
+       Number of columns of $V$ (i.e., number of items) 
+       \item[{\tt K}:] (default:\mbox{ }{\tt 5})
+       Number of top-K items for top-K prediction
+       \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+       Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
+  \end{Description}
+  
+ \noindent{\bf Details}
+ \smallskip
+ 
+ Given an $m \times n$ input matrix $V$ and a rank parameter $r \ll 
\min{(m,n)}$, low-rank matrix factorization seeks to find an $m \times r$ 
matrix $L$ and an $r \times n$ matrix $R$ such that $V \approx LR$, i.e., we 
aim to approximate $V$ by the low-rank matrix $LR$.
+ The quality of the approximation is determined by an application-dependent 
loss function $\mathcal{L}$. We aim at finding the loss-minimizing factor 
matrices, i.e., 
+ \begin{equation}\label{eq:problem}
+ (L^*, R^*) = \textrm{argmin}_{L,R}{\mathcal{L}(V,L,R)}.
+ \end{equation} 
+ In the context of collaborative filtering in the recommender systems it is 
often the case that the input matrix $V$ contains several missing entries. Such 
entries are coded with the 0 value and the loss function is computed only based 
on the nonzero entries in $V$, i.e.,
+ \begin{equation*} %\label{eq:loss}
+ \mathcal{L}=\sum_{(i,j)\in\Omega}l(V_{ij},L_{i*},R_{*j}),
+ \end{equation*} 
+ where $L_{i*}$ and $R_{*j}$, respectively, denote the $i$th row of $L$ and 
the $j$th column of $R$, $\Omega=\{\omega_1,\dots,\omega_N\}$ denotes the 
training set containing the observed (nonzero) entries in $V$, and $l$ is some 
local loss function.  
+ %for some training set $\Omega$ that contains the observed (nonzero) entries 
in $V$ and some local loss function $l$. In the above formula, 
+ 
+ ALS is an optimization technique that can be used to solve quadratic 
problems. 
+ For matrix completion, the algorithm repeatedly keeps one of the unknown 
matrices ($L$ or $R$) fixed and optimizes the other one. In particular, ALS 
alternates between recomputing the rows of $L$ in one step and the columns of 
$R$ in the subsequent step.  
+ Our implementation of the ALS algorithm supports the loss functions 
summarized in Table~\ref{tab:loss_functions} commonly used for matrix 
completion~\cite{ZhouWSP08:als}. 
+ %
+ \begin{table}[t]
+       \centering
+       \label{tab:loss_functions}
+       \begin{tabular}{|ll|} \hline
+               Loss & Definition \\ \hline
+%              $\mathcal{L}_\text{Sl}$ & $\sum_{i,j} (V_{ij} - [LR]_{ij})^2$ \\
+%              $\mathcal{L}_\text{Sl+L2}$ & $\mathcal{L}_\text{Sl} + \lambda 
\Bigl( \sum_{ik} L_{ik}^2 + \sum_{kj} R_{kj}^2 \Bigr)$ \\
+               $\mathcal{L}_\text{Nzsl}$ & $\sum_{i,j:V_{ij}\neq 0} (V_{ij} - 
[LR]_{ij})^2$ \\
+               $\mathcal{L}_\text{Nzsl+L2}$ & $\mathcal{L}_\text{Nzsl} + 
\lambda \Bigl( \sum_{ik} L_{ik}^2 + \sum_{kj} R_{kj}^2 \Bigr)$ \\
+               $\mathcal{L}_\text{Nzsl+wL2}$ & $\mathcal{L}_\text{Nzsl} + 
\lambda \Bigl(\sum_{ik}N_{i*} L_{ik}^2 + \sum_{kj}N_{*j} R_{kj}^2 \Bigr)$ \\ 
\hline 
+       \end{tabular}
+       \caption{Popular loss functions supported by our ALS implementation; 
$N_{i*}$ and $N_{*j}$, respectively, denote the number of nonzero entries in 
row $i$ and column $j$ of $V$.}
+ \end{table}
+ 
+ Note that the matrix completion problem as defined in (\ref{eq:problem}) is a 
non-convex problem for all loss functions from Table~\ref{tab:loss_functions}. 
+ However, when fixing one of the matrices $L$ or $R$, we get a least-squares 
problem with a globally optimal solution.  
+ For example, for the case of $\mathcal{L}_\text{Nzsl+wL2}$ we have the 
following closed form solutions
+  \begin{align*}
+  L^\top_{n+1,i*} &\leftarrow (R^{(i)}_n {[R^{(i)}_n]}^\top + \lambda N_2 
I)^{-1} R_n V^\top_{i*}, \\
+  R_{n+1,*j} &\leftarrow ({[L^{(j)}_{n+1}]}^\top L^{(j)}_{n+1} + \lambda N_1 
I)^{-1} L^\top_{n+1} V_{*j}, 
+  \end{align*}
+ where $L_{n+1,i*}$ (resp. $R_{n+1,*j}$) denotes the $i$th row of $L_{n+1}$ 
(resp. $j$th column of $R_{n+1}$), $\lambda$ denotes 
+ the regularization parameter, $I$ is the identity matrix of appropriate 
dimensionality, 
+ $V_{i*}$ (resp. $V_{*j}$) denotes the revealed entries in row $i$ (column 
$j$), 
+ $R^{(i)}_n$ (resp. $L^{(j)}_{n+1}$) refers to the corresponding columns of 
$R_n$ (rows of $L_{n+1}$), 
+ and $N_1$ (resp. $N_2$) denotes a diagonal matrix that contains the number of 
nonzero entries in row $i$ (column $j$) of $V$.   
+ 
+% For example, for the case of $\mathcal{L}_\text{Sl-L2}$ we have the 
following closed form solutions
+% \begin{align*}
+% L^\top_{n+1,i*} &\leftarrow (R_n {[R_n]}^\top + \lambda I)^{-1} R_n 
V^\top_{i*}, \\
+% R_{n+1,*j} &\leftarrow ({[L_{n+1}]}^\top L_{n+1} + \lambda I)^{-1} 
L^\top_{n+1} V_{*j}, 
+% \end{align*}
+% where $L_{n+1,i*}$ (resp. $R_{n+1,*j}$) denotes the $i$th row of $L_{n+1}$ 
(resp. $j$th column of $R_{n+1}$), $\lambda$ denotes 
+% the regularization parameter and $I$ is the identity matrix of appropriate 
dimensionality. 
+% For the case of $\mathcal{L}_\text{Nzsl}$ we need to remove the equation 
that correspond to zero entries of $V$ from the least-squares problems. 
+% With wL2 we get the following equations
+% \begin{align*}
+% L^\top_{n+1,i*} &\leftarrow (R^{(i)}_n {[R^{(i)}_n]}^\top + \lambda N_2 
I)^{-1} R_n V^\top_{i*}, \\
+% R_{n+1,*j} &\leftarrow ({[L^{(j)}_{n+1}]}^\top L^{(j)}_{n+1} + \lambda N_1 
I)^{-1} L^\top_{n+1} V_{*j}, 
+% \end{align*}
+% where $V_{i*}$ (resp. $V_{*j}$) denotes the revealed entries in row $i$ 
(column $j$), 
+% $R^{(i)}_n$ (resp. $L^{(j)}_{n+1}$) refers to the corresponding columns of 
$R_n$ (rows of $L_{n+1}$), 
+% and $N_1$ (resp. $N_2$) denotes a diagonal matrix that contains the number 
of nonzero entries in row $i$ (column $j$) of $V$.
+ 
+ \textbf{Prediction.} 
+ Based on the factor matrices computed by ALS we provide two prediction 
scripts:   
+ \begin{Enumerate}
+       \item {\tt ALS\_predict.dml} computes the predicted ratings for a given 
list of users and items;
+       \item {\tt ALS\_topk\_predict.dml} computes top-K item (where $K$ is 
given as input) with highest predicted ratings together with their 
corresponding ratings for a given list of users.
+ \end{Enumerate} 
+  
+ \smallskip
+ \noindent{\bf Returns}
+ \smallskip
+ 
+ We output the factor matrices $L$ and $R$ after the algorithm has converged. 
The algorithm is declared as converged if one of the two criteria is meet: 
+ (1) the decrease in the value of loss function falls below {\tt thr}
+ given as an input parameter (if parameter {\tt check=TRUE}), or (2) the 
maximum number of iterations (defined as parameter {\tt maxi}) is reached. 
+ Note that for a given user $i$ prediction is possible only if user $i$ has 
rated at least one item, i.e., row $i$ in matrix $V$ has at least one nonzero 
entry. 
+ In case, some users have not rated any items the corresponding factor in $L$ 
will be all 0s.
+ Similarly if some items have not been rated at all the corresponding factors 
in $R$  will contain only 0s. 
+ Our prediction scripts output the predicted ratings for a given list of users 
and items as well as the top-K items with highest predicted ratings together 
with the predicted ratings for a given list of users. Note that the predictions 
will only be provided for the users who have rated at least one item, i.e., the 
corresponding rows contain at least one nonzero entry. 
+% Moreover in the case of top-K prediction, if the number of predicted 
ratings---i.e., missing entries--- for some user $i$ is less than the input 
parameter $K$, all the predicted ratings for user $i$ will be provided.
+
+ 
+
+ 
+ 
+  
+ \smallskip
+ \noindent{\bf Examples}
+ \smallskip
+  
+% {\hangindent=\parindent\noindent\tt
+%      \hml -f ALS.dml -nvargs V=/user/biadmin/V L=/user/biadmin/L 
R=/user/biadmin/R rank=10 reg="L2" lambda=0.0001 fmt=csv 
+%              
+% }
+  
+ {\hangindent=\parindent\noindent\tt
+       \hml -f ALS.dml -nvargs V=/user/biadmin/V L=/user/biadmin/L 
R=/user/biadmin/R rank=10 reg="wL2" lambda=0.0001 maxi=50 check=TRUE thr=0.001 
fmt=csv      
+       
+ }
+ 
+ \noindent To compute predicted ratings for a given list of users and items:
+ 
+ {\hangindent=\parindent\noindent\tt
+       \hml -f ALS-predict.dml -nvargs X=/user/biadmin/X Y=/user/biadmin/Y 
L=/user/biadmin/L R=/user/biadmin/R  Vrows=100000 Vcols=10000 fmt=csv       
+       
+ }
+  
+ \noindent To compute top-K items with highest predicted ratings together with 
the predicted ratings for a given list of users:
+ 
+ {\hangindent=\parindent\noindent\tt
+       \hml -f ALS-top-predict.dml -nvargs X=/user/biadmin/X Y=/user/biadmin/Y 
L=/user/biadmin/L R=/user/biadmin/R V=/user/biadmin/V K=10 fmt=csv      
+       
+ }
+
+
+%
+%\begin{itemize}
+%      \item Y. Zhou, D. K. Wilkinson, R. Schreiber, and R. Pan. 
\newblock{Large-scale parallel collaborative flitering for the Netflix prize}. 
In Proceedings of the International
+%      Conference on Algorithmic Aspects in Information and Management (AAIM), 
2008, 337-348.
+%\end{itemize}
+ 
+ 
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/alg-ref/BinarySVM.tex
----------------------------------------------------------------------
diff --git a/alg-ref/BinarySVM.tex b/alg-ref/BinarySVM.tex
new file mode 100644
index 0000000..7ff5b06
--- /dev/null
+++ b/alg-ref/BinarySVM.tex
@@ -0,0 +1,175 @@
+\begin{comment}
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+\end{comment}
+
+\subsubsection{Binary-class Support Vector Machines}
+\label{l2svm}
+
+\noindent{\bf Description}
+
+Support Vector Machines are used to model the relationship between a 
categorical 
+dependent variable y and one or more explanatory variables denoted X. This 
+implementation learns (and predicts with) a binary class support vector 
machine 
+(y with domain size 2).
+\\
+
+\noindent{\bf Usage}
+
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{l2-svm.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{icpt=}\textit{int} 
+  \texttt{tol=}\textit{double}\\
+\>\texttt{reg=}\textit{double} 
+  \texttt{maxiter=}\textit{int} 
+  \texttt{model=}\textit{path}/\textit{file}\\
+\>\texttt{Log=}\textit{path}/\textit{file}
+  \texttt{fmt=}\textit{csv}$\vert$\textit{text}
+\end{tabbing}
+
+\begin{tabbing}
+\texttt{-f} \textit{path}/\texttt{l2-svm-predict.dml -nvargs} 
+\=\texttt{X=}\textit{path}/\textit{file} 
+  \texttt{Y=}\textit{path}/\textit{file} 
+  \texttt{icpt=}\textit{int} 
+  \texttt{model=}\textit{path}/\textit{file}\\
+\>\texttt{scores=}\textit{path}/\textit{file}
+  \texttt{accuracy=}\textit{path}/\textit{file}\\
+\>\texttt{confusion=}\textit{path}/\textit{file}
+  \texttt{fmt=}\textit{csv}$\vert$\textit{text}
+\end{tabbing}
+
+%%\begin{verbatim}
+%%-f path/l2-svm.dml -nvargs X=path/file Y=path/file icpt=int tol=double
+%%                      reg=double maxiter=int model=path/file
+%%\end{verbatim}
+
+\noindent{\bf Arguments}
+
+\begin{itemize}
+\item X: Location (on HDFS) to read the matrix of feature vectors; 
+each row constitutes one feature vector.
+\item Y: Location to read the one-column matrix of (categorical) 
+labels that correspond to feature vectors in X. Binary class labels 
+can be expressed in one of two choices: $\pm 1$ or $1/2$. Note that,
+this argument is optional for prediction.
+\item icpt (default: {\tt 0}): If set to 1 then a constant bias column is 
+added to X. 
+\item tol (default: {\tt 0.001}): Procedure terminates early if the reduction
+in objective function value is less than tolerance times the initial objective
+function value.
+\item reg (default: {\tt 1}): Regularization constant. See details to find 
+out where lambda appears in the objective function. If one were interested 
+in drawing an analogy with the C parameter in C-SVM, then C = 2/lambda. 
+Usually, cross validation is employed to determine the optimum value of 
+lambda.
+\item maxiter (default: {\tt 100}): The maximum number of iterations.
+\item model: Location (on HDFS) that contains the learnt weights.
+\item Log: Location (on HDFS) to collect various metrics (e.g., objective 
+function value etc.) that depict progress across iterations while training.
+\item fmt (default: {\tt text}): Specifies the output format. Choice of 
+comma-separated values (csv) or as a sparse-matrix (text).
+\item scores: Location (on HDFS) to store scores for a held-out test set.
+Note that, this is an optional argument.
+\item accuracy: Location (on HDFS) to store the accuracy computed on a
+held-out test set. Note that, this is an optional argument.
+\item confusion: Location (on HDFS) to store the confusion matrix
+computed using a held-out test set. Note that, this is an optional 
+argument.
+\end{itemize}
+
+\noindent{\bf Details}
+
+Support vector machines learn a classification function by solving the
+following optimization problem ($L_2$-SVM):
+\begin{eqnarray*}
+&\textrm{argmin}_w& \frac{\lambda}{2} ||w||_2^2 + \sum_i \xi_i^2\\
+&\textrm{subject to:}& y_i w^{\top} x_i \geq 1 - \xi_i ~ \forall i
+\end{eqnarray*}
+where $x_i$ is an example from the training set with its label given by $y_i$, 
+$w$ is the vector of parameters and $\lambda$ is the regularization constant 
+specified by the user.
+
+To account for the missing bias term, one may augment the data with a column
+of constants which is achieved by setting intercept argument to 1 (C-J Hsieh 
+et al, 2008).
+
+This implementation optimizes the primal directly (Chapelle, 2007). It uses 
+nonlinear conjugate gradient descent to minimize the objective function 
+coupled with choosing step-sizes by performing one-dimensional Newton 
+minimization in the direction of the gradient.
+\\
+
+\noindent{\bf Returns}
+
+The learnt weights produced by l2-svm.dml are populated into a single column 
matrix 
+and written to file on HDFS (see model in section Arguments). The number of 
rows in 
+this matrix is ncol(X) if intercept was set to 0 during invocation and ncol(X) 
+ 1 
+otherwise. The bias term, if used, is placed in the last row. Depending on 
what arguments
+are provided during invocation, l2-svm-predict.dml may compute one or more of 
scores, 
+accuracy and confusion matrix in the output format specified. 
+\\
+
+%%\noindent{\bf See Also}
+%%
+%%In case of multi-class classification problems (y with domain size greater 
than 2), 
+%%please consider using a multi-class classifier learning algorithm, e.g., 
multi-class
+%%support vector machines (see Section \ref{msvm}). To model the relationship 
between 
+%%a scalar dependent variable y and one or more explanatory variables X, 
consider 
+%%Linear Regression instead (see Section \ref{linreg-solver} or Section 
+%%\ref{linreg-iterative}).
+%%\\
+%%
+\noindent{\bf Examples}
+
+\begin{verbatim}
+hadoop jar SystemML.jar -f l2-svm.dml -nvargs X=/user/biadmin/X.mtx 
+                                              Y=/user/biadmin/y.mtx 
+                                              icpt=0 tol=0.001 fmt=csv
+                                              reg=1.0 maxiter=100 
+                                              model=/user/biadmin/weights.csv
+                                              Log=/user/biadmin/Log.csv
+\end{verbatim}
+
+\begin{verbatim}
+hadoop jar SystemML.jar -f l2-svm-predict.dml -nvargs X=/user/biadmin/X.mtx 
+                                                      Y=/user/biadmin/y.mtx 
+                                                      icpt=0 fmt=csv
+                                                      
model=/user/biadmin/weights.csv
+                                                      
scores=/user/biadmin/scores.csv
+                                                      
accuracy=/user/biadmin/accuracy.csv
+                                                      
confusion=/user/biadmin/confusion.csv
+\end{verbatim}
+
+\noindent{\bf References}
+
+\begin{itemize}
+\item W. T. Vetterling and B. P. Flannery. \newblock{\em Conjugate Gradient 
Methods in Multidimensions in 
+Numerical Recipes in C - The Art in Scientific Computing}. \newblock W. H. 
Press and S. A. Teukolsky
+(eds.), Cambridge University Press, 1992.
+\item J. Nocedal and  S. J. Wright. Numerical Optimization, Springer-Verlag, 
1999.
+\item C-J Hsieh, K-W Chang, C-J Lin, S. S. Keerthi and S. Sundararajan. 
\newblock{\em A Dual Coordinate 
+Descent Method for Large-scale Linear SVM.} \newblock International Conference 
of Machine Learning
+(ICML), 2008.
+\item Olivier Chapelle. \newblock{\em Training a Support Vector Machine in the 
Primal}. \newblock Neural 
+Computation, 2007.
+\end{itemize}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/alg-ref/Cox.tex
----------------------------------------------------------------------
diff --git a/alg-ref/Cox.tex b/alg-ref/Cox.tex
new file mode 100644
index 0000000..a355df7
--- /dev/null
+++ b/alg-ref/Cox.tex
@@ -0,0 +1,340 @@
+\begin{comment}
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+\end{comment}
+
+\subsection{Cox Proportional Hazard Regression Model}
+
+\noindent{\bf Description}
+\smallskip
+
+
+The Cox (proportional hazard or PH) is a semi-parametric statistical approach 
commonly used for analyzing survival data.
+Unlike non-parametric approaches, e.g., the Kaplan-Meier estimates (Section 
\ref{sec:kaplan-meier}), which can be used to analyze single sample of survival 
data or to compare between groups of survival times, the Cox PH models the 
dependency of the survival times on the values of {\it explanatory variables} 
(i.e., covariates) recorded for each individual at the time origin. Our focus 
is on covariates that do not change value over time, i.e., time-independent 
covariates, and that may be categorical (ordinal or nominal) as well as 
continuous-valued. \\  
+
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+{\tt{}-f }path/\/{\tt{}Cox.dml}
+{\tt{} -nvargs}
+{\tt{} X=}path/file
+{\tt{} TE=}path/file
+{\tt{} F=}path/file
+{\tt{} R=}path/file
+{\tt{} M=}path/file
+{\tt{} S=}path/file
+{\tt{} T=}path/file
+{\tt{} COV=}path/file
+{\tt{} RT=}path/file
+{\tt{} XO=}path/file
+{\tt{} MF=}path/file
+{\tt{} alpha=}double
+{\tt{} fmt=}format
+
+}
+
+\smallskip
+\noindent{\bf Arguments --- Model Fitting/Prediction}
+\begin{Description}
+\item[{\tt X}:]
+Location (on HDFS) to read the input matrix of the survival data containing: 
+\begin{Itemize}
+       \item timestamps,
+       \item whether event occurred (1) or data is censored (0),
+       \item feature vectors
+\end{Itemize}
+\item[{\tt Y}:]
+Location (on HDFS) to the read matrix used for prediction 
+\item[{\tt TE}:]
+Location (on HDFS) to read the 1-column matrix $TE$ that contains the column 
indices of the input matrix $X$ corresponding to timestamps (first entry) and 
event information (second entry)
+\item[{\tt F}:]
+Location (on HDFS) to read the 1-column matrix $F$ that contains the column 
indices of the input matrix $X$ corresponding to the features to be used for 
fitting the Cox model
+\item[{\tt R}:] (default:\mbox{ }{\tt " "})
+If factors (i.e., categorical features) are available in the input matrix $X$, 
location (on HDFS) to read matrix $R$ containing the start (first column) and 
end (second column) indices of each factor in $X$;
+alternatively, user can specify the indices of the baseline level of each 
factor which needs to be removed from $X$. If $R$ is not provided by default 
all variables are considered to be continuous-valued.
+\item[{\tt M}:]                                                        
+Location (on HDFS) to store the results of Cox regression analysis including 
regression coefficients $\beta_j$s, their standard errors, confidence 
intervals, and $P$-values  
+\item[{\tt S}:] (default:\mbox{ }{\tt " "})
+Location (on HDFS) to store a summary of some statistics of the fitted model 
including number of records, number of events, log-likelihood, AIC, Rsquare 
(Cox \& Snell), and maximum possible Rsquare 
+\item[{\tt T}:] (default:\mbox{ }{\tt " "})
+Location (on HDFS) to store the results of Likelihood ratio test, Wald test, 
and Score (log-rank) test of the fitted model
+\item[{\tt COV}:]
+Location (on HDFS) to store the variance-covariance matrix of $\beta_j$s; note 
that parameter {\tt COV} needs to provided as input to prediction.
+\item[{\tt RT}:]
+Location (on HDFS) to store matrix $RT$ containing the order-preserving 
recoded timestamps from $X$; note that parameter {\tt RT} needs to provided as 
input for prediction.
+\item[{\tt XO}:]
+Location (on HDFS) to store the input matrix $X$ ordered by the timestamps; 
note that parameter {\tt XO} needs to provided as input for prediction.
+\item[{\tt MF}:]
+Location (on HDFS) to store column indices of $X$ excluding the baseline 
factors if available; note that parameter {\tt MF} needs to provided as input 
for prediction.
+\item[{\tt P}] 
+Location (on HDFS) to store matrix $P$ containing the results of prediction
+\item[{\tt alpha}](default:\mbox{ }{\tt 0.05})
+Parameter to compute a $100(1-\alpha)\%$ confidence interval for $\beta_j$s 
+\item[{\tt tol}](default:\mbox{ }{\tt 0.000001})
+Tolerance (epsilon) used in the convergence criterion
+\item[{\tt moi}:] (default:\mbox{ }{\tt 100})
+Maximum number of outer (Fisher scoring) iterations
+\item[{\tt mii}:] (default:\mbox{ }{\tt 0})
+Maximum number of inner (conjugate gradient) iterations, or~0 if no maximum
+limit provided
+\item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+
+ \smallskip
+ \noindent{\bf Usage: Cox Prediction}
+ \smallskip
+ 
+ {\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}Cox-predict.dml}
+       {\tt{} -nvargs}
+       {\tt{} X=}path/file
+       {\tt{} RT=}path/file
+       {\tt{} M=}path/file
+       {\tt{} Y=}path/file
+       {\tt{} COV=}path/file
+       {\tt{} MF=}path/file
+       {\tt{} P=}path/file
+       {\tt{} fmt=}format
+       
+ }\smallskip
+ 
+% \noindent{\bf Arguments --- Prediction}
+% \begin{Description}
+%      \item[{\tt X}:]
+%      Location (on HDFS) to read the input matrix of the survival data sorted 
by the timestamps including: 
+%      \begin{Itemize}
+%              \item timestamps,
+%              \item whether event occurred (1) or data is censored (0),
+%              \item feature vectors
+%      \end{Itemize}
+%      \item[{\tt RT}:]
+%      Location to read column matrix $RT$ containing the (order preserving) 
recoded timestamps from X (output by {\tt Cox.dml})
+%      \item[{\tt M}:]
+%      Location to read matrix $M$ containing the fitted Cox model (see below 
for the schema) 
+%      \item[{\tt Y}:]
+%      Location to the read matrix used for prediction    
+%      \item[{\tt COV}:] 
+%      Location to read the variance-covariance matrix of the regression 
coefficients (output by {\tt Cox.dml})
+%      \item[{\tt MF}] 
+%      Location to store column indices of $X$ excluding the baseline factors 
if available (output by {\tt Cox.dml})
+%      \item[{\tt P}] 
+%      Location to store matrix $P$ containing the results of prediction
+%      \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+%      Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv}
+% \end{Description}
+ 
+
+
+\noindent{\bf Details}
+\smallskip
+
+ 
+In Cox PH regression model the relationship between the hazard 
function---i.e., the probability of event occurrence at a given time---and the 
covariates is described as
+\begin{equation}
+h_i(t)=h_0(t)\exp\Bigl\{ \sum_{j=1}^{p} \beta_jx_{ij} \Bigr\}, \label{eq:coxph}
+\end{equation} 
+where the hazard function for the $i$th individual ($i\in\{1,2,\ldots,n\}$) 
depends on a set of $p$ covariates $x_i=(x_{i1},x_{i2},\ldots,x_{ip})$, whose 
importance is measured by the magnitude of the corresponding coefficients 
+$\beta=(\beta_1,\beta_2,\ldots,\beta_p)$. The term $h_0(t)$ is the baseline 
hazard and is related to a hazard value if all covariates equal 0. 
+In the Cox PH model the hazard function for the individuals may vary over 
time, however the baseline hazard is estimated non-parametrically and can take 
any form.
+Note that re-writing~(\ref{eq:coxph}) we have 
+\begin{equation*}
+\log\biggl\{ \frac{h_i(t)}{h_0(t)} \biggr\} = \sum_{j=1}^{p} \beta_jx_{ij}.
+\end{equation*}
+Thus, the Cox PH model is essentially a linear model for the logarithm of the 
hazard ratio and the hazard of event for any individual is a constant multiple 
of the hazard of any other. 
+%Consequently, the Cox model is a proportional hazard model.
+We follow similar notation and methodology as 
in~\cite[Sec.~3]{collett2003:kaplanmeier}.
+For completeness we briefly discuss the equations used in our implementation.
+
+
+\textbf{Factors in the model.} 
+Note that if some of the feature variables are factors they need to {\it dummy 
code} as follows. 
+Let $\alpha$ be such a variable (i.e., a factor) with $a$ levels. 
+We introduce $a-1$ indicator (or dummy coded) variables $X_2,X_3\ldots,X_a$ 
with $X_j=1$ if $\alpha=j$ and 0 otherwise, for $j\in\{ 2,3,\ldots,a\}$.
+In particular, one of $a$ levels of $\alpha$ will be considered as the 
baseline and is not included in the model.
+In our implementation, user can specify a baseline level for each of the 
factor (as selecting the baseline level for each factor is arbitrary). 
+On the other hand, if for a given factor $\alpha$ no baseline is specified by 
the user, the most frequent level of $\alpha$ will be considered as the 
baseline.   
+
+
+\textbf{Fitting the model.}
+We estimate the coefficients of the Cox model via negative log-likelihood 
method.
+In particular the Cox PH model is fitted by using trust region Newton method 
with conjugate gradient~\cite{Nocedal2006:Optimization}.
+%The likelihood for the PH hazard model is given by
+%\begin{equation*}
+%\prod_{i=1}^{n} {\Bigg\{ \frac{\exp(\vec{\beta}^\top\vec{x_i})}{\sum_{l\in 
%R(t_i)\exp(\vec{\beta}\vec{x}_l)}} \Biggr\}}^\delta_i,
+%\end{equation*}
+%where $\delta_i$ is an event indicator, which is 0 if the $i$th survival time 
is censored or 1 otherwise, and $R(t_i)$ is the risk set defined as the set of 
individuals who die at time $t_i$ or later.
+Define the risk set $R(t_j)$ at time $t_j$ to be the set of individuals who 
die at time $t_i$ or later. 
+The PH model assumes that survival times are distinct. In order to handle tied 
observations
+we use the \emph{Breslow} approximation of the likelihood function
+\begin{equation*}
+\mathcal{L}=\prod_{j=1}^{r} \frac{\exp(\beta^\top s_j)}{{\bigg\{ \sum_{l\in 
R(t_j)} \exp(\beta^\top x_l) \biggr\}}^{d_j}},
+\end{equation*}
+where $d_j$ is number individuals who die at time $t_j$ and $s_j$ denotes the 
element-wise sum of the covariates for those individuals who die at time $t_j$, 
$j=1,2,\ldots,r$, i.e.,
+the $h$th element of $s_j$ is given by $s_{hj}=\sum_{k=1}^{d_j}x_{hjk}$, where 
$x_{hjk}$ is the value of $h$th variable ($h\in \{1,2,\ldots,p\}$) for the 
$k$th of the $d_j$ individuals ($k\in\{ 1,2,\ldots,d_j \}$) who die at the 
$j$th death time ($j\in\{ 1,2,\ldots,r \}$).  
+
+\textbf{Standard error and confidence interval for coefficients.}
+Note that the variance-covariance matrix of the estimated coefficients 
$\hat{\beta}$ can be approximated by the inverse of the Hessian evaluated at 
$\hat{\beta}$. The square root of the diagonal elements of this matrix are the 
standard errors of estimated coefficients.  
+Once the standard errors of the coefficients $se(\hat{\beta})$ is obtained we 
can compute a $100(1-\alpha)\%$ confidence interval using $\hat{\beta}\pm 
z_{\alpha/2}se(\hat{\beta})$, where $z_{\alpha/2}$ is the upper 
$\alpha/2$-point of the standard normal distribution.
+In {\tt Cox.dml}, we utilize the build-in function {\tt inv()} to compute the 
inverse of the Hessian. Note that this build-in function can be used only if 
the Hessian fits in the main memory of a single machine.   
+
+
+\textbf{Wald test, likelihood ratio test, and log-rank test.}
+In order to test the {\it null hypothesis} that all of the coefficients 
$\beta_j$s are 0, our implementation provides three statistical test: {\it Wald 
test}, {\it likelihood ratio test}, the {\it log-rank test} (also known as the 
{\it score test}). 
+Let $p$ be the number of coefficients.
+The Wald test is based on the test statistic 
${\hat{\beta}}^2/{se(\hat{\beta})}^2$, which is compared to percentage points 
of the Chi-squared distribution to obtain the $P$-value.
+The likelihood ratio test relies on the test statistic $-2\log\{ 
{L}(\textbf{0})/{L}(\hat{\beta}) \}$ ($\textbf{0}$ denotes a zero vector of 
size $p$ ) which has an approximate Chi-squared distribution with $p$ degrees 
of freedom under the null hypothesis that all $\beta_j$s are 0.
+The Log-rank test is based on the test statistic 
+$l=\nabla^\top L(\textbf{0}) {\mathcal{H}}^{-1}(\textbf{0}) \nabla 
L(\textbf{0})$, 
+where $\nabla L(\textbf{0})$ is the gradient of $L$ and 
$\mathcal{H}(\textbf{0})$ is the Hessian of $L$ evaluated at \textbf{0}. Under 
the null hypothesis that $\beta=\textbf{0}$, $l$ has a Chi-squared distribution 
on $p$ degrees of freedom.
+
+
+% Scoring
+\textbf{Prediction.}
+Once the parameters of the model are fitted, we compute the following 
predictions together with their standard errors
+\begin{itemize}
+       \item linear predictors,
+       \item risk, and
+       \item estimated cumulative hazard. 
+\end{itemize}
+Given feature vector $X_i$ for individual $i$, we obtain the above predictions 
at time $t$ as follows.
+The linear predictors (denoted as $\mathcal{LP}$) as well as the risk (denoted 
as $\mathcal{R}$) are computed relative to a baseline whose feature values are 
the mean of the values in the corresponding features.
+Let $X_i^\text{rel} = X_i - \mu$, where $\mu$ is a row vector that contains 
the mean values for each feature.  
+We have  $\mathcal{LP}=X_i^\text{rel} \hat{\beta}$ and $\mathcal{R}=\exp\{ 
X_i^\text{rel}\hat{\beta} \}$.
+The standard errors of the linear predictors $se\{\mathcal{LP} \}$ are 
computed as the square root of ${(X_i^\text{rel})}^\top V(\hat{\beta}) 
X_i^\text{rel}$ and the standard error of the risk $se\{ \mathcal{R} \}$ are 
given by the square root of 
+${(X_i^\text{rel} \odot \mathcal{R})}^\top V(\hat{\beta}) (X_i^\text{rel} 
\odot \mathcal{R})$, where $V(\hat{\beta})$ is the variance-covariance matrix 
of the coefficients and $\odot$ is the element-wise multiplication.     
+
+We estimate the cumulative hazard function for individual $i$ by
+\begin{equation*}
+\hat{H}_i(t) = \exp(\hat{\beta}^\top X_i) \hat{H}_0(t), 
+\end{equation*}
+where $\hat{H}_0(t)$ is the \emph{Breslow estimate} of the cumulative baseline 
hazard given by
+\begin{equation*}
+\hat{H}_0(t) = \sum_{j=1}^{k} \frac{d_j}{\sum_{l\in R(t_{(j)})} 
\exp(\hat{\beta}^\top X_l)}.
+\end{equation*}
+In the equation above, as before, $d_j$ is the number of deaths, and 
$R(t_{(j)})$ is the risk set at time $t_{(j)}$, for $t_{(k)} \leq t \leq 
t_{(k+1)}$, $k=1,2,\ldots,r-1$.
+The standard error of $\hat{H}_i(t)$ is obtained using the estimation
+\begin{equation*}
+se\{ \hat{H}_i(t) \} = \sum_{j=1}^{k} \frac{d_j}{ {\left[ \sum_{l\in 
R(t_{(j)})} \exp(X_l\hat{\beta}) \right]}^2 } + J_i^\top(t) V(\hat{\beta}) 
J_i(t),
+\end{equation*}
+where 
+\begin{equation*}
+J_i(t) = \sum_{j-1}^{k} d_j \frac{\sum_{l\in R(t_{(j)})} (X_l-X_i)\exp \{ 
(X_l-X_i)\hat{\beta} \}}{ {\left[ \sum_{l\in R(t_{(j)})} 
\exp\{(X_l-X_i)\hat{\beta}\} \right]}^2  },
+\end{equation*}
+for $t_{(k)} \leq t \leq t_{(k+1)}$, $k=1,2,\ldots,r-1$. 
+
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+  
+Blow we list the results of fitting a Cox regression model stored in matrix 
{\tt M} with the following schema:
+\begin{itemize}
+       \item Column 1: estimated regression coefficients $\hat{\beta}$
+       \item Column 2: $\exp(\hat{\beta})$
+       \item Column 3: standard error of the estimated coefficients 
$se\{\hat{\beta}\}$
+       \item Column 4: ratio of $\hat{\beta}$ to $se\{\hat{\beta}\}$ denoted 
by $Z$  
+       \item Column 5: $P$-value of $Z$ 
+       \item Column 6: lower bound of $100(1-\alpha)\%$ confidence interval 
for $\hat{\beta}$
+       \item Column 7: upper bound of $100(1-\alpha)\%$ confidence interval 
for $\hat{\beta}$.
+\end{itemize}
+Note that above $Z$ is the Wald test statistic which is asymptotically 
standard normal under the hypothesis that $\beta=\textbf{0}$.
+
+Moreover, {\tt Cox.dml} outputs two log files {\tt S} and {\tt T} containing a 
summary statistics of the fitted model as follows.
+File {\tt S} stores the following information 
+\begin{itemize}
+       \item Line 1: total number of observations
+       \item Line 2: total number of events
+       \item Line 3: log-likelihood (of the fitted model)
+       \item Line 4: AIC
+       \item Line 5: Cox \& Snell Rsquare
+       \item Line 6: maximum possible Rsquare. 
+\end{itemize}
+Above, the AIC is computed as in (\ref{eq:AIC}),
+the Cox \& Snell Rsquare is equal to $1-\exp\{ -l/n \}$, where $l$ is the 
log-rank test statistic as discussed above and $n$ is total number of 
observations,
+and the maximum possible Rsquare computed as $1-\exp\{ -2 L(\textbf{0})/n \}$ 
, where $L(\textbf{0})$ denotes the initial likelihood. 
+
+
+File {\tt T} contains the following information
+\begin{itemize}
+       \item Line 1: Likelihood ratio test statistic, degree of freedom of the 
corresponding Chi-squared distribution, $P$-value
+       \item Line 2: Wald test statistic, degree of freedom of the 
corresponding Chi-squared distribution, $P$-value
+       \item Line 3: Score (log-rank) test statistic, degree of freedom of the 
corresponding Chi-squared distribution, $P$-value.
+\end{itemize}
+
+Additionally, the following matrices will be stored. Note that these matrices 
are required for prediction.
+\begin{itemize}
+        \item Order-preserving recoded timestamps $RT$, i.e., contiguously 
numbered from 1 $\ldots$ \#timestamps
+        \item Feature matrix ordered by the timestamps $XO$
+        \item Variance-covariance matrix of the coefficients $COV$
+        \item Column indices of the feature matrix with baseline factors 
removed (if available) $MF$.  
+\end{itemize}
+
+
+\textbf{Prediction}
+Finally, the results of prediction is stored in Matrix $P$ with the following 
schema
+\begin{itemize}
+       \item Column 1: linear predictors
+       \item Column 2: standard error of the linear predictors
+       \item Column 3: risk
+       \item Column 4: standard error of the risk
+       \item Column 5: estimated cumulative hazard
+       \item Column 6: standard error of the estimated cumulative hazard.
+\end{itemize}
+
+
+
+
+\smallskip
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+       \hml -f Cox.dml -nvargs X=/user/biadmin/X.mtx TE=/user/biadmin/TE
+       F=/user/biadmin/F R=/user/biadmin/R M=/user/biadmin/model.csv
+       T=/user/biadmin/test.csv COV=/user/biadmin/var-covar.csv 
XO=/user/biadmin/X-sorted.mtx fmt=csv
+       
+}\smallskip
+
+{\hangindent=\parindent\noindent\tt
+       \hml -f Cox.dml -nvargs X=/user/biadmin/X.mtx TE=/user/biadmin/TE
+       F=/user/biadmin/F R=/user/biadmin/R M=/user/biadmin/model.csv
+       T=/user/biadmin/test.csv COV=/user/biadmin/var-covar.csv 
+       RT=/user/biadmin/recoded-timestamps.csv XO=/user/biadmin/X-sorted.csv 
+       MF=/user/biadmin/baseline.csv alpha=0.01 tol=0.000001 moi=100 mii=20 
fmt=csv
+       
+}\smallskip
+
+\noindent To compute predictions:
+
+{\hangindent=\parindent\noindent\tt
+       \hml -f Cox-predict.dml -nvargs X=/user/biadmin/X-sorted.mtx 
+       RT=/user/biadmin/recoded-timestamps.csv
+       M=/user/biadmin/model.csv Y=/user/biadmin/Y.mtx 
COV=/user/biadmin/var-covar.csv 
+       MF=/user/biadmin/baseline.csv P=/user/biadmin/predictions.csv fmt=csv
+       
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/358cfc9f/alg-ref/DecisionTrees.tex
----------------------------------------------------------------------
diff --git a/alg-ref/DecisionTrees.tex b/alg-ref/DecisionTrees.tex
new file mode 100644
index 0000000..cea26a4
--- /dev/null
+++ b/alg-ref/DecisionTrees.tex
@@ -0,0 +1,312 @@
+\begin{comment}
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+\end{comment}
+
+\subsection{Decision Trees}
+\label{sec:decision_trees}
+
+\noindent{\bf Description}
+\smallskip
+
+
+Decision tree (for classification) is a classifier that is considered
+more interpretable than other statistical classifiers. This implementation
+is well-suited to handle large-scale data and builds a (binary) decision 
+tree in parallel.\\
+
+\smallskip
+\noindent{\bf Usage}
+\smallskip
+
+{\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}decision-tree.dml}
+       {\tt{} -nvargs}
+       {\tt{} X=}path/file
+       {\tt{} Y=}path/file
+       {\tt{} R=}path/file
+       {\tt{} bins=}integer
+       {\tt{} depth=}integer
+       {\tt{} num\_leaf=}integer
+       {\tt{} num\_samples=}integer
+       {\tt{} impurity=}Gini$\mid$entropy
+       {\tt{} M=}path/file
+       {\tt{} O=}path/file
+       {\tt{} S\_map=}path/file
+       {\tt{} C\_map=}path/file
+       {\tt{} fmt=}format
+       
+}
+
+ \smallskip
+ \noindent{\bf Usage: Prediction}
+ \smallskip
+ 
+ {\hangindent=\parindent\noindent\it%
+       {\tt{}-f }path/\/{\tt{}decision-tree-predict.dml}
+       {\tt{} -nvargs}
+       {\tt{} X=}path/file
+       {\tt{} Y=}path/file
+       {\tt{} R=}path/file
+       {\tt{} M=}path/file
+       {\tt{} P=}path/file
+       {\tt{} A=}path/file
+       {\tt{} CM=}path/file
+       {\tt{} fmt=}format
+       
+ }\smallskip
+ 
+ 
+\noindent{\bf Arguments}
+\begin{Description}
+       \item[{\tt X}:]
+       Location (on HDFS) to read the matrix of feature vectors; 
+       each row constitutes one feature vector. Note that categorical features 
in $X$ need to be both recoded and dummy coded.
+       \item[{\tt Y}:]
+       Location (on HDFS) to read the matrix of (categorical) 
+       labels that correspond to feature vectors in $X$. Note that class 
labels are assumed to be both recoded and dummy coded. 
+       This argument is optional for prediction. 
+       \item[{\tt R}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to read matrix $R$ which for each feature in $X$ 
contains column-ids (first column), start indices (second column), and end 
indices (third column).
+       If $R$ is not provided by default all features are assumed to be 
continuous-valued.   
+       \item[{\tt bins}:] (default:\mbox{ }{\tt 20})
+       Number of thresholds to choose for each continuous-valued feature 
(determined by equi-height binning). 
+       \item[{\tt depth}:] (default:\mbox{ }{\tt 25})
+       Maximum depth of the learned tree
+       \item[{\tt num\_leaf}:] (default:\mbox{ }{\tt 10})
+       Parameter that controls pruning. The tree
+       is not expanded if a node receives less than {\tt num\_leaf} training 
examples.
+       \item[{\tt num\_samples}:] (default:\mbox{ }{\tt 3000})
+       Parameter that decides when to switch to in-memory building of 
subtrees. If a node $v$ receives less than {\tt num\_samples}
+       training examples then this implementation switches to an in-memory 
subtree
+       building procedure to build the subtree under $v$ in its entirety.
+       \item[{\tt impurity}:] (default:\mbox{ }{\tt "Gini"})
+       Impurity measure used at internal nodes of the tree for selecting which 
features to split on. Possible value are entropy or Gini.
+       \item[{\tt M}:] 
+       Location (on HDFS) to write matrix $M$ containing the learned decision 
tree (see below for the schema) 
+       \item[{\tt O}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to store the training accuracy (\%). Note that this 
argument is optional.
+       \item[{\tt A}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to store the testing accuracy (\%) from a 
+       held-out test set during prediction. Note that this argument is 
optional.
+       \item[{\tt P}:] 
+       Location (on HDFS) to store predictions for a held-out test set
+       \item[{\tt CM}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to store the confusion matrix computed using a 
held-out test set. Note that this argument is optional.
+       \item[{\tt S\_map}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to write the mappings from the continuous-valued 
feature-ids to the global feature-ids in $X$ (see below for details). Note that 
this argument is optional.
+       \item[{\tt C\_map}:] (default:\mbox{ }{\tt " "})
+       Location (on HDFS) to write the mappings from the categorical 
feature-ids to the global feature-ids in $X$ (see below for details). Note that 
this argument is optional.
+       \item[{\tt fmt}:] (default:\mbox{ }{\tt "text"})
+       Matrix file output format, such as {\tt text}, {\tt mm}, or {\tt csv};
+       see read/write functions in SystemML Language Reference for details.
+\end{Description}
+
+
+ \noindent{\bf Details}
+ \smallskip
+
+ 
+Decision trees~\cite{BreimanFOS84:dtree} are simple models of
+classification that,  due to their structure,  are easy to
+interpret. Given an example feature vector, each node in the learned
+tree runs a simple test on it. Based on the result of the test, the
+example is either diverted to the left subtree or to the right
+subtree. Once the example reaches a leaf, then the label stored at the
+leaf is returned as the prediction for the example.
+
+
+Building a decision tree from a fully labeled training set entails
+choosing appropriate splitting tests for each internal node in the tree and 
this is usually performed in a top-down manner. 
+The splitting test (denoted by $s$) requires
+first choosing a feature $j$ and depending on the type of $j$, either
+a threshold $\sigma$, in case $j$ is continuous-valued, or a subset of
+values $S \subseteq \text{Dom}(j)$ where $\text{Dom}(j)$ denotes
+domain of $j$, in case it is categorical. For continuous-valued
+features the test is thus of form $x_j < \sigma$ and for categorical
+features it is of form $x_j \in S$, where $x_j$ denotes the $j$th
+feature value of feature vector $x$. One way to determine which test
+to include, is to compare impurities of the tree nodes induced by the test.
+The {\it node impurity} measures the homogeneity of the labels at the node. 
This implementation supports two commonly used impurity measures (denoted by 
$\mathcal{I}$): {\it Entropy} $\mathcal{E}=\sum_{i=1}^{C}-f_i \log f_i$, as 
well as {\it Gini impurity} $\mathcal{G}=\sum_{i=1}^{C}f_i (1-f_i)$, where $C$ 
denotes the number of unique labels and $f_i$ is the frequency of label $i$.
+Once the impurity at the tree nodes has been obtained, the {\it best split} is 
chosen from a set of possible splits that maximizes the {\it information gain} 
at the node, i.e., $\arg\max_{s}\mathcal{IG}(X,s)$, where $\mathcal{IG}(X,s)$ 
denotes the information gain when the splitting test $s$ partitions the feature 
matrix $X$. 
+Assuming that $s$ partitions $X$ that contains $N$ feature vectors into 
$X_\text{left}$ and $X_\text{right}$ each including $N_\text{left}$ and 
$N_\text{right}$ feature vectors, respectively, $\mathcal{IG}(X,s)$ is given by 
+\begin{equation*}
+\mathcal{IG}(X,s)=\mathcal{I}(X)-\frac{N_\text{left}}{N}\mathcal{I}(X_\text{left})-\frac{N_\text{right}}{N}\mathcal{I}(X_\text{right}),
+\end{equation*}
+where $\mathcal{I}\in\{\mathcal{E},\mathcal{G}\}$.
+In the following we discuss the implementation details specific to {\tt 
decision-tree.dml}. 
+
+
+\textbf{Input format.} 
+In general implementations of the decision tree algorithm do not require 
categorical features to be dummy coded. For improved efficiency and reducing 
the training time, our implementation however assumes dummy coded categorical 
features and dummy coded class labels.  
+
+
+\textbf{Tree construction.}
+Learning a decision tree on large-scale data has received some
+attention in the literature. The current implementation includes logic
+for choosing tests for multiple nodes that belong to the same level in
+the decision tree in parallel (breadth-first expansion) and for
+building entire subtrees under multiple nodes in parallel (depth-first
+subtree building). Empirically it has been demonstrated that it is
+advantageous to perform breadth-first expansion for the nodes
+belonging to the top levels of the tree and to perform depth-first
+subtree building for nodes belonging to the lower levels of the 
tree~\cite{PandaHBB09:dtree}. The parameter {\tt num\_samples} controls when we
+switch to  depth-first subtree building. Any node in the decision tree
+that receives $\leq$ {\tt num\_samples} training examples, the subtree
+under it is built in its entirety in one shot.
+
+
+\textbf{Stopping rule and pruning.} 
+The splitting of data at the internal nodes stops when at least one the 
following criteria is satisfied:
+\begin{itemize}
+       \item the depth of the internal node reaches the input parameter {\tt 
depth} controlling the maximum depth of the learned tree, or
+       \item no candidate split achieves information gain.
+\end{itemize}
+This implementation also allows for some automated pruning via the argument 
{\tt num\_leaf}. If
+a node receives $\leq$ {\tt num\_leaf} training examples, then a leaf
+is built in its place.
+
+
+\textbf{Continuous-valued features.}
+For a continuous-valued feature
+$j$ the number of candidate thresholds $\sigma$ to choose from is of
+the order of the number of examples present in the training set. Since
+for large-scale data this can result in a large number of candidate
+thresholds, the user can limit this number via the arguments {\tt bins} which 
controls the number of candidate thresholds considered
+for each continuous-valued feature. For each continuous-valued
+feature, the implementation computes an equi-height histogram to
+generate one candidate threshold per equi-height bin.
+
+
+\textbf{Categorical features.}
+In order to determine the best value subset to split on in the case of 
categorical features, this implementation greedily includes values from the 
feature's domain until the information gain stops improving.
+In particular, for a categorical feature $j$ the $|Dom(j)|$ feature values are 
sorted by impurity and the resulting split candidates $|Dom(j)|-1$ are 
examined; the sequence of feature values which results in the maximum 
information gain is then selected.
+
+
+\textbf{Description of the model.} 
+The learned decision tree is represented in a matrix $M$ that
+contains at least 6 rows. Each column in the matrix contains the parameters 
relevant to a single node in the tree. 
+Note that for building the tree model, our implementation splits the feature 
matrix $X$ into $X_\text{cont}$ containing continuous-valued features and 
$X_\text{cat}$ containing categorical features. In the following, the 
continuous-valued (resp. categorical) feature-ids correspond to the indices of 
the features in $X_\text{cont}$ (resp. $X_\text{cat}$). 
+Moreover, we refer to an internal node as a continuous-valued (categorical) 
node if the feature that this nodes looks at is continuous-valued (categorical).
+Below is a description of what each row in the matrix contains.
+\begin{itemize}
+\item Row 1: stores the node-ids. These ids correspond to the node-ids in a 
complete binary tree.
+\item Row 2: for internal nodes stores the offsets (the number of columns) in 
$M$ to the left child, and otherwise 0.
+\item Row 3: stores the feature index of the feature (id of a 
continuous-valued feature in $X_\text{cont}$ if the feature is 
continuous-valued or id of a categorical feature in $X_\text{cat}$ if the 
feature is categorical) that this node looks at if the node is an internal 
node, otherwise 0. 
+\item Row 4: store the type of the feature that this node looks at if the node 
is an internal node: 1 for continuous-valued and 2 for categorical features, 
+otherwise the label this leaf node is supposed to predict.
+\item Row 5: for the internal nodes contains 1 if the feature chosen for the 
node is continuous-valued, or the size of the subset of values used for 
splitting at the node stored in rows 6,7,$\ldots$ if the feature chosen for the 
node is categorical. For the leaf nodes, Row 5 contains the number of 
misclassified training examples reaching at this node. 
+\item Row 6,7,$\ldots$: for the internal nodes, row 6 stores the threshold to 
which the example's feature value is compared if the feature chosen for this 
node is continuous-valued, otherwise if the feature chosen for this node is 
categorical rows 6,7,$\ldots$ store the value subset chosen for the node.
+For the leaf nodes, row 6 contains 1 if the node is impure and the number of 
training examples at the node is greater than {\tt num\_leaf}, otherwise 0.     
  
+\end{itemize}
+As an example, Figure~\ref{dtree} shows a decision tree with $5$ nodes and its 
matrix
+representation.
+
+\begin{figure}
+\begin{minipage}{0.3\linewidth}
+\begin{center}
+\begin{tikzpicture}
+\node (labelleft) [draw,shape=circle,minimum size=16pt] at (2,0) {$2$};
+\node (labelright) [draw,shape=circle,minimum size=16pt] at (3.25,0) {$1$};
+
+\node (rootleft) [draw,shape=rectangle,minimum size=16pt] at (2.5,1) {$x_5 \in 
\{2,3\}$};
+\node (rootlabel) [draw,shape=circle,minimum size=16pt] at (0.9,1) {$1$};
+\node (root) [draw,shape=rectangle,minimum size=16pt] at (1.75,2) {$x_3 < 
0.45$};
+
+\draw[-latex] (root) -- (rootleft);
+\draw[-latex] (root) -- (rootlabel);
+\draw[-latex] (rootleft) -- (labelleft);
+\draw[-latex] (rootleft) -- (labelright);
+
+\end{tikzpicture}
+\end{center}
+\begin{center}
+(a)
+\end{center}
+\end{minipage}
+\hfill
+\begin{minipage}{0.65\linewidth}
+\begin{center}
+\begin{tabular}{c|c|c|c|c|c|}
+& Col 1 & Col 2 & Col 3 & Col 4 & Col 5\\
+\hline
+Row 1 & 1 & 2 & 3 & 6 & 7 \\
+\hline
+Row 2 & 1 & 0 & 1 & 0 & 0 \\
+\hline
+Row 3 & 3 & 5 & 0 & 0 & 0 \\
+\hline
+Row 4 & 1 & 1 & 2 & 2 & 1 \\
+\hline
+Row 5 & 1 & 0 & 2 & 0 & 0 \\
+\hline
+Row 6 & 0.45 & 0 & 2 & 0 & 0 \\
+\hline
+Row 7 &  &  & 3 &  & \\
+\hline
+\end{tabular}
+\end{center}
+\begin{center}
+(b)
+\end{center}
+\end{minipage}
+\caption{(a) An example tree and its (b) matrix representation. $x$ denotes an 
example and $x_j$ is the value of the $j$th continuous-valued (resp. 
categorical) feature in $X_\text{cont}$ (resp. $X_\text{cat}$). In this example 
all leaf nodes are pure and no training example is misclassified.}
+\label{dtree}
+\end{figure}
+
+
+\smallskip
+\noindent{\bf Returns}
+\smallskip
+
+
+The matrix corresponding to the learned model as well as the training accuracy 
(if requested) is written to a file in the format specified. See
+details where the structure of the model matrix is described.
+Recall that in our implementation $X$ is split into $X_\text{cont}$ and 
$X_\text{cat}$. If requested, the mappings of the continuous-valued feature-ids 
in $X_\text{cont}$ (stored at {\tt S\_map}) and the categorical feature-ids in 
$X_\text{cat}$ (stored at {\tt C\_map}) to the global feature-ids in $X$ will 
be provided. 
+Depending on what arguments are provided during
+invocation, the {\tt decision-tree-predict.dml} script may compute one or more 
of predictions, accuracy and confusion matrix in the requested output format. 
+
+\smallskip
+\noindent{\bf Examples}
+\smallskip
+
+{\hangindent=\parindent\noindent\tt
+       \hml -f decision-tree.dml -nvargs X=/user/biadmin/X.mtx 
Y=/user/biadmin/Y.mtx
+       R=/user/biadmin/R.csv M=/user/biadmin/model.csv
+       bins=20 depth=25 num\_leaf=10 num\_samples=3000 impurity=Gini fmt=csv
+       
+}\smallskip
+
+
+\noindent To compute predictions:
+
+{\hangindent=\parindent\noindent\tt
+       \hml -f decision-tree-predict.dml -nvargs X=/user/biadmin/X.mtx 
Y=/user/biadmin/Y.mtx R=/user/biadmin/R.csv
+       M=/user/biadmin/model.csv  P=/user/biadmin/predictions.csv
+       A=/user/biadmin/accuracy.csv CM=/user/biadmin/confusion.csv fmt=csv
+       
+}\smallskip
+
+
+%\noindent{\bf References}
+%
+%\begin{itemize}
+%\item B. Panda, J. Herbach, S. Basu, and R. Bayardo. \newblock{PLANET: 
massively parallel learning of tree ensembles with MapReduce}. In Proceedings 
of the VLDB Endowment, 2009.
+%\item L. Breiman, J. Friedman, R. Olshen, and C. Stone. 
\newblock{Classification and Regression Trees}. Wadsworth and Brooks, 1984.
+%\end{itemize}

Reply via email to