svn commit: r1828046 [4/20] - /systemml/site/docs/1.1.0/

reinwald Thu, 29 Mar 2018 21:31:53 -0700
Added: systemml/site/docs/1.1.0/algorithms-descriptive-statistics.html
URL: 
http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/algorithms-descriptive-statistics.html?rev=1828046&view=auto
==============================================================================
--- systemml/site/docs/1.1.0/algorithms-descriptive-statistics.html (added)
+++ systemml/site/docs/1.1.0/algorithms-descriptive-statistics.html Fri Mar 30 
04:31:05 2018
@@ -0,0 +1,1802 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <title>SystemML Algorithms Reference - Descriptive Statistics - 
SystemML 1.1.0</title>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <link rel="stylesheet" href="css/main.css">
+        <link rel="stylesheet" href="css/pygments-default.css">
+        <link rel="shortcut icon" href="img/favicon.png">
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a 
href="http://browsehappy.com/";>Upgrade your browser today</a> or <a 
href="http://www.google.com/chromeframe/?redirect=true";>install Google Chrome 
Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <header class="navbar navbar-default navbar-fixed-top" id="topbar">
+            <div class="container">
+                <div class="navbar-header">
+                    <div class="navbar-brand brand projectlogo">
+                        <a href="http://systemml.apache.org/";><img 
class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache 
SystemML"/></a>
+                    </div>
+                    <div class="navbar-brand brand projecttitle">
+                        <a href="http://systemml.apache.org/";>Apache 
SystemML<sup id="trademark">â¢</sup></a><br/>
+                        <span class="version">1.1.0</span>
+                    </div>
+                    <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target=".navbar-collapse">
+                        <span class="sr-only">Toggle navigation</span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                    </button>
+                </div>
+                <nav class="navbar-collapse collapse">
+                    <ul class="nav navbar-nav navbar-right">
+                        <li><a href="index.html">Overview</a></li>
+                        <li><a 
href="https://github.com/apache/systemml";>GitHub</a></li>
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Documentation<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>Running SystemML:</b></li>
+                                <li><a 
href="https://github.com/apache/systemml";>SystemML GitHub README</a></li>
+                                <li><a 
href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
+                                <li><a href="spark-batch-mode.html">Spark 
Batch Mode</a>
+                                <li><a href="hadoop-batch-mode.html">Hadoop 
Batch Mode</a>
+                                <li><a href="standalone-guide.html">Standalone 
Guide</a></li>
+                                <li><a href="jmlc.html">Java Machine Learning 
Connector (JMLC)</a>
+                                <li class="divider"></li>
+                                <li><b>Language Guides:</b></li>
+                                <li><a href="dml-language-reference.html">DML 
Language Reference</a></li>
+                                <li><a 
href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and 
PyDML</a></li>
+                                <li><a 
href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
+                                <li><a href="python-reference.html">Reference 
Guide for Python Users</a></li>
+                                <li class="divider"></li>
+                                <li><b>ML Algorithms:</b></li>
+                                <li><a 
href="algorithms-reference.html">Algorithms Reference</a></li>
+                                <li class="divider"></li>
+                                <li><b>Tools:</b></li>
+                                <li><a href="debugger-guide.html">Debugger 
Guide</a></li>
+                                <li><a 
href="developer-tools-systemml.html">IDE Guide</a></li>
+                                <li class="divider"></li>
+                                <li><b>Other:</b></li>
+                                <li><a 
href="contributing-to-systemml.html">Contributing to SystemML</a></li>
+                                <li><a href="engine-dev-guide.html">Engine 
Developer Guide</a></li>
+                                <li><a 
href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
+                                <li><a href="release-process.html">Release 
Process</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><a 
href="./api/java/index.html">Java</a></li>
+                                <li><a 
href="./api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Issues<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>JIRA:</b></li>
+                                <li><a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>SystemML JIRA</a></li>
+                                
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+            </div>
+        </header>
+
+        <div class="container" id="content">
+          
+            <h1 class="title"><a href="algorithms-reference.html">SystemML 
Algorithms Reference</a></h1>
+          
+
+          <!--
+
+-->
+
+<h1 id="descriptive-statistics">1. Descriptive Statistics</h1>
+
+<p>Descriptive statistics are used to quantitatively describe the main
+characteristics of the data. They provide meaningful summaries computed
+over different observations or data records collected in a study. These
+summaries typically form the basis of the initial data exploration as
+part of a more extensive statistical analysis. Such a quantitative
+analysis assumes that every variable (also known as, attribute, feature,
+or column) in the data has a specific <em>level of
+measurement</em>Â <a href="algorithms-bibliography.html">[Stevens1946]</a>.</p>
+
+<p>The measurement level of a variable, often called as <strong>variable
+type</strong>, can either be <em>scale</em> or <em>categorical</em>. A 
<em>scale</em>
+variable represents the data measured on an interval scale or ratio
+scale. Examples of scale variables include âHeightâ, âWeightâ, 
âSalaryâ,
+and âTemperatureâ. Scale variables are also referred to as
+<em>quantitative</em> or <em>continuous</em> variables. In contrast, a 
<em>categorical</em>
+variable has a fixed limited number of distinct values or categories.
+Examples of categorical variables include âGenderâ, âRegionâ, âHair
+colorâ, âZipcodeâ, and âLevel of Satisfactionâ. Categorical variables
+can further be classified into two types, <em>nominal</em> and 
<em>ordinal</em>,
+depending on whether the categories in the variable can be ordered via
+an intrinsic ranking. For example, there is no meaningful ranking among
+distinct values in âHair colorâ variable, while the categories in âLevel
+of Satisfactionâ can be ranked from highly dissatisfied to highly
+satisfied.</p>
+
+<p>The input dataset for descriptive statistics is provided in the form of
+a matrix, whose rows are the records (data points) and whose columns are
+the features (i.e.Â variables). Some scripts allow this matrix to be
+vertically split into two or three matrices. Descriptive statistics are
+computed over the specified features (columns) in the matrix. Which
+statistics are computed depends on the types of the features. It is
+important to keep in mind the following caveats and restrictions:</p>
+
+<ol>
+  <li>
+    <p>Given a finite set of data records, i.e.Â a <em>sample</em>, we take 
their
+feature values and compute their <em>sample statistics</em>. These statistics
+will vary from sample to sample even if the underlying distribution of
+feature values remains the same. Sample statistics are accurate for the
+given sample only. If the goal is to estimate the <em>distribution
+statistics</em> that are parameters of the (hypothesized) underlying
+distribution of the features, the corresponding sample statistics may
+sometimes be used as approximations, but their accuracy will vary.</p>
+  </li>
+  <li>
+    <p>In particular, the accuracy of the estimated distribution statistics
+will be low if the number of values in the sample is small. That is, for
+small samples, the computed statistics may depend on the randomness of
+the individual sample values more than on the underlying distribution of
+the features.</p>
+  </li>
+  <li>
+    <p>The accuracy will also be low if the sample records cannot be assumed
+mutually independent and identically distributed (i.i.d.), that is,
+sampled at random from the same underlying distribution. In practice,
+feature values in one record often depend on other features and other
+records, including unknown ones.</p>
+  </li>
+  <li>
+    <p>Most of the computed statistics will have low estimation accuracy in the
+presence of extreme values (outliers) or if the underlying distribution
+has heavy tails, for example obeys a power law. However, a few of the
+computed statistics, such as the median and Spearmanâs rank
+correlation coefficient, are <em>robust</em> to outliers.</p>
+  </li>
+  <li>
+    <p>Some sample statistics are reported with their <em>sample standard 
errors</em>
+in an attempt to quantify their accuracy as distribution parameter
+estimators. But these sample standard errors, in turn, only estimate the
+underlying distributionâs standard errors and will have low accuracy for
+small or samples, outliers in samples, or heavy-tailed distributions.</p>
+  </li>
+  <li>
+    <p>We assume that the quantitative (scale) feature columns do not contain
+missing values, infinite values, <code>NaN</code>s, or coded non-numeric 
values,
+unless otherwise specified. We assume that each categorical feature
+column contains positive integers from 1 to the number of categories;
+for ordinal features, the natural order on the integers should coincide
+with the order on the categories.</p>
+  </li>
+</ol>
+
+<hr />
+
+<h2 id="univariate-statistics">1.1. Univariate Statistics</h2>
+
+<h3 id="description">Description</h3>
+
+<p><em>Univariate statistics</em> are the simplest form of descriptive 
statistics
+in data analysis. They are used to quantitatively describe the main
+characteristics of each feature in the data. For a given dataset matrix,
+script <code>Univar-Stats.dml</code> computes certain univariate
+statistics for each feature column in the matrix. The feature type
+governs the exact set of statistics computed for that feature. For
+example, the statistic <em>mean</em> can only be computed on a quantitative
+(scale) feature like âHeightâ and âTemperatureâ. It does not make sense
+to compute the mean of a categorical attribute like âHair Colorâ.</p>
+
+<h3 id="usage">Usage</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f Univar-Stats.dml
+                        -nvargs X=&lt;file&gt;
+                                TYPES=&lt;file&gt;
+                                STATS=&lt;file&gt;
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f Univar-Stats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=&lt;file&gt;
+                                     TYPES=&lt;file&gt;
+                                     STATS=&lt;file&gt;
+</code></pre>
+  </div>
+</div>
+
+<h3 id="arguments">Arguments</h3>
+
+<p><strong>X</strong>: Location (on HDFS) to read the data matrix $X$ whose 
columns we want to
+analyze as the features.</p>
+
+<p><strong>TYPES</strong>: Location (on HDFS) to read the single-row matrix 
whose $i^{\textrm{th}}$
+column-cell contains the type of the $i^{\textrm{th}}$ feature column
+<code>X[,i]</code> in the data matrix. Feature types must be encoded by integer
+numbers: 1 = scale, 2 = nominal, 3 = ordinal.</p>
+
+<p><strong>STATS</strong>: Location (on HDFS) where the output matrix of 
computed statistics will
+be stored. The format of the output matrix is defined by
+<a href="algorithms-descriptive-statistics.html#table1"><strong>TableÂ 
1</strong></a>.</p>
+
+<h3 id="examples">Examples</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f Univar-Stats.dml
+                        -nvargs X=/user/ml/X.mtx
+                                TYPES=/user/ml/types.mtx
+                                STATS=/user/ml/stats.mtx
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f Univar-Stats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=/user/ml/X.mtx
+                                     TYPES=/user/ml/types.mtx
+                                     STATS=/user/ml/stats.mtx
+</code></pre>
+  </div>
+</div>
+
+<hr />
+
+<p><a name="table1"></a>
+<strong>Table 1</strong>: The output matrix of <code>Univar-Stats.dml</code> 
has one row per
+each univariate statistic and one column per input feature. This table
+lists the meaning of each row. Signs &#8220;+&#8221; show applicability to 
scale
+or/and to categorical features.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>Row</th>
+      <th>Name of Statistic</th>
+      <th style="text-align: center">Scale</th>
+      <th style="text-align: center">Category</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>1</td>
+      <td>Minimum</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Maximum</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Range</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>4</td>
+      <td>Mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>5</td>
+      <td>Variance</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>6</td>
+      <td>Standard deviation</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>7</td>
+      <td>Standard error of mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>8</td>
+      <td>Coefficient of variation</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>9</td>
+      <td>Skewness</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>10</td>
+      <td>Kurtosis</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>11</td>
+      <td>Standard error of skewness</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>12</td>
+      <td>Standard error of kurtosis</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>13</td>
+      <td>Median</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>14</td>
+      <td>Interquartile mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td>15</td>
+      <td>Number of categories</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+    <tr>
+      <td>16</td>
+      <td>Mode</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+    <tr>
+      <td>17</td>
+      <td>Number of modes</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+  </tbody>
+</table>
+
+<hr />
+
+<h3 id="details">Details</h3>
+
+<p>Given an input matrix <code>X</code>, this script computes the set of all 
relevant
+univariate statistics for each feature column <code>X[,i]</code> inÂ 
<code>X</code>. The list
+of statistics to be computed depends on the <em>type</em>, or <em>measurement
+level</em>, of each column. The command-line argument points to a vector
+containing the types of all columns. The types must be provided as per
+the following convention: 1 = scale, 2 = nominal,
+3 = ordinal.</p>
+
+<p>Below we list all univariate statistics computed by script
+<code>Univar-Stats.dml</code>. The statistics are collected by
+relevance into several groups, namely: central tendency, dispersion,
+shape, and categorical measures. The first three groups contain
+statistics computed for a quantitative (also known as: numerical, scale,
+or continuous) feature; the last group contains the statistics for a
+categorical (either nominal or ordinal) feature.</p>
+
+<p>LetÂ $n$ be the number of data records (rows) with feature values. In
+what follows we fix a column index <code>idx</code> and consider sample 
statistics
+of feature column <code>X[</code>$\,$<code>,</code>$\,$<code>idx]</code>. Let
+$v = (v_1, v_2, \ldots, v_n)$ be the values of 
<code>X[</code>$\,$<code>,</code>$\,$<code>idx]</code> in
+their original unsorted order:
+$v_i = \texttt{X[}i\texttt{,}\,\texttt{idx]}$. Let
+$v^s = (v^s_1, v^s_2, \ldots, v^s_n)$ be the same values in the sorted
+order, preserving duplicates: $v^s_1 \leq v^s_2 \leq \ldots \leq v^s_n$.</p>
+
+<hr />
+
+<p><a name="figure1"></a>
+<strong>Figure 1</strong>: The computation of quartiles, median, and 
interquartile mean from the
+empirical distribution function of the 10-point
+sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4, 7.2, 7.8}.  Each vertical step 
in
+the graph has height $1{/}n = 0.1$.  Values <script 
type="math/tex">q_{25\%}</script>, <script type="math/tex">q_{50\%}</script>, 
and <script type="math/tex">q_{75\%}</script> denote
+the $1^{\textrm{st}}$, $2^{\textrm{nd}}$, and $3^{\textrm{rd}}$ quartiles 
correspondingly;
+value $\mu$ denotes the median.  Values $\phi_1$ and $\phi_2$ show the partial 
contribution
+of border points (quartiles) $v_3=3.7$ and $v_8=6.4$ into the interquartile 
mean.
+<img 
src="img/algorithms-reference/computation-of-quartiles-median-and-interquartile-mean.png"
 alt="Figure 1" title="Figure 1" /></p>
+
+<hr />
+
+<h4 id="central-tendency-measures">Central Tendency Measures</h4>
+
+<p>Sample statistics that describe the location of the quantitative (scale)
+feature distribution, represent it with a single value.</p>
+
+<p><strong>Mean</strong> (output row 4): The arithmetic average over a sample 
of a quantitative
+feature. Computed as the ratio between the sum of values and the number
+of values: $\left(\sum_{i=1}^n v_i\right)!/n$. Example: the mean of
+sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4, 7.2, 7.8}
+equalsÂ 5.2.</p>
+
+<p>Note that the mean is significantly affected by extreme values in the
+sample and may be misleading as a central tendency measure if the
+feature varies on exponential scale. For example, the mean of {0.01,
+0.1, 1.0, 10.0, 100.0} is 22.222, greater than all the sample values
+except theÂ largest.</p>
+
+<p><strong>Median</strong> (output row 13): The &#8220;middle&#8221; value 
that separates the higher half of the
+sample values (in a sorted order) from the lower half. To compute the
+median, we sort the sample in the increasing order, preserving
+duplicates: $v^s_1 \leq v^s_2 \leq \ldots \leq v^s_n$. If $n$ is odd,
+the median equals $v^s_i$ where $i = (n\,{+}\,1)\,{/}\,2$, same as the
+$50^{\textrm{th}}$Â percentile of the sample. If $n$ is even, there are
+two &#8220;middle&#8221; values <script type="math/tex">v^s_{n/2}</script> and 
<script type="math/tex">v^s_{n/2\,+\,1}</script>, so we compute the
+median as the mean of these two values. (For evenÂ $n$ we compute the
+$50^{\textrm{th}}$Â percentile asÂ $v^s_{n/2}$, not as the median.)
+Example: the median of sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1,
+6.4, 7.2, 7.8} equals $(5.3\,{+}\,5.7)\,{/}\,2$Â ${=}$Â 5.5, see
+<a href="algorithms-descriptive-statistics.html#figure1"><strong>FigureÂ 
1</strong></a>.</p>
+
+<p>Unlike the mean, the median is not sensitive to extreme values in the
+sample, i.e.Â it is robust to outliers. It works better as a measure of
+central tendency for heavy-tailed distributions and features that vary
+on exponential scale. However, the median is sensitive to small sample
+size.</p>
+
+<p><strong>Interquartile mean</strong> (output row 14): For a sample of a 
quantitative feature, this is
+the mean of the values greater than or equal to the $1^{\textrm{st}}$
+quartile and less than or equal the $3^{\textrm{rd}}$ quartile. In other
+words, it is a &#8220;truncated mean&#8221; where the lowest 25$\%$ and the 
highest
+25$\%$ of the sorted values are omitted in its computation. The two
+&#8220;border values&#8221;, i.e.Â the $1^{\textrm{st}}$ and the 
$3^{\textrm{rd}}$
+quartiles themselves, contribute to this mean only partially. This
+measure is occasionally used as the &#8220;robust&#8221; version of the mean 
that is
+less sensitive to the extreme values.*</p>
+
+<p>To compute the measure, we sort the sample in the increasing order,
+preserving duplicates: $v^s_1 \leq v^s_2 \leq \ldots \leq v^s_n$. We set
+$j = \lceil n{/}4 \rceil$ for the $1^{\textrm{st}}$ quartile index and
+$k = \lceil 3n{/}4 \rceil$ for the $3^{\textrm{rd}}$ quartile index,
+then compute the following weighted mean:</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+\frac{1}{3{/}4 - 1{/}4} \left[
+\left(\frac{j}{n} - \frac{1}{4}\right) v^s_j \,\,+ 
+\sum_{j<i<k} \left(\frac{i}{n} - \frac{i\,{-}\,1}{n}\right) v^s_i 
+\,\,+\,\, \left(\frac{3}{4} - \frac{k\,{-}\,1}{n}\right) v^s_k\right] 
%]]></script>
+
+<p>In other words, all sample values between the $1^{\textrm{st}}$ and the
+$3^{\textrm{rd}}$ quartile enter the sum with weights $2{/}n$, times
+their number of duplicates, while the two quartiles themselves enter the
+sum with reduced weights. The weights are proportional to the vertical
+steps in the empirical distribution function of the sample, see
+FigureÂ [fig:example_quartiles] for an illustration. Example: the
+interquartile mean of sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4,
+7.2, 7.8} equals the sum
+$0.1 (3.7\,{+}\,6.4) + 0.2 (4.4\,{+}\,5.3\,{+}\,5.7\,{+}\,6.1)$, which
+equalsÂ 5.31.</p>
+
+<h4 id="dispersion-measures">Dispersion Measures</h4>
+
+<p>Statistics that describe the amount of variation or spread in a
+quantitative (scale) data feature.</p>
+
+<p><strong>Variance</strong> (output row 5): A measure of dispersion, or 
spread-out, of sample values
+around their mean, expressed in units that are the square of those of
+the feature itself. Computed as the sum of squared differences between
+the values in the sample and their mean, divided by one less than the
+number of values: <script type="math/tex">\sum_{i=1}^n (v_i - 
\bar{v})^2\,/\,(n\,{-}\,1)</script> where
+$\bar{v}=\left(\sum_{i=1}^n v_i\right)/n$. Example: the variance of
+sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4, 7.2, 7.8}
+equalsÂ 3.24. Note that at least two values ($n\geq 2$) are required to
+avoid division by zero. Sample variance is sensitive to outliers, even
+more than the mean.</p>
+
+<p><strong>Standard deviation</strong> (output row 6): A measure of dispersion 
around the mean, the
+square root of variance. Computed by taking the square root of the
+sample variance; see <em>Variance</em> above on computing the variance.
+Example: the standard deviation of sample {2.2, 3.2, 3.7, 4.4, 5.3,
+5.7, 6.1, 6.4, 7.2, 7.8} equalsÂ 1.8. At least two values are required
+to avoid division by zero. Note that standard deviation is sensitive to
+outliers.</p>
+
+<p>Standard deviation is used in conjunction with the mean to determine an
+interval containing a given percentage of the feature values, assuming
+the normal distribution. In a large sample from a normal distribution,
+around 68% of the cases fall within one standard deviation and around
+95% of cases fall within two standard deviations of the mean. For
+example, if the mean age is 45 with a standard deviation of 10, around
+95% of the cases would be between 25 and 65 in a normal distribution.</p>
+
+<p><strong>Coefficient of variation</strong> (output row 8): The ratio of the 
standard deviation to the
+mean, i.e.Â the <em>relative</em> standard deviation, of a quantitative feature
+sample. Computed by dividing the sample <em>standard deviation</em> by the
+sample <em>mean</em>, see above for their computation details. Example: the
+coefficient of variation for sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7,
+6.1, 6.4, 7.2, 7.8} equals 1.8$\,{/}\,$5.2Â ${\approx}$Â 0.346.</p>
+
+<p>This metric is used primarily with non-negative features such as
+financial or population data. It is sensitive to outliers. Note: zero
+mean causes division by zero, returning infinity or <code>NaN</code>. At least 
two
+values (records) are required to compute the standard deviation.</p>
+
+<p><strong>Minimum</strong> (output row 1): The smallest value of a 
quantitative sample, computed as
+$\min v = v^s_1$. Example: the minimum of sample {2.2, 3.2, 3.7, 4.4,
+5.3, 5.7, 6.1, 6.4, 7.2, 7.8} equalsÂ 2.2.</p>
+
+<p><strong>Maximum</strong> (output row 2): The largest value of a 
quantitative sample, computed as
+$\max v = v^s_n$. Example: the maximum of sample {2.2, 3.2, 3.7, 4.4,
+5.3, 5.7, 6.1, 6.4, 7.2, 7.8} equalsÂ 7.8.</p>
+
+<p><strong>Range</strong> (output row 3): The difference between the largest 
and the smallest value of
+a quantitative sample, computed as $\max v - \min v = v^s_n - v^s_1$. It
+provides information about the overall spread of the sample values.
+Example: the range of sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4,
+7.2, 7.8} equals 7.8$\,{-}\,$2.2Â ${=}$Â 5.6.</p>
+
+<p><strong>Standard error of the mean</strong> (output row 7): A measure of 
how much the value of the
+sample mean may vary from sample to sample taken from the same
+(hypothesized) distribution of the feature. It helps to roughly bound
+the distribution mean, i.e.the limit of the sample mean as the sample
+size tends to infinity. Under certain assumptions (e.g.Â normality and
+large sample), the difference between the distribution mean and the
+sample mean is unlikely to exceed 2Â standard errors.</p>
+
+<p>The measure is computed by dividing the sample standard deviation by the
+square root of the number of valuesÂ $n$; see <em>standard deviation</em> for
+its computation details. Ensure $n\,{\geq}\,2$ to avoid division byÂ 0.
+Example: for sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4, 7.2,
+7.8} with the mean ofÂ 5.2 the standard error of the mean equals
+1.8$\,{/}\sqrt{10}$Â ${\approx}$Â 0.569.</p>
+
+<p>Note that the standard error itself is subject to sample randomness. Its
+accuracy as an error estimator may be low if the sample size is small or
+non-i.i.d., if there are outliers, or if the distribution has heavy tails.</p>
+
+<h4 id="shape-measures">Shape Measures</h4>
+
+<p>Statistics that describe the shape and symmetry of the quantitative
+(scale) feature distribution estimated from a sample of its values.</p>
+
+<p><strong>Skewness</strong> (output row 9): It measures how symmetrically the 
values of a feature are
+spread out around the mean. A significant positive skewness implies a
+longer (or fatter) right tail, i.e. feature values tend to lie farther
+away from the mean on the right side. A significant negative skewness
+implies a longer (or fatter) left tail. The normal distribution is
+symmetric and has a skewness value ofÂ 0; however, its sample skewness is
+likely to be nonzero, just close to zero. As a guideline, a skewness
+value more than twice its standard error is taken to indicate a
+departure from symmetry.</p>
+
+<p>Skewness is computed as the $3^{\textrm{rd}}$Â central moment divided by
+the cube of the standard deviation. We estimate the
+$3^{\textrm{rd}}$Â central moment as the sum of cubed differences between
+the values in the feature column and their sample mean, divided by the
+number of values: <script type="math/tex">\sum_{i=1}^n (v_i - \bar{v})^3 / 
n</script> where
+<script type="math/tex">\bar{v}=\left(\sum_{i=1}^n v_i\right)/n</script>. The 
standard deviation is
+computed as described above in <em>standard deviation</em>. To avoid division
+byÂ 0, at least two different sample values are required. Example: for
+sample {2.2, 3.2, 3.7, 4.4, 5.3, 5.7, 6.1, 6.4, 7.2, 7.8} with the
+mean ofÂ 5.2 and the standard deviation ofÂ 1.8 skewness is estimated as
+$-1.0728\,{/}\,1.8^3 \approx -0.184$. Note: skewness is sensitive to
+outliers.</p>
+
+<p><strong>Standard error in skewness</strong> (output row 11): A measure of 
how much the sample
+skewness may vary from sample to sample, assuming that the feature is
+normally distributed, which makes its distribution skewness equalÂ 0.
+Given the numberÂ $n$ of sample values, the standard error is computed as</p>
+
+<script type="math/tex; 
mode=display">\sqrt{\frac{6n\,(n-1)}{(n-2)(n+1)(n+3)}}</script>
+
+<p>This measure can tell us, for example:</p>
+
+<ul>
+  <li>If the sample skewness lands within two standard errors fromÂ 0, its
+positive or negative sign is non-significant, may just be accidental.</li>
+  <li>If the sample skewness lands outside this interval, the feature is
+unlikely to be normally distributed.</li>
+</ul>
+
+<p>At least 3Â values ($n\geq 3$) are required to avoid arithmetic failure.
+Note that the standard error is inaccurate if the feature distribution
+is far from normal or if the number of samples is small.</p>
+
+<p><strong>Kurtosis</strong> (output row 10): As a distribution parameter, 
kurtosis is a measure of the
+extent to which feature values cluster around a central point. In other
+words, it quantifies &#8220;peakedness&#8221; of the distribution: how tall and
+sharp the central peak is relative to a standard bell curve.</p>
+
+<p>Positive kurtosis (<em>leptokurtic</em> distribution) indicates that, 
relative
+to a normal distribution:</p>
+
+<ul>
+  <li>Observations cluster more about the center (peak-shaped)</li>
+  <li>The tails are thinner at non-extreme values</li>
+  <li>The tails are thicker at extreme values</li>
+</ul>
+
+<p>Negative kurtosis (<em>platykurtic</em> distribution) indicates that, 
relative
+to a normal distribution:</p>
+
+<ul>
+  <li>Observations cluster less about the center (box-shaped)</li>
+  <li>The tails are thicker at non-extreme values</li>
+  <li>The tails are thinner at extreme values</li>
+</ul>
+
+<p>Kurtosis of a normal distribution is zero; however, the sample kurtosis
+(computed here) is likely to deviate from zero.</p>
+
+<p>Sample kurtosis is computed as the $4^{\textrm{th}}$Â central moment
+divided by the $4^{\textrm{th}}$Â power of the standard deviation,
+minusÂ 3. We estimate the $4^{\textrm{th}}$Â central moment as the sum of
+the $4^{\textrm{th}}$Â powers of differences between the values in the
+feature column and their sample mean, divided by the number of values:
+<script type="math/tex">\sum_{i=1}^n (v_i - \bar{v})^4 / n</script> where
+$\bar{v}=\left(\sum_{i=1}^n v_i\right)/n$. The standard deviation is
+computed as described above, see <em>standard deviation</em>.</p>
+
+<p>Note that kurtosis is sensitive to outliers, and requires at least two
+different sample values. Example: for sample {2.2, 3.2, 3.7, 4.4,
+5.3, 5.7, 6.1, 6.4, 7.2, 7.8} with the mean ofÂ 5.2 and the standard
+deviation ofÂ 1.8, sample kurtosis equals
+$16.6962\,{/}\,1.8^4 - 3 \approx -1.41$.</p>
+
+<p><strong>Standard error in kurtosis</strong> (output row 12): A measure of 
how much the sample
+kurtosis may vary from sample to sample, assuming that the feature is
+normally distributed, which makes its distribution kurtosis equalÂ 0.
+Given the numberÂ $n$ of sample values, the standard error is computed as</p>
+
+<script type="math/tex; 
mode=display">\sqrt{\frac{24n\,(n-1)^2}{(n-3)(n-2)(n+3)(n+5)}}</script>
+
+<p>This measure can tell us, for example:</p>
+
+<ul>
+  <li>If the sample kurtosis lands within two standard errors fromÂ 0, its
+positive or negative sign is non-significant, may just be accidental.</li>
+  <li>If the sample kurtosis lands outside this interval, the feature is
+unlikely to be normally distributed.</li>
+</ul>
+
+<p>At least 4Â values ($n\geq 4$) are required to avoid arithmetic failure.
+Note that the standard error is inaccurate if the feature distribution
+is far from normal or if the number of samples is small.</p>
+
+<h4 id="categorical-measures">Categorical Measures</h4>
+
+<p>Statistics that describe the sample of a categorical feature, either
+nominal or ordinal. We represent all categories by integers fromÂ 1 to
+the number of categories; we call these integers <em>categoryÂ IDs</em>.</p>
+
+<p><strong>Number of categories</strong> (output row 15): The maximum 
categoryÂ ID that occurs in the
+sample. Note that some categories withÂ IDs <em>smaller</em> than this
+maximumÂ ID may have noÂ occurrences in the sample, without reducing the
+number of categories. However, any categories withÂ IDs <em>larger</em> than 
the
+maximumÂ ID with no occurrences in the sample will not be counted.
+Example: in sample {1, 3, 3, 3, 3, 4, 4, 5, 7, 7, 7, 7, 8, 8, 8}
+the number of categories is reported asÂ 8. CategoryÂ IDs 2 andÂ 6, which
+have zero occurrences, are still counted; but if there is a category
+with ID${}=9$ and zero occurrences, it is not counted.</p>
+
+<p><strong>Mode</strong> (output row 16): The most frequently occurring 
category value. If several
+values share the greatest frequency of occurrence, then each of them is
+a mode; but here we report only the smallest of these modes. Example: in
+sample {1, 3, 3, 3, 3, 4, 4, 5, 7, 7, 7, 7, 8, 8, 8} the modes are
+3 andÂ 7, with 3 reported.</p>
+
+<p>Computed by counting the number of occurrences for each category, then
+taking the smallest categoryÂ ID that has the maximum count. Note that
+the sample modes may be different from the distribution modes, i.e.Â the
+categories whose (hypothesized) underlying probability is the maximum
+over all categories.</p>
+
+<p><strong>Number of modes</strong> (output row 17): The number of category 
values that each have the
+largest frequency count in the sample. Example: in sample {1, 3, 3,
+3, 3, 4, 4, 5, 7, 7, 7, 7, 8, 8, 8} there are two category IDs (3
+andÂ 7) that occur the maximum count of 4Â times; hence, we returnÂ 2.</p>
+
+<p>Computed by counting the number of occurrences for each category, then
+counting how many categories have the maximum count. Note that the
+sample modes may be different from the distribution modes, i.e.Â the
+categories whose (hypothesized) underlying probability is the maximum
+over all categories.</p>
+
+<h3 id="returns">Returns</h3>
+
+<p>The output matrix containing all computed statistics is of size
+$17$Â rows and as many columns as in the input matrixÂ <code>X</code>. Each row
+corresponds to a particular statistic, according to the convention
+specified in <a 
href="algorithms-descriptive-statistics.html#table1"><strong>TableÂ 
1</strong></a>. The first $14$Â statistics are
+applicable for <em>scale</em> columns, and the last $3$Â statistics are
+applicable for categorical, i.e.Â nominal and ordinal, columns.</p>
+
+<hr />
+
+<h2 id="bivariate-statistics">1.2. Bivariate Statistics</h2>
+
+<h3 id="description-1">Description</h3>
+
+<p>Bivariate statistics are used to quantitatively describe the association
+between two features, such as test their statistical (in-)dependence or
+measure the accuracy of one data feature predicting the other feature,
+in a sample. The <code>bivar-stats.dml</code> script computes common
+bivariate statistics, such as Pearsonâs correlation
+coefficient and PearsonâsÂ $\chi^2$, in parallel for
+many pairs of data features. For a given dataset matrix, script
+<code>bivar-stats.dml</code> computes certain bivariate statistics for
+the given feature (column) pairs in the matrix. The feature types govern
+the exact set of statistics computed for that pair. For example,
+Pearsonâs correlation coefficient can only be computed on
+two quantitative (scale) features like âHeightâ and âTemperatureâ. It
+does not make sense to compute the linear correlation of two categorical
+attributes like âHair Colorâ.</p>
+
+<h3 id="usage-1">Usage</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f bivar-stats.dml
+                        -nvargs X=&lt;file&gt;
+                                index1=&lt;file&gt;
+                                index2=&lt;file&gt;
+                                types1=&lt;file&gt;
+                                types2=&lt;file&gt;
+                                OUTDIR=&lt;directory&gt;
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f bivar-stats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=&lt;file&gt;
+                                     index1=&lt;file&gt;
+                                     index2=&lt;file&gt;
+                                     types1=&lt;file&gt;
+                                     types2=&lt;file&gt;
+                                     OUTDIR=&lt;directory&gt;
+</code></pre>
+  </div>
+</div>
+
+<h3 id="arguments-1">Arguments</h3>
+
+<p><strong>X</strong>: Location (on HDFS) to read the data matrix $X$ whose 
columns are the
+features that we want to compare and correlate with bivariate
+statistics.</p>
+
+<p><strong>index1</strong>: Location (on HDFS) to read the single-row matrix 
that lists the column
+indices of the <em>first-argument</em> features in pairwise statistics. Its
+$i^{\textrm{th}}$ entry (i.e.Â $i^{\textrm{th}}$ column-cell) contains
+the index $k$ of column <code>X[,k]</code> in the data matrix whose bivariate
+statistics need to be computed.</p>
+
+<p><strong>index2</strong>: Location (on HDFS) to read the single-row matrix 
that lists the column
+indices of the <em>second-argument</em> features in pairwise statistics. Its
+$j^{\textrm{th}}$ entry (i.e.Â $j^{\textrm{th}}$ column-cell) contains
+the index $l$ of column <code>X[,l]</code> in the data matrix whose bivariate
+statistics need to be computed.</p>
+
+<p><strong>types1</strong>: Location (on HDFS) to read the single-row matrix 
that lists the <em>types</em>
+of the <em>first-argument</em> features in pairwise statistics. Its
+$i^{\textrm{th}}$ entry (i.e.Â $i^{\textrm{th}}$ column-cell) contains
+the type of column <code>X[,k]</code> in the data matrix, where $k$ is the
+$i^{\textrm{th}}$ entry in the index1 matrix. Feature types
+must be encoded by integer numbers:1 = scale, 2 = nominal,
+3 = ordinal.</p>
+
+<p><strong>types2</strong>: Location (on HDFS) to read the single-row matrix 
that lists the <em>types</em>
+of the <em>second-argument</em> features in pairwise statistics. Its
+$j^{\textrm{th}}$ entry (i.e.Â $j^{\textrm{th}}$ column-cell) contains
+the type of column <code>X[,l]</code> in the data matrix, where $l$ is the
+$j^{\textrm{th}}$ entry in the index2 matrix. Feature types
+must be encoded by integer numbers: 1 = scale, 2 = nominal,
+3 = ordinal.</p>
+
+<p><strong>OUTDIR</strong>: Location path (on HDFS) where the output matrices 
with computed
+bivariate statistics will be stored. The matricesâ file names and format
+are defined in <a 
href="algorithms-descriptive-statistics.html#table2"><strong>TableÂ 
2</strong></a>.</p>
+
+<h3 id="examples-1">Examples</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f bivar-stats.dml
+                        -nvargs X=/user/ml/X.mtx
+                                index1=/user/ml/S1.mtx
+                                index2=/user/ml/S2.mtx
+                                types1=/user/ml/K1.mtx
+                                types2=/user/ml/K2.mtx
+                                OUTDIR=/user/ml/stats.mtx
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f bivar-stats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=/user/ml/X.mtx
+                                     index1=/user/ml/S1.mtx
+                                     index2=/user/ml/S2.mtx
+                                     types1=/user/ml/K1.mtx
+                                     types2=/user/ml/K2.mtx
+                                     OUTDIR=/user/ml/stats.mtx
+</code></pre>
+  </div>
+</div>
+
+<hr />
+
+<p><a name="table2"></a>
+<strong>Table 2</strong>: The output matrices of <code>bivar-stats.dml</code> 
have one row per one bivariate
+statistic and one column per one pair of input features. This table lists
+the meaning of each matrix and each row.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>Output File / Matrix</th>
+      <th>Row</th>
+      <th>Name of Statistic</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>All Files</td>
+      <td>1</td>
+      <td>1-st feature column</td>
+    </tr>
+    <tr>
+      <td>&#8221;</td>
+      <td>2</td>
+      <td>2-nd feature column</td>
+    </tr>
+    <tr>
+      <td>bivar.scale.scale.stats</td>
+      <td>3</td>
+      <td>Pearsonâs correlation coefficient</td>
+    </tr>
+    <tr>
+      <td>bivar.nominal.nominal.stats</td>
+      <td>3</td>
+      <td>PearsonâsÂ $\chi^2$</td>
+    </tr>
+    <tr>
+      <td>&#8221;</td>
+      <td>4</td>
+      <td>Degrees of freedom</td>
+    </tr>
+    <tr>
+      <td>&#8221;</td>
+      <td>5</td>
+      <td>$P\textrm{-}$value of PearsonâsÂ $\chi^2$</td>
+    </tr>
+    <tr>
+      <td>&#8221;</td>
+      <td>6</td>
+      <td>CramÃ©râsÂ $V$</td>
+    </tr>
+    <tr>
+      <td>bivar.nominal.scale.stats</td>
+      <td>3</td>
+      <td>Eta statistic</td>
+    </tr>
+    <tr>
+      <td>&#8221;</td>
+      <td>4</td>
+      <td>$F$Â statistic</td>
+    </tr>
+    <tr>
+      <td>bivar.ordinal.ordinal.stats</td>
+      <td>3</td>
+      <td>Spearmanâs rank correlation coefficient</td>
+    </tr>
+  </tbody>
+</table>
+
+<hr />
+
+<h3 id="details-1">Details</h3>
+
+<p>Script <code>bivar-stats.dml</code> takes an input matrix <code>X</code> 
whose
+columns represent the features and whose rows represent the records of a
+data sample. Given <code>X</code>, the script computes certain relevant 
bivariate
+statistics for specified pairs of feature columns <code>X[,i]</code> and
+<code>X[,j]</code>. Command-line parameters <code>index1</code> and 
<code>index2</code> specify the
+files with column pairs of interest to the user. Namely, the file given
+by <code>index1</code> contains the vector of the 1st-attribute column indices 
and
+the file given by <code>index2</code> has the vector of the 2nd-attribute 
column
+indices, with &#8220;1st&#8221; and &#8220;2nd&#8221; referring to their 
places in bivariate
+statistics. Note that both <code>index1</code> and <code>index2</code> files 
should contain a
+1-row matrix of positive integers.</p>
+
+<p>The bivariate statistics to be computed depend on the <em>types</em>, or
+<em>measurement levels</em>, of the two columns. The types for each pair are
+provided in the files whose locations are specified by <code>types1</code> and
+<code>types2</code> command-line parameters. These files are also 1-row 
matrices,
+i.e.Â vectors, that list the 1st-attribute and the 2nd-attribute column
+types in the same order as their indices in the <code>index1</code> and 
<code>index2</code>
+files. The types must be provided as per the following convention:
+1 = scale, 2 = nominal, 3 = ordinal.</p>
+
+<p>The script organizes its results into (potentially) four output
+matrices, one per each type combination. The types of bivariate
+statistics are defined using the types of the columns that were used for
+their arguments, with &#8220;ordinal&#8221; sometimes retrogressing to 
&#8220;nominal.&#8221;
+<a href="algorithms-descriptive-statistics.html#table2"><strong>TableÂ 
2</strong></a>
+describes what each column in each output matrix
+contains. In particular, the script includes the following statistics:</p>
+
+<ul>
+  <li>For a pair of scale (quantitative) columns, Pearsonâs correlation 
coefficient.</li>
+  <li>For a pair of nominal columns (with finite-sized, fixed, unordered
+domains), the PearsonâsÂ $\chi^2$ and its p-value.</li>
+  <li>For a pair of one scale column and one nominal column, $F$Â 
statistic.</li>
+  <li>For a pair of ordinal columns (ordered domains depicting ranks),
+Spearmanâs rank correlation coefficient.</li>
+</ul>
+
+<p>Note that, as shown in <a 
href="algorithms-descriptive-statistics.html#table2"><strong>TableÂ 
2</strong></a>, the output matrices
+contain the column indices of the features involved in each statistic.
+Moreover, if the output matrix does not contain a value in a certain
+cell then it should be interpreted as aÂ $0$ (sparse matrix
+representation).</p>
+
+<p>Below we list all bivariate statistics computed by script
+<code>bivar-stats.dml</code>. The statistics are collected into
+several groups by the type of their input features. We refer to the two
+input features as $v_1$ and $v_2$ unless specified otherwise; the value
+pairs are <script type="math/tex">(v_{1,i}, v_{2,i})</script> for 
$i=1,\ldots,n$, where $n$ is the
+number of rows in <code>X</code>, i.e.Â the sample size.</p>
+
+<h4 id="scale-vs-scale-statistics">Scale-vs-Scale Statistics</h4>
+
+<p>Sample statistics that describe association between two quantitative
+(scale) features. A scale feature has numerical values, with the natural
+ordering relation.</p>
+
+<p><em>Pearsonâs correlation coefficient</em>: A measure of linear
+dependence between two numerical features:</p>
+
+<script type="math/tex; mode=display">r
+= \frac{Cov(v_1, v_2)}{\sqrt{Var v_1 Var v_2}}
+= \frac{\sum_{i=1}^n (v_{1,i} - \bar{v}_1) (v_{2,i} - 
\bar{v}_2)}{\sqrt{\sum_{i=1}^n (v_{1,i} - \bar{v}_1)^{2\mathstrut} \cdot 
\sum_{i=1}^n (v_{2,i} - \bar{v}_2)^{2\mathstrut}}}</script>
+
+<p>Commonly denoted byÂ $r$, correlation ranges between $-1$ and $+1$,
+reaching ${\pm}1$ when all value pairs <script type="math/tex">(v_{1,i}, 
v_{2,i})</script> lie on the
+same line. Correlation nearÂ 0 means that a line is not a good way to
+represent the dependence between the two features; however, this does
+not imply independence. The sign indicates direction of the linear
+association: $r &gt; 0$ ($r &lt; 0$) if one feature tends to linearly increase
+(decrease) when the other feature increases. Nonlinear association, if
+present, may disobey this sign. Pearsonâs correlation
+coefficient is symmetric: $r(v_1, v_2) = r(v_2, v_1)$; it does
+not change if we transform $v_1$ and $v_2$ to $a + b v_1$ and
+$c + d v_2$ where $a, b, c, d$ are constants and $b, d &gt; 0$.</p>
+
+<p>Suppose that we use simple linear regression to represent one feature
+given the other, say represent <script type="math/tex">v_{2,i} \approx \alpha 
+ \beta v_{1,i}</script>
+by selecting $\alpha$ and $\beta$ to minimize the least-squares error
+<script type="math/tex">\sum_{i=1}^n (v_{2,i} - \alpha - \beta 
v_{1,i})^2</script>. Then the best error
+equals</p>
+
+<script type="math/tex; mode=display">\min_{\alpha, \beta} \,\,\sum_{i=1}^n 
\big(v_{2,i} - \alpha - \beta v_{1,i}\big)^2 \,\,=\,\,
+(1 - r^2) \,\sum_{i=1}^n \big(v_{2,i} - \bar{v}_2\big)^2</script>
+
+<p>In other words, $1\,{-}\,r^2$ is the ratio of the residual sum of squares 
to the
+total sum of squares. Hence, $r^2$ is an accuracy measure of the linear
+regression.</p>
+
+<h4 id="nominal-vs-nominal-statistics">Nominal-vs-Nominal Statistics</h4>
+
+<p>Sample statistics that describe association between two nominal
+categorical features. Both featuresâ value domains are encoded with
+positive integers in arbitrary order: nominal features do not order
+their value domains.</p>
+
+<p><em>PearsonâsÂ $\chi^2$</em>: A measure of how much the
+frequencies of value pairs of two categorical features deviate from
+statistical independence. Under independence, the probability of every
+value pair must equal the product of probabilities of each value in the
+pair: $Prob[a, b] - Prob[a]Prob[b] = 0$.
+But we do not know these (hypothesized) probabilities; we only know the
+sample frequency counts. Let $n_{a,b}$ be the frequency count of pair
+$(a, b)$, let $n_a$ and $n_b$ be the frequency counts of $a$Â alone and
+of $b$Â alone. Under independence, difference
+<script type="math/tex">n_{a,b}{/}n - (n_a{/}n)(n_b{/}n)</script> is unlikely 
to be exactlyÂ 0 due to
+sample randomness, yet it is unlikely to be too far fromÂ 0. For some
+pairs $(a,b)$ it may deviate fromÂ 0 farther than for other pairs.
+PearsonâsÂ $\chi^2$Â is an aggregate measure that combines
+squares of these differences across all value pairs:</p>
+
+<script type="math/tex; mode=display">\chi^2 \,\,=\,\, \sum_{a,\,b} 
\Big(\frac{n_a n_b}{n}\Big)^{-1} \Big(n_{a,b} - \frac{n_a n_b}{n}\Big)^2
+\,=\,\, \sum_{a,\,b} \frac{(O_{a,b} - E_{a,b})^2}{E_{a,b}}</script>
+
+<p>where <script type="math/tex">O_{a,b} = n_{a,b}</script> are the 
<em>observed</em> frequencies and
+$E_{a,b} = (n_a n_b){/}n$ are the <em>expected</em> frequencies for all
+pairsÂ $(a,b)$. Under independence (plus other standard assumptions) the
+sampleÂ $\chi^2$ closely follows a well-known distribution, making it a
+basis for statistical tests for independence,
+seeÂ <em>$P\textrm{-}$value of PearsonâsÂ $\chi^2$</em> for details.
+Note that PearsonâsÂ $\chi^2$ does <em>not</em> measure the
+strength of dependence: even very weak dependence may result in a
+significant deviation from independence if the counts are large enough.
+UseÂ CramÃ©râsÂ $V$ instead to measure the strength of
+dependence.</p>
+
+<p><em>Degrees of freedom</em>: An integer parameter required for the
+interpretation ofÂ PearsonâsÂ $\chi^2$ measure. Under
+independence (plus other standard assumptions) the sampleÂ $\chi^2$
+statistic is approximately distributed as the sum of $d$Â squares of
+independent normal random variables with meanÂ 0 and varianceÂ 1, where
+$d$ is this integer parameter. For a pair of categorical features such
+that the $1^{\textrm{st}}$Â feature has $k_1$ categories and the
+$2^{\textrm{nd}}$Â feature has $k_2$ categories, the number of degrees of
+freedom is $d = (k_1 - 1)(k_2 - 1)$.</p>
+
+<p><em>$P\textrm{-}$value of PearsonâsÂ $\chi^2$</em>: A measure of
+how likely we would observe the current frequencies of value pairs of
+two categorical features assuming their statistical independence. More
+precisely, it computes the probability that the sum of $d$Â squares of
+independent normal random variables with meanÂ 0 and varianceÂ 1 (called
+the $\chi^2$Â distribution with $d$ degrees of freedom) generates a value
+at least as large as the current sample PearsonâsÂ $\chi^2$.
+The $d$ parameter is <em>degrees of freedom</em>, see above. Under independence
+(plus other standard assumptions) the sample
+PearsonâsÂ $\chi^2$ closely follows the
+$\chi^2$Â distribution and is unlikely to land very far into its tail. On
+the other hand, if the two features are dependent, their sample
+PearsonâsÂ $\chi^2$ becomes arbitrarily large as
+$n\to\infty$ and lands extremely far into the tail of the
+$\chi^2$Â distribution given a large enough data sample.
+$P\textrm{-}$value of PearsonâsÂ $\chi^2$ returns the tail
+&#8220;weight&#8221; on the right-hand side of PearsonâsÂ $\chi^2$:</p>
+
+<script type="math/tex; mode=display">P = Prob\big[r \geq \textrm{PearsonâsÂ 
$\chi^2$} \big|\,\, r \sim \textrm{the $\chi^2$ distribution}\big]</script>
+
+<p>As any probability, $P$ ranges between 0 andÂ 1. If $P\leq 0.05$, the
+dependence between the two features may be considered statistically
+significant (i.e.Â their independence is considered statistically ruled
+out). For highly dependent features, it is not unusual to have
+$P\leq 10^{-20}$ or less, in which case our script will simply return
+$P = 0$. Independent features should have their $P\geq 0.05$ in about
+95% cases.</p>
+
+<p><em>CramÃ©râsÂ $V$</em>: A measure for the strength of
+association, i.e.Â of statistical dependence, between two categorical
+features, conceptually similar to Pearsonâs correlation
+coefficient. It divides the
+observedÂ PearsonâsÂ $\chi^2$ by the maximum
+possibleÂ $\chi^2_{\textrm{max}}$ given $n$ and the number $k_1, k_2$Â of
+categories in each feature, then takes the square root. Thus,
+CramÃ©râsÂ $V$ ranges from 0 toÂ 1, where 0 implies no
+association and 1 implies the maximum possible association (one-to-one
+correspondence) between the two features. See
+<em>PearsonâsÂ $\chi^2$</em> for the computation ofÂ $\chi^2$; its
+maximum = $n\cdot\min\{k_1\,{-}\,1, k_2\,{-}\,1\}$ where the
+$1^{\textrm{st}}$Â feature has $k_1$ categories and the
+$2^{\textrm{nd}}$Â feature has $k_2$
+categoriesÂ 
+<a href="algorithms-bibliography.html">[AcockStavig1979]</a>, so</p>
+
+<script type="math/tex; mode=display">\textrm{CramÃ©râs $V$} \,\,=\,\, 
\sqrt{\frac{\textrm{PearsonâsÂ $\chi^2$}}{n\cdot\min\{k_1\,{-}\,1, 
k_2\,{-}\,1\}}}</script>
+
+<p>As opposed to $P\textrm{-}$value of PearsonâsÂ $\chi^2$,
+which goes toÂ 0 (rapidly) as the featuresâ dependence increases,
+CramÃ©râsÂ $V$ goes towardsÂ 1 (slowly) as the dependence
+increases. Both PearsonâsÂ $\chi^2$ and
+$P\textrm{-}$value of PearsonâsÂ $\chi^2$ are very sensitive
+toÂ $n$, but in CramÃ©râsÂ $V$ this is mitigated by taking the
+ratio.</p>
+
+<h4 id="nominal-vs-scale-statistics">Nominal-vs-Scale Statistics</h4>
+
+<p>Sample statistics that describe association between a categorical
+feature (order ignored) and a quantitative (scale) feature. The values
+of the categorical feature must be coded as positive integers.</p>
+
+<p><em>Eta statistic</em>: A measure for the strength of
+association (statistical dependence) between a nominal feature and a
+scale feature, conceptually similar to Pearsonâs correlation
+coefficient. Ranges from 0 toÂ 1, approaching 0 when there is no
+association and approaching 1 when there is a strong association. The
+nominal feature, treated as the independent variable, is assumed to have
+relatively few possible values, all with large frequency counts. The
+scale feature is treated as the dependent variable. Denoting the nominal
+feature byÂ $x$ and the scale feature byÂ $y$, we have:</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+\eta^2 \,=\, 1 - \frac{\sum_{i=1}^{n} \big(y_i - 
\hat{y}[x_i]\big)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2},
+\,\,\,\,\textrm{where}\,\,\,\,
+\hat{y}[x] = \frac{1}{\mathop{\mathrm{freq}}(x)}\sum_{i=1}^n  
+\,\left\{\!\!\begin{array}{rl} y_i & \textrm{if $x_i = x$}\\ 0 & 
\textrm{otherwise}\end{array}\right.\!\!\! %]]></script>
+
+<p>and <script type="math/tex">\bar{y} = (1{/}n)\sum_{i=1}^n y_i</script> is 
the mean. Value $\hat{y}[x]$
+is the average ofÂ $y_i$ among all records where $x_i = x$; it can also
+be viewed as the &#8220;predictor&#8221; of $y$ givenÂ $x$. Then
+<script type="math/tex">\sum_{i=1}^{n} (y_i - \hat{y}[x_i])^2</script> is the 
residual error
+sum-of-squares and $\sum_{i=1}^{n} (y_i - \bar{y})^2$ is the total
+sum-of-squares forÂ $y$. Hence, $\eta^2$ measures the accuracy of
+predicting $y$ withÂ $x$, just like the &#8220;R-squared&#8221; statistic 
measures
+the accuracy of linear regression. Our output $\eta$ is the square root
+ofÂ $\eta^2$.</p>
+
+<p><em>$F$Â statistic</em>: A measure of how much the values of the
+scale feature, denoted here byÂ $y$, deviate from statistical
+independence on the nominal feature, denoted byÂ $x$. The same measure
+appears in the one-way analysis of variance (ANOVA). Like
+PearsonâsÂ $\chi^2$, $F$Â statistic is used to
+test the hypothesis that $y$Â is independent fromÂ $x$, given the
+following assumptions:</p>
+
+<ul>
+  <li>The scale feature $y$ has approximately normal distribution whose mean
+may depend only onÂ $x$ and variance is the same for allÂ $x$.</li>
+  <li>The nominal feature $x$ has relatively small value domain with large
+frequency counts, the $x_i$-values are treated as fixed (non-random).</li>
+  <li>All records are sampled independently of each other.</li>
+</ul>
+
+<p>To compute $F$Â statistic, we first compute $\hat{y}[x]$ as
+the average ofÂ $y_i$ among all records where $x_i = x$. These
+$\hat{y}[x]$ can be viewed as &#8220;predictors&#8221; of $y$ givenÂ $x$; if 
$y$ is
+independent onÂ $x$, they should &#8220;predict&#8221; only the global
+meanÂ $\bar{y}$. Then we form two sums-of-squares:</p>
+
+<ul>
+  <li><em>Residual</em> sum-of-squares of the &#8220;predictor&#8221; accuracy:
+$y_i - \hat{y}[x_i]$.</li>
+  <li><em>Explained</em> sum-of-squares of the &#8220;predictor&#8221; 
variability:
+$\hat{y}[x_i] - \bar{y}$.</li>
+</ul>
+
+<p>$F$Â statistic is the ratio of the explained sum-of-squares
+to the residual sum-of-squares, each divided by their corresponding
+degrees of freedom:</p>
+
+<script type="math/tex; mode=display">F \,\,=\,\, 
+\frac{\sum_{x}\, \mathop{\mathrm{freq}}(x) \, \big(\hat{y}[x] - \bar{y}\big)^2 
\,\big/\,\, (k\,{-}\,1)}{\sum_{i=1}^{n} \big(y_i - \hat{y}[x_i]\big)^2 
\,\big/\,\, (n\,{-}\,k)} \,\,=\,\,
+\frac{n\,{-}\,k}{k\,{-}\,1} \cdot \frac{\eta^2}{1 - \eta^2}</script>
+
+<p>Here $k$
+is the domain size of the nominal featureÂ $x$. The $k$ 
&#8220;predictors&#8221; lose
+1Â freedom due to their linear dependence withÂ $\bar{y}$; similarly, the
+$n$Â $y_i$-s lose $k$Â freedoms due to the &#8220;predictors&#8221;.</p>
+
+<p>The statistic can test if the independence hypothesis of $y$ from $x$ is
+reasonable; more generally (with relaxed normality assumptions) it can
+test the hypothesis that <em>the mean</em> of $y$ among records with a
+givenÂ $x$ is the same for allÂ $x$. Under this hypothesis
+$F$Â statistic has, or approximates, the
+$F(k\,{-}\,1, n\,{-}\,k)$-distribution. But if the mean of $y$ given $x$
+depends onÂ $x$, $F$Â statistic becomes arbitrarily large as
+$n\to\infty$ (with $k$Â fixed) and lands extremely far into the tail of
+the $F(k\,{-}\,1, n\,{-}\,k)$-distribution given a large enough data
+sample.</p>
+
+<h4 id="ordinal-vs-ordinal-statistics">Ordinal-vs-Ordinal Statistics</h4>
+
+<p>Sample statistics that describe association between two ordinal
+categorical features. Both featuresâ value domains are encoded with
+positive integers, so that the natural order of the integers coincides
+with the order in each value domain.</p>
+
+<p><em>Spearmanâs rank correlation coefficient</em>: A measure for
+the strength of association (statistical dependence) between two ordinal
+features, conceptually similar to Pearsonâs correlation
+coefficient. Specifically, it is Pearsonâs correlation
+coefficient applied to the feature vectors in which all values
+are replaced by their ranks, i.e.Â  their positions if the vector is
+sorted. The ranks of identical (duplicate) values are replaced with
+their average rank. For example, in vector $(15, 11, 26, 15, 8)$ the
+value &#8220;15&#8221; occurs twice with ranks 3 andÂ 4 per the sorted order
+$(8_1, 11_2, 15_3, 15_4, 26_5)$; so, both values are assigned their
+average rank of $3.5 = (3\,{+}\,4)\,{/}\,2$ and the vector is replaced
+byÂ $(3.5,\, 2,\, 5,\, 3.5,\, 1)$.</p>
+
+<p>Our implementation of Spearmanâs rank correlation
+coefficient is geared towards features having small value domains
+and large counts for the values. Given the two input vectors, we form a
+contingency table $T$ of pairwise frequency counts, as well as a vector
+of frequency counts for each feature: $f_1$ andÂ $f_2$. Here in
+<script type="math/tex">T_{i,j}</script>, <script 
type="math/tex">f_{1,i}</script>, <script type="math/tex">f_{2,j}</script> 
indices $i$ andÂ $j$ refer to the
+order-preserving integer encoding of the feature values. We use prefix
+sums over $f_1$ andÂ $f_2$ to compute the valuesâ average ranks:
+<script type="math/tex">r_{1,i} = \sum_{j=1}^{i-1} f_{1,j} + 
(f_{1,i}\,{+}\,1){/}2</script>, and
+analogously forÂ $r_2$. Finally, we compute rank variances for $r_1, r_2$
+weighted by counts $f_1, f_2$ and their covariance weighted byÂ $T$,
+before applying the standard formula for Pearsonâs correlation
+coefficient:</p>
+
+<script type="math/tex; mode=display">\rho \,\,=\,\, \frac{Cov_T(r_1, 
r_2)}{\sqrt{Var_{f_1}(r_1)Var_{f_2}(r_2)}}
+\,\,=\,\, \frac{\sum_{i,j} T_{i,j} (r_{1,i} - \bar{r}_1) (r_{2,j} - 
\bar{r}_2)}{\sqrt{\sum_i f_{1,i} (r_{1,i} - \bar{r}_1)^{2\mathstrut} \cdot 
\sum_j f_{2,j} (r_{2,j} - \bar{r}_2)^{2\mathstrut}}}</script>
+
+<p>where <script type="math/tex">\bar{r_1} = \sum_i r_{1,i} 
f_{1,i}{/}n</script>, analogously
+forÂ $\bar{r}_2$. The value of $\rho$ lies between $-1$ and $+1$, with
+sign indicating the prevalent direction of the association: $\rho &gt; 0$
+($\rho &lt; 0$) means that one feature tends to increase (decrease) when
+the other feature increases. The correlation becomesÂ 1 when the two
+features are monotonically related.</p>
+
+<h3 id="returns-1">Returns</h3>
+
+<p>A collection of (potentially) 4 matrices. Each matrix contains bivariate
+statistics that resulted from a different combination of feature types.
+There is one matrix for scale-scale statistics (which includes
+Pearsonâs correlation coefficient), one for nominal-nominal
+statistics (includes PearsonâsÂ $\chi^2$), one for
+nominal-scale statistics (includes $F$Â statistic) and one
+for ordinal-ordinal statistics (includes Spearmanâs rank
+correlation coefficient). If any of these matrices is not
+produced, then no pair of columns required the corresponding type
+combination. See
+<a href="algorithms-descriptive-statistics.html#table2"><strong>TableÂ 
2</strong></a>
+for the matrix naming and format
+details.</p>
+
+<hr />
+
+<h2 id="stratified-bivariate-statistics">1.3. Stratified Bivariate 
Statistics</h2>
+
+<h3 id="description-2">Description</h3>
+
+<p>The <code>stratstats.dml</code> script computes common bivariate
+statistics, such as correlation, slope, and their p-value, in parallel
+for many pairs of input variables in the presence of a confounding
+categorical variable. The values of this confounding variable group the
+records into strata (subpopulations), in which all bivariate pairs are
+assumed free of confounding. The script uses the same data model as in
+one-way analysis of covariance (ANCOVA), with strata representing
+population samples. It also outputs univariate stratified and bivariate
+unstratified statistics.</p>
+
+<p>To see how data stratification mitigates confounding, consider an
+(artificial) example in 
+<a href="algorithms-descriptive-statistics.html#table3"><strong>TableÂ 
3</strong></a>. A highly seasonal
+retail item was marketed with and without a promotion over the final
+3Â months of the year. In each month the sale was more likely with the
+promotion than without it. But during the peak holiday season, when
+shoppers came in greater numbers and bought the item more often, the
+promotion was less frequently used. As a result, if the 4-th quarter
+data is pooled together, the promotionâs effect becomes reversed and
+magnified. Stratifying by month restores the positive correlation.</p>
+
+<p>The script computes its statistics in parallel over all possible pairs
+from two specified sets of covariates. The 1-st covariate is a column in
+input matrixÂ $X$ and the 2-nd covariate is a column in input matrixÂ $Y$;
+matrices $X$ andÂ $Y$ may be the same or different. The columns of
+interest are given by their index numbers in special matrices. The
+stratum column, specified in its own matrix, is the same for all
+covariate pairs.</p>
+
+<p>Both covariates in each pair must be numerical, with the 2-nd covariate
+normally distributed given the 1-st covariate (seeÂ Details). Missing
+covariate values or strata are represented byÂ &#8221;NaN&#8221;. Records with 
NaNâs
+are selectively omitted wherever their NaNâs are material to the output
+statistic.</p>
+
+<hr />
+
+<p><a name="table3"></a>
+<strong>Table 3</strong>: Stratification example: the effect of the promotion 
on average sales
+becomes reversed and amplified (from $+0.1$ to $-0.5$) if we ignore the 
months.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>Month</th>
+      <th colspan="2">Oct</th>
+      <th colspan="2">Nov</th>
+      <th colspan="2">Dec</th>
+      <th colspan="2">Oct - Dec</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>Customers (millions)</td>
+      <td>0.6</td>
+      <td>1.4</td>
+      <td>1.4</td>
+      <td>0.6</td>
+      <td>3.0</td>
+      <td>1.0</td>
+      <td>5.0</td>
+      <td>3.0</td>
+    </tr>
+    <tr>
+      <td>Promotions (0 or 1)</td>
+      <td>0</td>
+      <td>1</td>
+      <td>0</td>
+      <td>1</td>
+      <td>0</td>
+      <td>1</td>
+      <td>0</td>
+      <td>1</td>
+    </tr>
+    <tr>
+      <td>Avg sales per 1000</td>
+      <td>0.4</td>
+      <td>0.5</td>
+      <td>0.9</td>
+      <td>1.0</td>
+      <td>2.5</td>
+      <td>2.6</td>
+      <td>1.8</td>
+      <td>1.3</td>
+    </tr>
+  </tbody>
+</table>
+
+<hr />
+
+<h3 id="usage-2">Usage</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f stratstats.dml
+                        -nvargs X=&lt;file&gt;
+                                Xcid=[file]
+                                Y=[file]
+                                Ycid=[file]
+                                S=[file]
+                                Scid=[int]
+                                O=&lt;file&gt;
+                                fmt=[format]
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f stratstats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=&lt;file&gt;
+                                     Xcid=[file]
+                                     Y=[file]
+                                     Ycid=[file]
+                                     S=[file]
+                                     Scid=[int]
+                                     O=&lt;file&gt;
+                                     fmt=[format]
+</code></pre>
+  </div>
+</div>
+
+<h3 id="arguments-2">Arguments</h3>
+
+<p><strong>X</strong>: Location (on HDFS) to read matrix $X$ whose columns we 
want to use as
+the 1-st covariate (i.e.Â as the feature variable)</p>
+
+<p><strong>Xcid</strong>: (default: <code>" "</code>) Location to read the 
single-row matrix that lists all index
+numbers of the $X$-columns used as the 1-st covariate; the default value
+means &#8220;use all $X$-columns&#8221;</p>
+
+<p><strong>Y</strong>: (default: <code>" "</code>) Location to read matrix $Y$ 
whose columns we want to use as
+the 2-nd covariate (i.e.Â as the response variable); the default value
+means &#8220;use $X$ in place ofÂ $Y$&#8221;</p>
+
+<p><strong>Ycid</strong>: (default: <code>" "</code>) Location to read the 
single-row matrix that lists all index
+numbers of the $Y$-columns used as the 2-nd covariate; the default value
+means &#8220;use all $Y$-columns&#8221;</p>
+
+<p><strong>S</strong>: (default: <code>" "</code>) Location to read matrix $S$ 
that has the stratum column.
+Note: the stratum column must contain small positive integers; all
+fractional values are rounded; stratum IDs of value ${\leq}\,0$ or NaN
+are treated as missing. The default value for S means &#8220;use
+$X$ in place ofÂ $S$&#8221;</p>
+
+<p><strong>Scid</strong>: (default: <code>1</code>) The index number of the 
stratum column inÂ $S$</p>
+
+<p><strong>O</strong>: Location to store the output matrix defined in
+<a href="algorithms-descriptive-statistics.html#table4"><strong>TableÂ 
4</strong></a>.</p>
+
+<p><strong>fmt</strong>: (default: <code>"text"</code>) Matrix file output 
format, such as <code>text</code>,
+<code>mm</code>, or <code>csv</code>; see read/write functions in
+SystemML Language Reference for details.</p>
+
+<hr />
+
+<p><a name="table4"></a>
+<strong>Table 4</strong>: The <code>stratstats.dml</code> output matrix has 
one row per each distinct pair of 1-st and 2-nd covariates, and 40 columns with 
the statistics described here.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>&nbsp;</th>
+      <th>Col</th>
+      <th>Meaning</th>
+      <th>&nbsp;</th>
+      <th>Col</th>
+      <th>Meaning</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td rowspan="9">1-st covariate</td>
+      <td>01</td>
+      <td>$X$-column number</td>
+      <td rowspan="9">2-nd covariate</td>
+      <td>11</td>
+      <td>$Y$-column number</td>
+    </tr>
+    <tr>
+      <td>02</td>
+      <td>presence count for $x$</td>
+      <td>12</td>
+      <td>presence count for $y$</td>
+    </tr>
+    <tr>
+      <td>03</td>
+      <td>global mean $(x)$</td>
+      <td>13</td>
+      <td>global mean $(y)$</td>
+    </tr>
+    <tr>
+      <td>04</td>
+      <td>global std. dev. $(x)$</td>
+      <td>14</td>
+      <td>global std. dev. $(y)$</td>
+    </tr>
+    <tr>
+      <td>05</td>
+      <td>stratified std. dev. $(x)$</td>
+      <td>15</td>
+      <td>stratified std. dev. $(y)$</td>
+    </tr>
+    <tr>
+      <td>06</td>
+      <td>$R^2$ for $x \sim $ strata</td>
+      <td>16</td>
+      <td>$R^2$ for $y \sim $ strata</td>
+    </tr>
+    <tr>
+      <td>07</td>
+      <td>adjusted $R^2$ for $x \sim $ strata</td>
+      <td>17</td>
+      <td>adjusted $R^2$ for $y \sim $ strata</td>
+    </tr>
+    <tr>
+      <td>08</td>
+      <td>p-value, $x \sim $ strata</td>
+      <td>18</td>
+      <td>p-value, $y \sim $ strata</td>
+    </tr>
+    <tr>
+      <td>09 - 10</td>
+      <td>reserved</td>
+      <td>19 - 20</td>
+      <td>reserved</td>
+    </tr>
+    <tr>
+      <td rowspan="10">$y \sim x$, NO strata</td>
+      <td>21</td>
+      <td>presence count $(x, y)$</td>
+      <td rowspan="10">$y \sim x$ AND strata</td>
+      <td>31</td>
+      <td>presence count $(x, y, s)$</td>
+    </tr>
+    <tr>
+      <td>22</td>
+      <td>regression slope</td>
+      <td>32</td>
+      <td>regression slope</td>
+    </tr>
+    <tr>
+      <td>23</td>
+      <td>regres. slope std. dev.</td>
+      <td>33</td>
+      <td>regres. slope std. dev.</td>
+    </tr>
+    <tr>
+      <td>24</td>
+      <td>correlation $= \pm\sqrt{R^2}$</td>
+      <td>34</td>
+      <td>correlation $= \pm\sqrt{R^2}$</td>
+    </tr>
+    <tr>
+      <td>25</td>
+      <td>residual std. dev.</td>
+      <td>35</td>
+      <td>residual std. dev.</td>
+    </tr>
+    <tr>
+      <td>26</td>
+      <td>$R^2$ in $y$ due to $x$</td>
+      <td>36</td>
+      <td>$R^2$ in $y$ due to $x$</td>
+    </tr>
+    <tr>
+      <td>27</td>
+      <td>adjusted $R^2$ in $y$ due to $x$</td>
+      <td>37</td>
+      <td>adjusted $R^2$ in $y$ due to $x$</td>
+    </tr>
+    <tr>
+      <td>28</td>
+      <td>p-value for "slope = 0"</td>
+      <td>38</td>
+      <td>p-value for "slope = 0"</td>
+    </tr>
+    <tr>
+      <td>29</td>
+      <td>reserved</td>
+      <td>39</td>
+      <td># strata with ${\geq}\,2$ count</td>
+    </tr>
+    <tr>
+      <td>30</td>
+      <td>reserved</td>
+      <td>40</td>
+      <td>reserved</td>
+    </tr>
+  </tbody>
+</table>
+
+<hr />
+
+<h3 id="examples-2">Examples</h3>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f stratstats.dml
+                        -nvargs X=/user/ml/X.mtx
+                                Xcid=/user/ml/Xcid.mtx
+                                Y=/user/ml/Y.mtx
+                                Ycid=/user/ml/Ycid.mtx
+                                S=/user/ml/S.mtx
+                                Scid=2
+                                O=/user/ml/Out.mtx
+                                fmt=csv
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f stratstats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=/user/ml/X.mtx
+                                     Xcid=/user/ml/Xcid.mtx
+                                     Y=/user/ml/Y.mtx
+                                     Ycid=/user/ml/Ycid.mtx
+                                     S=/user/ml/S.mtx
+                                     Scid=2
+                                     O=/user/ml/Out.mtx
+                                     fmt=csv
+</code></pre>
+  </div>
+</div>
+
+<div class="codetabs">
+<div data-lang="Hadoop">
+    <pre><code>hadoop jar SystemML.jar -f stratstats.dml
+                        -nvargs X=/user/ml/Data.mtx
+                                Xcid=/user/ml/Xcid.mtx
+                                Ycid=/user/ml/Ycid.mtx
+                                Scid=7
+                                O=/user/ml/Out.mtx
+</code></pre>
+  </div>
+<div data-lang="Spark">
+    <pre><code>$SPARK_HOME/bin/spark-submit --master yarn
+                             --deploy-mode cluster
+                             --conf spark.driver.maxResultSize=0
+                             SystemML.jar
+                             -f stratstats.dml
+                             -config SystemML-config.xml
+                             -exec hybrid_spark
+                             -nvargs X=/user/ml/Data.mtx
+                                     Xcid=/user/ml/Xcid.mtx
+                                     Ycid=/user/ml/Ycid.mtx
+                                     Scid=7
+                                     O=/user/ml/Out.mtx
+</code></pre>
+  </div>
+</div>
+
+<h3 id="details-2">Details</h3>
+
+<p>Suppose we have $n$ records of format $(i, x, y)$, where
+<script type="math/tex">i\in\{1,\ldots, k\}</script> is a stratum number and 
$(x, y)$ are two numerical
+covariates. We want to analyze conditional linear relationship between
+$y$ and $x$ conditioned byÂ $i$. Note that $x$, but notÂ $y$, may
+represent a categorical variable if we assign a numerical value to each
+category, for example 0 and 1 for two categories.</p>
+
+<p>We assume a linear regression model forÂ $y$:</p>
+
+<script type="math/tex; mode=display">\begin{equation}
+y_{i,j} \,=\, \alpha_i + \beta x_{i,j} + {\varepsilon}_{i,j}\,, 
\quad\textrm{where}\,\,\,\,
+{\varepsilon}_{i,j} \sim Normal(0, \sigma^2)
+\end{equation}</script>
+
+<p>Here $i = 1\ldots k$ is a stratum number and
+$j = 1\ldots n_i$ is a record number in stratumÂ $i$; by $n_i$ we denote
+the number of records available in stratumÂ $i$. The noise
+termÂ <script type="math/tex">\varepsilon_{i,j}</script> is assumed to have 
the same variance in all
+strata. When $n_i\,{&gt;}\,0$, we can estimate the means of <script 
type="math/tex">x_{i, j}</script> and
+<script type="math/tex">y_{i, j}</script> in stratumÂ $i$ as</p>
+
+<script type="math/tex; mode=display">\bar{x}_i \,= 
\Big(\sum\nolimits_{j=1}^{n_i} \,x_{i, j}\Big) / n_i\,;\quad
+\bar{y}_i \,= \Big(\sum\nolimits_{j=1}^{n_i} \,y_{i, j}\Big) / n_i</script>
+
+<p>If
+$\beta$ is known, the best estimate for $\alpha_i$ is
+$\bar{y}_i - \beta \bar{x}_i$, which gives the prediction error
+sum-of-squares of</p>
+
+<script type="math/tex; mode=display">\begin{equation}
+\sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - \beta x_{i,j} - 
(\bar{y}_i - \beta \bar{x}_i)\big)^2
+\,\,=\,\, \beta^{2\,}V_x \,-\, 2\beta \,V_{x,y} \,+\, V_y
+\label{eqn:stratsumsq}
+\end{equation}</script>
+
+<p>where $V_x$, $V_y$, and $V_{x, y}$ are,
+correspondingly, the &#8220;stratified&#8221; sample estimates of variance
+$Var(x)$ and
+$Var(y)$ and covariance
+$Cov(x,y)$ computed as</p>
+
+<script type="math/tex; mode=display">% <![CDATA[
+\begin{aligned}
+V_x     \,&=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(x_{i,j} - 
\bar{x}_i\big)^2; \quad
+V_y     \,=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - 
\bar{y}_i\big)^2;\\
+V_{x,y} \,&=\, \sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(x_{i,j} - 
\bar{x}_i\big)\big(y_{i,j} - \bar{y}_i\big)\end{aligned} %]]></script>
+
+<p>They are stratified because we compute the sample (co-)variances in each
+stratumÂ $i$ separately, then combine by summation. The stratified
+estimates for $Var(X)$ and
+$Var(Y)$ tend to be smaller
+than the non-stratified ones (with the global mean instead of
+$\bar{x_i}$ andÂ $\bar{y_i}$) since $\bar{x_i}$ and $\bar{y_i}$ fit
+closer to <script type="math/tex">x_{i,j}</script> and <script 
type="math/tex">y_{i,j}</script> than the global means. The stratified
+variance estimates the uncertainty in <script type="math/tex">x_{i,j}</script> 
andÂ <script type="math/tex">y_{i,j}</script> given
+their stratumÂ $i$.</p>
+
+<p>Minimizing overÂ $\beta$ the error sum-of-squaresÂ (2)
+gives us the regression slope estimate $\hat{\beta} = V_{x,y} / V_x$,
+withÂ (2) becoming the residual sum-of-squaresÂ (RSS):</p>
+
+<script type="math/tex; mode=display">\mathrm{RSS} \,\,=\, \,
+\sum\nolimits_{i=1}^k \sum\nolimits_{j=1}^{n_i} \big(y_{i,j} - 
+\hat{\beta} x_{i,j} - (\bar{y}_i - \hat{\beta} \bar{x}_i)\big)^2
+\,\,=\,\,  V_y \,\big(1 \,-\, V_{x,y}^2 / (V_x V_y)\big)</script>
+
+<p>The quantity
+<script type="math/tex">\hat{R}^2 = V_{x,y}^2 / (V_x V_y)</script>, called 
<em>$R$-squared</em>, estimates the
+fraction of stratified variance inÂ <script type="math/tex">y_{i,j}</script> 
explained by covariate
+<script type="math/tex">x_{i, j}</script> in the linear regression model (1). 
We
+define <em>stratified correlation</em> as the square root ofÂ $\hat{R}^2$ taken
+with the sign ofÂ $V_{x,y}$. We also use RSS to estimate the residual
+standard deviation $\sigma$ inÂ (1) that models the
+prediction error of <script type="math/tex">y_{i,j}</script> given <script 
type="math/tex">x_{i,j}</script> and the stratum:</p>
+
+<script type="math/tex; mode=display">\hat{\beta}\, =\, \frac{V_{x,y}}{V_x}; 
\,\,\,\, \hat{R} \,=\, \frac{V_{x,y}}{\sqrt{V_x V_y}};
+\,\,\,\, \hat{R}^2 \,=\, \frac{V_{x,y}^2}{V_x V_y};
+\,\,\,\, \hat{\sigma} \,=\, \sqrt{\frac{\mathrm{RSS}}{n - k - 1}}\,\,\,\,
+\Big(n = \sum_{i=1}^k n_i\Big)</script>
+
+<p>The $t$-test and the $F$-test for the null-hypothesis of &#8220;$\beta = 
0$&#8221;
+are obtained by considering the effect of $\hat{\beta}$ on the residual
+sum-of-squares, measured by the decrease from $V_y$ toÂ RSS. The
+$F$-statistic is the ratio of the &#8220;explained&#8221; sum-of-squares to the
+residual sum-of-squares, divided by their corresponding degrees of
+freedom. There are $n\,{-}\,k$ degrees of freedom forÂ $V_y$, parameter
+$\beta$ reduces that to $n\,{-}\,k\,{-}\,1$ forÂ RSS, and their
+difference $V_y - {}$RSS has just 1 degree of freedom:</p>
+
+<script type="math/tex; mode=display">F \,=\, \frac{(V_y - 
\mathrm{RSS})/1}{\mathrm{RSS}/(n\,{-}\,k\,{-}\,1)}
+\,=\, \frac{\hat{R}^2\,(n\,{-}\,k\,{-}\,1)}{1-\hat{R}^2}; \quad
+t \,=\, \hat{R}\, \sqrt{\frac{n\,{-}\,k\,{-}\,1}{1-\hat{R}^2}}.</script>
+
+<p>The
+$t$-statistic is simply the square root of the $F$-statistic with the
+appropriate choice of sign. If the null hypothesis and the linear model
+are both true, the $t$-statistic has Student $t$-distribution with
+$n\,{-}\,k\,{-}\,1$ degrees of freedom. We can also compute it if we
+divide $\hat{\beta}$ by its estimated standard deviation:</p>
+
+<script type="math/tex; mode=display">st.dev(\hat{\beta})_{\mathrm{est}} \,=\, 
\hat{\sigma}\,/\sqrt{V_x} \quad\Longrightarrow\quad
+t \,=\, \hat{R}\sqrt{V_y} \,/\, \hat{\sigma} \,=\, \beta \,/\, 
st.dev(\hat{\beta})_{\mathrm{est}}</script>
+
+<p>The standard deviation estimate forÂ $\beta$ is included in
+<code>stratstats.dml</code> output.</p>
+
+<h3 id="returns-2">Returns</h3>
+
+<p>The output matrix format is defined in
+<a href="algorithms-descriptive-statistics.html#table4"><strong>TableÂ 
4</strong></a>.</p>
+
+
+
+        </div> <!-- /container -->
+
+        
+
+        <script src="js/vendor/jquery-1.12.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/vendor/anchor.min.js"></script>
+        <script src="js/main.js"></script>
+        
+
+
+
+
+        <!-- Analytics -->
+        <script>
+            
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+            (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+            
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+            
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+            ga('create', 'UA-71553733-1', 'auto');
+            ga('send', 'pageview');
+        </script>
+
+
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file 
(file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support 
"file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 
'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 
'https://' : 'http://') +
+                    
'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>
svn commit: r1828046 [4/20] - /systemml/site/docs/1.1.0/

Reply via email to