This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a30e23  [SPARK-26351][MLLIB] Update doc and minor correction in the 
mllib evaluation metrics
9a30e23 is described below

commit 9a30e23211e165a44acc0dbe19693950f7a7cc73
Author: Shahid <shahidk...@gmail.com>
AuthorDate: Sun Jan 20 18:11:14 2019 -0600

    [SPARK-26351][MLLIB] Update doc and minor correction in the mllib 
evaluation metrics
    
    ## What changes were proposed in this pull request?
    Currently, there are some minor inconsistencies in doc compared to the 
code. In this PR, I am correcting those inconsistencies.
    1) Links related to the evaluation metrics in the docs are not working
    2) Minor correction in the evaluation metrics formulas in docs.
    
    ## How was this patch tested?
    
    NA
    
    Closes #23589 from shahidki31/docCorrection.
    
    Authored-by: Shahid <shahidk...@gmail.com>
    Signed-off-by: Sean Owen <sean.o...@databricks.com>
---
 docs/mllib-evaluation-metrics.md                   | 22 +++++++++++-----------
 .../spark/mllib/evaluation/RankingMetrics.scala    |  2 ++
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/docs/mllib-evaluation-metrics.md b/docs/mllib-evaluation-metrics.md
index c65ecdc..896d95b 100644
--- a/docs/mllib-evaluation-metrics.md
+++ b/docs/mllib-evaluation-metrics.md
@@ -413,13 +413,13 @@ A ranking system usually deals with a set of $M$ users
 
 $$U = \left\{u_0, u_1, ..., u_{M-1}\right\}$$
 
-Each user ($u_i$) having a set of $N$ ground truth relevant documents
+Each user ($u_i$) having a set of $N_i$ ground truth relevant documents
 
-$$D_i = \left\{d_0, d_1, ..., d_{N-1}\right\}$$
+$$D_i = \left\{d_0, d_1, ..., d_{N_i-1}\right\}$$
 
-And a list of $Q$ recommended documents, in order of decreasing relevance
+And a list of $Q_i$ recommended documents, in order of decreasing relevance
 
-$$R_i = \left[r_0, r_1, ..., r_{Q-1}\right]$$
+$$R_i = \left[r_0, r_1, ..., r_{Q_i-1}\right]$$
 
 The goal of the ranking system is to produce the most relevant set of 
documents for each user. The relevance of the
 sets and the effectiveness of the algorithms can be measured using the metrics 
listed below.
@@ -439,10 +439,10 @@ $$rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & 
\text{otherwise}.\end{
         Precision at k
       </td>
       <td>
-        $p(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{k} 
\sum_{j=0}^{\text{min}(\left|D\right|, k) - 1} rel_{D_i}(R_i(j))}$
+        $p(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{k} 
\sum_{j=0}^{\text{min}(Q_i, k) - 1} rel_{D_i}(R_i(j))}$
       </td>
       <td>
-        <a 
href="https://en.wikipedia.org/wiki/Information_retrieval#Precision_at_K";>Precision
 at k</a> is a measure of
+        <a 
href="https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_K">Precision
 at k</a> is a measure of
          how many of the first k recommended documents are in the set of true 
relevant documents averaged across all
          users. In this metric, the order of the recommendations is not taken 
into account.
       </td>
@@ -450,10 +450,10 @@ $$rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & 
\text{otherwise}.\end{
     <tr>
       <td>Mean Average Precision</td>
       <td>
-        $MAP=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{\left|D_i\right|} 
\sum_{j=0}^{Q-1} \frac{rel_{D_i}(R_i(j))}{j + 1}}$
+        $MAP=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{N_i} \sum_{j=0}^{Q_i-1} 
\frac{rel_{D_i}(R_i(j))}{j + 1}}$
       </td>
       <td>
-        <a 
href="https://en.wikipedia.org/wiki/Information_retrieval#Mean_average_precision";>MAP</a>
 is a measure of how
+        <a 
href="https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Mean_average_precision">MAP</a>
 is a measure of how
          many of the recommended documents are in the set of true relevant 
documents, where the
         order of the recommendations is taken into account (i.e. penalty for 
highly relevant documents is higher).
       </td>
@@ -462,10 +462,10 @@ $$rel_D(r) = \begin{cases}1 & \text{if $r \in D$}, \\ 0 & 
\text{otherwise}.\end{
       <td>Normalized Discounted Cumulative Gain</td>
       <td>
         $NDCG(k)=\frac{1}{M} \sum_{i=0}^{M-1} {\frac{1}{IDCG(D_i, 
k)}\sum_{j=0}^{n-1}
-          \frac{rel_{D_i}(R_i(j))}{\text{ln}(j+2)}} \\
+          \frac{rel_{D_i}(R_i(j))}{\text{log}(j+2)}} \\
         \text{Where} \\
-        \hspace{5 mm} n = 
\text{min}\left(\text{max}\left(|R_i|,|D_i|\right),k\right) \\
-        \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 
1} \frac{1}{\text{ln}(j+2)}$
+        \hspace{5 mm} n = \text{min}\left(\text{max}\left(Q_i, 
N_i\right),k\right) \\
+        \hspace{5 mm} IDCG(D, k) = \sum_{j=0}^{\text{min}(\left|D\right|, k) - 
1} \frac{1}{\text{log}(j+2)}$
       </td>
       <td>
         <a 
href="https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG";>NDCG
 at k</a> is a
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
index b98aa05..4935d11 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/RankingMetrics.scala
@@ -138,6 +138,8 @@ class RankingMetrics[T: ClassTag](predictionAndLabels: 
RDD[(Array[T], Array[T])]
         var dcg = 0.0
         var i = 0
         while (i < n) {
+          // Base of the log doesn't matter for calculating NDCG,
+          // if the relevance value is binary.
           val gain = 1.0 / math.log(i + 2)
           if (i < pred.length && labSet.contains(pred(i))) {
             dcg += gain


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to