Chelsyx has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/346461 )

Change subject: Implement the wiki/language selector in more search dashboards
......................................................................

Implement the wiki/language selector in more search dashboards

Three new dashboards are added:
- CTR by Language/Project
- Events by Language/Project
- PaulScore by Language/Project

Bug: T150410
Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc
---
M server.R
A tab_documentation/desktop_events_langproj.md
A tab_documentation/kpi_ctr_langproj.md
A tab_documentation/paulscore_langproj.html
M ui.R
M utils.R
6 files changed, 520 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/rainbow 
refs/changes/61/346461/1

diff --git a/server.R b/server.R
index 5ec500e..79a7846 100644
--- a/server.R
+++ b/server.R
@@ -26,6 +26,7 @@
     read_failures(existing_date)
     progress$set(message = "Downloading engagement data", value = 0.7)
     read_augmented_clickthrough()
+    read_augmented_clickthrough_langproj()
     progress$set(message = "Downloading survival data", value = 0.8)
     read_lethal_dose()
     progress$set(message = "Downloading PaulScore data", value = 0.9)
@@ -877,6 +878,191 @@
       dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom")
   })
 
+  output$ctr_language_selector_container <- renderUI({
+    if (input$ctr_language_order == "alphabet") {
+      languages_to_display <- as.list(sort(available_languages_ctr$language))
+      names(languages_to_display) <- 
available_languages_ctr$label[order(available_languages_ctr$language)]
+    } else {
+      languages_to_display <- available_languages_ctr$language
+      names(languages_to_display) <- available_languages_ctr$label
+    }
+
+    # e.g. if user sorts projects alphabetically and the selected project is 
"10th Anniversary of Wikipeda"
+    #      then automatically select the language "(None)" to avoid giving 
user an error. This also works if
+    #      the user selects a project that is not multilingual, so this 
automatically chooses the "(None)"
+    #      option for the user.
+    if (any(input$ctr_project_selector %in% 
projects_db$project[!projects_db$multilingual])) {
+      if (any(input$ctr_project_selector %in% 
projects_db$project[projects_db$multilingual])) {
+        if (!is.null(input$ctr_language_selector)) {
+          selected_language <- union("(None)", input$ctr_language_selector)
+        } else {
+          selected_language <- c("(None)", languages_to_display[[1]])
+        }
+      } else {
+        selected_language <- "(None)"
+      }
+    } else {
+      if (!is.null(input$ctr_language_selector)) {
+        selected_language <- input$ctr_language_selector
+      } else {
+        selected_language <- languages_to_display[[1]]
+      }
+    }
+    return(selectInput("ctr_language_selector", "Language", multiple = 
TRUE,selectize = FALSE, size = 19,
+                       choices = languages_to_display, selected = 
selected_language))
+  })
+
+  output$ctr_project_selector_container <- renderUI({
+    if (input$ctr_project_order == "alphabet") {
+      projects_to_display <- as.list(sort(available_projects_ctr$project))
+      names(projects_to_display) <- 
available_projects_ctr$label[order(available_projects_ctr$project)]
+    } else {
+      projects_to_display <- available_projects_ctr$project
+      names(projects_to_display) <- available_projects_ctr$label
+    }
+    return(selectInput("ctr_project_selector", "Project", multiple = 
TRUE,selectize = FALSE, size = 19,
+                       choices = projects_to_display, selected = 
projects_to_display[[1]]))
+  })
+
+  output$kpi_ctr_langproj_plot <- renderDygraph({
+    augmented_clickthroughs_langproj %>%
+      kpi_ctr_aggregate_wikis(input$ctr_language_selector, 
input$ctr_project_selector) %>%
+      dplyr::select_(.dots=c("date", "wiki", 
paste0("`",input$ctr_metrics,"`"))) %>%
+      tidyr::spread_(., key_col="wiki", value_col=input$ctr_metrics, fill=0) 
%>%
+      polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_kpi_ctr_langproj)) %>%
+      polloi::make_dygraph(xlab = "Date", ylab = input$ctr_metrics, title = 
paste0(input$ctr_metrics, ", by day")) %>%
+      dyAxis("y", axisLabelFormatter = "function(x) { return x + '%'; }", 
valueFormatter = "function(x) { return x + '%'; }") %>%
+      dyLegend(show = "always", width = 400, labelsDiv = 
"kpi_ctr_langproj_legend") %>%
+      dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>%
+      dyRangeSelector(fillColor = "")
+  })
+
+  output$desktop_events_language_selector_container <- renderUI({
+    if (input$desktop_events_language_order == "alphabet") {
+      languages_to_display <- 
as.list(sort(available_languages_desktop$language))
+      names(languages_to_display) <- 
available_languages_desktop$label[order(available_languages_desktop$language)]
+    } else {
+      languages_to_display <- available_languages_desktop$language
+      names(languages_to_display) <- available_languages_desktop$label
+    }
+
+    # e.g. if user sorts projects alphabetically and the selected project is 
"10th Anniversary of Wikipeda"
+    #      then automatically select the language "(None)" to avoid giving 
user an error. This also works if
+    #      the user selects a project that is not multilingual, so this 
automatically chooses the "(None)"
+    #      option for the user.
+    if (any(input$desktop_events_project_selector %in% 
projects_db$project[!projects_db$multilingual])) {
+      if (any(input$desktop_events_project_selector %in% 
projects_db$project[projects_db$multilingual])) {
+        if (!is.null(input$desktop_events_language_selector)) {
+          selected_language <- union("(None)", 
input$desktop_events_language_selector)
+        } else {
+          selected_language <- c("(None)", languages_to_display[[1]])
+        }
+      } else {
+        selected_language <- "(None)"
+      }
+    } else {
+      if (!is.null(input$desktop_events_language_selector)) {
+        selected_language <- input$desktop_events_language_selector
+      } else {
+        selected_language <- languages_to_display[[1]]
+      }
+    }
+    return(selectInput("desktop_events_language_selector", "Language", 
multiple = TRUE,selectize = FALSE, size = 19,
+                       choices = languages_to_display, selected = 
selected_language))
+  })
+
+  output$desktop_events_project_selector_container <- renderUI({
+    if (input$desktop_events_project_order == "alphabet") {
+      projects_to_display <- as.list(sort(available_projects_desktop$project))
+      names(projects_to_display) <- 
available_projects_desktop$label[order(available_projects_desktop$project)]
+    } else {
+      projects_to_display <- available_projects_desktop$project
+      names(projects_to_display) <- available_projects_desktop$label
+    }
+    return(selectInput("desktop_events_project_selector", "Project", multiple 
= TRUE,selectize = FALSE, size = 19,
+                       choices = projects_to_display, selected = 
projects_to_display[[1]]))
+  })
+
+  output$desktop_events_langproj_plot <- renderDygraph({
+    desktop_langproj_dygraph_set %>%
+      desktop_events_aggregate_wikis(input$desktop_events_language_selector, 
input$desktop_events_project_selector) %>%
+      dplyr::select_(.dots=c("date", "wiki", 
paste0("`",input$desktop_event_type,"`"))) %>%
+      tidyr::spread_(., key_col="wiki", value_col=input$desktop_event_type, 
fill=0) %>%
+      polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_desktop_events_langproj)) %>%
+      polloi::make_dygraph(xlab = "Date", ylab = 
capitalize_first_letter(input$desktop_event_type), title = 
paste0(capitalize_first_letter(input$desktop_event_type), ", by day")) %>%
+      dyLegend(show = "always", width = 400, labelsDiv = 
"desktop_events_langproj_legend") %>%
+      dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>%
+      dyRangeSelector(fillColor = "")
+  })
+
+  output$paulscore_language_selector_container <- renderUI({
+    if (input$paulscore_language_order == "alphabet") {
+      languages_to_display <- 
as.list(sort(available_languages_paulscore$language))
+      names(languages_to_display) <- 
available_languages_paulscore$label[order(available_languages_paulscore$language)]
+    } else {
+      languages_to_display <- available_languages_paulscore$language
+      names(languages_to_display) <- available_languages_paulscore$label
+    }
+
+    # e.g. if user sorts projects alphabetically and the selected project is 
"10th Anniversary of Wikipeda"
+    #      then automatically select the language "(None)" to avoid giving 
user an error. This also works if
+    #      the user selects a project that is not multilingual, so this 
automatically chooses the "(None)"
+    #      option for the user.
+    if (any(input$paulscore_project_selector %in% 
projects_db$project[!projects_db$multilingual])) {
+      if (any(input$paulscore_project_selector %in% 
projects_db$project[projects_db$multilingual])) {
+        if (!is.null(input$paulscore_language_selector)) {
+          selected_language <- union("(None)", 
input$paulscore_language_selector)
+        } else {
+          selected_language <- c("(None)", languages_to_display[[1]])
+        }
+      } else {
+        selected_language <- "(None)"
+      }
+    } else {
+      if (!is.null(input$paulscore_language_selector)) {
+        selected_language <- input$paulscore_language_selector
+      } else {
+        selected_language <- languages_to_display[[1]]
+      }
+    }
+    return(selectInput("paulscore_language_selector", "Language", multiple = 
TRUE,selectize = FALSE, size = 19,
+                       choices = languages_to_display, selected = 
selected_language))
+  })
+
+  output$paulscore_project_selector_container <- renderUI({
+    if (input$paulscore_project_order == "alphabet") {
+      projects_to_display <- 
as.list(sort(available_projects_paulscore$project))
+      names(projects_to_display) <- 
available_projects_paulscore$label[order(available_projects_paulscore$project)]
+    } else {
+      projects_to_display <- available_projects_paulscore$project
+      names(projects_to_display) <- available_projects_paulscore$label
+    }
+    return(selectInput("paulscore_project_selector", "Project", multiple = 
TRUE,selectize = FALSE, size = 19,
+                       choices = projects_to_display, selected = 
projects_to_display[[1]]))
+  })
+
+  output$paulscore_langproj_plot <- renderDygraph({
+    temp <- paulscore_fulltext_langproj
+    if (input$paulscore_relative_langproj) {
+      temp$`F = 0.1` <- temp$`F = 0.1` / (1/(1-0.1))
+      temp$`F = 0.5` <- temp$`F = 0.5` / (1/(1-0.5))
+      temp$`F = 0.9` <- temp$`F = 0.9` / (1/(1-0.9))
+    }
+    dyOut <- temp %>%
+      paulscore_aggregate_wikis(input$paulscore_language_selector, 
input$paulscore_project_selector) %>%
+      dplyr::select_(.dots=c("date", "wiki", 
paste0("`",input$paulscore_factor,"`"))) %>%
+      tidyr::spread_(., key_col="wiki", value_col=input$paulscore_factor, 
fill=0) %>%
+      polloi::smoother(smooth_level = 
polloi::smooth_switch(input$smoothing_global, 
input$smoothing_paulscore_langproj)) %>%
+      polloi::make_dygraph(xlab = "Date", ylab = "PaulScore", title = 
paste0("PaulScore for fulltext searches, ", input$paulscore_factor)) %>%
+      dyLegend(show = "always", width = 400, labelsDiv = 
"paulscore_langproj_legend") %>%
+      dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>%
+      dyRangeSelector(fillColor = "")
+    if (input$paulscore_relative_langproj) {
+      dyOut <- dyAxis(dyOut, "y", axisLabelFormatter = "function(x) { return 
Math.round(100*x, 2) + '%'; }", valueFormatter = "function(x) { return 
Math.round(100*x, 2) + '%'; }")
+    }
+    return(dyOut)
+  })
+
   output$monthly_metrics_tbl <- DT::renderDataTable({
     temp <- data.frame(
       KPI = c("Load time", "Zero results rate", "API Usage", "User 
engagement"),
diff --git a/tab_documentation/desktop_events_langproj.md 
b/tab_documentation/desktop_events_langproj.md
new file mode 100644
index 0000000..1f33770
--- /dev/null
+++ b/tab_documentation/desktop_events_langproj.md
@@ -0,0 +1,31 @@
+Desktop Full-Text Search by Languages and Projects
+=======
+
+User actions that we track around search on the desktop website generally fall 
into three categories:
+
+1. The start of a user's search session;
+2. The presentation of the user with a results page, and;
+3. A user clicking through to an article in the results page.
+
+These three things are tracked via the [EventLogging 'TestSearchSatisfaction2' 
schema](https://meta.wikimedia.org/wiki/Schema:TestSearchSatisfaction2) and 
stored to
+a database. The results are then aggregated and anonymised, and split out by 
language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary) 
as presented on this page. For performance/privacy reasons we randomly sample 
what we store, so the actual numbers are a vast understatement of how many user 
actions our servers receive - what's more interesting is how they change over 
time. In the case of desktop search, this sampling rate is **0.1%**.
+
+Notes/Tips
+------
+* The percentages next to the language and project names represent the 
proportion of the total volume.
+* You can select multiple projects and multiple languages to compare 
simultaneously. (Hold down Ctrl on Windows or Command on Mac.)
+* The language picker will automatically choose "(None)" if you select a 
non-multilingual project such as Wikidata.
+* If you're interested in the overall metric for a multilingual project such 
as Wikipedia, make sure only "(None)" is selected in the languages picker.
+* Due to the high number of language-project combinations, we have restricted 
ourselves to only storing the last 30 days of data.
+
+Questions, bug reports, and feature suggestions
+------
+For technical, non-bug questions, [email 
Mikhail](mailto:mpo...@wikimedia.org?subject=Dashboard%20Question). If you 
experience a bug or notice something wrong or have a suggestion, [open a ticket 
in 
Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery)
 in the Discovery board or [email 
Deb](mailto:d...@wikimedia.org?subject=Dashboard%20Question).
+
+<hr style="border-color: gray;">
+<p style="font-size: small; color: gray;">
+  <strong>Link to this dashboard:</strong>
+  <a href="http://discovery.wmflabs.org/metrics/#desktop_events_langproj";>
+    http://discovery.wmflabs.org/metrics/#desktop_events_langproj
+  </a>
+</p>
diff --git a/tab_documentation/kpi_ctr_langproj.md 
b/tab_documentation/kpi_ctr_langproj.md
new file mode 100644
index 0000000..2094a7b
--- /dev/null
+++ b/tab_documentation/kpi_ctr_langproj.md
@@ -0,0 +1,24 @@
+User Engagement (Augmented Clickthroughs) by Languages and Projects
+=======
+
+This metric combines the clickthrough rate and the proportion of users' 
session dwell times exceeding the threshold of 10s, split out by language (e.g. 
English vs Russian) and project (e.g. Wikipedia vs Wiktionary).
+
+Notes/Tips
+------
+* The percentages next to the language and project names represent the 
proportion of the total volume.
+* You can select multiple projects and multiple languages to compare 
simultaneously. (Hold down Ctrl on Windows or Command on Mac.)
+* The language picker will automatically choose "(None)" if you select a 
non-multilingual project such as Wikidata.
+* If you're interested in the overall metric for a multilingual project such 
as Wikipedia, make sure only "(None)" is selected in the languages picker.
+* Due to the high number of language-project combinations, we have restricted 
ourselves to only storing the last 30 days of data.
+
+Questions, bug reports, and feature suggestions
+------
+For technical, non-bug questions, [email 
Mikhail](mailto:mpo...@wikimedia.org?subject=Dashboard%20Question). If you 
experience a bug or notice something wrong or have a suggestion, [open a ticket 
in 
Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery)
 in the Discovery board or [email 
Deb](mailto:d...@wikimedia.org?subject=Dashboard%20Question).
+
+<hr style="border-color: gray;">
+<p style="font-size: small; color: gray;">
+  <strong>Link to this dashboard:</strong>
+  <a href="http://discovery.wmflabs.org/metrics/#kpi_ctr_langproj";>
+    http://discovery.wmflabs.org/metrics/#kpi_ctr_langproj
+  </a>
+</p>
diff --git a/tab_documentation/paulscore_langproj.html 
b/tab_documentation/paulscore_langproj.html
new file mode 100644
index 0000000..f9db40d
--- /dev/null
+++ b/tab_documentation/paulscore_langproj.html
@@ -0,0 +1,43 @@
+<h1>PaulScore Approximations by Languages and Projects</h1>
+
+<div>
+<div style="float: left; width: 47%; padding: 0 3% 0 1%;">
+<p>"PaulScore" is the name we've given to a metric proposed by Paul Nelson in 
a talk he gave at <a 
href="https://www.youtube.com/watch?v=YJ_amC9gZmk&t=16m38s"; title="Paul 
Nelson's talk at Elasticon">Elasticon</a>. We use PaulScore to evaluate the 
quality of results provided by CirrusSearch or proposed modifications to 
CirrusSearch, based on historical click data. A big advantage of the PaulScore 
is that it relies on user click history to award points, so it is easy to 
compute.</p>
+<p>This dashboard shows the PaulScore approximation for 3 values of $F$: 0.1, 
0.5, and 0.9, split out by language (e.g. English vs Russian) and project (e.g. 
Wikipedia vs Wiktionary). The maximum score possible for each value of $F$ is 
$1/(1-F)$, so the dashboard has the option of looking at relative PaulScores, 
which is the computed value divided by maximum possible value for given $F$.</p>
+<p>For more details, please see <a 
href="https://www.mediawiki.org/wiki/Wikimedia_Discovery/Search/Glossary#PaulScore";
 title="Definition of PaulScore">Discovery's Search glossary</a>.</p>
+</div>
+<div style="float: left; width: 46%; padding: 0 1% 0 3%;">
+<p>PaulScore is computed via the following steps:</p>
+<ol>
+<li>Pick scoring factor $0 < F < 1$.</li>
+<li>For $i$-th search session $S_i$ $(i=1, \ldots, n)$ containing $m$ queries 
$Q_1, \ldots, Q_m$ and search result sets $\mathbf{R}_1, \ldots, \mathbf{R}_m$: 
<ol>
+    <li>For each $j$-th search query $Q_j$ with result set $\mathbf{R}_j$, let 
$\nu_j$ be the query score: $$\nu_j=\sum_{k~\in~\{\text{0-based positions of 
clicked results in}~\mathbf{R}_j\}} F^k.$$</li>
+    <li>Let user's average query score $\bar{\nu}_{(i)}$ be 
$$\bar{\nu}_{(i)}=\frac{1}{m} \sum_{j=1}^m \nu_j.$$</li>
+  </ol></li>
+<li>Then the PaulScore is the average of all users' average query scores: 
$$\text{PaulScore}~=~\frac{1}{n} \sum_{i=1}^n \bar{\nu}_{(i)}.$$</li>
+</ol>
+</div>
+</div>
+<div style="clear: both;"></div>
+
+<h2>Notes/Tips</h2>
+
+<ul>
+  <li>The percentages next to the language and project names represent the 
proportion of the total volume.</li>
+  <li>You can select multiple projects and multiple languages to compare 
simultaneously. (Hold down Ctrl on Windows or Command on Mac.)</li>
+  <li>The language picker will automatically choose "(None)" if you select a 
non-multilingual project such as Wikidata.</li>
+  <li>If you're interested in the overall PaulScore for a multilingual project 
such as Wikipedia, make sure only "(None)" is selected in the languages 
picker.</li>
+  <li>Due to the high number of language-project combinations, we have 
restricted ourselves to only storing the last 30 days of data.</li>
+</ul>
+
+<h2>Questions, bug reports, and feature suggestions</h2>
+
+<p>For technical, non-bug questions, <a 
href="mailto:mpo...@wikimedia.org?subject=Dashboard%20Question";>email 
Mikhail</a>. If you experience a bug or notice something wrong or have a 
suggestion, <a 
href="https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery";>open
 a ticket in Phabricator</a> in the Discovery board or <a 
href="mailto:d...@wikimedia.org?subject=Dashboard%20Question";>email Deb</a>.</p>
+
+<hr style="border-color: gray;">
+<p style="font-size: small; color: gray;">
+  <strong>Link to this dashboard:</strong>
+  <a href="http://discovery.wmflabs.org/metrics/#paulscore_langproj";>
+    http://discovery.wmflabs.org/metrics/#paulscore_langproj
+  </a>
+</p>
diff --git a/ui.R b/ui.R
index ba99936..f22db39 100644
--- a/ui.R
+++ b/ui.R
@@ -39,11 +39,14 @@
                            menuSubItem(text = "Zero results", tabName = 
"kpi_zero_results"),
                            menuSubItem(text = "API usage", tabName = 
"kpi_api_usage"),
                            menuSubItem(text = "Augmented Clickthrough", 
tabName = "kpi_augmented_clickthroughs"),
+                           menuSubItem(text = "CTR by Language/Project", 
tabName = "kpi_ctr_langproj"),
                            icon = icon("star", lib = "glyphicon")),
                   menuItem(text = "Desktop",
                            menuSubItem(text = "Events", tabName = 
"desktop_events"),
                            menuSubItem(text = "Load times", tabName = 
"desktop_load"),
-                           menuSubItem(text = "PaulScore", tabName = 
"paulscore_approx")),
+                           menuSubItem(text = "PaulScore", tabName = 
"paulscore_approx"),
+                           menuSubItem(text = "Events by Language/Project", 
tabName = "desktop_events_langproj"),
+                           menuSubItem(text = "PaulScore by Language/Project", 
tabName = "paulscore_langproj")),
                   menuItem(text = "Mobile Web",
                            menuSubItem(text = "Events", tabName = 
"mobile_events"),
                            menuSubItem(text = "Load times", tabName = 
"mobile_load")),
@@ -154,6 +157,23 @@
                   column(div(id = 
"kpi_augmented_clickthroughs_series_legend"), width = 8)),
                 dygraphOutput("kpi_augmented_clickthroughs_series"),
                 
includeMarkdown("./tab_documentation/kpi_augmented_clickthroughs.md")),
+        tabItem(tabName = "kpi_ctr_langproj",
+                
fluidRow(column(polloi::smooth_select("smoothing_kpi_ctr_langproj"), width = 6),
+                         column(selectInput("ctr_metrics", "Metrics",
+                                            list("User engagement" = "User 
engagement", "Threshold-passing %" = "Threshold-passing %", "Clickthrough rate" 
= "Clickthrough rate"),
+                                            selected = "User engagement"), 
width = 6)),
+                fluidRow(column(selectInput("ctr_project_order", "Sort 
projects by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                uiOutput("ctr_project_selector_container"), 
width = 2),
+                         column(selectInput("ctr_language_order", "Sort 
languages by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                uiOutput("ctr_language_selector_container"), 
width = 2),
+                         column(dygraphOutput("kpi_ctr_langproj_plot"),
+                                div(id = "kpi_ctr_langproj_legend", style = 
"margin-top:30px;"), width = 8)),
+                includeMarkdown("./tab_documentation/kpi_ctr_langproj.md")
+        ),
         tabItem(tabName = "desktop_events",
                 fluidRow(
                   valueBoxOutput("desktop_event_searches"),
@@ -176,6 +196,42 @@
                 div(id = "paulscore_approx_legend", style = "text-align: 
center;"),
                 dygraphOutput("paulscore_approx_plot_autocomplete"),
                 includeHTML("./tab_documentation/paulscore_approx.html")),
+        tabItem(tabName = "desktop_events_langproj",
+                
fluidRow(column(polloi::smooth_select("smoothing_desktop_events_langproj"), 
width = 6),
+                         column(selectInput("desktop_event_type", "Events 
Type",
+                                            list("Clickthroughs" = 
"clickthroughs", "Result pages opened" = "Result pages opened", "Search 
sessions" = "search sessions"),
+                                            selected = "clickthroughs"), width 
= 6)),
+                fluidRow(column(selectInput("desktop_events_project_order", 
"Sort projects by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                
uiOutput("desktop_events_project_selector_container"), width = 2),
+                         column(selectInput("desktop_events_language_order", 
"Sort languages by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                
uiOutput("desktop_events_language_selector_container"), width = 2),
+                         column(dygraphOutput("desktop_events_langproj_plot"),
+                                div(id = "desktop_events_langproj_legend", 
style = "margin-top:30px;"), width = 8)),
+                
includeMarkdown("./tab_documentation/desktop_events_langproj.md")
+        ),
+        tabItem(tabName = "paulscore_langproj",
+                
fluidRow(column(polloi::smooth_select("smoothing_paulscore_langproj"), width = 
6),
+                         column(selectInput("paulscore_factor", "Scoring 
Factor",
+                                            list("F = 0.1" = "F = 0.1", "F = 
0.5" = "F = 0.5", "F = 0.9" = "F = 0.9"),
+                                            selected = "F = 0.5"), width = 6)),
+                checkboxInput("paulscore_relative_langproj", "Use relative 
PaulScores", FALSE),
+                helpText("Divides PaulScore by the maximum possible score for 
each F"),
+                fluidRow(column(selectInput("paulscore_project_order", "Sort 
projects by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                
uiOutput("paulscore_project_selector_container"), width = 2),
+                         column(selectInput("paulscore_language_order", "Sort 
languages by",
+                                            list("Alphabetical order" = 
"alphabet", "Volume of requests" = "volume"),
+                                            selected = "volume"),
+                                
uiOutput("paulscore_language_selector_container"), width = 2),
+                         column(dygraphOutput("paulscore_langproj_plot"),
+                                div(id = "paulscore_langproj_legend", style = 
"margin-top:30px;"), width = 8)),
+                includeHTML("./tab_documentation/paulscore_langproj.html")
+        ),
         tabItem(tabName = "mobile_events",
                 fluidRow(
                   valueBoxOutput("mobile_event_searches"),
diff --git a/utils.R b/utils.R
index 98fdb4e..a5b0a1a 100644
--- a/utils.R
+++ b/utils.R
@@ -1,5 +1,10 @@
 library(magrittr)
 
+capitalize_first_letter <- function(x) {
+  s <- strsplit(x, " ")[[1]]
+  return(paste(toupper(substring(s, 1,1)), substring(s, 2), sep = "", collapse 
= " "))
+}
+
 ## Read in desktop data and generate means for the value boxes, along with a 
time-series appropriate form for
 ## dygraphs.
 read_desktop <- function() {
@@ -9,6 +14,27 @@
   desktop_dygraph_means <<- round(colMeans(desktop_dygraph_set[, 2:5]))
   desktop_load_data <<- 
polloi::read_dataset("discovery/search/desktop_load_times.tsv", col_types = 
"Dddd") %>%
     dplyr::filter(!is.na(Median))
+  # Broken down by language-project pair
+  desktop_langproj_dygraph_set <<- 
polloi::read_dataset("discovery/search/desktop_event_counts_langproj_breakdown.tsv",
 col_types = "Dccci") %>%
+    dplyr::filter(!is.na(action), !is.na(events)) %>%
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>%
+    tidyr::spread(action, events, fill = 0)
+  ## Summaries for sorting (search sessions)
+  available_languages_desktop <<- desktop_langproj_dygraph_set %>%
+    dplyr::group_by(language) %>%
+    dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", language, 100*prop))
+  available_projects_desktop <<- desktop_langproj_dygraph_set %>%
+    dplyr::group_by(project) %>%
+    dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", project, 100*prop))
+  # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", 
package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')]
 }
 
 read_web <- function() {
@@ -148,22 +174,22 @@
   ### With automata
   langproj_with_automata <<- 
polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_with_automata.tsv",
 na = "~", col_types = "Dccii") %>%
     dplyr::filter(!is.na(zero_results), !is.na(total)) %>%
-    dplyr::mutate(language = sub("NA", "(None)", language))
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language))
   ### Without automata
   langproj_no_automata <<- 
polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_no_automata.tsv",
 na = "~", col_types = "Dccii") %>%
     dplyr::filter(!is.na(zero_results), !is.na(total)) %>%
-    dplyr::mutate(language = sub("NA", "(None)", language))
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language))
   ### Summaries for sorting
   available_languages <<- langproj_with_automata %>%
     dplyr::group_by(language) %>%
-    dplyr::summarize(volume = sum(as.numeric(total))) %>%
+    dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>%
     dplyr::filter(volume > 0) %>%
     dplyr::arrange(desc(volume)) %>%
     dplyr::mutate(prop = volume/sum(volume),
                   label = sprintf("%s (%.3f%%)", language, 100*prop))
   available_projects <<- langproj_with_automata %>%
     dplyr::group_by(project) %>%
-    dplyr::summarize(volume = sum(as.numeric(total))) %>%
+    dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>%
     dplyr::filter(volume > 0) %>%
     dplyr::arrange(desc(volume)) %>%
     dplyr::mutate(prop = volume/sum(volume),
@@ -193,6 +219,68 @@
     )
 }
 
+read_augmented_clickthrough_langproj <- function() {
+  # Read data
+  threshold_data <- 
polloi::read_dataset("discovery/search/search_threshold_pass_rate_langproj_breakdown.tsv",
 col_types = "Dccdi") %>%
+    dplyr::filter(!is.na(threshold_pass)) %>%
+    dplyr::mutate(threshold_pass = 100 * threshold_pass, language = 
ifelse(is.na(language), "(None)", language))
+  mobile_langproj <- 
polloi::read_dataset("discovery/search/mobile_event_counts_langproj_breakdown.tsv",
 col_types = "Dccci") %>%
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>%
+    dplyr::filter(!is.na(action), !is.na(events), !is.na(project)) %>%
+    tidyr::spread(action, events, fill = 0)
+  app_langproj <- 
polloi::read_dataset("discovery/search/app_event_counts_langproj_breakdown.tsv",
 col_types = "Dccci") %>%
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>%
+    dplyr::mutate(project = "Wikipedia") %>%
+    dplyr::filter(!is.na(action), !is.na(events)) %>%
+    dplyr::distinct(date, platform, language, project, action, .keep_all = 
TRUE)
+  ios_langproj <- app_langproj %>%
+    dplyr::filter(platform == "iOS") %>%
+    dplyr::select(-platform) %>%
+    tidyr::spread(action, events, fill = 0)
+  android_langproj <- app_langproj %>%
+    dplyr::filter(platform == "Android") %>%
+    dplyr::select(-platform) %>%
+    tidyr::spread(action, events, fill = 0)
+  # Augmented clickthroughs
+  augmented_clickthroughs_langproj <<- list(
+    desktop = dplyr::select(desktop_langproj_dygraph_set, c(date, language, 
project, clickthroughs, `Result pages opened`)),
+    mobile = dplyr::select(mobile_langproj, c(date, language, project, 
clickthroughs, `Result pages opened`)),
+    ios = dplyr::select(ios_langproj, c(date, language, project, 
clickthroughs, `Result pages opened`)),
+    android = dplyr::select(android_langproj, c(date, language, project, 
clickthroughs, `Result pages opened`))
+  ) %>%
+    dplyr::bind_rows(.id = "platform") %>%
+    dplyr::group_by(date, language, project) %>%
+    dplyr::summarize(clickthroughs = sum(clickthroughs), serps = sum(`Result 
pages opened`)) %>%
+    dplyr::right_join(threshold_data, by = c("date", "language", "project")) 
%>%
+    dplyr::ungroup() %>%
+    dplyr::transmute(
+      date = date,
+      language = language,
+      project = project,
+      `Result pages opened` = serps,
+      search_sessions_threshold = search_sessions,
+      `Threshold-passing %` = round(threshold_pass, 2),
+      `Clickthrough rate` = round(100 * clickthroughs/serps, 2),
+      `User engagement` = round((threshold_pass + `Clickthrough rate`)/2, 2)
+    )
+  # Summaries for sorting (SERP)
+  available_languages_ctr <<- augmented_clickthroughs_langproj %>%
+    dplyr::group_by(language) %>%
+    dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", language, 100*prop))
+  available_projects_ctr <<- augmented_clickthroughs_langproj %>%
+    dplyr::group_by(project) %>%
+    dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", project, 100*prop))
+  # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", 
package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')]
+}
+
 read_lethal_dose <- function() {
   user_page_visit_dataset <<- 
polloi::read_dataset("discovery/search/sample_page_visit_ld.tsv", col_types = 
"Dddddddd") %>%
     dplyr::filter(!is.na(LD10)) %>%
@@ -205,6 +293,93 @@
     dplyr::select(c(date, event_source, `F = 0.1` = pow_1, `F = 0.5` = pow_5, 
`F = 0.9` = pow_9))
   paulscore_autocomplete <<- dplyr::filter(paulscore, event_source == 
"autocomplete") %>% dplyr::select(-event_source)
   paulscore_fulltext <<- dplyr::filter(paulscore, event_source == "fulltext") 
%>% dplyr::select(-event_source)
+  # Broken down by language-project pair
+  paulscore_fulltext_langproj <<- 
polloi::read_dataset("discovery/search/paulscore_approximations_fulltext_langproj_breakdown.tsv",
 col_types = "Dcciddddddddd") %>%
+    dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>%
+    dplyr::filter(!is.na(project)) %>%
+    dplyr::select(c(date, language, project, `search sessions` = 
search_sessions, `F = 0.1` = pow_1, `F = 0.5` = pow_5, `F = 0.9` = pow_9))
+  ## Summaries for sorting (search sessions)
+  available_languages_paulscore <<- paulscore_fulltext_langproj %>%
+    dplyr::group_by(language) %>%
+    dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", language, 100*prop))
+  available_projects_paulscore <<- paulscore_fulltext_langproj %>%
+    dplyr::group_by(project) %>%
+    dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = 
TRUE)) %>%
+    dplyr::filter(volume > 0) %>%
+    dplyr::arrange(desc(volume)) %>%
+    dplyr::mutate(prop = volume/sum(volume),
+                  label = sprintf("%s (%.3f%%)", project, 100*prop))
+  # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", 
package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')]
+}
+
+kpi_ctr_aggregate_wikis <- function(data, languages, projects) {
+  languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages)
+  projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects)
+  if (length(languages) == 1 && languages[1] == "(None)") {
+    temp <- data %>%
+      dplyr::filter_(~project %in% projects) %>%
+      dplyr::rename(wiki = project) %>%
+      dplyr::group_by(date, wiki) %>%
+      dplyr::summarize(`Threshold-passing %` = round(sum(`Threshold-passing 
%`*search_sessions_threshold)/sum(search_sessions_threshold), 2),
+                       `Clickthrough rate` = round(sum(`Clickthrough 
rate`*`Result pages opened`)/sum(`Result pages opened`), 2),
+                       `User engagement` = round((`Threshold-passing %` + 
`Clickthrough rate`)/2, 2)
+                       ) %>%
+      dplyr::ungroup()
+  } else {
+    temp <- data %>%
+      dplyr::filter_(~language %in% languages & project %in% projects) %>%
+      tidyr::unite(wiki, language, project, sep = " ") %>%
+      dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE))
+  }
+  return(temp)
+}
+
+desktop_events_aggregate_wikis <- function(data, languages, projects) {
+  languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages)
+  projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects)
+  if (length(languages) == 1 && languages[1] == "(None)") {
+    temp <- data %>%
+      dplyr::filter_(~project %in% projects) %>%
+      dplyr::rename(wiki = project) %>%
+      dplyr::group_by(date, wiki) %>%
+      dplyr::summarize(clickthroughs = round(sum(as.numeric(clickthroughs)), 
2),
+                       `Result pages opened` = round(sum(as.numeric(`Result 
pages opened`)), 2),
+                       `search sessions` = round(sum(as.numeric(`search 
sessions`)), 2)
+                       ) %>%
+      dplyr::ungroup()
+  } else {
+    temp <- data %>%
+      dplyr::filter_(~language %in% languages & project %in% projects) %>%
+      tidyr::unite(wiki, language, project, sep = " ") %>%
+      dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE))
+  }
+  return(temp)
+}
+
+paulscore_aggregate_wikis <- function(data, languages, projects) {
+  languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages)
+  projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects)
+  if (length(languages) == 1 && languages[1] == "(None)") {
+    temp <- data %>%
+      dplyr::filter_(~project %in% projects) %>%
+      dplyr::rename(wiki = project) %>%
+      dplyr::group_by(date, wiki) %>%
+      dplyr::summarize(`F = 0.1` = round(sum(`F = 0.1`*`search 
sessions`)/sum(`search sessions`), 2),
+                       `F = 0.5` = round(sum(`F = 0.5`*`search 
sessions`)/sum(`search sessions`), 2),
+                       `F = 0.9` = round(sum(`F = 0.9`*`search 
sessions`)/sum(`search sessions`), 2)
+                       ) %>%
+      dplyr::ungroup()
+  } else {
+    temp <- data %>%
+      dplyr::filter_(~language %in% languages & project %in% projects) %>%
+      tidyr::unite(wiki, language, project, sep = " ") %>%
+      dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE))
+  }
+  return(temp)
 }
 
 aggregate_wikis <- function(data, languages, projects) {

-- 
To view, visit https://gerrit.wikimedia.org/r/346461
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/rainbow
Gerrit-Branch: master
Gerrit-Owner: Chelsyx <c...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to