Chelsyx has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/346461 )
Change subject: Implement the wiki/language selector in more search dashboards ...................................................................... Implement the wiki/language selector in more search dashboards Three new dashboards are added: - CTR by Language/Project - Events by Language/Project - PaulScore by Language/Project Bug: T150410 Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc --- M server.R A tab_documentation/desktop_events_langproj.md A tab_documentation/kpi_ctr_langproj.md A tab_documentation/paulscore_langproj.html M ui.R M utils.R 6 files changed, 520 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/rainbow refs/changes/61/346461/1 diff --git a/server.R b/server.R index 5ec500e..79a7846 100644 --- a/server.R +++ b/server.R @@ -26,6 +26,7 @@ read_failures(existing_date) progress$set(message = "Downloading engagement data", value = 0.7) read_augmented_clickthrough() + read_augmented_clickthrough_langproj() progress$set(message = "Downloading survival data", value = 0.8) read_lethal_dose() progress$set(message = "Downloading PaulScore data", value = 0.9) @@ -877,6 +878,191 @@ dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom") }) + output$ctr_language_selector_container <- renderUI({ + if (input$ctr_language_order == "alphabet") { + languages_to_display <- as.list(sort(available_languages_ctr$language)) + names(languages_to_display) <- available_languages_ctr$label[order(available_languages_ctr$language)] + } else { + languages_to_display <- available_languages_ctr$language + names(languages_to_display) <- available_languages_ctr$label + } + + # e.g. if user sorts projects alphabetically and the selected project is "10th Anniversary of Wikipeda" + # then automatically select the language "(None)" to avoid giving user an error. This also works if + # the user selects a project that is not multilingual, so this automatically chooses the "(None)" + # option for the user. + if (any(input$ctr_project_selector %in% projects_db$project[!projects_db$multilingual])) { + if (any(input$ctr_project_selector %in% projects_db$project[projects_db$multilingual])) { + if (!is.null(input$ctr_language_selector)) { + selected_language <- union("(None)", input$ctr_language_selector) + } else { + selected_language <- c("(None)", languages_to_display[[1]]) + } + } else { + selected_language <- "(None)" + } + } else { + if (!is.null(input$ctr_language_selector)) { + selected_language <- input$ctr_language_selector + } else { + selected_language <- languages_to_display[[1]] + } + } + return(selectInput("ctr_language_selector", "Language", multiple = TRUE,selectize = FALSE, size = 19, + choices = languages_to_display, selected = selected_language)) + }) + + output$ctr_project_selector_container <- renderUI({ + if (input$ctr_project_order == "alphabet") { + projects_to_display <- as.list(sort(available_projects_ctr$project)) + names(projects_to_display) <- available_projects_ctr$label[order(available_projects_ctr$project)] + } else { + projects_to_display <- available_projects_ctr$project + names(projects_to_display) <- available_projects_ctr$label + } + return(selectInput("ctr_project_selector", "Project", multiple = TRUE,selectize = FALSE, size = 19, + choices = projects_to_display, selected = projects_to_display[[1]])) + }) + + output$kpi_ctr_langproj_plot <- renderDygraph({ + augmented_clickthroughs_langproj %>% + kpi_ctr_aggregate_wikis(input$ctr_language_selector, input$ctr_project_selector) %>% + dplyr::select_(.dots=c("date", "wiki", paste0("`",input$ctr_metrics,"`"))) %>% + tidyr::spread_(., key_col="wiki", value_col=input$ctr_metrics, fill=0) %>% + polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_kpi_ctr_langproj)) %>% + polloi::make_dygraph(xlab = "Date", ylab = input$ctr_metrics, title = paste0(input$ctr_metrics, ", by day")) %>% + dyAxis("y", axisLabelFormatter = "function(x) { return x + '%'; }", valueFormatter = "function(x) { return x + '%'; }") %>% + dyLegend(show = "always", width = 400, labelsDiv = "kpi_ctr_langproj_legend") %>% + dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>% + dyRangeSelector(fillColor = "") + }) + + output$desktop_events_language_selector_container <- renderUI({ + if (input$desktop_events_language_order == "alphabet") { + languages_to_display <- as.list(sort(available_languages_desktop$language)) + names(languages_to_display) <- available_languages_desktop$label[order(available_languages_desktop$language)] + } else { + languages_to_display <- available_languages_desktop$language + names(languages_to_display) <- available_languages_desktop$label + } + + # e.g. if user sorts projects alphabetically and the selected project is "10th Anniversary of Wikipeda" + # then automatically select the language "(None)" to avoid giving user an error. This also works if + # the user selects a project that is not multilingual, so this automatically chooses the "(None)" + # option for the user. + if (any(input$desktop_events_project_selector %in% projects_db$project[!projects_db$multilingual])) { + if (any(input$desktop_events_project_selector %in% projects_db$project[projects_db$multilingual])) { + if (!is.null(input$desktop_events_language_selector)) { + selected_language <- union("(None)", input$desktop_events_language_selector) + } else { + selected_language <- c("(None)", languages_to_display[[1]]) + } + } else { + selected_language <- "(None)" + } + } else { + if (!is.null(input$desktop_events_language_selector)) { + selected_language <- input$desktop_events_language_selector + } else { + selected_language <- languages_to_display[[1]] + } + } + return(selectInput("desktop_events_language_selector", "Language", multiple = TRUE,selectize = FALSE, size = 19, + choices = languages_to_display, selected = selected_language)) + }) + + output$desktop_events_project_selector_container <- renderUI({ + if (input$desktop_events_project_order == "alphabet") { + projects_to_display <- as.list(sort(available_projects_desktop$project)) + names(projects_to_display) <- available_projects_desktop$label[order(available_projects_desktop$project)] + } else { + projects_to_display <- available_projects_desktop$project + names(projects_to_display) <- available_projects_desktop$label + } + return(selectInput("desktop_events_project_selector", "Project", multiple = TRUE,selectize = FALSE, size = 19, + choices = projects_to_display, selected = projects_to_display[[1]])) + }) + + output$desktop_events_langproj_plot <- renderDygraph({ + desktop_langproj_dygraph_set %>% + desktop_events_aggregate_wikis(input$desktop_events_language_selector, input$desktop_events_project_selector) %>% + dplyr::select_(.dots=c("date", "wiki", paste0("`",input$desktop_event_type,"`"))) %>% + tidyr::spread_(., key_col="wiki", value_col=input$desktop_event_type, fill=0) %>% + polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_desktop_events_langproj)) %>% + polloi::make_dygraph(xlab = "Date", ylab = capitalize_first_letter(input$desktop_event_type), title = paste0(capitalize_first_letter(input$desktop_event_type), ", by day")) %>% + dyLegend(show = "always", width = 400, labelsDiv = "desktop_events_langproj_legend") %>% + dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>% + dyRangeSelector(fillColor = "") + }) + + output$paulscore_language_selector_container <- renderUI({ + if (input$paulscore_language_order == "alphabet") { + languages_to_display <- as.list(sort(available_languages_paulscore$language)) + names(languages_to_display) <- available_languages_paulscore$label[order(available_languages_paulscore$language)] + } else { + languages_to_display <- available_languages_paulscore$language + names(languages_to_display) <- available_languages_paulscore$label + } + + # e.g. if user sorts projects alphabetically and the selected project is "10th Anniversary of Wikipeda" + # then automatically select the language "(None)" to avoid giving user an error. This also works if + # the user selects a project that is not multilingual, so this automatically chooses the "(None)" + # option for the user. + if (any(input$paulscore_project_selector %in% projects_db$project[!projects_db$multilingual])) { + if (any(input$paulscore_project_selector %in% projects_db$project[projects_db$multilingual])) { + if (!is.null(input$paulscore_language_selector)) { + selected_language <- union("(None)", input$paulscore_language_selector) + } else { + selected_language <- c("(None)", languages_to_display[[1]]) + } + } else { + selected_language <- "(None)" + } + } else { + if (!is.null(input$paulscore_language_selector)) { + selected_language <- input$paulscore_language_selector + } else { + selected_language <- languages_to_display[[1]] + } + } + return(selectInput("paulscore_language_selector", "Language", multiple = TRUE,selectize = FALSE, size = 19, + choices = languages_to_display, selected = selected_language)) + }) + + output$paulscore_project_selector_container <- renderUI({ + if (input$paulscore_project_order == "alphabet") { + projects_to_display <- as.list(sort(available_projects_paulscore$project)) + names(projects_to_display) <- available_projects_paulscore$label[order(available_projects_paulscore$project)] + } else { + projects_to_display <- available_projects_paulscore$project + names(projects_to_display) <- available_projects_paulscore$label + } + return(selectInput("paulscore_project_selector", "Project", multiple = TRUE,selectize = FALSE, size = 19, + choices = projects_to_display, selected = projects_to_display[[1]])) + }) + + output$paulscore_langproj_plot <- renderDygraph({ + temp <- paulscore_fulltext_langproj + if (input$paulscore_relative_langproj) { + temp$`F = 0.1` <- temp$`F = 0.1` / (1/(1-0.1)) + temp$`F = 0.5` <- temp$`F = 0.5` / (1/(1-0.5)) + temp$`F = 0.9` <- temp$`F = 0.9` / (1/(1-0.9)) + } + dyOut <- temp %>% + paulscore_aggregate_wikis(input$paulscore_language_selector, input$paulscore_project_selector) %>% + dplyr::select_(.dots=c("date", "wiki", paste0("`",input$paulscore_factor,"`"))) %>% + tidyr::spread_(., key_col="wiki", value_col=input$paulscore_factor, fill=0) %>% + polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_paulscore_langproj)) %>% + polloi::make_dygraph(xlab = "Date", ylab = "PaulScore", title = paste0("PaulScore for fulltext searches, ", input$paulscore_factor)) %>% + dyLegend(show = "always", width = 400, labelsDiv = "paulscore_langproj_legend") %>% + dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>% + dyRangeSelector(fillColor = "") + if (input$paulscore_relative_langproj) { + dyOut <- dyAxis(dyOut, "y", axisLabelFormatter = "function(x) { return Math.round(100*x, 2) + '%'; }", valueFormatter = "function(x) { return Math.round(100*x, 2) + '%'; }") + } + return(dyOut) + }) + output$monthly_metrics_tbl <- DT::renderDataTable({ temp <- data.frame( KPI = c("Load time", "Zero results rate", "API Usage", "User engagement"), diff --git a/tab_documentation/desktop_events_langproj.md b/tab_documentation/desktop_events_langproj.md new file mode 100644 index 0000000..1f33770 --- /dev/null +++ b/tab_documentation/desktop_events_langproj.md @@ -0,0 +1,31 @@ +Desktop Full-Text Search by Languages and Projects +======= + +User actions that we track around search on the desktop website generally fall into three categories: + +1. The start of a user's search session; +2. The presentation of the user with a results page, and; +3. A user clicking through to an article in the results page. + +These three things are tracked via the [EventLogging 'TestSearchSatisfaction2' schema](https://meta.wikimedia.org/wiki/Schema:TestSearchSatisfaction2) and stored to +a database. The results are then aggregated and anonymised, and split out by language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary) as presented on this page. For performance/privacy reasons we randomly sample what we store, so the actual numbers are a vast understatement of how many user actions our servers receive - what's more interesting is how they change over time. In the case of desktop search, this sampling rate is **0.1%**. + +Notes/Tips +------ +* The percentages next to the language and project names represent the proportion of the total volume. +* You can select multiple projects and multiple languages to compare simultaneously. (Hold down Ctrl on Windows or Command on Mac.) +* The language picker will automatically choose "(None)" if you select a non-multilingual project such as Wikidata. +* If you're interested in the overall metric for a multilingual project such as Wikipedia, make sure only "(None)" is selected in the languages picker. +* Due to the high number of language-project combinations, we have restricted ourselves to only storing the last 30 days of data. + +Questions, bug reports, and feature suggestions +------ +For technical, non-bug questions, [email Mikhail](mailto:mpo...@wikimedia.org?subject=Dashboard%20Question). If you experience a bug or notice something wrong or have a suggestion, [open a ticket in Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery) in the Discovery board or [email Deb](mailto:d...@wikimedia.org?subject=Dashboard%20Question). + +<hr style="border-color: gray;"> +<p style="font-size: small; color: gray;"> + <strong>Link to this dashboard:</strong> + <a href="http://discovery.wmflabs.org/metrics/#desktop_events_langproj"> + http://discovery.wmflabs.org/metrics/#desktop_events_langproj + </a> +</p> diff --git a/tab_documentation/kpi_ctr_langproj.md b/tab_documentation/kpi_ctr_langproj.md new file mode 100644 index 0000000..2094a7b --- /dev/null +++ b/tab_documentation/kpi_ctr_langproj.md @@ -0,0 +1,24 @@ +User Engagement (Augmented Clickthroughs) by Languages and Projects +======= + +This metric combines the clickthrough rate and the proportion of users' session dwell times exceeding the threshold of 10s, split out by language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary). + +Notes/Tips +------ +* The percentages next to the language and project names represent the proportion of the total volume. +* You can select multiple projects and multiple languages to compare simultaneously. (Hold down Ctrl on Windows or Command on Mac.) +* The language picker will automatically choose "(None)" if you select a non-multilingual project such as Wikidata. +* If you're interested in the overall metric for a multilingual project such as Wikipedia, make sure only "(None)" is selected in the languages picker. +* Due to the high number of language-project combinations, we have restricted ourselves to only storing the last 30 days of data. + +Questions, bug reports, and feature suggestions +------ +For technical, non-bug questions, [email Mikhail](mailto:mpo...@wikimedia.org?subject=Dashboard%20Question). If you experience a bug or notice something wrong or have a suggestion, [open a ticket in Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery) in the Discovery board or [email Deb](mailto:d...@wikimedia.org?subject=Dashboard%20Question). + +<hr style="border-color: gray;"> +<p style="font-size: small; color: gray;"> + <strong>Link to this dashboard:</strong> + <a href="http://discovery.wmflabs.org/metrics/#kpi_ctr_langproj"> + http://discovery.wmflabs.org/metrics/#kpi_ctr_langproj + </a> +</p> diff --git a/tab_documentation/paulscore_langproj.html b/tab_documentation/paulscore_langproj.html new file mode 100644 index 0000000..f9db40d --- /dev/null +++ b/tab_documentation/paulscore_langproj.html @@ -0,0 +1,43 @@ +<h1>PaulScore Approximations by Languages and Projects</h1> + +<div> +<div style="float: left; width: 47%; padding: 0 3% 0 1%;"> +<p>"PaulScore" is the name we've given to a metric proposed by Paul Nelson in a talk he gave at <a href="https://www.youtube.com/watch?v=YJ_amC9gZmk&t=16m38s" title="Paul Nelson's talk at Elasticon">Elasticon</a>. We use PaulScore to evaluate the quality of results provided by CirrusSearch or proposed modifications to CirrusSearch, based on historical click data. A big advantage of the PaulScore is that it relies on user click history to award points, so it is easy to compute.</p> +<p>This dashboard shows the PaulScore approximation for 3 values of $F$: 0.1, 0.5, and 0.9, split out by language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary). The maximum score possible for each value of $F$ is $1/(1-F)$, so the dashboard has the option of looking at relative PaulScores, which is the computed value divided by maximum possible value for given $F$.</p> +<p>For more details, please see <a href="https://www.mediawiki.org/wiki/Wikimedia_Discovery/Search/Glossary#PaulScore" title="Definition of PaulScore">Discovery's Search glossary</a>.</p> +</div> +<div style="float: left; width: 46%; padding: 0 1% 0 3%;"> +<p>PaulScore is computed via the following steps:</p> +<ol> +<li>Pick scoring factor $0 < F < 1$.</li> +<li>For $i$-th search session $S_i$ $(i=1, \ldots, n)$ containing $m$ queries $Q_1, \ldots, Q_m$ and search result sets $\mathbf{R}_1, \ldots, \mathbf{R}_m$: <ol> + <li>For each $j$-th search query $Q_j$ with result set $\mathbf{R}_j$, let $\nu_j$ be the query score: $$\nu_j=\sum_{k~\in~\{\text{0-based positions of clicked results in}~\mathbf{R}_j\}} F^k.$$</li> + <li>Let user's average query score $\bar{\nu}_{(i)}$ be $$\bar{\nu}_{(i)}=\frac{1}{m} \sum_{j=1}^m \nu_j.$$</li> + </ol></li> +<li>Then the PaulScore is the average of all users' average query scores: $$\text{PaulScore}~=~\frac{1}{n} \sum_{i=1}^n \bar{\nu}_{(i)}.$$</li> +</ol> +</div> +</div> +<div style="clear: both;"></div> + +<h2>Notes/Tips</h2> + +<ul> + <li>The percentages next to the language and project names represent the proportion of the total volume.</li> + <li>You can select multiple projects and multiple languages to compare simultaneously. (Hold down Ctrl on Windows or Command on Mac.)</li> + <li>The language picker will automatically choose "(None)" if you select a non-multilingual project such as Wikidata.</li> + <li>If you're interested in the overall PaulScore for a multilingual project such as Wikipedia, make sure only "(None)" is selected in the languages picker.</li> + <li>Due to the high number of language-project combinations, we have restricted ourselves to only storing the last 30 days of data.</li> +</ul> + +<h2>Questions, bug reports, and feature suggestions</h2> + +<p>For technical, non-bug questions, <a href="mailto:mpo...@wikimedia.org?subject=Dashboard%20Question">email Mikhail</a>. If you experience a bug or notice something wrong or have a suggestion, <a href="https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery">open a ticket in Phabricator</a> in the Discovery board or <a href="mailto:d...@wikimedia.org?subject=Dashboard%20Question">email Deb</a>.</p> + +<hr style="border-color: gray;"> +<p style="font-size: small; color: gray;"> + <strong>Link to this dashboard:</strong> + <a href="http://discovery.wmflabs.org/metrics/#paulscore_langproj"> + http://discovery.wmflabs.org/metrics/#paulscore_langproj + </a> +</p> diff --git a/ui.R b/ui.R index ba99936..f22db39 100644 --- a/ui.R +++ b/ui.R @@ -39,11 +39,14 @@ menuSubItem(text = "Zero results", tabName = "kpi_zero_results"), menuSubItem(text = "API usage", tabName = "kpi_api_usage"), menuSubItem(text = "Augmented Clickthrough", tabName = "kpi_augmented_clickthroughs"), + menuSubItem(text = "CTR by Language/Project", tabName = "kpi_ctr_langproj"), icon = icon("star", lib = "glyphicon")), menuItem(text = "Desktop", menuSubItem(text = "Events", tabName = "desktop_events"), menuSubItem(text = "Load times", tabName = "desktop_load"), - menuSubItem(text = "PaulScore", tabName = "paulscore_approx")), + menuSubItem(text = "PaulScore", tabName = "paulscore_approx"), + menuSubItem(text = "Events by Language/Project", tabName = "desktop_events_langproj"), + menuSubItem(text = "PaulScore by Language/Project", tabName = "paulscore_langproj")), menuItem(text = "Mobile Web", menuSubItem(text = "Events", tabName = "mobile_events"), menuSubItem(text = "Load times", tabName = "mobile_load")), @@ -154,6 +157,23 @@ column(div(id = "kpi_augmented_clickthroughs_series_legend"), width = 8)), dygraphOutput("kpi_augmented_clickthroughs_series"), includeMarkdown("./tab_documentation/kpi_augmented_clickthroughs.md")), + tabItem(tabName = "kpi_ctr_langproj", + fluidRow(column(polloi::smooth_select("smoothing_kpi_ctr_langproj"), width = 6), + column(selectInput("ctr_metrics", "Metrics", + list("User engagement" = "User engagement", "Threshold-passing %" = "Threshold-passing %", "Clickthrough rate" = "Clickthrough rate"), + selected = "User engagement"), width = 6)), + fluidRow(column(selectInput("ctr_project_order", "Sort projects by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("ctr_project_selector_container"), width = 2), + column(selectInput("ctr_language_order", "Sort languages by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("ctr_language_selector_container"), width = 2), + column(dygraphOutput("kpi_ctr_langproj_plot"), + div(id = "kpi_ctr_langproj_legend", style = "margin-top:30px;"), width = 8)), + includeMarkdown("./tab_documentation/kpi_ctr_langproj.md") + ), tabItem(tabName = "desktop_events", fluidRow( valueBoxOutput("desktop_event_searches"), @@ -176,6 +196,42 @@ div(id = "paulscore_approx_legend", style = "text-align: center;"), dygraphOutput("paulscore_approx_plot_autocomplete"), includeHTML("./tab_documentation/paulscore_approx.html")), + tabItem(tabName = "desktop_events_langproj", + fluidRow(column(polloi::smooth_select("smoothing_desktop_events_langproj"), width = 6), + column(selectInput("desktop_event_type", "Events Type", + list("Clickthroughs" = "clickthroughs", "Result pages opened" = "Result pages opened", "Search sessions" = "search sessions"), + selected = "clickthroughs"), width = 6)), + fluidRow(column(selectInput("desktop_events_project_order", "Sort projects by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("desktop_events_project_selector_container"), width = 2), + column(selectInput("desktop_events_language_order", "Sort languages by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("desktop_events_language_selector_container"), width = 2), + column(dygraphOutput("desktop_events_langproj_plot"), + div(id = "desktop_events_langproj_legend", style = "margin-top:30px;"), width = 8)), + includeMarkdown("./tab_documentation/desktop_events_langproj.md") + ), + tabItem(tabName = "paulscore_langproj", + fluidRow(column(polloi::smooth_select("smoothing_paulscore_langproj"), width = 6), + column(selectInput("paulscore_factor", "Scoring Factor", + list("F = 0.1" = "F = 0.1", "F = 0.5" = "F = 0.5", "F = 0.9" = "F = 0.9"), + selected = "F = 0.5"), width = 6)), + checkboxInput("paulscore_relative_langproj", "Use relative PaulScores", FALSE), + helpText("Divides PaulScore by the maximum possible score for each F"), + fluidRow(column(selectInput("paulscore_project_order", "Sort projects by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("paulscore_project_selector_container"), width = 2), + column(selectInput("paulscore_language_order", "Sort languages by", + list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), + selected = "volume"), + uiOutput("paulscore_language_selector_container"), width = 2), + column(dygraphOutput("paulscore_langproj_plot"), + div(id = "paulscore_langproj_legend", style = "margin-top:30px;"), width = 8)), + includeHTML("./tab_documentation/paulscore_langproj.html") + ), tabItem(tabName = "mobile_events", fluidRow( valueBoxOutput("mobile_event_searches"), diff --git a/utils.R b/utils.R index 98fdb4e..a5b0a1a 100644 --- a/utils.R +++ b/utils.R @@ -1,5 +1,10 @@ library(magrittr) +capitalize_first_letter <- function(x) { + s <- strsplit(x, " ")[[1]] + return(paste(toupper(substring(s, 1,1)), substring(s, 2), sep = "", collapse = " ")) +} + ## Read in desktop data and generate means for the value boxes, along with a time-series appropriate form for ## dygraphs. read_desktop <- function() { @@ -9,6 +14,27 @@ desktop_dygraph_means <<- round(colMeans(desktop_dygraph_set[, 2:5])) desktop_load_data <<- polloi::read_dataset("discovery/search/desktop_load_times.tsv", col_types = "Dddd") %>% dplyr::filter(!is.na(Median)) + # Broken down by language-project pair + desktop_langproj_dygraph_set <<- polloi::read_dataset("discovery/search/desktop_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::filter(!is.na(action), !is.na(events)) %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + tidyr::spread(action, events, fill = 0) + ## Summaries for sorting (search sessions) + available_languages_desktop <<- desktop_langproj_dygraph_set %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_desktop <<- desktop_langproj_dygraph_set %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) + # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')] } read_web <- function() { @@ -148,22 +174,22 @@ ### With automata langproj_with_automata <<- polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_with_automata.tsv", na = "~", col_types = "Dccii") %>% dplyr::filter(!is.na(zero_results), !is.na(total)) %>% - dplyr::mutate(language = sub("NA", "(None)", language)) + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) ### Without automata langproj_no_automata <<- polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_no_automata.tsv", na = "~", col_types = "Dccii") %>% dplyr::filter(!is.na(zero_results), !is.na(total)) %>% - dplyr::mutate(language = sub("NA", "(None)", language)) + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) ### Summaries for sorting available_languages <<- langproj_with_automata %>% dplyr::group_by(language) %>% - dplyr::summarize(volume = sum(as.numeric(total))) %>% + dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>% dplyr::filter(volume > 0) %>% dplyr::arrange(desc(volume)) %>% dplyr::mutate(prop = volume/sum(volume), label = sprintf("%s (%.3f%%)", language, 100*prop)) available_projects <<- langproj_with_automata %>% dplyr::group_by(project) %>% - dplyr::summarize(volume = sum(as.numeric(total))) %>% + dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>% dplyr::filter(volume > 0) %>% dplyr::arrange(desc(volume)) %>% dplyr::mutate(prop = volume/sum(volume), @@ -193,6 +219,68 @@ ) } +read_augmented_clickthrough_langproj <- function() { + # Read data + threshold_data <- polloi::read_dataset("discovery/search/search_threshold_pass_rate_langproj_breakdown.tsv", col_types = "Dccdi") %>% + dplyr::filter(!is.na(threshold_pass)) %>% + dplyr::mutate(threshold_pass = 100 * threshold_pass, language = ifelse(is.na(language), "(None)", language)) + mobile_langproj <- polloi::read_dataset("discovery/search/mobile_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::filter(!is.na(action), !is.na(events), !is.na(project)) %>% + tidyr::spread(action, events, fill = 0) + app_langproj <- polloi::read_dataset("discovery/search/app_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::mutate(project = "Wikipedia") %>% + dplyr::filter(!is.na(action), !is.na(events)) %>% + dplyr::distinct(date, platform, language, project, action, .keep_all = TRUE) + ios_langproj <- app_langproj %>% + dplyr::filter(platform == "iOS") %>% + dplyr::select(-platform) %>% + tidyr::spread(action, events, fill = 0) + android_langproj <- app_langproj %>% + dplyr::filter(platform == "Android") %>% + dplyr::select(-platform) %>% + tidyr::spread(action, events, fill = 0) + # Augmented clickthroughs + augmented_clickthroughs_langproj <<- list( + desktop = dplyr::select(desktop_langproj_dygraph_set, c(date, language, project, clickthroughs, `Result pages opened`)), + mobile = dplyr::select(mobile_langproj, c(date, language, project, clickthroughs, `Result pages opened`)), + ios = dplyr::select(ios_langproj, c(date, language, project, clickthroughs, `Result pages opened`)), + android = dplyr::select(android_langproj, c(date, language, project, clickthroughs, `Result pages opened`)) + ) %>% + dplyr::bind_rows(.id = "platform") %>% + dplyr::group_by(date, language, project) %>% + dplyr::summarize(clickthroughs = sum(clickthroughs), serps = sum(`Result pages opened`)) %>% + dplyr::right_join(threshold_data, by = c("date", "language", "project")) %>% + dplyr::ungroup() %>% + dplyr::transmute( + date = date, + language = language, + project = project, + `Result pages opened` = serps, + search_sessions_threshold = search_sessions, + `Threshold-passing %` = round(threshold_pass, 2), + `Clickthrough rate` = round(100 * clickthroughs/serps, 2), + `User engagement` = round((threshold_pass + `Clickthrough rate`)/2, 2) + ) + # Summaries for sorting (SERP) + available_languages_ctr <<- augmented_clickthroughs_langproj %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_ctr <<- augmented_clickthroughs_langproj %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) + # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')] +} + read_lethal_dose <- function() { user_page_visit_dataset <<- polloi::read_dataset("discovery/search/sample_page_visit_ld.tsv", col_types = "Dddddddd") %>% dplyr::filter(!is.na(LD10)) %>% @@ -205,6 +293,93 @@ dplyr::select(c(date, event_source, `F = 0.1` = pow_1, `F = 0.5` = pow_5, `F = 0.9` = pow_9)) paulscore_autocomplete <<- dplyr::filter(paulscore, event_source == "autocomplete") %>% dplyr::select(-event_source) paulscore_fulltext <<- dplyr::filter(paulscore, event_source == "fulltext") %>% dplyr::select(-event_source) + # Broken down by language-project pair + paulscore_fulltext_langproj <<- polloi::read_dataset("discovery/search/paulscore_approximations_fulltext_langproj_breakdown.tsv", col_types = "Dcciddddddddd") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::filter(!is.na(project)) %>% + dplyr::select(c(date, language, project, `search sessions` = search_sessions, `F = 0.1` = pow_1, `F = 0.5` = pow_5, `F = 0.9` = pow_9)) + ## Summaries for sorting (search sessions) + available_languages_paulscore <<- paulscore_fulltext_langproj %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_paulscore <<- paulscore_fulltext_langproj %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) + # projects_db <<- readr::read_csv(system.file("extdata/projects.csv", package = "polloi"), col_types = "cclc")[, c('project', 'multilingual')] +} + +kpi_ctr_aggregate_wikis <- function(data, languages, projects) { + languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages) + projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects) + if (length(languages) == 1 && languages[1] == "(None)") { + temp <- data %>% + dplyr::filter_(~project %in% projects) %>% + dplyr::rename(wiki = project) %>% + dplyr::group_by(date, wiki) %>% + dplyr::summarize(`Threshold-passing %` = round(sum(`Threshold-passing %`*search_sessions_threshold)/sum(search_sessions_threshold), 2), + `Clickthrough rate` = round(sum(`Clickthrough rate`*`Result pages opened`)/sum(`Result pages opened`), 2), + `User engagement` = round((`Threshold-passing %` + `Clickthrough rate`)/2, 2) + ) %>% + dplyr::ungroup() + } else { + temp <- data %>% + dplyr::filter_(~language %in% languages & project %in% projects) %>% + tidyr::unite(wiki, language, project, sep = " ") %>% + dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE)) + } + return(temp) +} + +desktop_events_aggregate_wikis <- function(data, languages, projects) { + languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages) + projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects) + if (length(languages) == 1 && languages[1] == "(None)") { + temp <- data %>% + dplyr::filter_(~project %in% projects) %>% + dplyr::rename(wiki = project) %>% + dplyr::group_by(date, wiki) %>% + dplyr::summarize(clickthroughs = round(sum(as.numeric(clickthroughs)), 2), + `Result pages opened` = round(sum(as.numeric(`Result pages opened`)), 2), + `search sessions` = round(sum(as.numeric(`search sessions`)), 2) + ) %>% + dplyr::ungroup() + } else { + temp <- data %>% + dplyr::filter_(~language %in% languages & project %in% projects) %>% + tidyr::unite(wiki, language, project, sep = " ") %>% + dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE)) + } + return(temp) +} + +paulscore_aggregate_wikis <- function(data, languages, projects) { + languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages) + projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects) + if (length(languages) == 1 && languages[1] == "(None)") { + temp <- data %>% + dplyr::filter_(~project %in% projects) %>% + dplyr::rename(wiki = project) %>% + dplyr::group_by(date, wiki) %>% + dplyr::summarize(`F = 0.1` = round(sum(`F = 0.1`*`search sessions`)/sum(`search sessions`), 2), + `F = 0.5` = round(sum(`F = 0.5`*`search sessions`)/sum(`search sessions`), 2), + `F = 0.9` = round(sum(`F = 0.9`*`search sessions`)/sum(`search sessions`), 2) + ) %>% + dplyr::ungroup() + } else { + temp <- data %>% + dplyr::filter_(~language %in% languages & project %in% projects) %>% + tidyr::unite(wiki, language, project, sep = " ") %>% + dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE)) + } + return(temp) } aggregate_wikis <- function(data, languages, projects) { -- To view, visit https://gerrit.wikimedia.org/r/346461 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/rainbow Gerrit-Branch: master Gerrit-Owner: Chelsyx <c...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits