Bearloga has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/346461 )
Change subject: Implement the wiki/language selector in Search Metrics ...................................................................... Implement the wiki/language selector in Search Metrics Bug: T150410 Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc --- M CHANGELOG.md M server.R D tab_documentation/failure_langproj.md A tab_documentation/langproj_breakdown.md M ui.R M utils.R 6 files changed, 310 insertions(+), 114 deletions(-) Approvals: Bearloga: Verified; Looks good to me, approved diff --git a/CHANGELOG.md b/CHANGELOG.md index f01c6ea..ab4260b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## 2017/05/01 +- Added a language-project breakdown of additional metrics ([T150410](https://phabricator.wikimedia.org/T150410)) + ## 2017/02/02 - Updated to work with new datasets generated by Reportupdater-based golden ([T150915](https://phabricator.wikimedia.org/T150915)) diff --git a/server.R b/server.R index 2c0bd92..2225127 100644 --- a/server.R +++ b/server.R @@ -23,9 +23,11 @@ progress$set(message = "Downloading API usage data", value = 0.4) read_api() progress$set(message = "Downloading zero results data", value = 0.5) - read_failures(existing_date) - progress$set(message = "Downloading engagement data", value = 0.7) + read_failures() + progress$set(message = "Downloading engagement data", value = 0.6) read_augmented_clickthrough() + progress$set(message = "Downloading language-project engagement data", value = 0.7) + read_augmented_clickthrough_langproj() progress$set(message = "Downloading survival data", value = 0.8) read_lethal_dose() progress$set(message = "Downloading PaulScore data", value = 0.9) @@ -354,65 +356,6 @@ dyRangeSelector(fillColor = "") %>% dyEvent(as.Date("2016-02-01"), "A (format switch)", labelLoc = "bottom") %>% dyEvent(as.Date("2016-03-16"), "Completion Suggester Deployed", labelLoc = "bottom") %>% - dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom") - }) - - output$language_selector_container <- renderUI({ - if (input$language_order == "alphabet") { - languages_to_display <- as.list(sort(available_languages$language)) - names(languages_to_display) <- available_languages$label[order(available_languages$language)] - } else { - languages_to_display <- available_languages$language - names(languages_to_display) <- available_languages$label - } - - # e.g. if user sorts projects alphabetically and the selected project is "10th Anniversary of Wikipeda" - # then automatically select the language "(None)" to avoid giving user an error. This also works if - # the user selects a project that is not multilingual, so this automatically chooses the "(None)" - # option for the user. - if (any(input$project_selector %in% projects_db$project[!projects_db$multilingual])) { - if (any(input$project_selector %in% projects_db$project[projects_db$multilingual])) { - if (!is.null(input$language_selector)) { - selected_language <- union("(None)", input$language_selector) - } else { - selected_language <- c("(None)", languages_to_display[[1]]) - } - } else { - selected_language <- "(None)" - } - } else { - if (!is.null(input$language_selector)) { - selected_language <- input$language_selector - } else { - selected_language <- languages_to_display[[1]] - } - } - return(selectInput("language_selector", "Language", multiple = TRUE,selectize = FALSE, size = 19, - choices = languages_to_display, selected = selected_language)) - }) - - output$project_selector_container <- renderUI({ - if (input$project_order == "alphabet") { - projects_to_display <- as.list(sort(available_projects$project)) - names(projects_to_display) <- available_projects$label[order(available_projects$project)] - } else { - projects_to_display <- available_projects$project - names(projects_to_display) <- available_projects$label - } - return(selectInput("project_selector", "Project", multiple = TRUE,selectize = FALSE, size = 19, - choices = projects_to_display, selected = projects_to_display[[1]])) - }) - - output$failure_langproj_plot <- renderDygraph({ - input$failure_langproj_automata %>% - polloi::data_select(langproj_with_automata, langproj_no_automata) %>% - aggregate_wikis(input$language_selector, input$project_selector) %>% - polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_failure_langproj)) %>% - polloi::make_dygraph(xlab = "", ylab = "Zero Results Rate", title = "Zero result rate by language and project") %>% - dyAxis("y", axisLabelFormatter = "function(x) { return x + '%'; }", valueFormatter = "function(x) { return Math.round(x, 3) + '%'; }") %>% - dyLegend(show = "always", width = 400, labelsDiv = "failure_langproj_legend") %>% - dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>% - dyRangeSelector(fillColor = "") %>% dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom") }) @@ -878,6 +821,116 @@ dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom") }) + output$language_selector_container <- renderUI({ + if (input$langproj_metrics %in% c("User engagement", "Threshold-passing %", "Clickthrough rate")){ + temp_language <- available_languages_ctr + } else if (input$langproj_metrics %in% c("clickthroughs", "Result pages opened", "search sessions")){ + temp_language <- available_languages_desktop + } else if (input$langproj_metrics %in% c("F = 0.1", "F = 0.5", "F = 0.9")){ + temp_language <- available_languages_paulscore + } else{ + temp_language <- available_languages + } + + if (input$language_order == "alphabet") { + languages_to_display <- as.list(sort(temp_language$language)) + names(languages_to_display) <- temp_language$label[order(temp_language$language)] + } else { + languages_to_display <- temp_language$language + names(languages_to_display) <- temp_language$label + } + + # e.g. if user sorts projects alphabetically and the selected project is "10th Anniversary of Wikipeda" + # then automatically select the language "(None)" to avoid giving user an error. This also works if + # the user selects a project that is not multilingual, so this automatically chooses the "(None)" + # option for the user. + if (any(input$project_selector %in% projects_db$project[!projects_db$multilingual])) { + if (any(input$project_selector %in% projects_db$project[projects_db$multilingual])) { + if (!is.null(input$language_selector)) { + selected_language <- union("(None)", input$language_selector) + } else { + selected_language <- c("(None)", languages_to_display[[1]]) + } + } else { + selected_language <- "(None)" + } + } else { + if (!is.null(input$language_selector)) { + selected_language <- input$language_selector + } else { + selected_language <- languages_to_display[[1]] + } + } + return(selectInput("language_selector", "Language", multiple = TRUE, selectize = FALSE, size = 19, + choices = languages_to_display, selected = selected_language)) + }) + + output$project_selector_container <- renderUI({ + if (input$langproj_metrics %in% c("User engagement", "Threshold-passing %", "Clickthrough rate")){ + temp_project <- available_projects_ctr + } else if (input$langproj_metrics %in% c("clickthroughs", "Result pages opened", "search sessions")){ + temp_project <- available_projects_desktop + } else if (input$langproj_metrics %in% c("F = 0.1", "F = 0.5", "F = 0.9")){ + temp_project <- available_projects_paulscore + } else{ + temp_project <- available_projects + } + + if (input$project_order == "alphabet") { + projects_to_display <- as.list(sort(temp_project$project)) + names(projects_to_display) <- temp_project$label[order(temp_project$project)] + } else { + projects_to_display <- temp_project$project + names(projects_to_display) <- temp_project$label + } + + if (!is.null(input$project_selector)) { + selected_project <- input$project_selector + } else { + selected_project <- projects_to_display[[1]] + } + return(selectInput("project_selector", "Project", multiple = TRUE,selectize = FALSE, size = 19, + choices = projects_to_display, selected = selected_project)) + }) + + output$langproj_breakdown_plot <- renderDygraph({ + # Select data + if (input$langproj_metrics %in% c("User engagement", "Threshold-passing %", "Clickthrough rate")){ + temp <- augmented_clickthroughs_langproj + } else if (input$langproj_metrics %in% c("clickthroughs", "Result pages opened", "search sessions")){ + temp <- desktop_langproj_dygraph_set + } else if (input$langproj_metrics %in% c("F = 0.1", "F = 0.5", "F = 0.9")){ + temp <- paulscore_fulltext_langproj + if (input$paulscore_relative_langproj) { + temp$`F = 0.1` <- temp$`F = 0.1` / (1/(1-0.1)) + temp$`F = 0.5` <- temp$`F = 0.5` / (1/(1-0.5)) + temp$`F = 0.9` <- temp$`F = 0.9` / (1/(1-0.9)) + } + } else{ + temp <- input$failure_langproj_automata %>% + polloi::data_select(langproj_with_automata, langproj_no_automata) + } + # Plot + dyOut <- temp %>% + aggregate_wikis(input$language_selector, input$project_selector, input$langproj_metrics) %>% + polloi::smoother(smooth_level = polloi::smooth_switch(input$smoothing_global, input$smoothing_langproj_breakdown)) %>% + polloi::make_dygraph(xlab = "Date", + ylab = ifelse(grepl("^F = ", input$langproj_metrics), paste0("PaulScore, ", input$langproj_metrics), capitalize_first_letter(input$langproj_metrics)), + title = ifelse(grepl("^F = ", input$langproj_metrics), paste0("PaulScore for fulltext searches, ", input$langproj_metrics), paste0(capitalize_first_letter(input$langproj_metrics), ", by language and project"))) %>% + dyLegend(show = "always", width = 400, labelsDiv = "langproj_breakdown_legend") %>% + dyAxis("x", axisLabelFormatter = polloi::custom_axis_formatter) %>% + dyRangeSelector(fillColor = "") %>% + dyEvent(as.Date("2017-01-01"), "R (reportupdater)", labelLoc = "bottom") %>% + dyEvent(as.Date("2017-03-29"), "M (Eventlogging Maintenance)", labelLoc = "bottom") + if (input$paulscore_relative_langproj) { + dyOut <- dyAxis(dyOut, "y", axisLabelFormatter = "function(x) { return Math.round(100*x, 2) + '%'; }", valueFormatter = "function(x) { return Math.round(100*x, 2) + '%'; }") + } + if (input$langproj_metrics %in% c("User engagement", "Threshold-passing %", "Clickthrough rate", "Zero result rate")){ + dyOut <- dyAxis(dyOut, "y", axisLabelFormatter = "function(x) { return x + '%'; }", valueFormatter = "function(x) { return Math.round(x * 1000)/1000 + '%'; }") + } + return(dyOut) + }) + output$monthly_metrics_tbl <- DT::renderDataTable({ temp <- data.frame( KPI = c("Load time", "Zero results rate", "API Usage", "User engagement"), diff --git a/tab_documentation/failure_langproj.md b/tab_documentation/failure_langproj.md deleted file mode 100644 index 42de070..0000000 --- a/tab_documentation/failure_langproj.md +++ /dev/null @@ -1,31 +0,0 @@ -Usage and Zero Results Rate by Languages and Projects -======= - -Sometimes, searches return zero results. What we're visualising here is the percentage of the time -a search query returns zero results, split out by language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary). - -Notes/Tips ------- -* The percentages next to the language and project names represent the proportion of the total volume. -* You can select multiple projects and multiple languages to compare simultaneously. (Hold down Ctrl on Windows or Command on Mac.) -* For each arbitrary combination, the zero results rate is the overall rate (full-text AND prefix, web AND api). -* The language picker will automatically choose "(None)" if you select a non-multilingual project such as Wikidata. -* If you're interested in the overall ZRR for a multilingual project such as Wikipedia, make sure only "(None)" is selected in the languages picker. -* Due to the high number of language-project combinations, we have restricted ourselves to only storing the last 30 days of data. - -Outages and inaccuracies ------- -* On 15 January 2016 there was an [issue](https://phabricator.wikimedia.org/T123541) with Avro serialization that prevented data from entering the Hadoop cluster. A [patch](https://gerrit.wikimedia.org/r/#/c/264989/) was deployed on 19 January 2016. As a result, there are no recorded zero results rates for 01/15-01/19. The values you may see on those dates are estimates computed with [statistical models](https://github.com/bearloga/branch/blob/master/zero%20results%20rate%20estimation/report.pdf). -* '__R__': on 2017-01-01 we started calculating all of Discovery's metrics using a new version of [our data retrieval and processing codebase](https://phabricator.wikimedia.org/diffusion/WDGO/) that we migrated to [Wikimedia Analytics](https://www.mediawiki.org/wiki/Analytics)' [Reportupdater infrastructure](https://wikitech.wikimedia.org/wiki/Analytics/Reportupdater). See [T150915](https://phabricator.wikimedia.org/T150915) for more details. - -Questions, bug reports, and feature suggestions ------- -For technical, non-bug questions, [email Mikhail](mailto:[email protected]?subject=Dashboard%20Question). If you experience a bug or notice something wrong or have a suggestion, [open a ticket in Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery) in the Discovery board or [email Deb](mailto:[email protected]?subject=Dashboard%20Question). - -<hr style="border-color: gray;"> -<p style="font-size: small; color: gray;"> - <strong>Link to this dashboard:</strong> - <a href="http://discovery.wmflabs.org/metrics/#failure_breakdown"> - http://discovery.wmflabs.org/metrics/#failure_breakdown - </a> -</p> diff --git a/tab_documentation/langproj_breakdown.md b/tab_documentation/langproj_breakdown.md new file mode 100644 index 0000000..f999268 --- /dev/null +++ b/tab_documentation/langproj_breakdown.md @@ -0,0 +1,30 @@ +Breakdown by Languages and Projects +======= + +On this page, we split out several metrics by language (e.g. English vs Russian) and project (e.g. Wikipedia vs Wiktionary) to help us understand the differences between wikis. See the following pages for more details on how we compute these metrics: + + - [Augmented Clickthrough](http://discovery.wmflabs.org/metrics/#kpi_augmented_clickthroughs) + - [Desktop Events](http://discovery.wmflabs.org/metrics/#desktop_events) + - [Paulscore](http://discovery.wmflabs.org/metrics/#paulscore_approx) + - [Zero Results](http://discovery.wmflabs.org/metrics/#failure_rate) + +Notes/Tips +------ +* The percentages next to the language and project names represent the proportion of the total volume. +* You can select multiple projects and multiple languages to compare simultaneously. (Hold down Ctrl on Windows or Command on Mac.) +* For each arbitrary combination, the zero results rate is the overall rate (full-text AND prefix, web AND api). +* The language picker will automatically choose "(None)" if you select a non-multilingual project such as Wikidata. +* If you're interested in the overall metric for a multilingual project such as Wikipedia, make sure only "(None)" is selected in the languages picker. +* Due to the high number of language-project combinations, we have restricted ourselves to only storing the last 30 days of data. + +Questions, bug reports, and feature suggestions +------ +For technical, non-bug questions, [email Mikhail](mailto:[email protected]?subject=Dashboard%20Question). If you experience a bug or notice something wrong or have a suggestion, [open a ticket in Phabricator](https://phabricator.wikimedia.org/maniphest/task/create/?projects=Discovery) in the Discovery board or [email Deb](mailto:[email protected]?subject=Dashboard%20Question). + +<hr style="border-color: gray;"> +<p style="font-size: small; color: gray;"> + <strong>Link to this dashboard:</strong> + <a href="http://discovery.wmflabs.org/metrics/#langproj_breakdown"> + http://discovery.wmflabs.org/metrics/#langproj_breakdown + </a> +</p> diff --git a/ui.R b/ui.R index 9a25944..9684c54 100644 --- a/ui.R +++ b/ui.R @@ -62,9 +62,9 @@ menuItem(text = "Zero Results", menuSubItem(text = "Summary", tabName = "failure_rate"), menuSubItem(text = "Search Type Breakdown", tabName = "failure_breakdown"), - menuSubItem(text = "Search Suggestions", tabName = "failure_suggestions"), - menuSubItem(text = "Language/Project Breakdown", tabName = "failure_langproj")), + menuSubItem(text = "Search Suggestions", tabName = "failure_suggestions")), menuItem(text = "Page Visit Times", tabName = "survival"), + menuItem(text = "Language/Project Breakdown", tabName = "langproj_breakdown"), menuItem(text = "Global Settings", selectInput(inputId = "smoothing_global", label = "Smoothing", selectize = TRUE, selected = "day", choices = c("No Smoothing" = "day", "Weekly Median" = "week", @@ -271,9 +271,20 @@ dygraphOutput("suggestion_dygraph_plot"), includeMarkdown("./tab_documentation/failure_suggests.md") ), - tabItem(tabName = "failure_langproj", - polloi::smooth_select("smoothing_failure_langproj"), - polloi::automata_select(input_id = "failure_langproj_automata"), + tabItem(tabName = "langproj_breakdown", + fluidRow(column(polloi::smooth_select("smoothing_langproj_breakdown"), width = 4), + column(selectInput("langproj_metrics", "Metrics", + choices = list( + `Augmented Clickthrough` = c(`User engagement` = "User engagement", `Threshold-passing %` = "Threshold-passing %", `Clickthrough rate` = "Clickthrough rate"), + `Desktop Events` = c(`Clickthroughs` = "clickthroughs", `Result pages opened` = "Result pages opened", `Search sessions` = "search sessions"), + `Paulscore` = c(`F = 0.1` = "F = 0.1", `F = 0.5` = "F = 0.5", `F = 0.9` = "F = 0.9"), + `Zero Results` = c(`Zero result rate` = "Zero result rate") + ), + selected = "User engagement", selectize = FALSE), width = 4), + column(conditionalPanel("input.langproj_metrics == 'Zero result rate'", polloi::automata_select(input_id = "failure_langproj_automata")), width = 4), + column(conditionalPanel("input.langproj_metrics == 'F = 0.1' || input.langproj_metrics == 'F = 0.5' || input.langproj_metrics == 'F = 0.9'", + checkboxInput("paulscore_relative_langproj", "Use Relative PaulScores", FALSE), + helpText("Divides PaulScore by the maximum possible score for each F")), width = 4)), fluidRow(column(selectInput("project_order", "Sort projects by", list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), selected = "volume"), @@ -282,9 +293,9 @@ list("Alphabetical order" = "alphabet", "Volume of requests" = "volume"), selected = "volume"), uiOutput("language_selector_container"), width = 2), - column(dygraphOutput("failure_langproj_plot"), - div(id = "failure_langproj_legend", style = "margin-top:30px;"), width = 8)), - includeMarkdown("./tab_documentation/failure_langproj.md") + column(dygraphOutput("langproj_breakdown_plot"), + div(id = "langproj_breakdown_legend", style = "margin-top:30px;"), width = 8)), + includeMarkdown("./tab_documentation/langproj_breakdown.md") ), tabItem(tabName = "survival", polloi::smooth_select("smoothing_lethal_dose_plot"), diff --git a/utils.R b/utils.R index 98fdb4e..81b1593 100644 --- a/utils.R +++ b/utils.R @@ -1,5 +1,10 @@ library(magrittr) +capitalize_first_letter <- function(x) { + s <- strsplit(x, " ")[[1]] + return(paste(toupper(substring(s, 1,1)), substring(s, 2), sep = "", collapse = " ")) +} + ## Read in desktop data and generate means for the value boxes, along with a time-series appropriate form for ## dygraphs. read_desktop <- function() { @@ -9,6 +14,26 @@ desktop_dygraph_means <<- round(colMeans(desktop_dygraph_set[, 2:5])) desktop_load_data <<- polloi::read_dataset("discovery/search/desktop_load_times.tsv", col_types = "Dddd") %>% dplyr::filter(!is.na(Median)) + # Broken down by language-project pair + desktop_langproj_dygraph_set <<- polloi::read_dataset("discovery/search/desktop_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::filter(!is.na(action), !is.na(events)) %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + tidyr::spread(action, events, fill = 0) + ## Summaries for sorting (search sessions) + available_languages_desktop <<- desktop_langproj_dygraph_set %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_desktop <<- desktop_langproj_dygraph_set %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) } read_web <- function() { @@ -75,7 +100,7 @@ lapply(dplyr::select_, .dots = list(quote(-api))) } -read_failures <- function(date) { +read_failures <- function() { ## Zero results rate ### With automata failure_data_with_automata <<- polloi::read_dataset("discovery/search/cirrus_query_aggregates_with_automata.tsv", col_types = "Dd") %>% @@ -148,22 +173,22 @@ ### With automata langproj_with_automata <<- polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_with_automata.tsv", na = "~", col_types = "Dccii") %>% dplyr::filter(!is.na(zero_results), !is.na(total)) %>% - dplyr::mutate(language = sub("NA", "(None)", language)) + dplyr::mutate(language = ifelse(is.na(language) | language == "NA", "(None)", language)) ### Without automata langproj_no_automata <<- polloi::read_dataset("discovery/search/cirrus_langproj_breakdown_no_automata.tsv", na = "~", col_types = "Dccii") %>% dplyr::filter(!is.na(zero_results), !is.na(total)) %>% - dplyr::mutate(language = sub("NA", "(None)", language)) + dplyr::mutate(language = ifelse(is.na(language) | language == "NA", "(None)", language)) ### Summaries for sorting available_languages <<- langproj_with_automata %>% dplyr::group_by(language) %>% - dplyr::summarize(volume = sum(as.numeric(total))) %>% + dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>% dplyr::filter(volume > 0) %>% dplyr::arrange(desc(volume)) %>% dplyr::mutate(prop = volume/sum(volume), label = sprintf("%s (%.3f%%)", language, 100*prop)) available_projects <<- langproj_with_automata %>% dplyr::group_by(project) %>% - dplyr::summarize(volume = sum(as.numeric(total))) %>% + dplyr::summarize(volume = sum(as.numeric(total), na.rm = TRUE)) %>% dplyr::filter(volume > 0) %>% dplyr::arrange(desc(volume)) %>% dplyr::mutate(prop = volume/sum(volume), @@ -193,6 +218,67 @@ ) } +read_augmented_clickthrough_langproj <- function() { + # Read data + threshold_data <- polloi::read_dataset("discovery/search/search_threshold_pass_rate_langproj_breakdown.tsv", col_types = "Dccdi") %>% + dplyr::filter(!is.na(threshold_pass)) %>% + dplyr::mutate(threshold_pass = 100 * threshold_pass, language = ifelse(is.na(language), "(None)", language)) + mobile_langproj <- polloi::read_dataset("discovery/search/mobile_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::filter(!is.na(action), !is.na(events), !is.na(project)) %>% + tidyr::spread(action, events, fill = 0) + app_langproj <- polloi::read_dataset("discovery/search/app_event_counts_langproj_breakdown.tsv", col_types = "Dccci") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::mutate(project = "Wikipedia") %>% + dplyr::filter(!is.na(action), !is.na(events)) %>% + dplyr::distinct(date, platform, language, project, action, .keep_all = TRUE) + ios_langproj <- app_langproj %>% + dplyr::filter(platform == "iOS") %>% + dplyr::select(-platform) %>% + tidyr::spread(action, events, fill = 0) + android_langproj <- app_langproj %>% + dplyr::filter(platform == "Android") %>% + dplyr::select(-platform) %>% + tidyr::spread(action, events, fill = 0) + # Augmented clickthroughs + augmented_clickthroughs_langproj <<- list( + desktop = dplyr::select(desktop_langproj_dygraph_set, c(date, language, project, clickthroughs, `Result pages opened`)), + mobile = dplyr::select(mobile_langproj, c(date, language, project, clickthroughs, `Result pages opened`)), + ios = dplyr::select(ios_langproj, c(date, language, project, clickthroughs, `Result pages opened`)), + android = dplyr::select(android_langproj, c(date, language, project, clickthroughs, `Result pages opened`)) + ) %>% + dplyr::bind_rows(.id = "platform") %>% + dplyr::group_by(date, language, project) %>% + dplyr::summarize(clickthroughs = sum(clickthroughs), serps = sum(`Result pages opened`)) %>% + dplyr::right_join(threshold_data, by = c("date", "language", "project")) %>% + dplyr::ungroup() %>% + dplyr::transmute( + date = date, + language = language, + project = project, + `Result pages opened` = serps, + search_sessions_threshold = search_sessions, + `Threshold-passing %` = round(threshold_pass, 2), + `Clickthrough rate` = round(100 * clickthroughs/serps, 2), + `User engagement` = round((threshold_pass + `Clickthrough rate`)/2, 2) + ) + # Summaries for sorting (SERP) + available_languages_ctr <<- augmented_clickthroughs_langproj %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_ctr <<- augmented_clickthroughs_langproj %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`Result pages opened`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) +} + read_lethal_dose <- function() { user_page_visit_dataset <<- polloi::read_dataset("discovery/search/sample_page_visit_ld.tsv", col_types = "Dddddddd") %>% dplyr::filter(!is.na(LD10)) %>% @@ -205,29 +291,73 @@ dplyr::select(c(date, event_source, `F = 0.1` = pow_1, `F = 0.5` = pow_5, `F = 0.9` = pow_9)) paulscore_autocomplete <<- dplyr::filter(paulscore, event_source == "autocomplete") %>% dplyr::select(-event_source) paulscore_fulltext <<- dplyr::filter(paulscore, event_source == "fulltext") %>% dplyr::select(-event_source) + # Broken down by language-project pair + paulscore_fulltext_langproj <<- polloi::read_dataset("discovery/search/paulscore_approximations_fulltext_langproj_breakdown.tsv", col_types = "Dcciddddddddd") %>% + dplyr::mutate(language = ifelse(is.na(language), "(None)", language)) %>% + dplyr::filter(!is.na(project)) %>% + dplyr::select(c(date, language, project, `search sessions` = search_sessions, `F = 0.1` = pow_1, `F = 0.5` = pow_5, `F = 0.9` = pow_9)) + ## Summaries for sorting (search sessions) + available_languages_paulscore <<- paulscore_fulltext_langproj %>% + dplyr::group_by(language) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", language, 100*prop)) + available_projects_paulscore <<- paulscore_fulltext_langproj %>% + dplyr::group_by(project) %>% + dplyr::summarize(volume = sum(as.numeric(`search sessions`), na.rm = TRUE)) %>% + dplyr::filter(volume > 0) %>% + dplyr::arrange(desc(volume)) %>% + dplyr::mutate(prop = volume/sum(volume), + label = sprintf("%s (%.3f%%)", project, 100*prop)) } -aggregate_wikis <- function(data, languages, projects) { +aggregate_wikis <- function(data, languages, projects, input_metric) { languages <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", languages) projects <- sub(" \\([0-9]{1,2}\\.[0-9]{1,3}%\\)", "", projects) if (length(languages) == 1 && languages[1] == "(None)") { temp <- data %>% dplyr::filter_(~project %in% projects) %>% dplyr::rename(wiki = project) %>% - dplyr::group_by(date, wiki) %>% - dplyr::summarize(zero_results = sum(as.numeric(zero_results)), - total = sum(as.numeric(total))) %>% - dplyr::ungroup() + dplyr::group_by(date, wiki) + if (input_metric %in% c("User engagement", "Threshold-passing %", "Clickthrough rate")){ + temp %<>% dplyr::summarize( + `Threshold-passing %` = round(sum(`Threshold-passing %`*search_sessions_threshold)/sum(search_sessions_threshold), 2), + `Clickthrough rate` = round(sum(`Clickthrough rate`*`Result pages opened`)/sum(`Result pages opened`), 2), + `User engagement` = round((`Threshold-passing %` + `Clickthrough rate`)/2, 2)) + } else if (input_metric %in% c("clickthroughs", "Result pages opened", "search sessions")){ + temp %<>% dplyr::summarize( + clickthroughs = round(sum(as.numeric(clickthroughs)), 2), + `Result pages opened` = round(sum(as.numeric(`Result pages opened`)), 2), + `search sessions` = round(sum(as.numeric(`search sessions`)), 2)) + } else if (input_metric %in% c("F = 0.1", "F = 0.5", "F = 0.9")){ + temp %<>% dplyr::summarize( + `F = 0.1` = round(sum(`F = 0.1`*`search sessions`)/sum(`search sessions`), 2), + `F = 0.5` = round(sum(`F = 0.5`*`search sessions`)/sum(`search sessions`), 2), + `F = 0.9` = round(sum(`F = 0.9`*`search sessions`)/sum(`search sessions`), 2)) + } else{ + temp %<>% dplyr::summarize( + zero_results = sum(as.numeric(zero_results)), + total = sum(as.numeric(total))) + } + temp %<>% dplyr::ungroup() } else { temp <- data %>% dplyr::filter_(~language %in% languages & project %in% projects) %>% tidyr::unite(wiki, language, project, sep = " ") %>% dplyr::mutate(wiki = sub("(None) ", "", wiki, fixed = TRUE)) } - temp %<>% - dplyr::mutate(zrr = round(100 * as.numeric(zero_results) / as.numeric(total), 2)) %>% - dplyr::select(-c(total, zero_results)) %>% - tidyr::spread(wiki, zrr) + if (input_metric == "Zero result rate"){ + temp %<>% + dplyr::mutate(zrr = round(100 * as.numeric(zero_results) / as.numeric(total), 2)) %>% + dplyr::select(-c(total, zero_results)) %>% + tidyr::spread(wiki, zrr) + } else { + temp %<>% + dplyr::select_(.dots=c("date", "wiki", paste0("`",input_metric,"`"))) %>% + tidyr::spread_(., key_col="wiki", value_col=input_metric, fill=0) + } return(temp) } -- To view, visit https://gerrit.wikimedia.org/r/346461 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie04762d747a9dcbec1564d8945f8949ed8c52adc Gerrit-PatchSet: 8 Gerrit-Project: wikimedia/discovery/rainbow Gerrit-Branch: master Gerrit-Owner: Chelsyx <[email protected]> Gerrit-Reviewer: Bearloga <[email protected]> Gerrit-Reviewer: Chelsyx <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
