Bearloga has submitted this change and it was merged.
Change subject: Update to use new format
......................................................................
Update to use new format
In I05984ad713ed18554afaa98dcbbe3dfade766fda, we updated the data
collection scripts to employ a new version of the UDFs, which
resulted in a new data format for the referer traffic data. This
patch updates the Traffic dashboard to be compatible with the new
data format which breaks down external to internally-referred PVs.
Bug: T130083, T129137
Change-Id: Ia5c3bf012acbda4120e12c4df14ed167ff3dd4f3
---
M server.R
M tab_documentation/traffic_summary.md
M utils.R
3 files changed, 23 insertions(+), 16 deletions(-)
Approvals:
Bearloga: Verified; Looks good to me, approved
diff --git a/server.R b/server.R
index bd18fbe..09cf7d4 100644
--- a/server.R
+++ b/server.R
@@ -19,8 +19,10 @@
polloi::smoother(smooth_level =
polloi::smooth_switch(input$smoothing_global, input$smoothing_traffic_summary))
%>%
polloi::subset_by_date_range(time_frame_range(input$traffic_summary_timeframe,
input$traffic_summary_timeframe_daterange)) %>%
polloi::make_dygraph(xlab = "Date", ylab = "Pageviews",
- title = "Pageviews from external search engines")
%>%
- dyLegend(labelsDiv = "traffic_summary_legend", show = "always")
+ title = "Sources of page views (e.g. search engines
and internal referers)") %>%
+ dyLegend(labelsDiv = "traffic_summary_legend", show = "always") %>%
+ dyAnnotation(x = as.Date("2016-03-07"), text = "A",
+ tooltip = "Switched to a new UDF")
})
output$traffic_bysearch_dygraph <- renderDygraph({
diff --git a/tab_documentation/traffic_summary.md
b/tab_documentation/traffic_summary.md
index 3954f81..ffbad33 100644
--- a/tab_documentation/traffic_summary.md
+++ b/tab_documentation/traffic_summary.md
@@ -9,9 +9,9 @@
General trends
------
-Outages and inaccuracies
+Outages and notes
------
-None so far!
+- **A**: We switched to a finalized version of the UDF that extracts internal
traffic (see [T130083](https://phabricator.wikimedia.org/T130083))
Questions, bug reports, and feature suggestions
------
diff --git a/utils.R b/utils.R
index 3a801de..f4c0bce 100644
--- a/utils.R
+++ b/utils.R
@@ -1,38 +1,43 @@
library(polloi)
library(data.table)
-library(dplyr)
# Read in the traffic data
read_traffic <- function() {
# Read in the initial data.
- data <- polloi::read_dataset(path = "external_traffic/referer_data.tsv") %>%
- dplyr::rename(date = timestamp) %>%
- as.data.table
+ data <- polloi::read_dataset(path = "external_traffic/referer_data.tsv")
# Deduplicate
- data <- data[!duplicated(data[,1:(ncol(data) - 1), with=FALSE], fromLast =
TRUE)]
+ # data <- data[!duplicated(data[,1:(ncol(data) - 1), with=FALSE], fromLast =
TRUE)]
+ # Not sure what happened between 2016-02-04 and 2016-03-06 that caused the
pageviews to
+ # come out split.
# Format
data$is_search <- ifelse(data$is_search, "Referred by search", "Not referred
by search")
- data$search_engine[data$search_engine %in% c("none","None")] <- "Not
referred by search"
+ data$search_engine[data$search_engine == "none"] <- "Not referred by search"
+ data$referer_class[data$referer_class == "none"] <- "none (direct)"
+ data$referer_class[data$referer_class == "external (search engine)"] <-
"search engine"
+ data$referer_class[data$referer_class == "external"] <- "external but not
search engine"
+ data <- as.data.table(data)
# Write out the overall values for traffic
holding <- data[, j = list(pageviews = sum(pageviews)),
- by = c("date", "is_search", "access_method")]
+ by = c("date", "referer_class", "access_method")]
holding <- split(holding, f = holding$access_method)
- holding$all <- data[,j = list(pageviews = sum(pageviews)),
- by = c("date", "is_search")]
+ holding$total <- data[,j = list(pageviews = sum(pageviews)),
+ by = c("date", "referer_class")]
names(holding) <- c("Desktop", "Mobile Web", "All")
summary_traffic_data <<- lapply(holding, function(x){
- return(reshape2::dcast(x, formula = date ~ is_search, fun.aggregate = sum))
+ return(reshape2::dcast(x, formula = date ~ referer_class, fun.aggregate =
sum))
})
# Generate per-engine values
- holding <- data[, j = list(pageviews = sum(pageviews)),
+ holding <- data[which(data$referer_class == "search engine"),
+ j = list(pageviews = sum(pageviews)),
by = c("date", "search_engine", "access_method")]
holding <- split(holding, f = holding$access_method)
- holding$all <- data[, j = list(pageviews = sum(pageviews)),
+ holding$all <- data[which(data$referer_class == "search engine"),
+ j = list(pageviews = sum(pageviews)),
by = c("date", "search_engine")]
names(holding) <- c("Desktop", "Mobile Web", "All")
bysearch_traffic_data <<- lapply(holding, function(x){
--
To view, visit https://gerrit.wikimedia.org/r/281488
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ia5c3bf012acbda4120e12c4df14ed167ff3dd4f3
Gerrit-PatchSet: 3
Gerrit-Project: wikimedia/discovery/wonderbolt
Gerrit-Branch: master
Gerrit-Owner: Bearloga <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits