Bearloga has submitted this change and it was merged.
Change subject: Include external traffic and fix remaining bug with dashboard
automation
......................................................................
Include external traffic and fix remaining bug with dashboard automation
This includes the scripts used for calculating external referer traffic and
fixes a small (non-fatal) bug with the formatting of run.R
Bug: T116295
Change-Id: I303742abe9c45c56383f95b14523a6a179574893
---
M config.R
A external/search_referers.R
M run.R
3 files changed, 52 insertions(+), 4 deletions(-)
Approvals:
Bearloga: Verified; Looks good to me, approved
diff --git a/config.R b/config.R
index 1b50863..79764f2 100644
--- a/config.R
+++ b/config.R
@@ -3,7 +3,7 @@
# Core paths
write_root <- "/a/aggregate-datasets/"
-dirs <- c("maps","wdqs", "search")
+dirs <- c("maps","wdqs", "search", "external_traffic")
write_dirs <- paste0(write_root, dirs)
# Dependencies
diff --git a/external/search_referers.R b/external/search_referers.R
new file mode 100644
index 0000000..f3d30f5
--- /dev/null
+++ b/external/search_referers.R
@@ -0,0 +1,47 @@
+# Per-file config:
+base_path <- paste0(write_root, "external_traffic/")
+check_dir(base_path)
+
+main <- function(date = NULL){
+
+ # Date handling
+ if(is.null(date)){
+ date <- Sys.Date() - 1
+ }
+
+ # Date subquery
+ subquery <- date_clause(date)
+
+ # Write query and run it
+ query <- paste0("ADD JAR /home/ironholds/refinery-hive-0.0.21-SNAPSHOT.jar;
+ CREATE TEMPORARY FUNCTION is_external_search AS
+ 'org.wikimedia.analytics.refinery.hive.IsExternalSearchUDF';
+ CREATE TEMPORARY FUNCTION classify_referer AS
+ 'org.wikimedia.analytics.refinery.hive.RefererClassifyUDF';
+ CREATE TEMPORARY FUNCTION get_engine AS
+
'org.wikimedia.analytics.refinery.hive.IdentifySearchEngineUDF';
+ USE wmf;
+ SELECT year, month, day,
+ is_external_search(referer) AS is_search,
+ classify_referer(referer) AS referer_class,
+ get_engine(referer) as search_engine,
+ access_method,
+ COUNT(*) AS pageviews
+ FROM webrequest
+ ", subquery,
+ "AND webrequest_source IN('text','mobile') AND is_pageview =
true
+ AND access_method IN('desktop','mobile web')
+ GROUP BY year, month, day, is_external_search(referer),
classify_referer(referer),
+ get_engine(referer), access_method;")
+ results <- query_hive(query)
+
+ # Sanitise the resulting data
+ results <- results[!is.na(results$month),]
+ results$timestamp <- as.Date(paste(results$year, results$month, results$day,
sep = "-"))
+ results <- results[, c("timestamp", "is_search", "referer_class",
"search_engine", "access_method","pageviews")]
+ results$is_search <- ifelse(results$is_search == "true", TRUE, FALSE)
+
+ # Write out
+ conditional_write(results, file.path(base_path, "referer_data.tsv"))
+}
+
diff --git a/run.R b/run.R
index 5d8e930..7318b37 100644
--- a/run.R
+++ b/run.R
@@ -10,7 +10,7 @@
# If the user has not provided dates, just run each file.
if(!length(dates)){
- lapply(source_files, function(x){
+ file_status <- unlist(lapply(source_files, function(x){
tryCatch({
source(x)
check_dir(base_path)
@@ -18,9 +18,10 @@
}, error = function(e){
print(x)
print(e$message)
+ return(FALSE)
})
-
- })
+ return(TRUE)
+ }))
} else {
# If the user has provided dates, we need to do more clever stuff.
data_files <- list.files(write_dirs, full.names = TRUE, pattern =
"\\.tsv$")
--
To view, visit https://gerrit.wikimedia.org/r/248869
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I303742abe9c45c56383f95b14523a6a179574893
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits