Bearloga has submitted this change and it was merged.

Change subject: Include external traffic and fix remaining bug with dashboard 
automation
......................................................................


Include external traffic and fix remaining bug with dashboard automation

This includes the scripts used for calculating external referer traffic and
fixes a small (non-fatal) bug with the formatting of run.R

Bug: T116295
Change-Id: I303742abe9c45c56383f95b14523a6a179574893
---
M config.R
A external/search_referers.R
M run.R
3 files changed, 52 insertions(+), 4 deletions(-)

Approvals:
  Bearloga: Verified; Looks good to me, approved



diff --git a/config.R b/config.R
index 1b50863..79764f2 100644
--- a/config.R
+++ b/config.R
@@ -3,7 +3,7 @@
 
 # Core paths
 write_root <- "/a/aggregate-datasets/"
-dirs <- c("maps","wdqs", "search")
+dirs <- c("maps","wdqs", "search", "external_traffic")
 write_dirs <- paste0(write_root, dirs)
 
 # Dependencies
diff --git a/external/search_referers.R b/external/search_referers.R
new file mode 100644
index 0000000..f3d30f5
--- /dev/null
+++ b/external/search_referers.R
@@ -0,0 +1,47 @@
+# Per-file config:
+base_path <- paste0(write_root, "external_traffic/")
+check_dir(base_path)
+
+main <- function(date = NULL){
+  
+  # Date handling
+  if(is.null(date)){
+    date <- Sys.Date() - 1
+  }
+  
+  # Date subquery
+  subquery <- date_clause(date)
+  
+  # Write query and run it
+  query <- paste0("ADD JAR /home/ironholds/refinery-hive-0.0.21-SNAPSHOT.jar;
+                   CREATE TEMPORARY FUNCTION is_external_search AS
+                  'org.wikimedia.analytics.refinery.hive.IsExternalSearchUDF';
+                   CREATE TEMPORARY FUNCTION classify_referer AS
+                  'org.wikimedia.analytics.refinery.hive.RefererClassifyUDF';
+                   CREATE TEMPORARY FUNCTION get_engine AS
+                  
'org.wikimedia.analytics.refinery.hive.IdentifySearchEngineUDF';
+                   USE wmf;
+                   SELECT year, month, day,
+                   is_external_search(referer) AS is_search,
+                   classify_referer(referer) AS referer_class,
+                   get_engine(referer) as search_engine,
+                   access_method,
+                   COUNT(*) AS pageviews
+                   FROM webrequest
+                  ", subquery,
+                  "AND webrequest_source IN('text','mobile') AND is_pageview = 
true
+                  AND access_method IN('desktop','mobile web')
+                  GROUP BY year, month, day, is_external_search(referer), 
classify_referer(referer),
+                  get_engine(referer), access_method;")
+  results <- query_hive(query)
+  
+  # Sanitise the resulting data
+  results <- results[!is.na(results$month),]
+  results$timestamp <- as.Date(paste(results$year, results$month, results$day, 
sep = "-"))
+  results <- results[, c("timestamp", "is_search", "referer_class", 
"search_engine", "access_method","pageviews")]
+  results$is_search <- ifelse(results$is_search == "true", TRUE, FALSE)
+  
+  # Write out
+  conditional_write(results, file.path(base_path, "referer_data.tsv"))
+}
+
diff --git a/run.R b/run.R
index 5d8e930..7318b37 100644
--- a/run.R
+++ b/run.R
@@ -10,7 +10,7 @@
   
   # If the user has not provided dates, just run each file.
   if(!length(dates)){
-    lapply(source_files, function(x){
+    file_status <- unlist(lapply(source_files, function(x){
       tryCatch({
         source(x)
         check_dir(base_path)
@@ -18,9 +18,10 @@
       }, error = function(e){
         print(x)
         print(e$message)
+        return(FALSE)
       })
-
-    })
+      return(TRUE)
+    }))
   } else {
     # If the user has provided dates, we need to do more clever stuff.
     data_files <- list.files(write_dirs, full.names = TRUE, pattern = 
"\\.tsv$")

-- 
To view, visit https://gerrit.wikimedia.org/r/248869
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I303742abe9c45c56383f95b14523a6a179574893
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to