to an R script. - change (wikimedia...golden)

Bearloga (Code Review) Fri, 16 Oct 2015 14:08:01 -0700

Bearloga has submitted this change and it was merged.

Change subject: Switch the running of the R scripts /over/ to an R script.
......................................................................



Switch the running of the R scripts /over/ to an R script.

This switches over running the R scripts used for data collection
so that the shell script just needs to call one .R. It means
we won't have to append the .sh when adding new retrieval scripts -
and, more importantly, the internal logic is a prerequisite for automated
backfilling.

Bug: T114919

Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
---
M common.R
M config.R
M main.sh
M maps/tiles.R
M maps/usage.R
A run.R
M search/LDN.R
M search/api.R
M search/app.R
M search/desktop.R
M search/dwelltime.R
M search/mobile.R
M wdqs/basic_usage.R
13 files changed, 71 insertions(+), 92 deletions(-)

Approvals:
  Bearloga: Verified; Looks good to me, approved



diff --git a/common.R b/common.R
index bdbdfcb..202ec46 100644
--- a/common.R
+++ b/common.R
@@ -1,9 +1,12 @@
 source("config.R")
 
-# Dependencies
-library(lubridate)
-library(olivr)
-suppressPackageStartupMessages(library(data.table))
+# Directory creation function
+check_dir <- function(dir){
+  if(!file.exists(dir)) {
+    dir.create(path = dir)
+  }
+  return(invisible())
+}
 
 # Query building function
 query_func <- function(fields, table, ts_field, date = NULL, conditionals = 
NULL){
diff --git a/config.R b/config.R
index f1b524e..1b50863 100644
--- a/config.R
+++ b/config.R
@@ -1,9 +1,17 @@
-# Config variables and setup:
-options(scipen = 500, q = "no")
+# Config variables and setup
+options(scipen = 500, save = "no")
 
-# base_path : This is set on a per-script level
-#               (before sourcing common.R)
+# Core paths
+write_root <- "/a/aggregate-datasets/"
+dirs <- c("maps","wdqs", "search")
+write_dirs <- paste0(write_root, dirs)
 
-if(!file.exists(base_path)) {
-  dir.create(path = base_path)
-}
+# Dependencies
+library(lubridate)
+library(olivr)
+suppressPackageStartupMessages(library(data.table))
+library(readr)
+library(ortiz)
+library(plyr)
+library(magrittr)
+library(survival)
diff --git a/main.sh b/main.sh
index 54ec90b..b1511e9 100644
--- a/main.sh
+++ b/main.sh
@@ -1,11 +1,3 @@
-R CMD BATCH ./search/desktop.R
-R CMD BATCH ./search/mobile.R
-R CMD BATCH ./search/app.R
-R CMD BATCH ./search/api.R
-R CMD BATCH ./search/dwelltime.R
-R CMD BATCH ./search/LDN.R
-R CMD BATCH ./wdqs/basic_usage.R
-R CMD BATCH ./maps/usage.R
-R CMD BATCH ./maps/tiles.R
+R CMD BATCH run.R
 python ./search/core.py
 rm -rf .RData
diff --git a/maps/tiles.R b/maps/tiles.R
index c07081d..71542c4 100644
--- a/maps/tiles.R
+++ b/maps/tiles.R
@@ -1,7 +1,5 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/maps/"
-
-source("common.R")
+base_path <- paste0(write_root, "maps/")
 
 ## This script extracts Vagrant logs and processes them to summarize 
server-side maps usage.
 # Specifically, it generates a dataset containing summaries (avg, median, 
percentiles) of:
@@ -10,7 +8,7 @@
 # - tile requests per style per zoom, e.g. "osm-z10", "osm-z11", ...
 
 main <- function(date = NULL) {
-
+  
   # Date handling
   if(is.null(date)) {
     date <- Sys.Date() - 1
@@ -69,7 +67,3 @@
 #                              AND uri_path RLIKE 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$'")
 # lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
 # ^ equivalent to: lapply(seq(as.Date("2015-08-28"), Sys.Date() - 1, "day"), 
main)
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/maps/usage.R b/maps/usage.R
index 408f743..025f25a 100644
--- a/maps/usage.R
+++ b/maps/usage.R
@@ -1,6 +1,4 @@
-base_path <- "/a/aggregate-datasets/maps/"
-
-source("common.R")
+base_path <- paste0(write_root, "maps/")
 
 # Gathers very basic data for Maps.
 main <- function(date = NULL, table = "GeoFeatures_12914994"){
@@ -23,10 +21,6 @@
   
   return(invisible())
 }
-
-# Run and kill
-main()
-q(save = "no")
 
 # Good data starts on 20150804, so for backfilling...
 # lapply(seq(as.Date("2015-08-04"),Sys.Date()-1, "day"), main) 
diff --git a/run.R b/run.R
new file mode 100644
index 0000000..54ad785
--- /dev/null
+++ b/run.R
@@ -0,0 +1,32 @@
+source("config.R")
+source("common.R")
+
+# Central function
+run <- function(dates = NULL){
+  
+  # List out source files
+  source_files <- list.files(dirs, full.names = TRUE, pattern = "\\.R")
+  
+  # Read them in
+  # source_text <- lapply(source_files, readLines, encoding = "UTF-8")
+  
+  # If the user has not provided dates, just run each file.
+  if(!length(dates)){
+    lapply(source_files, function(x){
+      tryCatch({
+        source(x)
+        check_dir(base_path)
+        main()
+      }, error = function(e){
+        print(x)
+        stop(e)
+      })
+
+    })
+  }
+  
+  return(invisible())
+}
+
+run()
+q()
diff --git a/search/LDN.R b/search/LDN.R
index de1352d..2af7eeb 100644
--- a/search/LDN.R
+++ b/search/LDN.R
@@ -1,13 +1,7 @@
 ## Calculates the median lethal dose (LD50) and other.
 ## LD50 = the time point at which we have lost 50% of our users.
 
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
-
-library(plyr)
-library(magrittr)
-library(survival)
+base_path <- paste0(write_root, "search/")
 
 main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897") {
   
@@ -52,8 +46,5 @@
   conditional_write(output, file.path(base_path, "sample_page_visit_ld.tsv"))
   
 }
-
-main()
-q(save = "no")
 
 # backfill: lapply(seq(as.Date("2015-09-02"),Sys.Date() - 1, "day"), main)
diff --git a/search/api.R b/search/api.R
index 8050abd..5428f52 100644
--- a/search/api.R
+++ b/search/api.R
@@ -1,7 +1,5 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
+base_path <- paste0(write_root, "search/")
 
 # Central function
 main <- function(date = NULL){
@@ -38,7 +36,3 @@
   # Write out
   conditional_write(output, file.path(base_path, "search_api_aggregates.tsv"))
 }
-
-#Run and kill
-main()
-q(save = "no")
diff --git a/search/app.R b/search/app.R
index 4a721a2..7e2790b 100644
--- a/search/app.R
+++ b/search/app.R
@@ -1,7 +1,5 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
+base_path <- paste0(write_root, "search/")
 
 # Retrieves data for the mobile web stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
 
@@ -43,7 +41,3 @@
   conditional_write(load_times, file.path(base_path, "app_load_times.tsv"))
 
 }
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/desktop.R b/search/desktop.R
index b3e69ab..6c19d41 100644
--- a/search/desktop.R
+++ b/search/desktop.R
@@ -1,10 +1,8 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
+base_path <- paste0(write_root, "search/")
 
-source("common.R")
-
-# Retrieves data for the desktop stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
-
+# Retrieves data for the desktop stuff we care about, drops it in the 
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 main <- function(date = NULL, table = "Search_12057910"){
   
   # Get data and format
@@ -44,6 +42,3 @@
   conditional_write(load_times, file.path(base_path, "desktop_load_times.tsv"))
   return(invisible())
 }
-
-main()
-q(save = "no")
diff --git a/search/dwelltime.R b/search/dwelltime.R
index 65d1c91..103dd45 100644
--- a/search/dwelltime.R
+++ b/search/dwelltime.R
@@ -1,9 +1,5 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-library(ortiz)
-
-# Common dependencies
-source("common.R")
+base_path <- paste0(write_root, "search/")
 
 main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897"){
   
@@ -29,7 +25,3 @@
                        threshold_pass = sum(dwell_data)/length(dwell_data))
   conditional_write(output, file = file.path(base_path, 
"search_threshold_pass_rate.tsv"))
 }
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/mobile.R b/search/mobile.R
index 52bd1ce..737f2cb 100644
--- a/search/mobile.R
+++ b/search/mobile.R
@@ -1,10 +1,8 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
+base_path <- paste0(write_root, "search/")
 
-source("common.R")
-
-# Retrieves data for the mobile web stuff we care about, drops it in the 
public-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
-
+# Retrieves data for the mobile web stuff we care about, drops it in the 
public-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 main <- function(date = NULL, table = "MobileWebSearch_12054448"){
   
   # Get data and format the timestamps
@@ -44,8 +42,5 @@
   conditional_write(load_times, file.path(base_path, "mobile_load_times.tsv"))
   return(invisible())
 }
-
-main()
-q(save = "no")
 
 # dates <- seq(as.Date("2015-06-11"), as.Date("2015-06-17"), by = "date")
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index c5af4d4..37bad31 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -1,9 +1,8 @@
 # Per-file config:
-base_path <- "/a/aggregate-datasets/wdqs/"
+base_path <- paste0(write_root, "wdqs/")
 
-source("common.R")
-
-# Retrieves data for the WDQS stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
+# Retrieves data for the WDQS stuff we care about, drops it in the 
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 
 # Create a script that would produce raw data on usage of
 # - query.wikidata.org
@@ -46,7 +45,3 @@
 # backlog <- function(days) {
 #   for (i in days:1) try(main(Sys.Date() - i), silent = TRUE)
 # }; backlog(20)
-
-# Run and kill
-main()
-q(save = "no")

-- 
To view, visit https://gerrit.wikimedia.org/r/245978
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
Gerrit-PatchSet: 5
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>
Gerrit-Reviewer: OliverKeyes <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Switch the running of the R scripts /over/ to an R script. - change (wikimedia...golden)

Reply via email to