to an R script. - change (wikimedia...golden)

OliverKeyes (Code Review) Tue, 13 Oct 2015 12:38:53 -0700

OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/245978


Change subject: Switch the running of the R scripts /over/ to an R script.
......................................................................

Switch the running of the R scripts /over/ to an R script.

This switches over running the R scripts used for data collection
so that the shell script just needs to call one .R. It means
we won't have to append the .sh when adding new retrieval scripts -
and, more importantly, the internal logic is a prerequisite for automated
backfilling.

Bug: T114919

Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
---
M common.R
M config.R
M main.sh
M maps/tiles.R
M maps/usage.R
A run.R
M search/LDN.R
M search/api.R
M search/app.R
M search/desktop.R
M search/dwelltime.R
M search/mobile.R
M wdqs/basic_usage.R
13 files changed, 52 insertions(+), 81 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden 
refs/changes/78/245978/1

diff --git a/common.R b/common.R
index bdbdfcb..202ec46 100644
--- a/common.R
+++ b/common.R
@@ -1,9 +1,12 @@
 source("config.R")
 
-# Dependencies
-library(lubridate)
-library(olivr)
-suppressPackageStartupMessages(library(data.table))
+# Directory creation function
+check_dir <- function(dir){
+  if(!file.exists(dir)) {
+    dir.create(path = dir)
+  }
+  return(invisible())
+}
 
 # Query building function
 query_func <- function(fields, table, ts_field, date = NULL, conditionals = 
NULL){
diff --git a/config.R b/config.R
index f1b524e..abd7df5 100644
--- a/config.R
+++ b/config.R
@@ -1,9 +1,16 @@
-# Config variables and setup:
+# Config variables and setup
 options(scipen = 500, q = "no")
 
-# base_path : This is set on a per-script level
-#               (before sourcing common.R)
+# Base paths
+dirs <- c("maps","wdqs", "search")
+write_dirs <- paste0("/a/aggregate-datasets/", dirs)
 
-if(!file.exists(base_path)) {
-  dir.create(path = base_path)
-}
+# Dependencies
+library(lubridate)
+library(olivr)
+suppressPackageStartupMessages(library(data.table))
+library(readr)
+library(ortiz)
+library(plyr)
+library(magrittr)
+library(survival)
diff --git a/main.sh b/main.sh
index 54ec90b..b1511e9 100644
--- a/main.sh
+++ b/main.sh
@@ -1,11 +1,3 @@
-R CMD BATCH ./search/desktop.R
-R CMD BATCH ./search/mobile.R
-R CMD BATCH ./search/app.R
-R CMD BATCH ./search/api.R
-R CMD BATCH ./search/dwelltime.R
-R CMD BATCH ./search/LDN.R
-R CMD BATCH ./wdqs/basic_usage.R
-R CMD BATCH ./maps/usage.R
-R CMD BATCH ./maps/tiles.R
+R CMD BATCH run.R
 python ./search/core.py
 rm -rf .RData
diff --git a/maps/tiles.R b/maps/tiles.R
index c07081d..1b6c6aa 100644
--- a/maps/tiles.R
+++ b/maps/tiles.R
@@ -1,8 +1,6 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/maps/"
 
-source("common.R")
-
 ## This script extracts Vagrant logs and processes them to summarize 
server-side maps usage.
 # Specifically, it generates a dataset containing summaries (avg, median, 
percentiles) of:
 # - total tile requests
@@ -69,7 +67,3 @@
 #                              AND uri_path RLIKE 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$'")
 # lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
 # ^ equivalent to: lapply(seq(as.Date("2015-08-28"), Sys.Date() - 1, "day"), 
main)
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/maps/usage.R b/maps/usage.R
index 408f743..da0056e 100644
--- a/maps/usage.R
+++ b/maps/usage.R
@@ -1,7 +1,5 @@
 base_path <- "/a/aggregate-datasets/maps/"
 
-source("common.R")
-
 # Gathers very basic data for Maps.
 main <- function(date = NULL, table = "GeoFeatures_12914994"){
   
@@ -23,10 +21,6 @@
   
   return(invisible())
 }
-
-# Run and kill
-main()
-q(save = "no")
 
 # Good data starts on 20150804, so for backfilling...
 # lapply(seq(as.Date("2015-08-04"),Sys.Date()-1, "day"), main) 
diff --git a/run.R b/run.R
new file mode 100644
index 0000000..742da9f
--- /dev/null
+++ b/run.R
@@ -0,0 +1,25 @@
+source("config.R")
+source("common.R")
+
+# Central function
+run <- function(dates = NULL){
+  
+  # List out source files
+  source_files <- list.files(dirs, full.names = TRUE, pattern = "\\.R")
+  
+  # Read them in
+  # source_text <- lapply(source_files, readLines, encoding = "UTF-8")
+  
+  # If the user has not provided dates, just run each file.
+  if(!length(dates)){
+    lapply(source_files, function(x){
+      source(x)
+      main()
+    })
+  }
+  
+  return(invisible())
+}
+
+run()
+q()
diff --git a/search/LDN.R b/search/LDN.R
index de1352d..7f1b4f8 100644
--- a/search/LDN.R
+++ b/search/LDN.R
@@ -3,12 +3,6 @@
 
 base_path <- "/a/aggregate-datasets/search/"
 
-source("common.R")
-
-library(plyr)
-library(magrittr)
-library(survival)
-
 main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897") {
   
   checkins <- c(0, 10, 20, 30, 40, 50, 60, 90, 120, 150, 180, 210, 240, 300, 
360, 420)
@@ -52,8 +46,5 @@
   conditional_write(output, file.path(base_path, "sample_page_visit_ld.tsv"))
   
 }
-
-main()
-q(save = "no")
 
 # backfill: lapply(seq(as.Date("2015-09-02"),Sys.Date() - 1, "day"), main)
diff --git a/search/api.R b/search/api.R
index 8050abd..c16671e 100644
--- a/search/api.R
+++ b/search/api.R
@@ -1,8 +1,6 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/search/"
 
-source("common.R")
-
 # Central function
 main <- function(date = NULL){
 
@@ -38,7 +36,3 @@
   # Write out
   conditional_write(output, file.path(base_path, "search_api_aggregates.tsv"))
 }
-
-#Run and kill
-main()
-q(save = "no")
diff --git a/search/app.R b/search/app.R
index 4a721a2..d617b3c 100644
--- a/search/app.R
+++ b/search/app.R
@@ -1,8 +1,6 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/search/"
 
-source("common.R")
-
 # Retrieves data for the mobile web stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
 
 main <- function(date = NULL, table = "MobileWikiAppSearch_10641988"){
@@ -43,7 +41,3 @@
   conditional_write(load_times, file.path(base_path, "app_load_times.tsv"))
 
 }
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/desktop.R b/search/desktop.R
index b3e69ab..b06c918 100644
--- a/search/desktop.R
+++ b/search/desktop.R
@@ -1,10 +1,8 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/search/"
 
-source("common.R")
-
-# Retrieves data for the desktop stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
-
+# Retrieves data for the desktop stuff we care about, drops it in the 
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 main <- function(date = NULL, table = "Search_12057910"){
   
   # Get data and format
@@ -44,6 +42,3 @@
   conditional_write(load_times, file.path(base_path, "desktop_load_times.tsv"))
   return(invisible())
 }
-
-main()
-q(save = "no")
diff --git a/search/dwelltime.R b/search/dwelltime.R
index 65d1c91..dbb3023 100644
--- a/search/dwelltime.R
+++ b/search/dwelltime.R
@@ -1,9 +1,5 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/search/"
-library(ortiz)
-
-# Common dependencies
-source("common.R")
 
 main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897"){
   
@@ -29,7 +25,3 @@
                        threshold_pass = sum(dwell_data)/length(dwell_data))
   conditional_write(output, file = file.path(base_path, 
"search_threshold_pass_rate.tsv"))
 }
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/mobile.R b/search/mobile.R
index 52bd1ce..edecf87 100644
--- a/search/mobile.R
+++ b/search/mobile.R
@@ -1,10 +1,8 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/search/"
 
-source("common.R")
-
-# Retrieves data for the mobile web stuff we care about, drops it in the 
public-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
-
+# Retrieves data for the mobile web stuff we care about, drops it in the 
public-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 main <- function(date = NULL, table = "MobileWebSearch_12054448"){
   
   # Get data and format the timestamps
@@ -44,8 +42,5 @@
   conditional_write(load_times, file.path(base_path, "mobile_load_times.tsv"))
   return(invisible())
 }
-
-main()
-q(save = "no")
 
 # dates <- seq(as.Date("2015-06-11"), as.Date("2015-06-17"), by = "date")
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index c5af4d4..fccf6ec 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -1,9 +1,8 @@
 # Per-file config:
 base_path <- "/a/aggregate-datasets/wdqs/"
 
-source("common.R")
-
-# Retrieves data for the WDQS stuff we care about, drops it in the 
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis 
machine.
+# Retrieves data for the WDQS stuff we care about, drops it in the 
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
 
 # Create a script that would produce raw data on usage of
 # - query.wikidata.org
@@ -46,7 +45,3 @@
 # backlog <- function(days) {
 #   for (i in days:1) try(main(Sys.Date() - i), silent = TRUE)
 # }; backlog(20)
-
-# Run and kill
-main()
-q(save = "no")

-- 
To view, visit https://gerrit.wikimedia.org/r/245978
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Switch the running of the R scripts /over/ to an R script. - change (wikimedia...golden)

Reply via email to