Bearloga has submitted this change and it was merged.
Change subject: Switch the running of the R scripts /over/ to an R script.
......................................................................
Switch the running of the R scripts /over/ to an R script.
This switches over running the R scripts used for data collection
so that the shell script just needs to call one .R. It means
we won't have to append the .sh when adding new retrieval scripts -
and, more importantly, the internal logic is a prerequisite for automated
backfilling.
Bug: T114919
Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
---
M common.R
M config.R
M main.sh
M maps/tiles.R
M maps/usage.R
A run.R
M search/LDN.R
M search/api.R
M search/app.R
M search/desktop.R
M search/dwelltime.R
M search/mobile.R
M wdqs/basic_usage.R
13 files changed, 71 insertions(+), 92 deletions(-)
Approvals:
Bearloga: Verified; Looks good to me, approved
diff --git a/common.R b/common.R
index bdbdfcb..202ec46 100644
--- a/common.R
+++ b/common.R
@@ -1,9 +1,12 @@
source("config.R")
-# Dependencies
-library(lubridate)
-library(olivr)
-suppressPackageStartupMessages(library(data.table))
+# Directory creation function
+check_dir <- function(dir){
+ if(!file.exists(dir)) {
+ dir.create(path = dir)
+ }
+ return(invisible())
+}
# Query building function
query_func <- function(fields, table, ts_field, date = NULL, conditionals =
NULL){
diff --git a/config.R b/config.R
index f1b524e..1b50863 100644
--- a/config.R
+++ b/config.R
@@ -1,9 +1,17 @@
-# Config variables and setup:
-options(scipen = 500, q = "no")
+# Config variables and setup
+options(scipen = 500, save = "no")
-# base_path : This is set on a per-script level
-# (before sourcing common.R)
+# Core paths
+write_root <- "/a/aggregate-datasets/"
+dirs <- c("maps","wdqs", "search")
+write_dirs <- paste0(write_root, dirs)
-if(!file.exists(base_path)) {
- dir.create(path = base_path)
-}
+# Dependencies
+library(lubridate)
+library(olivr)
+suppressPackageStartupMessages(library(data.table))
+library(readr)
+library(ortiz)
+library(plyr)
+library(magrittr)
+library(survival)
diff --git a/main.sh b/main.sh
index 54ec90b..b1511e9 100644
--- a/main.sh
+++ b/main.sh
@@ -1,11 +1,3 @@
-R CMD BATCH ./search/desktop.R
-R CMD BATCH ./search/mobile.R
-R CMD BATCH ./search/app.R
-R CMD BATCH ./search/api.R
-R CMD BATCH ./search/dwelltime.R
-R CMD BATCH ./search/LDN.R
-R CMD BATCH ./wdqs/basic_usage.R
-R CMD BATCH ./maps/usage.R
-R CMD BATCH ./maps/tiles.R
+R CMD BATCH run.R
python ./search/core.py
rm -rf .RData
diff --git a/maps/tiles.R b/maps/tiles.R
index c07081d..71542c4 100644
--- a/maps/tiles.R
+++ b/maps/tiles.R
@@ -1,7 +1,5 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/maps/"
-
-source("common.R")
+base_path <- paste0(write_root, "maps/")
## This script extracts Vagrant logs and processes them to summarize
server-side maps usage.
# Specifically, it generates a dataset containing summaries (avg, median,
percentiles) of:
@@ -10,7 +8,7 @@
# - tile requests per style per zoom, e.g. "osm-z10", "osm-z11", ...
main <- function(date = NULL) {
-
+
# Date handling
if(is.null(date)) {
date <- Sys.Date() - 1
@@ -69,7 +67,3 @@
# AND uri_path RLIKE
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$'")
# lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
# ^ equivalent to: lapply(seq(as.Date("2015-08-28"), Sys.Date() - 1, "day"),
main)
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/maps/usage.R b/maps/usage.R
index 408f743..025f25a 100644
--- a/maps/usage.R
+++ b/maps/usage.R
@@ -1,6 +1,4 @@
-base_path <- "/a/aggregate-datasets/maps/"
-
-source("common.R")
+base_path <- paste0(write_root, "maps/")
# Gathers very basic data for Maps.
main <- function(date = NULL, table = "GeoFeatures_12914994"){
@@ -23,10 +21,6 @@
return(invisible())
}
-
-# Run and kill
-main()
-q(save = "no")
# Good data starts on 20150804, so for backfilling...
# lapply(seq(as.Date("2015-08-04"),Sys.Date()-1, "day"), main)
diff --git a/run.R b/run.R
new file mode 100644
index 0000000..54ad785
--- /dev/null
+++ b/run.R
@@ -0,0 +1,32 @@
+source("config.R")
+source("common.R")
+
+# Central function
+run <- function(dates = NULL){
+
+ # List out source files
+ source_files <- list.files(dirs, full.names = TRUE, pattern = "\\.R")
+
+ # Read them in
+ # source_text <- lapply(source_files, readLines, encoding = "UTF-8")
+
+ # If the user has not provided dates, just run each file.
+ if(!length(dates)){
+ lapply(source_files, function(x){
+ tryCatch({
+ source(x)
+ check_dir(base_path)
+ main()
+ }, error = function(e){
+ print(x)
+ stop(e)
+ })
+
+ })
+ }
+
+ return(invisible())
+}
+
+run()
+q()
diff --git a/search/LDN.R b/search/LDN.R
index de1352d..2af7eeb 100644
--- a/search/LDN.R
+++ b/search/LDN.R
@@ -1,13 +1,7 @@
## Calculates the median lethal dose (LD50) and other.
## LD50 = the time point at which we have lost 50% of our users.
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
-
-library(plyr)
-library(magrittr)
-library(survival)
+base_path <- paste0(write_root, "search/")
main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897") {
@@ -52,8 +46,5 @@
conditional_write(output, file.path(base_path, "sample_page_visit_ld.tsv"))
}
-
-main()
-q(save = "no")
# backfill: lapply(seq(as.Date("2015-09-02"),Sys.Date() - 1, "day"), main)
diff --git a/search/api.R b/search/api.R
index 8050abd..5428f52 100644
--- a/search/api.R
+++ b/search/api.R
@@ -1,7 +1,5 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
+base_path <- paste0(write_root, "search/")
# Central function
main <- function(date = NULL){
@@ -38,7 +36,3 @@
# Write out
conditional_write(output, file.path(base_path, "search_api_aggregates.tsv"))
}
-
-#Run and kill
-main()
-q(save = "no")
diff --git a/search/app.R b/search/app.R
index 4a721a2..7e2790b 100644
--- a/search/app.R
+++ b/search/app.R
@@ -1,7 +1,5 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-
-source("common.R")
+base_path <- paste0(write_root, "search/")
# Retrieves data for the mobile web stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
@@ -43,7 +41,3 @@
conditional_write(load_times, file.path(base_path, "app_load_times.tsv"))
}
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/desktop.R b/search/desktop.R
index b3e69ab..6c19d41 100644
--- a/search/desktop.R
+++ b/search/desktop.R
@@ -1,10 +1,8 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
+base_path <- paste0(write_root, "search/")
-source("common.R")
-
-# Retrieves data for the desktop stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
-
+# Retrieves data for the desktop stuff we care about, drops it in the
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
main <- function(date = NULL, table = "Search_12057910"){
# Get data and format
@@ -44,6 +42,3 @@
conditional_write(load_times, file.path(base_path, "desktop_load_times.tsv"))
return(invisible())
}
-
-main()
-q(save = "no")
diff --git a/search/dwelltime.R b/search/dwelltime.R
index 65d1c91..103dd45 100644
--- a/search/dwelltime.R
+++ b/search/dwelltime.R
@@ -1,9 +1,5 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
-library(ortiz)
-
-# Common dependencies
-source("common.R")
+base_path <- paste0(write_root, "search/")
main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897"){
@@ -29,7 +25,3 @@
threshold_pass = sum(dwell_data)/length(dwell_data))
conditional_write(output, file = file.path(base_path,
"search_threshold_pass_rate.tsv"))
}
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/mobile.R b/search/mobile.R
index 52bd1ce..737f2cb 100644
--- a/search/mobile.R
+++ b/search/mobile.R
@@ -1,10 +1,8 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/search/"
+base_path <- paste0(write_root, "search/")
-source("common.R")
-
-# Retrieves data for the mobile web stuff we care about, drops it in the
public-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
-
+# Retrieves data for the mobile web stuff we care about, drops it in the
public-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
main <- function(date = NULL, table = "MobileWebSearch_12054448"){
# Get data and format the timestamps
@@ -44,8 +42,5 @@
conditional_write(load_times, file.path(base_path, "mobile_load_times.tsv"))
return(invisible())
}
-
-main()
-q(save = "no")
# dates <- seq(as.Date("2015-06-11"), as.Date("2015-06-17"), by = "date")
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index c5af4d4..37bad31 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -1,9 +1,8 @@
# Per-file config:
-base_path <- "/a/aggregate-datasets/wdqs/"
+base_path <- paste0(write_root, "wdqs/")
-source("common.R")
-
-# Retrieves data for the WDQS stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
+# Retrieves data for the WDQS stuff we care about, drops it in the
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
# Create a script that would produce raw data on usage of
# - query.wikidata.org
@@ -46,7 +45,3 @@
# backlog <- function(days) {
# for (i in days:1) try(main(Sys.Date() - i), silent = TRUE)
# }; backlog(20)
-
-# Run and kill
-main()
-q(save = "no")
--
To view, visit https://gerrit.wikimedia.org/r/245978
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
Gerrit-PatchSet: 5
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
Gerrit-Reviewer: Bearloga <[email protected]>
Gerrit-Reviewer: OliverKeyes <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits