OliverKeyes has uploaded a new change for review.
https://gerrit.wikimedia.org/r/245978
Change subject: Switch the running of the R scripts /over/ to an R script.
......................................................................
Switch the running of the R scripts /over/ to an R script.
This switches over running the R scripts used for data collection
so that the shell script just needs to call one .R. It means
we won't have to append the .sh when adding new retrieval scripts -
and, more importantly, the internal logic is a prerequisite for automated
backfilling.
Bug: T114919
Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
---
M common.R
M config.R
M main.sh
M maps/tiles.R
M maps/usage.R
A run.R
M search/LDN.R
M search/api.R
M search/app.R
M search/desktop.R
M search/dwelltime.R
M search/mobile.R
M wdqs/basic_usage.R
13 files changed, 52 insertions(+), 81 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden
refs/changes/78/245978/1
diff --git a/common.R b/common.R
index bdbdfcb..202ec46 100644
--- a/common.R
+++ b/common.R
@@ -1,9 +1,12 @@
source("config.R")
-# Dependencies
-library(lubridate)
-library(olivr)
-suppressPackageStartupMessages(library(data.table))
+# Directory creation function
+check_dir <- function(dir){
+ if(!file.exists(dir)) {
+ dir.create(path = dir)
+ }
+ return(invisible())
+}
# Query building function
query_func <- function(fields, table, ts_field, date = NULL, conditionals =
NULL){
diff --git a/config.R b/config.R
index f1b524e..abd7df5 100644
--- a/config.R
+++ b/config.R
@@ -1,9 +1,16 @@
-# Config variables and setup:
+# Config variables and setup
options(scipen = 500, q = "no")
-# base_path : This is set on a per-script level
-# (before sourcing common.R)
+# Base paths
+dirs <- c("maps","wdqs", "search")
+write_dirs <- paste0("/a/aggregate-datasets/", dirs)
-if(!file.exists(base_path)) {
- dir.create(path = base_path)
-}
+# Dependencies
+library(lubridate)
+library(olivr)
+suppressPackageStartupMessages(library(data.table))
+library(readr)
+library(ortiz)
+library(plyr)
+library(magrittr)
+library(survival)
diff --git a/main.sh b/main.sh
index 54ec90b..b1511e9 100644
--- a/main.sh
+++ b/main.sh
@@ -1,11 +1,3 @@
-R CMD BATCH ./search/desktop.R
-R CMD BATCH ./search/mobile.R
-R CMD BATCH ./search/app.R
-R CMD BATCH ./search/api.R
-R CMD BATCH ./search/dwelltime.R
-R CMD BATCH ./search/LDN.R
-R CMD BATCH ./wdqs/basic_usage.R
-R CMD BATCH ./maps/usage.R
-R CMD BATCH ./maps/tiles.R
+R CMD BATCH run.R
python ./search/core.py
rm -rf .RData
diff --git a/maps/tiles.R b/maps/tiles.R
index c07081d..1b6c6aa 100644
--- a/maps/tiles.R
+++ b/maps/tiles.R
@@ -1,8 +1,6 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/maps/"
-source("common.R")
-
## This script extracts Vagrant logs and processes them to summarize
server-side maps usage.
# Specifically, it generates a dataset containing summaries (avg, median,
percentiles) of:
# - total tile requests
@@ -69,7 +67,3 @@
# AND uri_path RLIKE
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$'")
# lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
# ^ equivalent to: lapply(seq(as.Date("2015-08-28"), Sys.Date() - 1, "day"),
main)
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/maps/usage.R b/maps/usage.R
index 408f743..da0056e 100644
--- a/maps/usage.R
+++ b/maps/usage.R
@@ -1,7 +1,5 @@
base_path <- "/a/aggregate-datasets/maps/"
-source("common.R")
-
# Gathers very basic data for Maps.
main <- function(date = NULL, table = "GeoFeatures_12914994"){
@@ -23,10 +21,6 @@
return(invisible())
}
-
-# Run and kill
-main()
-q(save = "no")
# Good data starts on 20150804, so for backfilling...
# lapply(seq(as.Date("2015-08-04"),Sys.Date()-1, "day"), main)
diff --git a/run.R b/run.R
new file mode 100644
index 0000000..742da9f
--- /dev/null
+++ b/run.R
@@ -0,0 +1,25 @@
+source("config.R")
+source("common.R")
+
+# Central function
+run <- function(dates = NULL){
+
+ # List out source files
+ source_files <- list.files(dirs, full.names = TRUE, pattern = "\\.R")
+
+ # Read them in
+ # source_text <- lapply(source_files, readLines, encoding = "UTF-8")
+
+ # If the user has not provided dates, just run each file.
+ if(!length(dates)){
+ lapply(source_files, function(x){
+ source(x)
+ main()
+ })
+ }
+
+ return(invisible())
+}
+
+run()
+q()
diff --git a/search/LDN.R b/search/LDN.R
index de1352d..7f1b4f8 100644
--- a/search/LDN.R
+++ b/search/LDN.R
@@ -3,12 +3,6 @@
base_path <- "/a/aggregate-datasets/search/"
-source("common.R")
-
-library(plyr)
-library(magrittr)
-library(survival)
-
main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897") {
checkins <- c(0, 10, 20, 30, 40, 50, 60, 90, 120, 150, 180, 210, 240, 300,
360, 420)
@@ -52,8 +46,5 @@
conditional_write(output, file.path(base_path, "sample_page_visit_ld.tsv"))
}
-
-main()
-q(save = "no")
# backfill: lapply(seq(as.Date("2015-09-02"),Sys.Date() - 1, "day"), main)
diff --git a/search/api.R b/search/api.R
index 8050abd..c16671e 100644
--- a/search/api.R
+++ b/search/api.R
@@ -1,8 +1,6 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/search/"
-source("common.R")
-
# Central function
main <- function(date = NULL){
@@ -38,7 +36,3 @@
# Write out
conditional_write(output, file.path(base_path, "search_api_aggregates.tsv"))
}
-
-#Run and kill
-main()
-q(save = "no")
diff --git a/search/app.R b/search/app.R
index 4a721a2..d617b3c 100644
--- a/search/app.R
+++ b/search/app.R
@@ -1,8 +1,6 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/search/"
-source("common.R")
-
# Retrieves data for the mobile web stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
main <- function(date = NULL, table = "MobileWikiAppSearch_10641988"){
@@ -43,7 +41,3 @@
conditional_write(load_times, file.path(base_path, "app_load_times.tsv"))
}
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/desktop.R b/search/desktop.R
index b3e69ab..b06c918 100644
--- a/search/desktop.R
+++ b/search/desktop.R
@@ -1,10 +1,8 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/search/"
-source("common.R")
-
-# Retrieves data for the desktop stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
-
+# Retrieves data for the desktop stuff we care about, drops it in the
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
main <- function(date = NULL, table = "Search_12057910"){
# Get data and format
@@ -44,6 +42,3 @@
conditional_write(load_times, file.path(base_path, "desktop_load_times.tsv"))
return(invisible())
}
-
-main()
-q(save = "no")
diff --git a/search/dwelltime.R b/search/dwelltime.R
index 65d1c91..dbb3023 100644
--- a/search/dwelltime.R
+++ b/search/dwelltime.R
@@ -1,9 +1,5 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/search/"
-library(ortiz)
-
-# Common dependencies
-source("common.R")
main <- function(date = NULL, table = "TestSearchSatisfaction2_13223897"){
@@ -29,7 +25,3 @@
threshold_pass = sum(dwell_data)/length(dwell_data))
conditional_write(output, file = file.path(base_path,
"search_threshold_pass_rate.tsv"))
}
-
-# Run and kill
-main()
-q(save = "no")
diff --git a/search/mobile.R b/search/mobile.R
index 52bd1ce..edecf87 100644
--- a/search/mobile.R
+++ b/search/mobile.R
@@ -1,10 +1,8 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/search/"
-source("common.R")
-
-# Retrieves data for the mobile web stuff we care about, drops it in the
public-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
-
+# Retrieves data for the mobile web stuff we care about, drops it in the
public-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
main <- function(date = NULL, table = "MobileWebSearch_12054448"){
# Get data and format the timestamps
@@ -44,8 +42,5 @@
conditional_write(load_times, file.path(base_path, "mobile_load_times.tsv"))
return(invisible())
}
-
-main()
-q(save = "no")
# dates <- seq(as.Date("2015-06-11"), as.Date("2015-06-17"), by = "date")
diff --git a/wdqs/basic_usage.R b/wdqs/basic_usage.R
index c5af4d4..fccf6ec 100644
--- a/wdqs/basic_usage.R
+++ b/wdqs/basic_usage.R
@@ -1,9 +1,8 @@
# Per-file config:
base_path <- "/a/aggregate-datasets/wdqs/"
-source("common.R")
-
-# Retrieves data for the WDQS stuff we care about, drops it in the
aggregate-datasets directory. Should be run on stat1002, /not/ on the datavis
machine.
+# Retrieves data for the WDQS stuff we care about, drops it in the
aggregate-datasets directory.
+# Should be run on stat1002, /not/ on the datavis machine.
# Create a script that would produce raw data on usage of
# - query.wikidata.org
@@ -46,7 +45,3 @@
# backlog <- function(days) {
# for (i in days:1) try(main(Sys.Date() - i), silent = TRUE)
# }; backlog(20)
-
-# Run and kill
-main()
-q(save = "no")
--
To view, visit https://gerrit.wikimedia.org/r/245978
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If7b29f7e2fd5ca9112d3901ac8d71ae57b34e96e
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits