Yurik has uploaded a new change for review.
https://gerrit.wikimedia.org/r/247779
Change subject: Added graphs varnish query
......................................................................
Added graphs varnish query
Implemented graph varnish hive query
to allow graph usage visualization.
Change-Id: I8f096dd778f3feddc939969cb2e7f9531eef7590
---
A graphs/graphs.R
1 file changed, 65 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden
refs/changes/79/247779/1
diff --git a/graphs/graphs.R b/graphs/graphs.R
new file mode 100644
index 0000000..e1aba90
--- /dev/null
+++ b/graphs/graphs.R
@@ -0,0 +1,65 @@
+# Per-file config:
+base_path <- "/a/aggregate-datasets/graphs/"
+
+source("common.R")
+
+## This script extracts Vagrant logs and processes them to summarize
server-side graphs usage.
+# Specifically, it generates a dataset containing summaries (avg, median,
percentiles) of:
+# - total graph requests
+# - number of unique pages with graphs that have been viewed
+
+main <- function(date = NULL) {
+
+ # Date handling
+ if(is.null(date)) {
+ date <- Sys.Date() - 1
+ }
+ subquery <- date_clause(date)
+
+ # Get the per-hash graph usage:
+ query <- paste0("SELECT project_class, project, hash, COUNT(1) AS n
+ FROM (
+ SELECT
+ normalized_host.project_class AS project_class,
+ normalized_host.project AS project,
+ REGEXP_EXTRACT(uri_path,
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$', 3) AS hash
+ FROM wmf.webrequest", subquery, "
+ AND webrequest_source = 'text'
+ AND http_status IN('200','304')
+ AND uri_path RLIKE
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$'
+ ) prepared
+ GROUP BY project_class, project, hash;")
+
+ results <- query_hive(query)
+
+ # Summarise the per-hash results by aggregating across project_class and
project:
+ results <- plyr::ddply(results, plyr::.(project_class, project), function(x)
{
+ cbind(x[1, c('project_class' ,'project'), drop = FALSE],
+ graphs = length(x$n),
+ total = sum(x$n),
+ average = round(mean(x$n), 2),
+ median = ceiling(median(x$n)),
+ percentile95 = ceiling(quantile(x$n, 0.95)),
+ percentile99 = ceiling(quantile(x$n, 0.99)))
+ })
+ # Clean up those results:
+ results <- results[order(results$project_class, results$project), ]
+ results$date <- date
+ output <- results[, union('date', names(results))]
+ # Write out
+ conditional_write(output, file.path(base_path, "graph_aggregates.tsv"))
+
+}
+
+## Backfill:
+# earliest_ts <- query_hive("SELECT min(ts) AS earliest_timestamp
+# FROM wmf.webrequest
+# WHERE webrequest_source = 'text' AND year = 2015
AND month = 10
+# AND http_status IN('200','304')
+# AND uri_path RLIKE
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$'")
+# lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
+# ^ equivalent to: lapply(seq(as.Date("2015-08-11"), Sys.Date() - 1, "day"),
main)
+
+# Run and kill
+main()
+q(save = "no")
--
To view, visit https://gerrit.wikimedia.org/r/247779
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8f096dd778f3feddc939969cb2e7f9531eef7590
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits