Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/247779

Change subject: Added graphs varnish query
......................................................................

Added graphs varnish query

Implemented graph varnish hive query
to allow graph usage visualization.

Change-Id: I8f096dd778f3feddc939969cb2e7f9531eef7590
---
A graphs/graphs.R
1 file changed, 65 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden 
refs/changes/79/247779/1

diff --git a/graphs/graphs.R b/graphs/graphs.R
new file mode 100644
index 0000000..e1aba90
--- /dev/null
+++ b/graphs/graphs.R
@@ -0,0 +1,65 @@
+# Per-file config:
+base_path <- "/a/aggregate-datasets/graphs/"
+
+source("common.R")
+
+## This script extracts Vagrant logs and processes them to summarize 
server-side graphs usage.
+# Specifically, it generates a dataset containing summaries (avg, median, 
percentiles) of:
+# - total graph requests
+# - number of unique pages with graphs that have been viewed
+
+main <- function(date = NULL) {
+
+  # Date handling
+  if(is.null(date)) {
+    date <- Sys.Date() - 1
+  }
+  subquery <- date_clause(date)
+
+  # Get the per-hash graph usage:
+  query <- paste0("SELECT project_class, project, hash, COUNT(1) AS n
+                   FROM (
+                     SELECT
+                       normalized_host.project_class AS project_class,
+                       normalized_host.project AS project,
+                       REGEXP_EXTRACT(uri_path, 
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$', 3) AS hash
+                     FROM wmf.webrequest", subquery, "
+                       AND webrequest_source = 'text'
+                       AND http_status IN('200','304')
+                       AND uri_path RLIKE 
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$'
+                   ) prepared
+                   GROUP BY project_class, project, hash;")
+
+  results <- query_hive(query)
+
+  # Summarise the per-hash results by aggregating across project_class and 
project:
+  results <- plyr::ddply(results, plyr::.(project_class, project), function(x) 
{
+    cbind(x[1, c('project_class' ,'project'), drop = FALSE],
+          graphs = length(x$n),
+          total = sum(x$n),
+          average = round(mean(x$n), 2),
+          median = ceiling(median(x$n)),
+          percentile95 = ceiling(quantile(x$n, 0.95)),
+          percentile99 = ceiling(quantile(x$n, 0.99)))
+  })
+  # Clean up those results:
+  results <- results[order(results$project_class, results$project), ]
+  results$date <- date
+  output <- results[, union('date', names(results))]
+  # Write out
+  conditional_write(output, file.path(base_path, "graph_aggregates.tsv"))
+
+}
+
+## Backfill:
+# earliest_ts <- query_hive("SELECT min(ts) AS earliest_timestamp
+#                            FROM wmf.webrequest
+#                            WHERE webrequest_source = 'text' AND year = 2015 
AND month = 10
+#                              AND http_status IN('200','304')
+#                              AND uri_path RLIKE 
'^/api/rest_v1/page/graph/([^/]+)/([^/]+)/[^/]+/([^/]+)$'")
+# lapply(seq(as.Date(earliest_ts[1, 1]),Sys.Date()-1, "day"), main)
+# ^ equivalent to: lapply(seq(as.Date("2015-08-11"), Sys.Date() - 1, "day"), 
main)
+
+# Run and kill
+main()
+q(save = "no")

-- 
To view, visit https://gerrit.wikimedia.org/r/247779
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8f096dd778f3feddc939969cb2e7f9531eef7590
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to