OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/260347

Change subject: Switch Maps data collection over to detect automata
......................................................................

Switch Maps data collection over to detect automata

What it says on the tin. Also moves some filtering to Hive
and makes some style corrections.

Bug: T121758
Change-Id: I878bbe2a78d2e144154a1a654cee795cb89f0f12
---
M maps/tiles.R
1 file changed, 9 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden 
refs/changes/47/260347/1

diff --git a/maps/tiles.R b/maps/tiles.R
index b64fbc9..be12dba 100644
--- a/maps/tiles.R
+++ b/maps/tiles.R
@@ -16,7 +16,7 @@
   subquery <- date_clause(date)
 
   # Get the per-user tile usage:
-  query <- paste0("SELECT style, zoom, scale, format, user_id, cache, COUNT(1) 
AS n
+  query <- paste0("SELECT style, zoom, scale, format, user_id, cache, 
is_automata, COUNT(1) AS n
                    FROM (
                      SELECT
                        REGEXP_EXTRACT(uri_path, 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$',
 1) AS style,
@@ -24,26 +24,24 @@
                        COALESCE(REGEXP_EXTRACT(uri_path, 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$',
 6), '1') AS scale,
                        REGEXP_EXTRACT(uri_path, 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$',
 7) AS format,
                        CONCAT(user_agent, client_ip) AS user_id,
-                       cache_status AS cache
+                       cache_status AS cache,
+                       CASE WHEN agent_type = 'spider' THEN 'TRUE' ELSE 
'FALSE' END AS is_automata
                      FROM wmf.webrequest", subquery, "
                        AND webrequest_source = 'maps'
                        AND http_status IN('200','304')
                        AND uri_path RLIKE 
'^/([^/]+)/([0-9]{1,2})/(-?[0-9]+)/(-?[0-9]+)(@([0-9]\\.?[0-9]?)x)?\\.([a-z]+)$'
                        AND uri_query <> '?loadtesting'
                    ) prepared
-                   GROUP BY style, zoom, scale, format, user_id, cache;")
+                   WHERE zoom != '' AND style != ''
+                   GROUP BY style, zoom, scale, format, user_id, cache, 
is_automata;")
   results <- query_hive(query)
-
-  # In my tests, I've gotten rows with blank style and NA zoom. I have no way 
of finding out whether
-  #   that's because of the hive problems I've been experiencing or what. In 
either case, it's worth
-  #   including this line for data sanitation:
-  results <- results[!is.na(results$zoom) & results$style != "", ]
-  # P.S. The zoom sometimes exceeds what we actually allow (18). Yuri said 
that's acceptable but we
+  
+  # The zoom sometimes exceeds what we actually allow (18). Yuri said that's 
acceptable but we
   # enlarge the images, so they're not actually getting zoom level 21-26 tiles.
 
   # Summarise the per-user results  by aggregating across styles and zooms:
   results <- plyr::ddply(results, plyr::.(style, zoom, scale, format, cache), 
function(x) {
-    cbind(x[1, c('style' ,'zoom', 'scale', 'format', 'cache'), drop = FALSE],
+    cbind(x[1, c("style" ,"zoom", "scale", "format", "cache", "is_automata"), 
drop = FALSE],
           users = length(x$n),
           total = sum(x$n),
           average = round(mean(x$n), 2),
@@ -58,6 +56,6 @@
   output <- results[, union('date', names(results))]
 
   # Write out
-  conditional_write(output, file.path(base_path, "tile_aggregates.tsv"))
+  conditional_write(output, file.path(base_path, "tile_aggregates_new.tsv"))
 
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/260347
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I878bbe2a78d2e144154a1a654cee795cb89f0f12
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <oke...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to