Bearloga has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/295739

Change subject: Filter out search-redirect.php requests
......................................................................

Filter out search-redirect.php requests

Bug: T138411
Change-Id: I5db02d91c2403f7aef4b446572ce91358857e99e
---
M portal/pageviews.R
M portal/referers.R
2 files changed, 20 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden 
refs/changes/39/295739/1

diff --git a/portal/pageviews.R b/portal/pageviews.R
index 2ed4a0f..05268dc 100644
--- a/portal/pageviews.R
+++ b/portal/pageviews.R
@@ -6,13 +6,15 @@
   clause_data <- wmf::date_clause(date)
 
   # Query
-  data <- wmf::query_hive(paste0("USE wmf;
-                                  SELECT COUNT(*) AS pageviews
-                                  FROM webrequest
-                                 ", clause_data$date_clause, 
-                                 "AND uri_host IN('www.wikipedia.org', 
'wikipedia.org')
-                                  AND content_type RLIKE('^text/html')
-                                  AND webrequest_source = 'text'"))
+  data <- wmf::query_hive(paste("USE wmf;
+                                 SELECT COUNT(*) AS pageviews
+                                 FROM webrequest",
+                                 clause_data$date_clause, 
+                                "AND uri_host 
RLIKE('^(www\\.)?wikipedia.org/*$')
+                                 AND INSTR(uri_path, 'search-redirect.php') = 0
+                                 AND content_type RLIKE('^text/html')
+                                 AND webrequest_source = 'text'
+                                 AND NOT (referer 
RLIKE('^http://localhost'));"))
       
   output <- data.frame(date = clause_data$date, pageviews = data$pageviews)
   
diff --git a/portal/referers.R b/portal/referers.R
index 4f76059..81f583f 100644
--- a/portal/referers.R
+++ b/portal/referers.R
@@ -8,7 +8,7 @@
   clause_data <- wmf::date_clause(date)
 
   # Write query and run it
-  query <- paste0("ADD JAR 
/home/bearloga/Code/analytics-refinery-jars/refinery-hive.jar;
+  query <- paste("ADD JAR 
/home/bearloga/Code/analytics-refinery-jars/refinery-hive.jar;
                   CREATE TEMPORARY FUNCTION is_external_search AS
                   'org.wikimedia.analytics.refinery.hive.IsExternalSearchUDF';
                   CREATE TEMPORARY FUNCTION classify_referer AS
@@ -21,10 +21,17 @@
                     classify_referer(referer) AS referer_class,
                     get_engine(referer) as search_engine,
                     COUNT(*) AS pageviews
-                  FROM webrequest ", clause_data$date_clause, "
-                    AND webrequest_source = 'text'
+                  FROM webrequest",
+                  clause_data$date_clause,
+                 "  AND webrequest_source = 'text'
                     AND content_type RLIKE('^text/html')
-                    AND uri_host IN('www.wikipedia.org','wikipedia.org')
+                    AND uri_host RLIKE('^(www\\.)?wikipedia.org/*$')
+                    AND (
+                      INSTR(uri_path, 'search-redirect.php') = 0
+                      OR
+                      NOT referer 
RLIKE('^(https?://www\\.)?wikipedia.org/+search-redirect.php')
+                    )
+                    AND NOT referer RLIKE('^http://localhost')
                   GROUP BY
                     is_external_search(referer),
                     classify_referer(referer),

-- 
To view, visit https://gerrit.wikimedia.org/r/295739
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5db02d91c2403f7aef4b446572ce91358857e99e
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/golden
Gerrit-Branch: master
Gerrit-Owner: Bearloga <mpo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to