Bearloga has uploaded a new change for review. https://gerrit.wikimedia.org/r/295739
Change subject: Filter out search-redirect.php requests ...................................................................... Filter out search-redirect.php requests Bug: T138411 Change-Id: I5db02d91c2403f7aef4b446572ce91358857e99e --- M portal/pageviews.R M portal/referers.R 2 files changed, 20 insertions(+), 11 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/golden refs/changes/39/295739/1 diff --git a/portal/pageviews.R b/portal/pageviews.R index 2ed4a0f..05268dc 100644 --- a/portal/pageviews.R +++ b/portal/pageviews.R @@ -6,13 +6,15 @@ clause_data <- wmf::date_clause(date) # Query - data <- wmf::query_hive(paste0("USE wmf; - SELECT COUNT(*) AS pageviews - FROM webrequest - ", clause_data$date_clause, - "AND uri_host IN('www.wikipedia.org', 'wikipedia.org') - AND content_type RLIKE('^text/html') - AND webrequest_source = 'text'")) + data <- wmf::query_hive(paste("USE wmf; + SELECT COUNT(*) AS pageviews + FROM webrequest", + clause_data$date_clause, + "AND uri_host RLIKE('^(www\\.)?wikipedia.org/*$') + AND INSTR(uri_path, 'search-redirect.php') = 0 + AND content_type RLIKE('^text/html') + AND webrequest_source = 'text' + AND NOT (referer RLIKE('^http://localhost'));")) output <- data.frame(date = clause_data$date, pageviews = data$pageviews) diff --git a/portal/referers.R b/portal/referers.R index 4f76059..81f583f 100644 --- a/portal/referers.R +++ b/portal/referers.R @@ -8,7 +8,7 @@ clause_data <- wmf::date_clause(date) # Write query and run it - query <- paste0("ADD JAR /home/bearloga/Code/analytics-refinery-jars/refinery-hive.jar; + query <- paste("ADD JAR /home/bearloga/Code/analytics-refinery-jars/refinery-hive.jar; CREATE TEMPORARY FUNCTION is_external_search AS 'org.wikimedia.analytics.refinery.hive.IsExternalSearchUDF'; CREATE TEMPORARY FUNCTION classify_referer AS @@ -21,10 +21,17 @@ classify_referer(referer) AS referer_class, get_engine(referer) as search_engine, COUNT(*) AS pageviews - FROM webrequest ", clause_data$date_clause, " - AND webrequest_source = 'text' + FROM webrequest", + clause_data$date_clause, + " AND webrequest_source = 'text' AND content_type RLIKE('^text/html') - AND uri_host IN('www.wikipedia.org','wikipedia.org') + AND uri_host RLIKE('^(www\\.)?wikipedia.org/*$') + AND ( + INSTR(uri_path, 'search-redirect.php') = 0 + OR + NOT referer RLIKE('^(https?://www\\.)?wikipedia.org/+search-redirect.php') + ) + AND NOT referer RLIKE('^http://localhost') GROUP BY is_external_search(referer), classify_referer(referer), -- To view, visit https://gerrit.wikimedia.org/r/295739 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5db02d91c2403f7aef4b446572ce91358857e99e Gerrit-PatchSet: 1 Gerrit-Project: wikimedia/discovery/golden Gerrit-Branch: master Gerrit-Owner: Bearloga <mpo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits