OliverKeyes has uploaded a new change for review. https://gerrit.wikimedia.org/r/182971
Change subject: [WIP] Legacy pageviews definition UDF ...................................................................... [WIP] Legacy pageviews definition UDF Adds a UDF for seeing if requests match the legacy pageviews definition. This is very much a WIP; don't even bother CRing it at this stage, I just need to commit SOMEthing so that Nuria can try and debug a weird error. Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b --- A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java M refinery-core/src/test/resources/pageview_test_data.csv M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java 5 files changed, 177 insertions(+), 19 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/71/182971/1 diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java new file mode 100644 index 0000000..31d0640 --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java @@ -0,0 +1,93 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import java.util.regex.Pattern; +import java.util.HashSet; +import java.util.Arrays; + +/** + * Static functions to work wtih Wikimedia webrequest data. + * Created based on the HQL implementation at + * https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql + */ +public class LegacyPageview { + + private static final Pattern acceptedUriHostsPattern = Pattern.compile( + "\\.(mediawiki|wik(ibooks|idata|imediafoundation|inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$" + ); + + private static final Pattern acceptedMetaUriHostsPattern = Pattern.compile( + "(commons|incubator|meta|outreach|quality|species|strategy|usability)(\\.m)?\\.wikimedia)\\.org$" + ); + + private static final String acceptedUriPaths = "/wiki/"; + + private static final String rejectedUriPaths = "/wiki/Special:CentralAutoLogin/"; + + private static final HashSet<String> rejectedUriPathPages = new HashSet<String>(Arrays.asList( + "/wiki/undefined", + "/wiki/Undefined" + )); + + private static final HashSet<String> rejectedStatusCodes = new HashSet<String>(Arrays.asList( + "301", + "302", + "303" + )); + + private static final Pattern rejectedIPPattern = Pattern.compile( + "^(10\\.20\\.0|10\\.64\\.0|10\\.128\\.0|10\\.64\\.32|208\\.80\\.15[2-5]|91\\.198\\.174)\\..+" + ); + + /** + * Given a webrequest ip, x_forwarded_for, uri_host, uri_path, and http_status, returns + * True if we consider this a 'pageview', False otherwise. + * + * See: https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters + * for information on how to classify a pageview. + * @param ipAddress Requesting IP address + * @param xForwarded the x_forwarded_for field + * @param uriHost Hostname portion of the URI + * @param uriPath Path portion of the URI + * @param uriQuery Query portion of the URI + * @param httpStatus HTTP request status code + */ + public static boolean isLegacyPageview(String ipAddress, String xForwarded, String uriHost, String uriPath, String httpStatus) { + + return ( + //The status code is not 301, 302 or 303 + !rejectedStatusCodes.contains(httpStatus) + + //The host is a "recognised" project + && ( + acceptedUriHostsPattern.matcher(uriHost).find() + || acceptedMetaUriHostsPattern.matcher(uriHost).find() + ) + //The URI path starts with /wiki/, and + //isn't to undefined, Undefined or Special:CentralAutoLogin + && uriPath.substring(0,6).equals(acceptedUriPaths) + && !uriPath.substring(0,30).equals(rejectedUriPaths) + && !rejectedUriPathPages.contains(uriPath) + + //The source IP isn't in a specified range (or, + //is, but the XFF field is not empty) + && ( + !rejectedIPPattern.matcher(ipAddress).find() + || !xForwarded.equals("-") + ) + ); + } +} \ No newline at end of file diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java new file mode 100644 index 0000000..4027cf1 --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java @@ -0,0 +1,59 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestLegacyPageview { + + + @Test + @FileParameters( + value = "src/test/resources/pageview_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsLegacyPageview( + String test_description, + boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, + String uri_host, + String uri_path, + String uri_query, + String http_status, + String content_type, + String user_agent + ) { + assertEquals( + test_description, + is_legacy_pageview, + LegacyPageview.isLegacyPageview( + ip_address, + x_forwarded_for, + uri_host, + uri_path, + http_status + ) + ); + } +} \ No newline at end of file diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java index f8e2781..8fef768 100644 --- a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java @@ -34,6 +34,9 @@ public void testIsPageview( String test_description, boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, String uri_host, String uri_path, String uri_query, diff --git a/refinery-core/src/test/resources/pageview_test_data.csv b/refinery-core/src/test/resources/pageview_test_data.csv index 3637293..ff7ae7f 100644 --- a/refinery-core/src/test/resources/pageview_test_data.csv +++ b/refinery-core/src/test/resources/pageview_test_data.csv @@ -1,19 +1,19 @@ -test_description, is_pageview, uri_host, uri_path, uri_query, http_status, content_type, user_agent -Is Pageview - Desktop, true, en.wikipedia.org, /wiki/Horseshoe_crab,-,200,text/html, turnip -Is Pageview - App, true, en.wikipedia.org, /w/api.php, ?action=mobileview§ions=0,200, application/json, WikipediaApp/1.2.3 -Is Pageview – Mobile Web, true,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga -Is Pageview – Desktop - Serbian sr-ec, true,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute -Is Pageview – Desktop - Serbian sr-el, true,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute -Is Pageview – Desktop - Chinese zh-cn, true,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hans, true,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hant, true,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hk, true,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-mo, true,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-my, true,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-sg, true,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-tw, true,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan -Is Not Pageview - http_status != 200, false, en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip -Is Not Pageview - content_type does not match, false, en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip -Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain classes of error., false, en.wikipedia.org, /w/api.php,-,200, text/html, turnip -Is Not Pageview – App request for non-page content, false, en.wikipedia.org, /w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1 ,200, application/json, WikipediaApp/1.2.3 -Is Not Pageview – Non-App request for page content, false, en.wikipedia.org, /w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3 +test_description, is_pageview,is_legacy_pageview,ip_address,x_forwarded_for, uri_host, uri_path, uri_query, http_status, content_type, user_agent +Is Pageview - Desktop, true,true,174.62.175.82,-,en.wikipedia.org, /wiki/Horseshoe_crab,-,200,text/html, turnip +Is Pageview - App, true,false,174.62.175.83,-,en.wikipedia.org, /w/api.php, ?action=mobileview§ions=0,200, application/json, WikipediaApp/1.2.3 +Is Pageview – Mobile Web, true,true,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga +Is Pageview – Desktop - Serbian sr-ec, true,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute +Is Pageview – Desktop - Serbian sr-el, true,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute +Is Pageview – Desktop - Chinese zh-cn, true,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hans, true,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hant, true,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hk, true,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-mo, true,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-my, true,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-sg, true,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-tw, true,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan +Is Not Pageview - http_status != 200, false,true,174.62.175.95,-, en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip +Is Not Pageview - content_type does not match, false,true,174.62.175.96,-, en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip +Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain classes of error., false, false,174.62.175.97,-, en.wikipedia.org, /w/api.php,-,200, text/html, turnip +Is Not Pageview – App request for non-page content, false, false,174.62.175.98,-, en.wikipedia.org, /w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1 ,200, application/json, WikipediaApp/1.2.3 +Is Not Pageview – Non-App request for page content, false, false,174.62.175.99,-, en.wikipedia.org, /w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3 diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java index e192056..00a42fa 100644 --- a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java @@ -35,6 +35,9 @@ public void testIsPageview( String test_description, boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, String uri_host, String uri_path, String uri_query, -- To view, visit https://gerrit.wikimedia.org/r/182971 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: OliverKeyes <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
