Ottomata has submitted this change and it was merged. Change subject: Legacy pageviews definition UDF ......................................................................
Legacy pageviews definition UDF Adds a UDF for seeing if requests match the legacy pageviews definition. Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b --- A refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java A refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java M refinery-core/src/test/resources/pageview_test_data.csv A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java 8 files changed, 322 insertions(+), 20 deletions(-) Approvals: Ottomata: Verified; Looks good to me, approved diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java new file mode 100644 index 0000000..b6edb2d --- /dev/null +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/LegacyPageview.java @@ -0,0 +1,104 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import java.util.regex.Pattern; +import java.util.HashSet; +import java.util.Arrays; + +/** + * Static functions to identify what requests constitute "pageviews", + * according to the definition at + * https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql + * This is the "legacy" definition, in use by WebStatsCollector and the + * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/ + * from 2007 to early 2015, and is to be superseded by the "Pageview" class + * and isPageview method. + */ +public class LegacyPageview { + + private static final Pattern acceptedUriHostsPattern = Pattern.compile( + "\\.(mediawiki|wik(ibooks|idata|imediafoundation|inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$" + ); + + private static final Pattern acceptedMetaUriHostsPattern = Pattern.compile( + "(commons|incubator|meta|outreach|quality|species|strategy|usability)(\\.m)?\\.wikimedia\\.org$" + ); + + private static final Pattern acceptedUriPattern = Pattern.compile( + "^/wiki/" + ); + + private static final Pattern rejectedUriPattern = Pattern.compile( + "^/wiki/Special\\:CentralAutoLogin/" + ); + private static final HashSet<String> rejectedUriPathPages = new HashSet<String>(Arrays.asList( + "/wiki/undefined", + "/wiki/Undefined" + )); + + private static final HashSet<String> rejectedStatusCodes = new HashSet<String>(Arrays.asList( + "301", + "302", + "303" + )); + + private static final Pattern rejectedIPPattern = Pattern.compile( + "^(10\\.20\\.0|10\\.64\\.0|10\\.128\\.0|10\\.64\\.32|208\\.80\\.15[2-5]|91\\.198\\.174)\\..+" + ); + + /** + * Given a webrequest ip, x_forwarded_for, uri_host, uri_path, and http_status, returns + * True if we consider this a 'legacy pageview', False otherwise. + * + * @param ip Requesting IP address + * @param xForwardedFor the x_forwarded_for field + * @param uriHost Hostname portion of the URI + * @param uriPath Path portion of the URI + * @param uriQuery Query portion of the URI + * @param httpStatus HTTP request status code + */ + public static boolean isLegacyPageview( + String ip, + String xForwardedFor, + String uriHost, + String uriPath, + String httpStatus + ) { + + return ( + //The status code is not 301, 302 or 303 + !rejectedStatusCodes.contains(httpStatus) + + //The host is a "recognised" project + && ( + Pageview.patternIsFound(acceptedUriHostsPattern, uriHost) + || Pageview.patternIsFound(acceptedMetaUriHostsPattern, uriHost) + ) + //The URI path starts with /wiki/, and + //isn't to undefined, Undefined or Special:CentralAutoLogin + && Pageview.patternIsFound(acceptedUriPattern, uriPath) + && !Pageview.patternIsFound(rejectedUriPattern, uriPath) + && !rejectedUriPathPages.contains(uriPath) + + //The source IP isn't in a specified range (or, + //is, but the XFF field is not empty) + && ( + !Pageview.patternIsFound(rejectedIPPattern, ip) + || !xForwardedFor.equals("-") + ) + ); + } +} \ No newline at end of file diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java index 81e96f5..e52777d 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java @@ -85,13 +85,14 @@ /** * Convenience method for Using Matcher.find() to check if * the given regex Pattern matches the target String. + * Also called in the LegacyPageview class. * * @param Pattern pattern * @param String target * * @return boolean */ - private static boolean patternIsFound(Pattern pattern, String target) { + public static boolean patternIsFound(Pattern pattern, String target) { return pattern.matcher(target).find(); } diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java new file mode 100644 index 0000000..4027cf1 --- /dev/null +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageview.java @@ -0,0 +1,59 @@ +// Copyright 2014 Wikimedia Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package org.wikimedia.analytics.refinery.core; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestLegacyPageview { + + + @Test + @FileParameters( + value = "src/test/resources/pageview_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsLegacyPageview( + String test_description, + boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, + String uri_host, + String uri_path, + String uri_query, + String http_status, + String content_type, + String user_agent + ) { + assertEquals( + test_description, + is_legacy_pageview, + LegacyPageview.isLegacyPageview( + ip_address, + x_forwarded_for, + uri_host, + uri_path, + http_status + ) + ); + } +} \ No newline at end of file diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java index f8e2781..8fef768 100644 --- a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java @@ -34,6 +34,9 @@ public void testIsPageview( String test_description, boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, String uri_host, String uri_path, String uri_query, diff --git a/refinery-core/src/test/resources/pageview_test_data.csv b/refinery-core/src/test/resources/pageview_test_data.csv index 3637293..ff7ae7f 100644 --- a/refinery-core/src/test/resources/pageview_test_data.csv +++ b/refinery-core/src/test/resources/pageview_test_data.csv @@ -1,19 +1,19 @@ -test_description, is_pageview, uri_host, uri_path, uri_query, http_status, content_type, user_agent -Is Pageview - Desktop, true, en.wikipedia.org, /wiki/Horseshoe_crab,-,200,text/html, turnip -Is Pageview - App, true, en.wikipedia.org, /w/api.php, ?action=mobileview§ions=0,200, application/json, WikipediaApp/1.2.3 -Is Pageview – Mobile Web, true,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga -Is Pageview – Desktop - Serbian sr-ec, true,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute -Is Pageview – Desktop - Serbian sr-el, true,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute -Is Pageview – Desktop - Chinese zh-cn, true,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hans, true,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hant, true,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-hk, true,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-mo, true,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-my, true,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-sg, true,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan -Is Pageview – Desktop - Chinese zh-tw, true,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan -Is Not Pageview - http_status != 200, false, en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip -Is Not Pageview - content_type does not match, false, en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip -Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain classes of error., false, en.wikipedia.org, /w/api.php,-,200, text/html, turnip -Is Not Pageview – App request for non-page content, false, en.wikipedia.org, /w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1 ,200, application/json, WikipediaApp/1.2.3 -Is Not Pageview – Non-App request for page content, false, en.wikipedia.org, /w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3 +test_description, is_pageview,is_legacy_pageview,ip_address,x_forwarded_for, uri_host, uri_path, uri_query, http_status, content_type, user_agent +Is Pageview - Desktop, true,true,174.62.175.82,-,en.wikipedia.org, /wiki/Horseshoe_crab,-,200,text/html, turnip +Is Pageview - App, true,false,174.62.175.83,-,en.wikipedia.org, /w/api.php, ?action=mobileview§ions=0,200, application/json, WikipediaApp/1.2.3 +Is Pageview – Mobile Web, true,true,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga +Is Pageview – Desktop - Serbian sr-ec, true,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute +Is Pageview – Desktop - Serbian sr-el, true,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger salute +Is Pageview – Desktop - Chinese zh-cn, true,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hans, true,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hant, true,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-hk, true,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-mo, true,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-my, true,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-sg, true,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test plan +Is Pageview – Desktop - Chinese zh-tw, true,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test plan +Is Not Pageview - http_status != 200, false,true,174.62.175.95,-, en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip +Is Not Pageview - content_type does not match, false,true,174.62.175.96,-, en.wikipedia.org, /wiki/Noppperrrrs,-,200, image/png, turnip +Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain classes of error., false, false,174.62.175.97,-, en.wikipedia.org, /w/api.php,-,200, text/html, turnip +Is Not Pageview – App request for non-page content, false, false,174.62.175.98,-, en.wikipedia.org, /w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1 ,200, application/json, WikipediaApp/1.2.3 +Is Not Pageview – Non-App request for page content, false, false,174.62.175.99,-, en.wikipedia.org, /w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3 diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java new file mode 100644 index 0000000..733cb20 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsLegacyPageviewUDF.java @@ -0,0 +1,70 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.wikimedia.analytics.refinery.core.LegacyPageview; + + +/** + * A Hive UDF to identify what requests constitute "pageviews", + * according to the definition at + * https://github.com/wikimedia/analytics-refinery/blob/master/oozie/pagecounts-all-sites/load/insert_hourly_pagecounts.hql + * This is the "legacy" definition, in use by WebStatsCollector and the + * pageviews dumps at http://dumps.wikimedia.org/other/pagecounts-ez/ + * from 2007 to early 2015, and is to be superseded by the "Pageview" class + * and isPageview method. + * <p> + * Hive Usage: + * ADD JAR /path/to/refinery-hive.jar; + * CREATE TEMPORARY FUNCTION is_legacy_pageview AS + * 'org.wikimedia.analytics.refinery.hive.IsLegacyPageviewUDF'; + * SELECT + * LOWER(uri_host) as uri_host, + * count(*) as cnt + * FROM + * wmf_raw.webrequest + * WHERE + * webrequest_source = 'mobile' + * AND year=2014 + * AND month=12 + * AND day=7 + * AND hour=12 + * AND is_legacy_pageview(ip, x_forwarded_for, uri_host, uri_path, http_status) + * GROUP BY + * LOWER(uri_host) + * ORDER BY cnt desc + * LIMIT 10 + * ; + */ +public class IsLegacyPageviewUDF extends UDF { + public boolean evaluate( + String ip, + String xForwardedFor, + String uriHost, + String uriPath, + String httpStatus + ) { + return LegacyPageview.isLegacyPageview( + ip, + xForwardedFor, + uriHost, + uriPath, + httpStatus + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java new file mode 100644 index 0000000..1a11387 --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java @@ -0,0 +1,62 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestIsLegacyPageviewUDF { + + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/pageview_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsPageview( + String test_description, + boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, + String uri_host, + String uri_path, + String uri_query, + String http_status, + String content_type, + String user_agent + ) { + IsLegacyPageviewUDF udf = new IsLegacyPageviewUDF(); + + assertEquals( + test_description, + is_legacy_pageview, + udf.evaluate( + ip_address, + x_forwarded_for, + uri_host, + uri_path, + http_status + ) + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java index e192056..00a42fa 100644 --- a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java @@ -35,6 +35,9 @@ public void testIsPageview( String test_description, boolean is_pageview, + boolean is_legacy_pageview, + String ip_address, + String x_forwarded_for, String uri_host, String uri_path, String uri_query, -- To view, visit https://gerrit.wikimedia.org/r/182971 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ifd8c9d73da4724d2dccd4ec6d49cc0c2ebd2b63b Gerrit-PatchSet: 6 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: OliverKeyes <[email protected]> Gerrit-Reviewer: Nuria <[email protected]> Gerrit-Reviewer: OliverKeyes <[email protected]> Gerrit-Reviewer: Ottomata <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
