OliverKeyes has uploaded a new change for review. https://gerrit.wikimedia.org/r/188588
Change subject: (WIP) project class/variant extraction and zero-rated request UDFs ...................................................................... (WIP) project class/variant extraction and zero-rated request UDFs *UDF for identifying, to the best of our ability to detect, whether requests are zero-rated or not. This is complete. *UDF for identifying the project_variant and project_class of a request. This is incomplete and causes a runtime exception - it also lacks tests (Nuria, your thoughts on what I'm doing wrong with the UDF proper would be most welcome). *Various tweaks to test files and comments to improve consistency and explanations. Change-Id: I674c030a36f6d1cb480edcd146405e51235d1d22 --- M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java M refinery-core/src/test/resources/isCrawler_test_data.csv A refinery-core/src/test/resources/isZero_test_data.csv A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java 8 files changed, 270 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/88/188588/1 diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java index c3cd54a..f811e5e 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java @@ -81,7 +81,7 @@ * @param target String to search for * @return boolean */ - private static boolean stringContains(String string, String target){ + public static boolean stringContains(String string, String target){ return (target != null && string != null && string.contains(target)); } diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java index a5282f1..5884172 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java @@ -15,14 +15,18 @@ package org.wikimedia.analytics.refinery.core; import java.util.regex.Pattern; +import java.util.HashMap; +import java.util.Map; +import java.util.HashSet; +import java.util.Arrays; /** - * Static functions to work withh Wikimedia webrequest data. + * Static functions to work with Wikimedia webrequest data. */ public class Webrequest { /** - * Wikimedia-specific crawlers + * Wikimedia-specific crawlers. Used in isCrawler. */ private static final Pattern crawlerPattern = Pattern.compile( "(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot).*" @@ -37,6 +41,21 @@ ); /** + * Pattern for identifying Wikimedia projects; used in extractProject + * to protect against attempts to parse IPs or spoofed hostnames. + */ + private static final Pattern wikimediaProjectPattern = Pattern.compile( + "\\.wik.*\\.org$" + ); + + private static final HashSet<String> subdomainSet = new HashSet<String>(Arrays.asList( + "mobile", + "m", + "wap", + "zero" + )); + + /** * Consistent fragment of the user agent used by the Wikimedia * official mobile apps: used to identify app requests in * getAccessMethod. @@ -46,7 +65,13 @@ ); /** - * Identify Wikimedia-specific crawlers; returns TRUE + * The fragment used to identify Zero requests in their URL + * or x_analytics field. Used by isZero. + */ + private static final String zeroIdentifier = "zero"; + + /** + * Identify Wikimedia-specific crawlers; returns true * if the user agent matches a known crawler. * @param userAgent the user agent associated with the request. * @return boolean @@ -95,7 +120,10 @@ * * @return String */ - public static String getAccessMethod(String uriHost, String userAgent) { + public static String getAccessMethod( + String uriHost, + String userAgent + ) { String accessMethod = ""; if(appAgentPattern.matcher(userAgent).find()){ @@ -108,4 +136,89 @@ return accessMethod; } + + /** + * Determines whether a request, to the best of our ability + * to detect, came zero-rated or not. + * This is done by checking the host for a "zero" subdomain + * and checking the x_analytics field for a zero MCC code. + * If either are present, the request is zero-rated; else, + * it is not. + * + * @param uriHost the value in the uri_host field. + * @param xAnalytics the value in the x_analytics field. + * + * @return Boolean + */ + public static boolean isZero( + String uriHost, + String xAnalytics + ) { + return ( + Pageview.stringContains(uriHost, zeroIdentifier) + || Pageview.stringContains(xAnalytics, zeroIdentifier) + ); + } + + /** + * Extracts a hashmap containing the project_variant and + * project_class from a uri_host. These are the language code + * or project in the cases where (respectively) the host represents + * a language version or a language-neutral project. An example of + * the former would be uri_host "en.wikipedia.org", where "en" + * is the project_variant and "wikipedia" the project_class; + * an example of the latter would be "commons.wikimedia.org", + * where "commons" is the variant and "wikimedia" the class. + * + * The method first identifies whether the uri_host refers + * to a Wikimedia project at all (this is not guaranteed, + * since some internal requests point directly to IPs and + * some [redacted] users who clearly [redacted] [redacted] a + * horse think it's fun to spoof their URLs). It then identifies + * whether the uri_host is intact. If both of these conditions + * are not met, a hashmap of "Unknown"s is returned. If both + * are correct, the method treats the first period-delimited + * token as project_variant, and the second as project_class, + * except in cases where the second token refers to one of the + * mobile-related subdomains included in uriHostSubdomains, + * in which case the first and third token, respectively, + * are taken. + * + * @param uriHost the uri_host of a request. + * + * @return a hashMap containing project_variant and project_class. + */ + public static HashMap<String, String> extractProject( + String uriHost + ) { + HashMap <String,String> output = new HashMap <String, String>(){{ + put("project_variant","Unknown"); + put("project_class","Unknown"); + }}; + + if(!Pageview.patternIsFound(wikimediaProjectPattern, uriHost)) + { + return output; + } + uriHost.replaceAll("www\\.",""); + String[] hostTokens = uriHost.split("\\."); + if(hostTokens.length < 3) + { + if(hostTokens.length == 2 && hostTokens[1].equals("org")) + { + output.put("project_class",hostTokens[1]); + } + return output; + } + + output.put("project_variant",hostTokens[0]); + if(subdomainSet.contains(hostTokens[1])) + { + output.put("project_class",hostTokens[2]); + } else { + output.put("project_class",hostTokens[1]); + } + + return output; + } } \ No newline at end of file diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java index 2ac9dcf..1007124 100644 --- a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java @@ -72,4 +72,26 @@ ) ); } + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/isZero_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsZero( + String test_description, + Boolean is_zero, + String uri_host, + String x_analytics + ) { + + assertEquals( + test_description, + is_zero, + Webrequest.isZero( + uri_host, + x_analytics + ) + ); + } } \ No newline at end of file diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv b/refinery-core/src/test/resources/isCrawler_test_data.csv index d0cb88a..d22e653 100644 --- a/refinery-core/src/test/resources/isCrawler_test_data.csv +++ b/refinery-core/src/test/resources/isCrawler_test_data.csv @@ -2,6 +2,6 @@ Is crawler - Google, true,MediaWikiCrawler-Google/2.0 (+wikidata-exter...@google.com) Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/) Is crawler – wikiwix, true,wikiwix-bot-3.0 -Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko -Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 -Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain class, false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 +Is not crawler – Windows 8, false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko +Is not crawler – iphone, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 +Is not crawler – Opera Mini, false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 diff --git a/refinery-core/src/test/resources/isZero_test_data.csv b/refinery-core/src/test/resources/isZero_test_data.csv new file mode 100644 index 0000000..a37128f --- /dev/null +++ b/refinery-core/src/test/resources/isZero_test_data.csv @@ -0,0 +1,5 @@ +test_description,is_zero,uri_host,x_analytics +Is zero – URL matches, true,en.zero.wikipedia.org,proxy=Opera;https=1 +Is zero – x_analytics matches, true,en.wikipedia.org,zero=515-03;proxy=Opera +Is zero – both match, true,en.zero.wikipedia.org,zero=429-02;https=1 +Is not zero – neither match, false,en.wikipedia.org,mf-m=a;https=1 diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java new file mode 100644 index 0000000..537663c --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java @@ -0,0 +1,34 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.wikimedia.analytics.refinery.core.Webrequest; + +/** + * A hive UDF to extract the project variant + * and class from uri_hosts. + */ +public class ExtractProjectUDF extends UDF { + public Object evaluate( + String x_analytics + ) { + return Webrequest.extractProject( + x_analytics + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java new file mode 100644 index 0000000..e4bc9f4 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java @@ -0,0 +1,37 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.wikimedia.analytics.refinery.core.Webrequest; + +/** + * A hive UDF to identify whether requests are + * zero-rated or not (to the best of our ability + * to detect) + */ +public class IsZeroUDF extends UDF { + public boolean evaluate( + String uri_host, + String x_analytics + ) { + return Webrequest.isZero( + uri_host, + x_analytics + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java new file mode 100644 index 0000000..3b62f62 --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java @@ -0,0 +1,51 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import org.junit.Test; +import org.junit.runner.RunWith; +import static org.junit.Assert.assertEquals; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; + +@RunWith(JUnitParamsRunner.class) +public class TestIsZeroUDF { + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/isZero_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsZero( + String test_description, + Boolean is_zero, + String uri_host, + String x_analytics + ) { + IsZeroUDF udf = new IsZeroUDF(); + + assertEquals( + test_description, + is_zero, + udf.evaluate( + uri_host, + x_analytics + ) + ); + } +} \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/188588 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I674c030a36f6d1cb480edcd146405e51235d1d22 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: OliverKeyes <oke...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits