OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/188588

Change subject: (WIP) project class/variant extraction and zero-rated request 
UDFs
......................................................................

(WIP) project class/variant extraction and zero-rated request UDFs

*UDF for identifying, to the best of our ability to detect,
whether requests are zero-rated or not. This is complete.
*UDF for identifying the project_variant and project_class
of a request. This is incomplete and causes a runtime
exception - it also lacks tests (Nuria, your thoughts
on what I'm doing wrong with the UDF proper would be
most welcome).
*Various tweaks to test files and comments to improve
consistency and explanations.

Change-Id: I674c030a36f6d1cb480edcd146405e51235d1d22
---
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
M refinery-core/src/test/resources/isCrawler_test_data.csv
A refinery-core/src/test/resources/isZero_test_data.csv
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java
8 files changed, 270 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/88/188588/1

diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
index c3cd54a..f811e5e 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Pageview.java
@@ -81,7 +81,7 @@
      * @param   target    String to search for
      * @return  boolean
      */
-    private static boolean stringContains(String string, String target){
+    public static boolean stringContains(String string, String target){
         return (target != null && string != null && string.contains(target));
     }
 
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index a5282f1..5884172 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -15,14 +15,18 @@
 package org.wikimedia.analytics.refinery.core;
 
 import java.util.regex.Pattern;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.HashSet;
+import java.util.Arrays;
 
 /**
- * Static functions to work withh Wikimedia webrequest data.
+ * Static functions to work with Wikimedia webrequest data.
  */
 public class Webrequest {
 
     /**
-     * Wikimedia-specific crawlers
+     * Wikimedia-specific crawlers. Used in isCrawler.
      */
     private static final Pattern crawlerPattern = Pattern.compile(
         "(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot).*"
@@ -37,6 +41,21 @@
     );
 
     /**
+     * Pattern for identifying Wikimedia projects; used in extractProject
+     * to protect against attempts to parse IPs or spoofed hostnames.
+     */
+    private static final Pattern wikimediaProjectPattern = Pattern.compile(
+        "\\.wik.*\\.org$"
+    );
+
+    private static final HashSet<String> subdomainSet = new 
HashSet<String>(Arrays.asList(
+        "mobile",
+        "m",
+        "wap",
+        "zero"
+    ));
+
+    /**
      * Consistent fragment of the user agent used by the Wikimedia
      * official mobile apps: used to identify app requests in
      * getAccessMethod.
@@ -46,7 +65,13 @@
     );
 
     /**
-     * Identify Wikimedia-specific crawlers; returns TRUE
+     * The fragment used to identify Zero requests in their URL
+     * or x_analytics field. Used by isZero.
+     */
+    private static final String zeroIdentifier = "zero";
+
+    /**
+     * Identify Wikimedia-specific crawlers; returns true
      * if the user agent matches a known crawler.
      * @param    userAgent    the user agent associated with the request.
      * @return   boolean
@@ -95,7 +120,10 @@
      *
      * @return String
      */
-    public static String getAccessMethod(String uriHost, String userAgent) {
+    public static String getAccessMethod(
+        String uriHost,
+        String userAgent
+    ) {
         String accessMethod = "";
 
         if(appAgentPattern.matcher(userAgent).find()){
@@ -108,4 +136,89 @@
 
         return accessMethod;
     }
+
+    /**
+     * Determines whether a request, to the best of our ability
+     * to detect, came zero-rated or not.
+     * This is done by checking the host for a "zero" subdomain
+     * and checking the x_analytics field for a zero MCC code.
+     * If either are present, the request is zero-rated; else,
+     * it is not.
+     *
+     * @param   uriHost    the value in the uri_host field.
+     * @param   xAnalytics the value in the x_analytics field.
+     *
+     * @return  Boolean
+     */
+    public static boolean isZero(
+        String uriHost,
+        String xAnalytics
+    ) {
+        return (
+            Pageview.stringContains(uriHost, zeroIdentifier)
+            || Pageview.stringContains(xAnalytics, zeroIdentifier)
+        );
+    }
+
+    /**
+     * Extracts a hashmap containing the project_variant and
+     * project_class from a uri_host. These are the language code
+     * or project in the cases where (respectively) the host represents
+     * a language version or a language-neutral project. An example of
+     * the former would be uri_host "en.wikipedia.org", where "en"
+     * is the project_variant and "wikipedia" the project_class;
+     * an example of the latter would be "commons.wikimedia.org",
+     * where "commons" is the variant and "wikimedia" the class.
+     * 
+     * The method first identifies whether the uri_host refers
+     * to a Wikimedia project at all (this is not guaranteed,
+     * since some internal requests point directly to IPs and
+     * some [redacted] users who clearly [redacted] [redacted] a
+     * horse think it's fun to spoof their URLs). It then identifies
+     * whether the uri_host is intact. If both of these conditions
+     * are not met, a hashmap of "Unknown"s is returned. If both
+     * are correct, the method treats the first period-delimited
+     * token as project_variant, and the second as project_class,
+     * except in cases where the second token refers to one of the
+     * mobile-related subdomains included in uriHostSubdomains,
+     * in which case the first and third token, respectively,
+     * are taken.
+     * 
+     * @param uriHost the uri_host of a request.
+     * 
+     * @return a hashMap containing project_variant and project_class.
+     */
+    public static HashMap<String, String> extractProject(
+        String uriHost
+    ) {
+        HashMap <String,String> output = new HashMap <String, String>(){{
+            put("project_variant","Unknown"); 
+            put("project_class","Unknown");
+        }};
+
+        if(!Pageview.patternIsFound(wikimediaProjectPattern, uriHost))
+        {
+            return output;
+        }
+        uriHost.replaceAll("www\\.","");
+        String[] hostTokens = uriHost.split("\\.");
+        if(hostTokens.length < 3)
+        {
+            if(hostTokens.length == 2 && hostTokens[1].equals("org"))
+            {
+                output.put("project_class",hostTokens[1]);
+            }
+            return output;
+        }
+
+        output.put("project_variant",hostTokens[0]);
+        if(subdomainSet.contains(hostTokens[1]))
+        {
+          output.put("project_class",hostTokens[2]);
+        } else {
+          output.put("project_class",hostTokens[1]);
+        }
+        
+        return output;
+    }
 }
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
index 2ac9dcf..1007124 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
@@ -72,4 +72,26 @@
             )
         );
     }
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/isZero_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsZero(
+        String test_description,
+        Boolean is_zero,
+        String uri_host,
+        String x_analytics
+    ) {
+
+        assertEquals(
+            test_description,
+            is_zero,
+            Webrequest.isZero(
+                uri_host,
+                x_analytics
+            )
+        );
+    }
 }
\ No newline at end of file
diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv 
b/refinery-core/src/test/resources/isCrawler_test_data.csv
index d0cb88a..d22e653 100644
--- a/refinery-core/src/test/resources/isCrawler_test_data.csv
+++ b/refinery-core/src/test/resources/isCrawler_test_data.csv
@@ -2,6 +2,6 @@
 Is crawler - Google, true,MediaWikiCrawler-Google/2.0 
(+wikidata-exter...@google.com)
 Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/)
 Is crawler – wikiwix, true,wikiwix-bot-3.0
-Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; 
Trident/7.0; rv:11.0) like Gecko
-Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU 
iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) 
Version/7.0 Mobile/11D257 Safari/9537.53
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain class, false,Opera/9.80 (Android; Opera 
Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
+Is not crawler – Windows 8, false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; 
rv:11.0) like Gecko
+Is not crawler – iphone, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like 
Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 
Safari/9537.53
+Is not crawler – Opera Mini, false,Opera/9.80 (Android; Opera 
Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
diff --git a/refinery-core/src/test/resources/isZero_test_data.csv 
b/refinery-core/src/test/resources/isZero_test_data.csv
new file mode 100644
index 0000000..a37128f
--- /dev/null
+++ b/refinery-core/src/test/resources/isZero_test_data.csv
@@ -0,0 +1,5 @@
+test_description,is_zero,uri_host,x_analytics
+Is zero – URL matches, true,en.zero.wikipedia.org,proxy=Opera;https=1
+Is zero – x_analytics matches, true,en.wikipedia.org,zero=515-03;proxy=Opera
+Is zero – both match, true,en.zero.wikipedia.org,zero=429-02;https=1
+Is not zero – neither match, false,en.wikipedia.org,mf-m=a;https=1
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java
new file mode 100644
index 0000000..537663c
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/ExtractProjectUDF.java
@@ -0,0 +1,34 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.Webrequest;
+
+/**
+ * A hive UDF to extract the project variant
+ * and class from uri_hosts.
+ */
+public class ExtractProjectUDF extends UDF {
+    public Object evaluate(
+        String x_analytics
+    ) {
+        return Webrequest.extractProject(
+             x_analytics
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java
new file mode 100644
index 0000000..e4bc9f4
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsZeroUDF.java
@@ -0,0 +1,37 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.Webrequest;
+
+/**
+ * A hive UDF to identify whether requests are
+ * zero-rated or not (to the best of our ability
+ * to detect)
+ */
+public class IsZeroUDF extends UDF {
+    public boolean evaluate(
+        String uri_host,
+        String x_analytics
+    ) {
+        return Webrequest.isZero(
+            uri_host,
+            x_analytics
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java
new file mode 100644
index 0000000..3b62f62
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsZeroUDF.java
@@ -0,0 +1,51 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import static org.junit.Assert.assertEquals;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestIsZeroUDF {
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/isZero_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsZero(
+        String test_description,
+        Boolean is_zero,
+        String uri_host,
+        String x_analytics
+    ) {
+        IsZeroUDF udf = new IsZeroUDF();
+
+        assertEquals(
+            test_description,
+            is_zero,
+            udf.evaluate(
+                uri_host,
+                x_analytics
+            )
+        );
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/188588
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I674c030a36f6d1cb480edcd146405e51235d1d22
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <oke...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to