Ottomata has submitted this change and it was merged.

Change subject: Patch pageview definition
......................................................................


Patch pageview definition

Modify maven pom.xml files not to build uber jars and to take advantage of 
properties for versions.
Include xx.mobile.xxx.org and xx.wap.xxx.org.
Update referer classification to output a string instead of a map.
Add getProject function and UDF to identify pageview requests.
Correct little bugs in test.

Change-Id: Id3b14d954d1396a8e8667d6865a854ad1167d830
---
M .gitignore
M changelog.md
M pom.xml
M refinery-core/pom.xml
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
M refinery-core/src/test/resources/access_method_test_data.csv
M refinery-core/src/test/resources/pageview_test_data.csv
M refinery-hive/pom.xml
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
M refinery-job/pom.xml
M refinery-tools/pom.xml
23 files changed, 361 insertions(+), 292 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/.gitignore b/.gitignore
index b9d47ea..ad51ad7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@
 *.iml
 *.ipr
 *.iws
+out/
diff --git a/changelog.md b/changelog.md
index 77dec81..78498ab 100644
--- a/changelog.md
+++ b/changelog.md
@@ -1,4 +1,11 @@
-## v0.0.9-SNAPSHOT
+## v0.0.10-SNAPSHOT
+* Maven now builds non-uber jars by having hadoop and hive in provided scope.
+  It also takes advantage of properties to propagate version numbers.
+* PageView Class has a function to extract project from uri.
+  Bugs have been corrected on how to handle mobile uri.
+* Referer classification now outputs a string instead of a map.
+
+## v0.0.9
 * Generic functions used in multiple classes now live in a single "utilities" 
class.
 * Pageview and LegacyPageview have been renamed to PageviewDefinition and
   LegacyPageviewDefinition, respectively.  These also should now use the
diff --git a/pom.xml b/pom.xml
index c8b829c..6d70477 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,24 +122,6 @@
       </dependency>
 
       <dependency>
-        <groupId>org.apache.hadoop</groupId>
-        <artifactId>hadoop-common</artifactId>
-        <version>2.5.0-cdh5.3.1</version>
-      </dependency>
-
-      <dependency>
-          <groupId>org.apache.hadoop</groupId>
-          <artifactId>hadoop-client</artifactId>
-          <version>2.5.0-cdh5.3.1</version>
-      </dependency>
-
-      <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-exec</artifactId>
-          <version>0.13.1-cdh5.3.1</version>
-      </dependency>
-
-      <dependency>
           <groupId>ua_parser</groupId>
           <artifactId>ua-parser</artifactId>
           <version>1.3.0-wmf1</version>
@@ -175,32 +157,10 @@
           <version>2.0.29</version>
       </dependency>
 
-
       <dependency>
-          <groupId>org.scala-lang</groupId>
-          <artifactId>scala-library</artifactId>
-          <version>2.10.0</version>
-      </dependency>
-
-      <dependency>
-          <groupId>org.apache.spark</groupId>
-          <artifactId>spark-core_2.10</artifactId>
-          <version>1.2.0-cdh5.3.1</version>
-          <scope>provided</scope>
-      </dependency>
-
-      <dependency>
-          <groupId>org.apache.spark</groupId>
-          <artifactId>spark-hive_2.10</artifactId>
-          <version>1.2.0-cdh5.3.1</version>
-          <scope>provided</scope>
-      </dependency>
-
-      <dependency>
-          <groupId>org.apache.spark</groupId>
-          <artifactId>spark-sql_2.10</artifactId>
-          <version>1.2.0-cdh5.3.1</version>
-          <scope>provided</scope>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-databind</artifactId>
+        <version>2.5.2</version>
       </dependency>
 
     </dependencies>
@@ -310,6 +270,10 @@
       <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
       <skip.tests>false</skip.tests>
       <java.version>1.7</java.version>
+      <hadoop.version>2.5.0-cdh5.3.1</hadoop.version>
+      <hive.version>0.13.1-cdh5.3.1</hive.version>
+      <scala.version>2.10.4</scala.version>
+      <spark.version>1.2.0-cdh5.3.1</spark.version>
     </properties>
 
   </project>
diff --git a/refinery-core/pom.xml b/refinery-core/pom.xml
index e42355b..7ac0d8f 100644
--- a/refinery-core/pom.xml
+++ b/refinery-core/pom.xml
@@ -13,14 +13,18 @@
     <packaging>jar</packaging>
 
     <dependencies>
-        <dependency>
+        <!--<dependency>
             <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-common</artifactId>
-        </dependency>
+            <artifactId>hadoop-client</artifactId>
+            <version>${hadoop.version}</version>
+            <scope>provided</scope>
+        </dependency>-->
 
         <dependency>
             <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-client</artifactId>
+            <artifactId>hadoop-common</artifactId>
+            <version>${hadoop.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>
@@ -66,6 +70,10 @@
             <artifactId>json-simple</artifactId>
         </dependency>
 
+        <dependency>
+            <groupId>com.fasterxml.jackson.core</groupId>
+            <artifactId>jackson-databind</artifactId>
+        </dependency>
     </dependencies>
 
     <build>
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
index eb3f77c..808c857 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
@@ -48,19 +48,20 @@
      * Now back to the good part.
      */
     private final Pattern uriHostWikimediaDomainPattern = Pattern.compile(
-        "(commons|meta|incubator|species)\\."   // any of these domain names
-        + "((m|mobile|wap|zero)\\.)?"           // followed by an optional 
mobile or zero qualifier
-        + "wikimedia\\.org$"                    // ending with wikimedia.org
+        "(commons|meta|incubator|species|outreach)\\."   // any of these 
domain names
+        + "((m|mobile|wap|zero)\\.)?"                    // followed by an 
optional mobile or zero qualifier
+        + "wikimedia\\.org$"                             // ending with 
wikimedia.org
     );
 
     private final Pattern uriHostProjectDomainPattern = Pattern.compile(
-        "(?<!www)\\."                           // not starting with "www"
+        "(?<!(www\\.|test))"              // not starting with "www." or "test"
         + "(wik(ibooks|"                  // match project domains ending in 
.org
         + "inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$"
     );
 
     private final Pattern uriHostOtherProjectsPattern = Pattern.compile(
-        "(wikidata|mediawiki)\\.org$"
+        "(?<!test)"                                          // not starting 
with "test"
+        + "(wikidata|mediawiki|wikimediafoundation)\\.org$"  // match project 
domains ending in .org
     );
 
     private final Pattern uriPathPattern = Pattern.compile(
@@ -94,6 +95,15 @@
     private final HashSet<String> httpStatusesSet = new 
HashSet<String>(Arrays.asList(
         "200",
         "304"
+    ));
+
+    private final HashSet<String> uriPortionsToRemove = new 
HashSet<String>(Arrays.asList(
+            "m",
+            "mobile",
+            "wap",
+            "zero",
+            "www",
+            "download"
     ));
 
     /**
@@ -193,4 +203,41 @@
             && !Utilities.patternIsFound(uriQueryUnwantedActions, uriQuery)
         );
     }
-}
\ No newline at end of file
+
+    /**
+     * Identifies a project from a pageview uriHost
+     * NOTE: Provides correct result only if used with is_pageview = true
+     *
+     * @param uriHost The url's host
+     * @return The project identifier in format [xxx.]xxxx (en.wikipedia or 
wikisource for instance)
+     */
+    public String getProjectFromHost(String uriHost) {
+        if (uriHost == null) return "-";
+        String[] uri_parts = uriHost.toLowerCase().split("\\.");
+        switch (uri_parts.length) {
+            // case wikixxx.org
+            case 2:
+                return uri_parts[0];
+            //case xx.wikixxx.org - Remove unwanted parts
+            case 3:
+                if (uriPortionsToRemove.contains(uri_parts[0]))
+                    return uri_parts[1];
+                else
+                    return uri_parts[0] + "." + uri_parts[1];
+            //xx.[m|mobile|wap|zero].wikixxx.org - Remove unwanted parts
+            case 4:
+                if (uriPortionsToRemove.contains(uri_parts[0]))
+                    return uri_parts[2];
+                else
+                    return uri_parts[0] + "." + uri_parts[2];
+            //xx.[m|mobile|wap|zero].[m|mobile|wap|zero].wikixxx.org - Remove 
unwanted parts
+            case 5:
+                if (uriPortionsToRemove.contains(uri_parts[0]))
+                    return uri_parts[3];
+                else
+                    return uri_parts[0] + "." + uri_parts[3];
+            default:
+                return "-";
+        }
+    }
+}
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index 800d5f2..2251918 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -42,6 +42,13 @@
     }
 
     /*
+     * Constant string results for referer classification
+     */
+    public static final String REFERER_UNKNOWN = "unknown";
+    public static final String REFERER_INTERNAL = "internal";
+    public static final String REFERER_EXTERNAL = "external";
+
+    /*
      * Now back to the good part.
      * Wikimedia-specific crawlers
      */
@@ -54,7 +61,7 @@
      * or some similar portal-based interface to MW.
      */
     private static final Pattern uriHostPattern = Pattern.compile(
-        "\\.(m|zero)\\."
+        "(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"
     );
 
     /**
@@ -131,54 +138,39 @@
     }
 
     /**
-     * Classification for referers
-     * <p>
-     * <ul>
-     * <li>A referer from a WMF domain translates into “internal”.</li>
-     * <li>A referer from a non-WMF domain translates into “external".</li>
-     * <li>An empty or invalid refer translates into “unknown".</li>
-     * </ul>
-     */
-    public enum RefererClassification {
-        UNKNOWN,
-        INTERNAL,
-        EXTERNAL
-    }
-
-    /**
      * Classifies a referer
      *
      * @param url The referer url to classify
      * @return RefererClassification
      */
-    public static RefererClassification classify(String url) {
+    public String classifyReferer(String url) {
         if (url == null || url.isEmpty() || url.equals("-")) {
-            return RefererClassification.UNKNOWN;
+            return REFERER_UNKNOWN;
         }
 
         String[] urlParts = StringUtils.splitPreserveAllTokens(url, '/');
         if (urlParts == null || urlParts.length <3) {
-            return RefererClassification.UNKNOWN;
+            return REFERER_UNKNOWN;
         }
 
         if (!urlParts[0].equals("http:") && !urlParts[0].equals("https:")) {
-            return RefererClassification.UNKNOWN;
+            return REFERER_UNKNOWN;
         }
 
         if (!urlParts[1].isEmpty()) {
-            return RefererClassification.UNKNOWN;
+            return REFERER_UNKNOWN;
         }
 
         String[] domainParts = StringUtils.splitPreserveAllTokens(urlParts[2], 
'.');
 
         if (domainParts == null || domainParts.length <2) {
-            return RefererClassification.UNKNOWN;
+            return REFERER_UNKNOWN;
         }
 
         if (domainParts[domainParts.length-1].equals("org")) {
             switch (domainParts[domainParts.length-2]) {
             case "":
-                return RefererClassification.UNKNOWN;
+                return REFERER_UNKNOWN;
             case "mediawiki":
             case "wikibooks":
             case "wikidata":
@@ -191,9 +183,12 @@
             case "wikiversity":
             case "wikivoyage":
             case "wiktionary":
-                return RefererClassification.INTERNAL;
+                return REFERER_INTERNAL;
             }
         }
-        return RefererClassification.EXTERNAL;
+        return REFERER_EXTERNAL;
     }
+
+
+
 }
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
index 1460c7f..ca5d6c7 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
@@ -33,6 +33,7 @@
     )
     public void testIsLegacyPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
index 904e136..684fead 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
@@ -32,6 +32,7 @@
     )
     public void testIsPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
@@ -66,6 +67,7 @@
     )
     public void testIsAppPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
@@ -83,11 +85,39 @@
             test_description,
             is_app_pageview,
             PageviewDefinitionInstance.isAppPageview(
-                uri_path,
-                uri_query,
-                content_type,
-                user_agent
+                    uri_path,
+                    uri_query,
+                    content_type,
+                    user_agent
             )
         );
     }
+
+    @Test
+    @FileParameters(
+            value = "src/test/resources/pageview_test_data.csv",
+            mapper = CsvWithHeaderMapper.class
+    )
+    public void testGetProjectFromHost(
+            String test_description,
+            String project,
+            boolean is_pageview,
+            boolean is_legacy_pageview,
+            boolean is_app_pageview,
+            String ip_address,
+            String x_forwarded_for,
+            String uri_host,
+            String uri_path,
+            String uri_query,
+            String http_status,
+            String content_type,
+            String user_agent
+    ) {
+        PageviewDefinition PageviewDefinitionInstance = 
PageviewDefinition.getInstance();
+        assertEquals(
+                test_description,
+                project,
+                PageviewDefinitionInstance.getProjectFromHost(uri_host)
+        );
+    }
 }
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
index 9f9cf43..d6e8d3d 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
@@ -57,7 +57,6 @@
 
 
         // decode expected output and turn it into an object
-        System.out.println(jsonMapResult);
         Object obj = jsonParser.parse(jsonMapResult);
         JSONObject expected_ua = (JSONObject) obj;
 
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
index 38e4f8c..3af0a96 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
@@ -14,7 +14,6 @@
 
 package org.wikimedia.analytics.refinery.core;
 
-import org.wikimedia.analytics.refinery.core.Webrequest.RefererClassification;
 
 import junit.framework.TestCase;
 
@@ -22,23 +21,23 @@
 
     // Helper methods ---------------------------------------------------------
 
-    private void assertKind(final String url, final RefererClassification 
expected) {
-        RefererClassification actual = Webrequest.classify(url);
+    private void assertKind(final String url, final String expected) {
+        String actual = Webrequest.getInstance().classifyReferer(url);
 
         assertEquals("Identification output does not match expected",
                 expected, actual);
     }
 
     private void assertUnknown(final String url) {
-        assertKind(url, RefererClassification.UNKNOWN);
+        assertKind(url, Webrequest.REFERER_UNKNOWN);
     }
 
     private void assertInternal(final String url) {
-        assertKind(url, RefererClassification.INTERNAL);
+        assertKind(url, Webrequest.REFERER_INTERNAL);
     }
 
     private void assertExternal(final String url) {
-        assertKind(url, RefererClassification.EXTERNAL);
+        assertKind(url, Webrequest.REFERER_EXTERNAL);
     }
 
     // Test degernerate settings ----------------------------------------------
diff --git a/refinery-core/src/test/resources/access_method_test_data.csv 
b/refinery-core/src/test/resources/access_method_test_data.csv
index c2e9d48..37b108e 100644
--- a/refinery-core/src/test/resources/access_method_test_data.csv
+++ b/refinery-core/src/test/resources/access_method_test_data.csv
@@ -1,6 +1,12 @@
 test_description,expected_method, uri_host, user_agent
 Desktop request,desktop, en.wikipedia.org, turnip
-Mobile web request through m.wikipedia,mobile web,en.m.wikipedia.org,turnip
+Mobile web request through en.m.wikipedia,mobile web,en.m.wikipedia.org,turnip
+Mobile web request through en.mobile.wikipedia,mobile 
web,en.mobile.wikipedia.org,turnip
+Mobile web request through en.wap.wikipedia,mobile 
web,en.wap.wikipedia.org,turnip
 Mobile web request through zero,mobile web,en.zero.wikipedia.org,rutabaga
+Mobile web request through m.wikipedia,mobile web,m.wikipedia.org,rutabaga
+Mobile web request through mobile.wikipedia,mobile 
web,mobile.wikipedia.org,rutabaga
+Mobile web request through wap.wikipedia,mobile web,wap.wikipedia.org,rutabaga
+Mobile web request through zero,mobile web,zero.wikipedia.org,rutabaga
 Mobile apps request through mobile API,mobile app,en.m.wikipedia.org, 
WikipediaApp/1.2.3
 Mobile apps request through desktop API,mobile app, en.wikipedia.org, 
WikipediaApp/1.2.3
diff --git a/refinery-core/src/test/resources/pageview_test_data.csv 
b/refinery-core/src/test/resources/pageview_test_data.csv
index 985eee1..49f9124 100644
--- a/refinery-core/src/test/resources/pageview_test_data.csv
+++ b/refinery-core/src/test/resources/pageview_test_data.csv
@@ -1,28 +1,32 @@
-test_description, 
is_pageview,is_legacy_pageview,is_app_pageview,ip_address,x_forwarded_for, 
uri_host, uri_path, uri_query, http_status, content_type, user_agent
-Is Pageview - Desktop, true,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
-Is Pageview – Desktop – locally cached content, 
true,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,304,text/html, turnip
-Is Pageview – App - Android, true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=0,200,
 application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google 
Play
-is Pageview – App – iOS – old version, 
true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=0,200,
 application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
-Is Pageview – App -  iOS – new version, 
true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=all,200,
 application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
-Is Pageview – Mobile Web, 
true,true,false,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
-Is Pageview – Desktop - Serbian sr-ec, 
true,false,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Serbian sr-el, 
true,false,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
-Is Pageview – Desktop - Chinese zh-cn, 
true,false,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-hans, 
true,false,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-hant, 
true,false,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-hk, 
true,false,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-mo, 
true,false,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-my, 
true,false,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-sg, 
true,false,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Desktop - Chinese zh-tw, 
true,false,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
 plan
-Is Pageview – Wikidata, true, 
true,false,174.62.175.94,-,www.wikidata.org,/wiki/Q5651758,-,200,text/html,Five-test
 plan
-Is Pageview – MediaWiki, true, 
true,false,174.62.175.94,-,www.mediawiki.org,/wiki/Gerrit/git-review,-,200,text/html,Five-test
 plan
-Is Pageview – iOS search, 
true,false,false,174.62.175.94,-,en.wikipedia.org,/,?search=afdfsdfsd,200,text/html,Five-test
 plan
-Is Not Pageview - http_status != 200, false,true,false,174.62.175.95,-, 
en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip
-Is Not Pageview - content_type does not match, 
false,true,false,174.62.175.96,-, en.wikipedia.org, /wiki/Noppperrrrs,-,200, 
image/png, turnip
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of error., false, false,false,174.62.175.97,-, 
en.wikipedia.org, /w/api.php,-,200, text/html, turnip
-Is Not Pageview – App request for non-page content, false, 
false,false,174.62.175.98,-, en.wikipedia.org, 
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
-Is Not Pageview – Non-App request for page content, false, 
false,false,174.62.175.99,-, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
-Is Not Pageview – edit 
attempt,false,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,?action=edit,200,text/html, turnip
-Is not pageview – non-wikidata raw domain,false, 
false,false,174.62.175.82,-,www.wikipedia.org,-,-,200,text/html, turnip
-Is Not Pageview – App – Android – Refresh,false, 
false,false,174.62.175.82,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=all,200,
 application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google 
Play
+test_description,project,is_pageview,is_legacy_pageview,is_app_pageview,ip_address,x_forwarded_for,uri_host,uri_path,uri_query,http_status,content_type,user_agent
+Is Pageview - 
Desktop,en.wikipedia,true,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Pageview – Desktop – locally cached 
content,en.wikipedia,true,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,-,304,text/html, turnip
+Is Pageview – App - 
Android,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=0,200,
 application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google 
Play
+is Pageview – App – iOS – old 
version,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=0,200,
 application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
+Is Pageview – App -  iOS – new 
version,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=all,200,
 application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
+Is Pageview – Mobile 
Web,en.wikipedia,true,true,false,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
+Is Pageview – Desktop - Serbian 
sr-ec,sr.wikipedia,true,false,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Serbian 
sr-el,sr.wikipedia,true,false,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
 salute
+Is Pageview – Desktop - Chinese 
zh-cn,zh.wikipedia,true,false,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-hans,zh.wikipedia,true,false,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-hant,zh.wikipedia,true,false,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-hk,zh.wikipedia,true,false,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-mo,zh.wikipedia,true,false,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-my,zh.wikipedia,true,false,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-sg,zh.wikipedia,true,false,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – Desktop - Chinese 
zh-tw,zh.wikipedia,true,false,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
 plan
+Is Pageview – 
Wikidata,wikidata,true,true,false,174.62.175.94,-,www.wikidata.org,/wiki/Q5651758,-,200,text/html,Five-test
 plan
+Is Pageview – 
MediaWiki,mediawiki,true,true,false,174.62.175.94,-,www.mediawiki.org,/wiki/Gerrit/git-review,-,200,text/html,Five-test
 plan
+Is Pageview – iOS 
search,en.wikipedia,true,false,false,174.62.175.94,-,en.wikipedia.org,/,?search=afdfsdfsd,200,text/html,Five-test
 plan
+Is Not Pageview - http_status != 
200,en.wikipedia,false,true,false,174.62.175.95,-, en.wikipedia.org, 
/wiki/Noppperrrrs,-,400,text/html ,turnip
+Is Not Pageview - content_type does not 
match,en.wikipedia,false,true,false,174.62.175.96,-, en.wikipedia.org, 
/wiki/Noppperrrrs,-,200, image/png, turnip
+Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain classes of 
error,en.wikipedia,false,false,false,174.62.175.97,-, en.wikipedia.org, 
/w/api.php,-,200, text/html, turnip
+Is Not Pageview – App request for non-page 
content,en.wikipedia,false,false,false,174.62.175.98,-,en.wikipedia.org,/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
    ,200, application/json, WikipediaApp/1.2.3
+Is Not Pageview – Non-App request for page content,en.wikipedia,false, 
false,false,174.62.175.99,-, en.wikipedia.org, 
/w/api.php,?action=mobileview&sections=0,200, application/json, TributeApp/1.2.3
+Is Not Pageview – edit 
attempt,en.wikipedia,false,true,false,174.62.175.82,-,en.wikipedia.org, 
/wiki/Horseshoe_crab,?action=edit,200,text/html, turnip
+Is not pageview – non-wikidata raw 
domain,wikipedia,false,false,false,174.62.175.82,-,www.wikipedia.org,-,-,200,text/html,
 turnip
+Is Not Pageview – App – Android – Refresh,en.wikipedia,false, 
false,false,174.62.175.82,-,en.wikipedia.org, 
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text&sections=all,200,
 application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google 
Play
+"Is Pageview - mobile - 
wikimediafoundation",wikimediafoundation,true,true,false,174.62.175.82,-,m.wikimediafoundation.org,/wiki/Horseshoe_crab,-,200,text/html,turnip
+"Is Pageview - Desktop - 
wikimediafoundation",wikimediafoundation,true,false,false,174.62.175.82,-,wikimediafoundation.org,/wiki/Horseshoe_crab,-,200,text/html,turnip
+Is Not Pageview - Desktop,test2.wikipedia, 
true,true,false,174.62.175.82,-,test2.wikipedia.org, 
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Not Pageview - 
Desktop,test.wikimediafoundation,true,true,false,174.62.175.82,-,test.wikimediafoundation.org,
 /wiki/Horseshoe_crab,-,200,text/html, turnip
\ No newline at end of file
diff --git a/refinery-hive/pom.xml b/refinery-hive/pom.xml
index 01e2e8f..b957024 100644
--- a/refinery-hive/pom.xml
+++ b/refinery-hive/pom.xml
@@ -31,15 +31,26 @@
             <artifactId>refinery-core</artifactId>
         </dependency>
 
-        <dependency>
-            <groupId>org.apache.hadoop</groupId>
-            <artifactId>hadoop-common</artifactId>
-        </dependency>
+       <dependency>
+           <groupId>org.apache.hadoop</groupId>
+           <artifactId>hadoop-client</artifactId>
+           <version>${hadoop.version}</version>
+           <scope>provided</scope>
+       </dependency>
 
-        <dependency>
-            <groupId>org.apache.hive</groupId>
-            <artifactId>hive-exec</artifactId>
-        </dependency>
+       <dependency>
+           <groupId>org.apache.hadoop</groupId>
+           <artifactId>hadoop-common</artifactId>
+           <version>${hadoop.version}</version>
+           <scope>provided</scope>
+       </dependency>
+
+       <dependency>
+           <groupId>org.apache.hive</groupId>
+           <artifactId>hive-exec</artifactId>
+           <version>${hive.version}</version>
+           <scope>provided</scope>
+       </dependency>
 
         <dependency>
             <groupId>com.googlecode.json-simple</groupId>
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
new file mode 100644
index 0000000..0552a84
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
@@ -0,0 +1,60 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.PageviewDefinition;
+
+
+/**
+ * A Hive UDF to identify a Wikimedia webrequest pageview project.
+ * NOTE: this udf only works well if the uri_host comes from
+ * a webrequest having is_pageview = true
+ *
+ * <p>
+ * Hive Usage:
+ *   ADD JAR /path/to/refinery-hive.jar;
+ *   CREATE TEMPORARY FUNCTION get_project AS
+ *     'org.wikimedia.analytics.refinery.hive.IdentifyProjectUDF';
+ *   SELECT
+ *     get_project(uri_host) as project_qualifier,
+ *     count(*) as cnt
+ *   FROM
+ *     wmf_raw.webrequest
+ *   WHERE
+ *    webrequest_source = 'mobile'
+ *     AND year=2014
+ *     AND month=12
+ *     AND day=7
+ *     AND hour=12
+ *     AND is_pageview(uri_host, uri_path, uri_query, http_status, 
content_type, user_agent)
+ *   GROUP BY
+ *     get_project(uri_host)
+ *   ORDER BY cnt desc
+ *   LIMIT 10
+ *   ;
+ */
+@Description(name = "get_project",
+        value = "_FUNC_(uri_host) - Returns the project identifier for the 
pageview request.",
+        extended = "")
+public class GetProjectUDF extends UDF {
+    public String evaluate(String uriHost) {
+        PageviewDefinition pageviewDefinitionInstance = 
PageviewDefinition.getInstance();
+        return pageviewDefinitionInstance.getProjectFromHost(uriHost);
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
index 229d1ce..5314729 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
@@ -16,6 +16,7 @@
 
 package org.wikimedia.analytics.refinery.hive;
 
+import org.apache.hadoop.hive.ql.exec.Description;
 import org.apache.hadoop.hive.ql.exec.UDF;
 import org.wikimedia.analytics.refinery.core.PageviewDefinition;
 
@@ -48,6 +49,9 @@
  *   LIMIT 10
  *   ;
  */
+@Description(name = "is_pageview",
+        value = "_FUNC_(uri_host, uri_path, uri_query, http_status, 
content_type, user_agent) - Returns true if the request is a pageview",
+        extended = "")
 public class IsPageviewUDF extends UDF {
     public boolean evaluate(
         String uriHost,
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
index 05df0a2..2b9f204 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
@@ -14,108 +14,18 @@
 
 package org.wikimedia.analytics.refinery.hive;
 
-import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
-import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
-import org.wikimedia.analytics.refinery.core.Webrequest;
-import org.wikimedia.analytics.refinery.core.Webrequest.RefererClassification;
 
-import java.util.LinkedList;
-import java.util.List;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.wikimedia.analytics.refinery.core.Webrequest;
 
 @Description(name = "referer_classifier",
-    value = "_FUNC_(url) - Returns a map with a classification of a referer",
+    value = "_FUNC_(url) - Returns a string with a classification of a referer 
(unknown, internal, external)",
     extended = "argument 0 is the url to analyze")
-public class RefererClassifierUDF extends GenericUDF {
-    private Object[] result;
+public class RefererClassifierUDF extends UDF {
 
-    private StringObjectInspector inputOI;
-
-    private int IDX_IS_UNKNOWN;
-    private int IDX_IS_INTERNAL;
-    private int IDX_IS_EXTERNAL;
-
-    @Override
-    public ObjectInspector initialize(ObjectInspector[] arguments)
-            throws UDFArgumentException {
-        // We need exactly 1 parameter
-        if (arguments == null || arguments.length != 1) {
-            throw new UDFArgumentLengthException("The function "
-                    + "RefererClassifierUDF expects exactly 1 parameter");
-        }
-
-        // ... and the parameter has to be a string
-        if (!(arguments[0] instanceof StringObjectInspector)) {
-            throw new UDFArgumentTypeException(0, "The parameter to "
-                    + "RefererClassifierUDF has to be a string");
-        }
-
-        inputOI = (StringObjectInspector) arguments[0];
-
-        List<String> fieldNames = new LinkedList<String>();
-        List<ObjectInspector> fieldOIs= new LinkedList<ObjectInspector>();
-        int idx = 0;
-
-        fieldNames.add("is_unknown");
-        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
-        IDX_IS_UNKNOWN=idx++;
-
-        fieldNames.add("is_internal");
-        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
-        IDX_IS_INTERNAL=idx++;
-
-        fieldNames.add("is_external");
-        
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
-        IDX_IS_EXTERNAL=idx++;
-
-        result = new Object[idx];
-
-        return 
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
-    }
-
-    @Override
-    public Object evaluate(DeferredObject[] arguments) throws HiveException {
-        assert arguments != null : "Method 'evaluate' of RefererClassifierUDF "
-                + "called with null arguments array";
-        assert arguments.length == 1 : "Method 'evaluate' of "
-                + "RefererClassifierUDF called arguments of length "
-                + arguments.length + " (instead of 1)";
-        // arguments is an array with exactly 1 entry.
-
-        assert result != null : "Result object has not yet been initialized, "
-                + "but evaluate called";
-        // result object has been initialized. So it's an array of objects of
-        // the right length.
-
-        String url = inputOI.getPrimitiveJavaObject(arguments[0].get());
-
-        RefererClassification kind = Webrequest.classify(url);
-
-        result[IDX_IS_UNKNOWN] = kind == RefererClassification.UNKNOWN;
-        result[IDX_IS_INTERNAL] = kind == RefererClassification.INTERNAL;
-        result[IDX_IS_EXTERNAL] = kind == RefererClassification.EXTERNAL;
-
-        return result;
-    }
-
-    @Override
-    public String getDisplayString(String[] arguments) {
-        String argument;
-        if (arguments == null) {
-            argument = "<arguments == null>";
-        } else if (arguments.length == 1) {
-            argument = arguments[0];
-        } else {
-            argument = "<arguments of length " + arguments.length + ">";
-        }
-        return "referer_classifier(" + argument +")";
-
+    public String evaluate(String url) throws HiveException {
+        return Webrequest.getInstance().classifyReferer(url);
     }
 }
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
new file mode 100644
index 0000000..951921a
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
@@ -0,0 +1,70 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestGetProjectUDF {
+
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/pageview_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testGetProject(
+        String test_description,
+        String project,
+        boolean is_pageview,
+        boolean is_legacy_pageview,
+        boolean is_app_pageview,
+        String ip_address,
+        String x_forwarded_for,
+        String uri_host,
+        String uri_path,
+        String uri_query,
+        String http_status,
+        String content_type,
+        String user_agent
+    ) {
+        GetProjectUDF udf = new GetProjectUDF();
+
+        assertEquals(
+            test_description,
+            project,
+            udf.evaluate(uri_host)
+        );
+    }
+
+    @Test
+    public void testGetProjectNull() {
+        GetProjectUDF udf = new GetProjectUDF();
+
+        assertEquals(
+                "Test null input to getProject",
+                "-",
+                udf.evaluate(null)
+        );
+    }
+
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
index 8e80101..f36ead9 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
@@ -33,6 +33,7 @@
     )
     public void testIsAppPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
index ff8f9ba..5dbdb8a 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
@@ -34,6 +34,7 @@
     )
     public void testIsPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
index 868b505..172b85f 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
@@ -34,6 +34,7 @@
     )
     public void testIsPageview(
         String test_description,
+        String project,
         boolean is_pageview,
         boolean is_legacy_pageview,
         boolean is_app_pageview,
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
index 6f81d3f..96ac37d 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
@@ -17,84 +17,24 @@
 import java.io.IOException;
 
 import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-import org.wikimedia.analytics.refinery.core.Webrequest.RefererClassification;
 
 import junit.framework.TestCase;
+import org.wikimedia.analytics.refinery.core.Webrequest;
 
 public class TestRefererClassifierUDF extends TestCase {
-    ObjectInspector StringOI = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
-    ObjectInspector LongOI = 
PrimitiveObjectInspectorFactory.javaLongObjectInspector;
-
-    private Object callUDF(String url) throws HiveException, IOException {
-        DeferredObject urlDO = new DeferredJavaObject(url);
-        DeferredObject[] arguments = new DeferredObject[] {urlDO};
-        Object res = null;
-
-        RefererClassifierUDF udf = new RefererClassifierUDF();
-        try {
-            udf.initialize(new ObjectInspector[]{StringOI});
-            res = udf.evaluate(arguments);
-        } finally {
-            udf.close();
-        }
-        return res;
-    }
-
-    private void assertKind(String url, RefererClassification kind)
-            throws HiveException, IOException {
-        Object[] res = (Object[]) callUDF(url);
-
-        assertEquals("Result array has wrong length", 3, res.length);
-
-        assertEquals("is_unknown does not match", kind == 
RefererClassification.UNKNOWN, res[0]);
-        assertEquals("is_internal does not match", kind == 
RefererClassification.INTERNAL, res[1]);
-        assertEquals("is_external does not match", kind == 
RefererClassification.EXTERNAL, res[2]);
-    }
-
-    public void testInitialize() throws HiveException, IOException {
-        RefererClassifierUDF udf = new RefererClassifierUDF();
-        try {
-            udf.initialize(new ObjectInspector[]{StringOI});
-        } finally {
-            udf.close();
-        }
-    }
-
-    public void testInitializeEmpty() throws HiveException, IOException {
-        RefererClassifierUDF udf = new RefererClassifierUDF();
-        try {
-            udf.initialize(new ObjectInspector[]{});
-            fail("Initialize did not throw HiveException");
-        } catch (HiveException e) {
-        } finally {
-            udf.close();
-        }
-    }
-
-    public void testInitializeWrongType() throws HiveException, IOException {
-        RefererClassifierUDF udf = new RefererClassifierUDF();
-        try {
-            udf.initialize(new ObjectInspector[]{LongOI});
-            fail("Initialize did not throw HiveException");
-        } catch (HiveException e) {
-        } finally {
-            udf.close();
-        }
-    }
 
     public void testEvaluateUnknown() throws HiveException, IOException {
-        assertKind("foo", RefererClassification.UNKNOWN);
+        RefererClassifierUDF udf = new RefererClassifierUDF();
+        assertEquals("Unknown referer", udf.evaluate("foo"), 
Webrequest.REFERER_UNKNOWN);
     }
 
     public void testEvaluateInternal() throws HiveException, IOException {
-        assertKind("http://en.wikipedia.org/foo";, 
RefererClassification.INTERNAL);
+        RefererClassifierUDF udf = new RefererClassifierUDF();
+        assertEquals("Unknown referer", 
udf.evaluate("http://en.wikipedia.org/foo";), Webrequest.REFERER_INTERNAL);
     }
 
     public void testEvaluateExternal() throws HiveException, IOException {
-        assertKind("http://www.google.com/";, RefererClassification.EXTERNAL);
+        RefererClassifierUDF udf = new RefererClassifierUDF();
+        assertEquals("Unknown referer", 
udf.evaluate("http://www.google.com/";), Webrequest.REFERER_EXTERNAL);
     }
 }
\ No newline at end of file
diff --git a/refinery-job/pom.xml b/refinery-job/pom.xml
index b1940e7..f9d4534 100644
--- a/refinery-job/pom.xml
+++ b/refinery-job/pom.xml
@@ -23,21 +23,29 @@
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>
+            <version>${scala.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.10</artifactId>
+            <version>${spark.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-hive_2.10</artifactId>
+            <version>${spark.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-sql_2.10</artifactId>
+            <version>${spark.version}</version>
+            <scope>provided</scope>
         </dependency>
 
     </dependencies>
diff --git a/refinery-tools/pom.xml b/refinery-tools/pom.xml
index 47fe769..818cf5d 100644
--- a/refinery-tools/pom.xml
+++ b/refinery-tools/pom.xml
@@ -15,6 +15,8 @@
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-common</artifactId>
+            <version>${hadoop.version}</version>
+            <scope>provided</scope>
         </dependency>
 
         <dependency>

-- 
To view, visit https://gerrit.wikimedia.org/r/205833
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id3b14d954d1396a8e8667d6865a854ad1167d830
Gerrit-PatchSet: 5
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to