Joal has uploaded a new change for review.
https://gerrit.wikimedia.org/r/205833
Change subject: Patch pageview definition
......................................................................
Patch pageview definition
Modify maven pom.xml files not to build uber jars and to take advantage of
properties for versions.
Include xx.mobile.xxx.org and xx.wap.xxx.org.
Update referer classification to output a string instead of a map.
Add IdentifyProject function to iddentify pageview requests.
Correct little bugs in test.
Change-Id: Id3b14d954d1396a8e8667d6865a854ad1167d830
---
M .gitignore
M pom.xml
M refinery-core/pom.xml
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
M refinery-core/src/test/resources/access_method_test_data.csv
M refinery-core/src/test/resources/pageview_test_data.csv
M refinery-hive/pom.xml
A
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IdentifyProjectUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
A
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIdentifyProjectUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
M refinery-job/pom.xml
M refinery-tools/pom.xml
22 files changed, 333 insertions(+), 258 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source
refs/changes/33/205833/1
diff --git a/.gitignore b/.gitignore
index b9d47ea..ad51ad7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@
*.iml
*.ipr
*.iws
+out/
diff --git a/pom.xml b/pom.xml
index c8b829c..6d70477 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,24 +122,6 @@
</dependency>
<dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>2.5.0-cdh5.3.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>2.5.0-cdh5.3.1</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.hive</groupId>
- <artifactId>hive-exec</artifactId>
- <version>0.13.1-cdh5.3.1</version>
- </dependency>
-
- <dependency>
<groupId>ua_parser</groupId>
<artifactId>ua-parser</artifactId>
<version>1.3.0-wmf1</version>
@@ -175,32 +157,10 @@
<version>2.0.29</version>
</dependency>
-
<dependency>
- <groupId>org.scala-lang</groupId>
- <artifactId>scala-library</artifactId>
- <version>2.10.0</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-core_2.10</artifactId>
- <version>1.2.0-cdh5.3.1</version>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-hive_2.10</artifactId>
- <version>1.2.0-cdh5.3.1</version>
- <scope>provided</scope>
- </dependency>
-
- <dependency>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-sql_2.10</artifactId>
- <version>1.2.0-cdh5.3.1</version>
- <scope>provided</scope>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>2.5.2</version>
</dependency>
</dependencies>
@@ -310,6 +270,10 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<skip.tests>false</skip.tests>
<java.version>1.7</java.version>
+ <hadoop.version>2.5.0-cdh5.3.1</hadoop.version>
+ <hive.version>0.13.1-cdh5.3.1</hive.version>
+ <scala.version>2.10.4</scala.version>
+ <spark.version>1.2.0-cdh5.3.1</spark.version>
</properties>
</project>
diff --git a/refinery-core/pom.xml b/refinery-core/pom.xml
index e42355b..7ac0d8f 100644
--- a/refinery-core/pom.xml
+++ b/refinery-core/pom.xml
@@ -13,14 +13,18 @@
<packaging>jar</packaging>
<dependencies>
- <dependency>
+ <!--<dependency>
<groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- </dependency>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
+ </dependency>-->
<dependency>
<groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
</dependency>
<dependency>
@@ -66,6 +70,10 @@
<artifactId>json-simple</artifactId>
</dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ </dependency>
</dependencies>
<build>
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
index eb3f77c..65433a8 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
@@ -48,19 +48,19 @@
* Now back to the good part.
*/
private final Pattern uriHostWikimediaDomainPattern = Pattern.compile(
- "(commons|meta|incubator|species)\\." // any of these domain names
+ "(commons|meta|incubator|species|outreach)\\." // any of these
domain names
+ "((m|mobile|wap|zero)\\.)?" // followed by an optional
mobile or zero qualifier
+ "wikimedia\\.org$" // ending with wikimedia.org
);
private final Pattern uriHostProjectDomainPattern = Pattern.compile(
- "(?<!www)\\." // not starting with "www"
+ "(?<!(www\\.|test))" // not starting with
"www"
+ "(wik(ibooks|" // match project domains ending in
.org
+ "inews|ipedia|iquote|isource|tionary|iversity|ivoyage))\\.org$"
);
private final Pattern uriHostOtherProjectsPattern = Pattern.compile(
- "(wikidata|mediawiki)\\.org$"
+ "(?<!test)(wikidata|mediawiki|wikimediafoundation)\\.org$"
);
private final Pattern uriPathPattern = Pattern.compile(
@@ -94,6 +94,15 @@
private final HashSet<String> httpStatusesSet = new
HashSet<String>(Arrays.asList(
"200",
"304"
+ ));
+
+ private final HashSet<String> uriPortionsToRemove = new
HashSet<String>(Arrays.asList(
+ "m",
+ "mobile",
+ "wap",
+ "zero",
+ "www",
+ "download"
));
/**
@@ -193,4 +202,41 @@
&& !Utilities.patternIsFound(uriQueryUnwantedActions, uriQuery)
);
}
-}
\ No newline at end of file
+
+ /**
+ * Identifies a project from a pageview uriHost
+ * NOTE: Provides correct result only if used with is_pageview = true
+ *
+ * @param uriHost The url's host to identify
+ * @return The project identifier in format [xxx.]xxxx (en.wikipedia or
wikisource for instance)
+ */
+ public String identifyProject(String uriHost) {
+ if (uriHost == null) return "-";
+ String[] uri_parts = uriHost.toLowerCase().split("\\.");
+ switch (uri_parts.length) {
+ // case wikixxx.org
+ case 2:
+ return uri_parts[0];
+ //case xx.wikixxx.org - Remove unwanted parts
+ case 3:
+ if (uriPortionsToRemove.contains(uri_parts[0]))
+ return uri_parts[1];
+ else
+ return uri_parts[0] + "." + uri_parts[1];
+ //xx.[m|mobile|wap|zero].wikixxx.org - Remove unwanted parts
+ case 4:
+ if (uriPortionsToRemove.contains(uri_parts[0]))
+ return uri_parts[2];
+ else
+ return uri_parts[0] + "." + uri_parts[2];
+ //xx.[m|mobile|wap|zero].[m|mobile|wap|zero].wikixxx.org - Remove
unwanted parts
+ case 5:
+ if (uriPortionsToRemove.contains(uri_parts[0]))
+ return uri_parts[3];
+ else
+ return uri_parts[0] + "." + uri_parts[3];
+ default:
+ return "-";
+ }
+ }
+}
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index 800d5f2..bf9d88b 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -54,7 +54,7 @@
* or some similar portal-based interface to MW.
*/
private static final Pattern uriHostPattern = Pattern.compile(
- "\\.(m|zero)\\."
+ "(^(m|zero|wap|mobile)\\.)|(\\.(m|zero|wap|mobile)\\.)"
);
/**
@@ -151,7 +151,7 @@
* @param url The referer url to classify
* @return RefererClassification
*/
- public static RefererClassification classify(String url) {
+ public static RefererClassification classifyReferer(String url) {
if (url == null || url.isEmpty() || url.equals("-")) {
return RefererClassification.UNKNOWN;
}
@@ -196,4 +196,7 @@
}
return RefererClassification.EXTERNAL;
}
+
+
+
}
\ No newline at end of file
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
index 1460c7f..20c5ba8 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
@@ -33,6 +33,7 @@
)
public void testIsLegacyPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
index 904e136..6409fa9 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
@@ -32,6 +32,7 @@
)
public void testIsPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
@@ -66,6 +67,7 @@
)
public void testIsAppPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
@@ -83,11 +85,39 @@
test_description,
is_app_pageview,
PageviewDefinitionInstance.isAppPageview(
- uri_path,
- uri_query,
- content_type,
- user_agent
+ uri_path,
+ uri_query,
+ content_type,
+ user_agent
)
);
}
+
+ @Test
+ @FileParameters(
+ value = "src/test/resources/pageview_test_data.csv",
+ mapper = CsvWithHeaderMapper.class
+ )
+ public void testProjectIdentification(
+ String test_description,
+ String projectIdentifier,
+ boolean is_pageview,
+ boolean is_legacy_pageview,
+ boolean is_app_pageview,
+ String ip_address,
+ String x_forwarded_for,
+ String uri_host,
+ String uri_path,
+ String uri_query,
+ String http_status,
+ String content_type,
+ String user_agent
+ ) {
+ PageviewDefinition PageviewDefinitionInstance =
PageviewDefinition.getInstance();
+ assertEquals(
+ test_description,
+ projectIdentifier,
+ PageviewDefinitionInstance.identifyProject(uri_host)
+ );
+ }
}
\ No newline at end of file
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
index 9f9cf43..d6e8d3d 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestUAParserUserAgentMostPopular.java
@@ -57,7 +57,6 @@
// decode expected output and turn it into an object
- System.out.println(jsonMapResult);
Object obj = jsonParser.parse(jsonMapResult);
JSONObject expected_ua = (JSONObject) obj;
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
index 38e4f8c..dc42f1f 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequestRefererClassifier.java
@@ -23,7 +23,7 @@
// Helper methods ---------------------------------------------------------
private void assertKind(final String url, final RefererClassification
expected) {
- RefererClassification actual = Webrequest.classify(url);
+ RefererClassification actual = Webrequest.classifyReferer(url);
assertEquals("Identification output does not match expected",
expected, actual);
diff --git a/refinery-core/src/test/resources/access_method_test_data.csv
b/refinery-core/src/test/resources/access_method_test_data.csv
index c2e9d48..37b108e 100644
--- a/refinery-core/src/test/resources/access_method_test_data.csv
+++ b/refinery-core/src/test/resources/access_method_test_data.csv
@@ -1,6 +1,12 @@
test_description,expected_method, uri_host, user_agent
Desktop request,desktop, en.wikipedia.org, turnip
-Mobile web request through m.wikipedia,mobile web,en.m.wikipedia.org,turnip
+Mobile web request through en.m.wikipedia,mobile web,en.m.wikipedia.org,turnip
+Mobile web request through en.mobile.wikipedia,mobile
web,en.mobile.wikipedia.org,turnip
+Mobile web request through en.wap.wikipedia,mobile
web,en.wap.wikipedia.org,turnip
Mobile web request through zero,mobile web,en.zero.wikipedia.org,rutabaga
+Mobile web request through m.wikipedia,mobile web,m.wikipedia.org,rutabaga
+Mobile web request through mobile.wikipedia,mobile
web,mobile.wikipedia.org,rutabaga
+Mobile web request through wap.wikipedia,mobile web,wap.wikipedia.org,rutabaga
+Mobile web request through zero,mobile web,zero.wikipedia.org,rutabaga
Mobile apps request through mobile API,mobile app,en.m.wikipedia.org,
WikipediaApp/1.2.3
Mobile apps request through desktop API,mobile app, en.wikipedia.org,
WikipediaApp/1.2.3
diff --git a/refinery-core/src/test/resources/pageview_test_data.csv
b/refinery-core/src/test/resources/pageview_test_data.csv
index 985eee1..7153006 100644
--- a/refinery-core/src/test/resources/pageview_test_data.csv
+++ b/refinery-core/src/test/resources/pageview_test_data.csv
@@ -1,28 +1,32 @@
-test_description,
is_pageview,is_legacy_pageview,is_app_pageview,ip_address,x_forwarded_for,
uri_host, uri_path, uri_query, http_status, content_type, user_agent
-Is Pageview - Desktop, true,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,-,200,text/html, turnip
-Is Pageview – Desktop – locally cached content,
true,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,-,304,text/html, turnip
-Is Pageview – App - Android, true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=0,200,
application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google
Play
-is Pageview – App – iOS – old version,
true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=0,200,
application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
-Is Pageview – App - iOS – new version,
true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=all,200,
application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
-Is Pageview – Mobile Web,
true,true,false,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
-Is Pageview – Desktop - Serbian sr-ec,
true,false,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
salute
-Is Pageview – Desktop - Serbian sr-el,
true,false,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
salute
-Is Pageview – Desktop - Chinese zh-cn,
true,false,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-hans,
true,false,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-hant,
true,false,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-hk,
true,false,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-mo,
true,false,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-my,
true,false,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-sg,
true,false,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Desktop - Chinese zh-tw,
true,false,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
plan
-Is Pageview – Wikidata, true,
true,false,174.62.175.94,-,www.wikidata.org,/wiki/Q5651758,-,200,text/html,Five-test
plan
-Is Pageview – MediaWiki, true,
true,false,174.62.175.94,-,www.mediawiki.org,/wiki/Gerrit/git-review,-,200,text/html,Five-test
plan
-Is Pageview – iOS search,
true,false,false,174.62.175.94,-,en.wikipedia.org,/,?search=afdfsdfsd,200,text/html,Five-test
plan
-Is Not Pageview - http_status != 200, false,true,false,174.62.175.95,-,
en.wikipedia.org, /wiki/Noppperrrrs,-,400,text/html ,turnip
-Is Not Pageview - content_type does not match,
false,true,false,174.62.175.96,-, en.wikipedia.org, /wiki/Noppperrrrs,-,200,
image/png, turnip
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as
a MIME type on certain classes of error., false, false,false,174.62.175.97,-,
en.wikipedia.org, /w/api.php,-,200, text/html, turnip
-Is Not Pageview – App request for non-page content, false,
false,false,174.62.175.98,-, en.wikipedia.org,
/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
,200, application/json, WikipediaApp/1.2.3
-Is Not Pageview – Non-App request for page content, false,
false,false,174.62.175.99,-, en.wikipedia.org,
/w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3
-Is Not Pageview – edit
attempt,false,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,?action=edit,200,text/html, turnip
-Is not pageview – non-wikidata raw domain,false,
false,false,174.62.175.82,-,www.wikipedia.org,-,-,200,text/html, turnip
-Is Not Pageview – App – Android – Refresh,false,
false,false,174.62.175.82,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=all,200,
application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google
Play
+test_description, project
identifier,is_pageview,is_legacy_pageview,is_app_pageview,ip_address,x_forwarded_for,
uri_host, uri_path, uri_query, http_status, content_type, user_agent
+Is Pageview -
Desktop,en.wikipedia,true,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Pageview – Desktop – locally cached
content,en.wikipedia,true,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,-,304,text/html, turnip
+Is Pageview – App -
Android,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=0,200,
application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google
Play
+is Pageview – App – iOS – old
version,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=0,200,
application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
+Is Pageview – App - iOS – new
version,en.wikipedia,true,false,true,174.62.175.83,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=all,200,
application/json,WikipediaApp/4.0.6 (iPhone OS 8.2; Phone)
+Is Pageview – Mobile
Web,en.wikipedia,true,true,false,174.62.175.84,-,en.m.wikipedia.org,/wiki/Bernard_Manning,-,200,text/html,rutabaga
+Is Pageview – Desktop - Serbian
sr-ec,sr.wikipedia,true,false,false,174.62.175.85,-,sr.wikipedia.org,/sr-ec/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
salute
+Is Pageview – Desktop - Serbian
sr-el,sr.wikipedia,true,false,false,174.62.175.86,-,sr.wikipedia.org,/sr-el/Историја_Срба_пре_Немањића,-,200,text/html,Three-finger
salute
+Is Pageview – Desktop - Chinese
zh-cn,zh.wikipedia,true,false,false,174.62.175.87,-,zh.wikipedia.org,/zh-cn/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-hans,zh.wikipedia,true,false,false,174.62.175.88,-,zh.wikipedia.org,/zh-hans/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-hant,zh.wikipedia,true,false,false,174.62.175.89,-,zh.wikipedia.org,/zh-hant/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-hk,zh.wikipedia,true,false,false,174.62.175.90,-,zh.wikipedia.org,/zh-hk/Wikipedia:foo,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-mo,zh.wikipedia,true,false,false,174.62.175.91,-,zh.wikipedia.org,/zh-mo/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-my,zh.wikipedia,true,false,false,174.62.175.92,-,zh.wikipedia.org,/zh-my/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-sg,zh.wikipedia,true,false,false,174.62.175.93,-,zh.wikipedia.org,/zh-sg/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview – Desktop - Chinese
zh-tw,zh.wikipedia,true,false,false,174.62.175.94,-,zh.wikipedia.org,/zh-tw/Wikipedia:首页,-,200,text/html,Five-test
plan
+Is Pageview –
Wikidata,wikidata,true,true,false,174.62.175.94,-,www.wikidata.org,/wiki/Q5651758,-,200,text/html,Five-test
plan
+Is Pageview –
MediaWiki,mediawiki,true,true,false,174.62.175.94,-,www.mediawiki.org,/wiki/Gerrit/git-review,-,200,text/html,Five-test
plan
+Is Pageview – iOS
search,en.wikipedia,true,false,false,174.62.175.94,-,en.wikipedia.org,/,?search=afdfsdfsd,200,text/html,Five-test
plan
+Is Not Pageview - http_status !=
200,en.wikipedia,false,true,false,174.62.175.95,-, en.wikipedia.org,
/wiki/Noppperrrrs,-,400,text/html ,turnip
+Is Not Pageview - content_type does not
match,en.wikipedia,false,true,false,174.62.175.96,-, en.wikipedia.org,
/wiki/Noppperrrrs,-,200, image/png, turnip
+Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as
a MIME type on certain classes of
error,en.wikipedia,false,false,false,174.62.175.97,-, en.wikipedia.org,
/w/api.php,-,200, text/html, turnip
+Is Not Pageview – App request for non-page
content,en.wikipedia,false,false,false,174.62.175.98,-,en.wikipedia.org,/w/api.php,?action=query&format=json&titles=Foo&prop=pageimages&piprop=thumbnail&pithumbsize=96&pilimit=1
,200, application/json, WikipediaApp/1.2.3
+Is Not Pageview – Non-App request for page content,en.wikipedia,false,
false,false,174.62.175.99,-, en.wikipedia.org,
/w/api.php,?action=mobileview§ions=0,200, application/json, TributeApp/1.2.3
+Is Not Pageview – edit
attempt,en.wikipedia,false,true,false,174.62.175.82,-,en.wikipedia.org,
/wiki/Horseshoe_crab,?action=edit,200,text/html, turnip
+Is not pageview – non-wikidata raw
domain,wikipedia,false,false,false,174.62.175.82,-,www.wikipedia.org,-,-,200,text/html,
turnip
+Is Not Pageview – App – Android – Refresh,en.wikipedia,false,
false,false,174.62.175.82,-,en.wikipedia.org,
/w/api.php,action=mobileview&format=json&page=Hachiko_–_Eine_wunderbare_Freundschaft&prop=text§ions=all,200,
application/json,WikipediaApp/2.0-r-2015-01-15 (Android 4.4.2; Phone) Google
Play
+"Is Pageview - mobile -
wikimediafoundation",wikimediafoundation,true,true,false,174.62.175.82,-,m.wikimediafoundation.org,/wiki/Horseshoe_crab,-,200,text/html,turnip
+"Is Pageview - Desktop -
wikimediafoundation",wikimediafoundation,true,false,false,174.62.175.82,-,wikimediafoundation.org,/wiki/Horseshoe_crab,-,200,text/html,turnip
+Is Not Pageview - Desktop,test2.wikipedia,
true,true,false,174.62.175.82,-,test2.wikipedia.org,
/wiki/Horseshoe_crab,-,200,text/html, turnip
+Is Not Pageview -
Desktop,test.wikimediafoundation,true,true,false,174.62.175.82,-,test.wikimediafoundation.org,
/wiki/Horseshoe_crab,-,200,text/html, turnip
\ No newline at end of file
diff --git a/refinery-hive/pom.xml b/refinery-hive/pom.xml
index 01e2e8f..b957024 100644
--- a/refinery-hive/pom.xml
+++ b/refinery-hive/pom.xml
@@ -31,15 +31,26 @@
<artifactId>refinery-core</artifactId>
</dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-client</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
+ </dependency>
- <dependency>
- <groupId>org.apache.hive</groupId>
- <artifactId>hive-exec</artifactId>
- </dependency>
+ <dependency>
+ <groupId>org.apache.hadoop</groupId>
+ <artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.hive</groupId>
+ <artifactId>hive-exec</artifactId>
+ <version>${hive.version}</version>
+ <scope>provided</scope>
+ </dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IdentifyProjectUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IdentifyProjectUDF.java
new file mode 100644
index 0000000..b16cf75
--- /dev/null
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IdentifyProjectUDF.java
@@ -0,0 +1,60 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.PageviewDefinition;
+
+
+/**
+ * A Hive UDF to identify a Wikimedia webrequest pageview project.
+ * NOTE: this udf only works well if the uri_host comes from
+ * a webrequest having is_pageview = true
+ *
+ * <p>
+ * Hive Usage:
+ * ADD JAR /path/to/refinery-hive.jar;
+ * CREATE TEMPORARY FUNCTION identify_project AS
+ * 'org.wikimedia.analytics.refinery.hive.IdentifyProjectUDF';
+ * SELECT
+ * identify_project(uri_host) as project_qualifier,
+ * count(*) as cnt
+ * FROM
+ * wmf_raw.webrequest
+ * WHERE
+ * webrequest_source = 'mobile'
+ * AND year=2014
+ * AND month=12
+ * AND day=7
+ * AND hour=12
+ * AND is_pageview(uri_host, uri_path, uri_query, http_status,
content_type, user_agent)
+ * GROUP BY
+ * identify_project(uri_host)
+ * ORDER BY cnt desc
+ * LIMIT 10
+ * ;
+ */
+@Description(name = "identify_project",
+ value = "_FUNC_(uri_host) - Returns the project identifier for the
pageview request.",
+ extended = "")
+public class IdentifyProjectUDF extends UDF {
+ public String evaluate(String uriHost) {
+ PageviewDefinition pageviewDefinitionInstance =
PageviewDefinition.getInstance();
+ return pageviewDefinitionInstance.identifyProject(uriHost);
+ }
+}
\ No newline at end of file
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
index 229d1ce..5314729 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
@@ -16,6 +16,7 @@
package org.wikimedia.analytics.refinery.hive;
+import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.wikimedia.analytics.refinery.core.PageviewDefinition;
@@ -48,6 +49,9 @@
* LIMIT 10
* ;
*/
+@Description(name = "is_pageview",
+ value = "_FUNC_(uri_host, uri_path, uri_query, http_status,
content_type, user_agent) - Returns true if the request is a pageview",
+ extended = "")
public class IsPageviewUDF extends UDF {
public boolean evaluate(
String uriHost,
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
index 05df0a2..f855fd3 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/RefererClassifierUDF.java
@@ -14,108 +14,23 @@
package org.wikimedia.analytics.refinery.hive;
-import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
-import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.*;
import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
-import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.wikimedia.analytics.refinery.core.Webrequest;
import org.wikimedia.analytics.refinery.core.Webrequest.RefererClassification;
-import java.util.LinkedList;
-import java.util.List;
-
@Description(name = "referer_classifier",
- value = "_FUNC_(url) - Returns a map with a classification of a referer",
+ value = "_FUNC_(url) - Returns a string with a classification of a referer
(UNK, INT, EXT for unknown, internal, external)",
extended = "argument 0 is the url to analyze")
-public class RefererClassifierUDF extends GenericUDF {
- private Object[] result;
+public class RefererClassifierUDF extends UDF {
- private StringObjectInspector inputOI;
-
- private int IDX_IS_UNKNOWN;
- private int IDX_IS_INTERNAL;
- private int IDX_IS_EXTERNAL;
-
- @Override
- public ObjectInspector initialize(ObjectInspector[] arguments)
- throws UDFArgumentException {
- // We need exactly 1 parameter
- if (arguments == null || arguments.length != 1) {
- throw new UDFArgumentLengthException("The function "
- + "RefererClassifierUDF expects exactly 1 parameter");
+ public String evaluate(String url) throws HiveException {
+ RefererClassification kind = Webrequest.classifyReferer(url);
+ switch (kind) {
+ case UNKNOWN: return "UNK";
+ case INTERNAL: return "INT";
+ case EXTERNAL: return "EXT";
}
-
- // ... and the parameter has to be a string
- if (!(arguments[0] instanceof StringObjectInspector)) {
- throw new UDFArgumentTypeException(0, "The parameter to "
- + "RefererClassifierUDF has to be a string");
- }
-
- inputOI = (StringObjectInspector) arguments[0];
-
- List<String> fieldNames = new LinkedList<String>();
- List<ObjectInspector> fieldOIs= new LinkedList<ObjectInspector>();
- int idx = 0;
-
- fieldNames.add("is_unknown");
-
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
- IDX_IS_UNKNOWN=idx++;
-
- fieldNames.add("is_internal");
-
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
- IDX_IS_INTERNAL=idx++;
-
- fieldNames.add("is_external");
-
fieldOIs.add(PrimitiveObjectInspectorFactory.javaBooleanObjectInspector);
- IDX_IS_EXTERNAL=idx++;
-
- result = new Object[idx];
-
- return
ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
- }
-
- @Override
- public Object evaluate(DeferredObject[] arguments) throws HiveException {
- assert arguments != null : "Method 'evaluate' of RefererClassifierUDF "
- + "called with null arguments array";
- assert arguments.length == 1 : "Method 'evaluate' of "
- + "RefererClassifierUDF called arguments of length "
- + arguments.length + " (instead of 1)";
- // arguments is an array with exactly 1 entry.
-
- assert result != null : "Result object has not yet been initialized, "
- + "but evaluate called";
- // result object has been initialized. So it's an array of objects of
- // the right length.
-
- String url = inputOI.getPrimitiveJavaObject(arguments[0].get());
-
- RefererClassification kind = Webrequest.classify(url);
-
- result[IDX_IS_UNKNOWN] = kind == RefererClassification.UNKNOWN;
- result[IDX_IS_INTERNAL] = kind == RefererClassification.INTERNAL;
- result[IDX_IS_EXTERNAL] = kind == RefererClassification.EXTERNAL;
-
- return result;
- }
-
- @Override
- public String getDisplayString(String[] arguments) {
- String argument;
- if (arguments == null) {
- argument = "<arguments == null>";
- } else if (arguments.length == 1) {
- argument = arguments[0];
- } else {
- argument = "<arguments of length " + arguments.length + ">";
- }
- return "referer_classifier(" + argument +")";
-
+ return "UNK";
}
}
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIdentifyProjectUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIdentifyProjectUDF.java
new file mode 100644
index 0000000..bb4065f
--- /dev/null
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIdentifyProjectUDF.java
@@ -0,0 +1,71 @@
+/**
+ * Copyright (C) 2014 Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+import org.apache.hadoop.io.Text;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestIdentifyProjectUDF {
+
+
+ @Test
+ @FileParameters(
+ value = "../refinery-core/src/test/resources/pageview_test_data.csv",
+ mapper = CsvWithHeaderMapper.class
+ )
+ public void testIdentifyProject(
+ String test_description,
+ String projectIdentifier,
+ boolean is_pageview,
+ boolean is_legacy_pageview,
+ boolean is_app_pageview,
+ String ip_address,
+ String x_forwarded_for,
+ String uri_host,
+ String uri_path,
+ String uri_query,
+ String http_status,
+ String content_type,
+ String user_agent
+ ) {
+ IdentifyProjectUDF udf = new IdentifyProjectUDF();
+
+ assertEquals(
+ test_description,
+ projectIdentifier,
+ udf.evaluate(uri_host)
+ );
+ }
+
+ @Test
+ public void testIdentifyProjectNull() {
+ IdentifyProjectUDF udf = new IdentifyProjectUDF();
+
+ assertEquals(
+ "Test null input to identifyProject",
+ "-",
+ udf.evaluate(null)
+ );
+ }
+
+}
\ No newline at end of file
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
index 8e80101..88e5673 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
@@ -33,6 +33,7 @@
)
public void testIsAppPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
index ff8f9ba..5373544 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
@@ -34,6 +34,7 @@
)
public void testIsPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
index 868b505..82d030e 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
@@ -34,6 +34,7 @@
)
public void testIsPageview(
String test_description,
+ String projectIdentifier,
boolean is_pageview,
boolean is_legacy_pageview,
boolean is_app_pageview,
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
index 6f81d3f..1e6657e 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestRefererClassifierUDF.java
@@ -17,84 +17,23 @@
import java.io.IOException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
-import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
-import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
-import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
-import org.wikimedia.analytics.refinery.core.Webrequest.RefererClassification;
import junit.framework.TestCase;
public class TestRefererClassifierUDF extends TestCase {
- ObjectInspector StringOI =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
- ObjectInspector LongOI =
PrimitiveObjectInspectorFactory.javaLongObjectInspector;
-
- private Object callUDF(String url) throws HiveException, IOException {
- DeferredObject urlDO = new DeferredJavaObject(url);
- DeferredObject[] arguments = new DeferredObject[] {urlDO};
- Object res = null;
-
- RefererClassifierUDF udf = new RefererClassifierUDF();
- try {
- udf.initialize(new ObjectInspector[]{StringOI});
- res = udf.evaluate(arguments);
- } finally {
- udf.close();
- }
- return res;
- }
-
- private void assertKind(String url, RefererClassification kind)
- throws HiveException, IOException {
- Object[] res = (Object[]) callUDF(url);
-
- assertEquals("Result array has wrong length", 3, res.length);
-
- assertEquals("is_unknown does not match", kind ==
RefererClassification.UNKNOWN, res[0]);
- assertEquals("is_internal does not match", kind ==
RefererClassification.INTERNAL, res[1]);
- assertEquals("is_external does not match", kind ==
RefererClassification.EXTERNAL, res[2]);
- }
-
- public void testInitialize() throws HiveException, IOException {
- RefererClassifierUDF udf = new RefererClassifierUDF();
- try {
- udf.initialize(new ObjectInspector[]{StringOI});
- } finally {
- udf.close();
- }
- }
-
- public void testInitializeEmpty() throws HiveException, IOException {
- RefererClassifierUDF udf = new RefererClassifierUDF();
- try {
- udf.initialize(new ObjectInspector[]{});
- fail("Initialize did not throw HiveException");
- } catch (HiveException e) {
- } finally {
- udf.close();
- }
- }
-
- public void testInitializeWrongType() throws HiveException, IOException {
- RefererClassifierUDF udf = new RefererClassifierUDF();
- try {
- udf.initialize(new ObjectInspector[]{LongOI});
- fail("Initialize did not throw HiveException");
- } catch (HiveException e) {
- } finally {
- udf.close();
- }
- }
public void testEvaluateUnknown() throws HiveException, IOException {
- assertKind("foo", RefererClassification.UNKNOWN);
+ RefererClassifierUDF udf = new RefererClassifierUDF();
+ assertEquals("Unknown referer", udf.evaluate("foo"), "UNK");
}
public void testEvaluateInternal() throws HiveException, IOException {
- assertKind("http://en.wikipedia.org/foo",
RefererClassification.INTERNAL);
+ RefererClassifierUDF udf = new RefererClassifierUDF();
+ assertEquals("Unknown referer",
udf.evaluate("http://en.wikipedia.org/foo"), "INT");
}
public void testEvaluateExternal() throws HiveException, IOException {
- assertKind("http://www.google.com/", RefererClassification.EXTERNAL);
+ RefererClassifierUDF udf = new RefererClassifierUDF();
+ assertEquals("Unknown referer",
udf.evaluate("http://www.google.com/"), "EXT");
}
}
\ No newline at end of file
diff --git a/refinery-job/pom.xml b/refinery-job/pom.xml
index b1940e7..f9d4534 100644
--- a/refinery-job/pom.xml
+++ b/refinery-job/pom.xml
@@ -23,21 +23,29 @@
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
+ <version>${scala.version}</version>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
+ <version>${spark.version}</version>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.10</artifactId>
+ <version>${spark.version}</version>
+ <scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
+ <version>${spark.version}</version>
+ <scope>provided</scope>
</dependency>
</dependencies>
diff --git a/refinery-tools/pom.xml b/refinery-tools/pom.xml
index 47fe769..818cf5d 100644
--- a/refinery-tools/pom.xml
+++ b/refinery-tools/pom.xml
@@ -15,6 +15,8 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
+ <version>${hadoop.version}</version>
+ <scope>provided</scope>
</dependency>
<dependency>
--
To view, visit https://gerrit.wikimedia.org/r/205833
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Id3b14d954d1396a8e8667d6865a854ad1167d830
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits