Joal has submitted this change and it was merged.
Change subject: Make pageview definition aware of preview parameter
......................................................................
Make pageview definition aware of preview parameter
If x-analytics header includes tag preview
the request should not be counted as pageview.
To make this change backwards compatible
and allow for a variable number of arguments
the isPageViewUDF and isAppPageviewUDF need to extend GenericUDF
Both UDFs should have deterministic annotation
Change-Id: I54190f72755810bd2287a05ce7bfc8cfe40f6a42
TT: T109383
---
M
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
M
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
A
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsAppPageviewUDF.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
D
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
M
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
7 files changed, 524 insertions(+), 176 deletions(-)
Approvals:
Joal: Verified; Looks good to me, approved
diff --git
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
index 63f6bb9..7bb8fbb 100644
---
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
+++
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
@@ -129,6 +129,7 @@
* to see if the request is an app pageview, but not
* (for example) whether it actually completed.
*
+ *
* @param uriPath Path portion of the URI
* @param uriQuery Query portion of the URI
* @param userAgent User-Agent of the requestor
@@ -142,11 +143,56 @@
String userAgent
) {
+ return this.isAppPageview(
+ uriPath,
+ uriQuery,
+ contentType,
+ userAgent,
+ "");
+ }
+ /**
+ * Given a webrequest URI path, query and user agent,
+ * returns true if we consider this an app (API) pageview.
+ * Note that the logic here is /NOT COMPLETE/. It checks
+ * to see if the request is an app pageview, but not
+ * (for example) whether it actually completed.
+ *
+ * See: https://wikitech.wikimedia.org/wiki/X-Analytics#Keys
+ * for x-analytics info.
+ *
+ * Please note that requests tagged as 'preview' are not counted
+ * as pageviews.
+ *
+ * We use the raw xAnalytics header rather than x_analytics_map
+ * to make sure this function can be applied
+ * to raw data, where the parsing of x-Analytics header into
+ * a map has not yet happened.
+ *
+ * @param uriPath Path portion of the URI
+ * @param uriQuery Query portion of the URI
+ * @param userAgent User-Agent of the requestor
+ * @param rawXAnalyticsHeader String that represents the x-analytics
header
+ *
+ * @return boolean
+ */
+ public boolean isAppPageview(
+ String uriPath,
+ String uriQuery,
+ String contentType,
+ String userAgent,
+ String rawXAnalyticsHeader
+ ) {
+
final String appContentType = "application/json";
final String appUserAgent = "WikipediaApp";
final String appPageURIQuery = "sections=0";
final String iosAppPageURIQuery = "sections=all";
final String iosUserAgent = "iPhone";
+
+ Webrequest wr = Webrequest.getInstance();
+
+ if (!wr.getXAnalyticsValue(rawXAnalyticsHeader,"preview").isEmpty())
+ return false;
return (
Utilities.stringContains(uriPath, uriPathAPI)
@@ -159,9 +205,13 @@
);
}
+
+
+
/**
* Given a webrequest URI host, path, query user agent http status and
content type,
* returns true if we consider this a 'pageview', false otherwise.
+ *
* <p>
* See:
https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters
* for information on how to classify a pageview.
@@ -183,35 +233,88 @@
String contentType,
String userAgent
) {
+ return this.isPageview(
+ uriHost,
+ uriPath,
+ uriQuery,
+ httpStatus,
+ contentType,
+ userAgent,
+ ""
+ );
+ }
+
+ /**
+ * Given a webrequest URI host, path, query user agent http status and
content type,
+ * returns true if we consider this a 'pageview', false otherwise.
+ * <p>
+ * See:
https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters
+ * for information on how to classify a pageview.
+ *
+ * See: https://wikitech.wikimedia.org/wiki/X-Analytics#Keys
+ * for x-analytics info.
+ *
+ * Please note that requests tagged as 'preview' are not counted
+ * as pageviews.
+ *
+ * We use the raw xAnalytics header rather than x_analytics_map
+ * to make sure this function can be applied
+ * to raw data, where the parsing of x-Analytics header into
+ * a map has not yet happened.
+ *
+ * @param uriHost Hostname portion of the URI
+ * @param uriPath Path portion of the URI
+ * @param uriQuery Query portion of the URI
+ * @param httpStatus HTTP request status code
+ * @param contentType Content-Type of the request
+ * @param userAgent User-Agent of the requestor
+ * @param rawXAnalyticsHeader string for xAnalytics header
+ *
+ * @return boolean
+ */
+ public boolean isPageview(
+ String uriHost,
+ String uriPath,
+ String uriQuery,
+ String httpStatus,
+ String contentType,
+ String userAgent,
+ String rawXAnalyticsHeader
+ ) {
uriHost = uriHost.toLowerCase();
+
+ Webrequest wr = Webrequest.getInstance();
+
+ if (!wr.getXAnalyticsValue(rawXAnalyticsHeader,"preview").isEmpty())
+ return false;
return (
// All pageviews have a 200 or 304 HTTP status
httpStatusesSet.contains(httpStatus)
- // check for a regular pageview contentType, or a an API
contentType
- && (
- (contentTypesSet.contains(contentType) &&
!Utilities.stringContains(uriPath, uriPathAPI))
- || isAppPageview(uriPath, uriQuery, contentType, userAgent)
- )
- // A pageview must be from either a wikimedia.org domain,
- // or a 'project' domain, e.g. en.wikipedia.org
- && (
- Utilities.patternIsFound(uriHostWikimediaDomainPattern,
uriHost)
+ // check for a regular pageview contentType, or a an API
contentType
+ && (
+ (contentTypesSet.contains(contentType) &&
!Utilities.stringContains(uriPath, uriPathAPI))
+ || isAppPageview(uriPath, uriQuery, contentType,
userAgent, rawXAnalyticsHeader)
+ )
+ // A pageview must be from either a wikimedia.org domain,
+ // or a 'project' domain, e.g. en.wikipedia.org
+ && (
+ Utilities.patternIsFound(uriHostWikimediaDomainPattern,
uriHost)
|| Utilities.patternIsFound(uriHostOtherProjectsPattern,
uriHost)
|| Utilities.patternIsFound(uriHostProjectDomainPattern,
uriHost)
- )
- // Either a pageview's uriPath will match the first pattern,
- // or its uriQuery will match the second
- && (
- Utilities.patternIsFound(uriPathPattern, uriPath)
+ )
+ // Either a pageview's uriPath will match the first pattern,
+ // or its uriQuery will match the second
+ && (
+ Utilities.patternIsFound(uriPathPattern, uriPath)
|| Utilities.patternIsFound(uriQueryPattern, uriQuery)
- )
- // A pageview will not have these Special: pages in the uriPath or
uriQuery
- && !Utilities.patternIsFound(uriPathUnwantedSpecialPagesPattern,
uriPath)
- && !Utilities.patternIsFound(uriQueryUnwantedSpecialPagesPattern,
uriQuery)
- // Edits now come through as text/html. They should not be
included.
- // Luckily the query parameter does not seem to be localised.
- && !Utilities.patternIsFound(uriQueryUnwantedActions, uriQuery)
+ )
+ // A pageview will not have these Special: pages in the
uriPath or uriQuery
+ &&
!Utilities.patternIsFound(uriPathUnwantedSpecialPagesPattern, uriPath)
+ &&
!Utilities.patternIsFound(uriQueryUnwantedSpecialPagesPattern, uriQuery)
+ // Edits now come through as text/html. They should not be
included.
+ // Luckily the query parameter does not seem to be localised.
+ && !Utilities.patternIsFound(uriQueryUnwantedActions, uriQuery)
);
}
diff --git
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
index 6365b20..c567dac 100644
---
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
+++
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
@@ -14,13 +14,14 @@
package org.wikimedia.analytics.refinery.core;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import static org.junit.Assert.assertEquals;
-
import junitparams.FileParameters;
import junitparams.JUnitParamsRunner;
import junitparams.mappers.CsvWithHeaderMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
@RunWith(JUnitParamsRunner.class)
public class TestPageview {
@@ -48,6 +49,7 @@
String user_agent
) {
PageviewDefinition PageviewDefinitionInstance =
PageviewDefinition.getInstance();
+
assertEquals(
test_description,
is_pageview,
@@ -85,14 +87,15 @@
String user_agent
) {
PageviewDefinition PageviewDefinitionInstance =
PageviewDefinition.getInstance();
+
assertEquals(
test_description,
is_app_pageview,
PageviewDefinitionInstance.isAppPageview(
- uri_path,
- uri_query,
- content_type,
- user_agent
+ uri_path,
+ uri_query,
+ content_type,
+ user_agent
)
);
}
@@ -190,5 +193,27 @@
);
}
}
+ @Test
+ public void testIsPageviewXAnalyticsPreview(
+ ){
+ String uri_host = "en.wikipedia";
+ String uri_path = "/wiki/Horseshoe%20crab#anchor"; ;
+ String uri_query = "-";
+ String http_status = "200";
+ String content_type = "text/html";
+ String user_agent = "turnip/";
+
+ assertTrue("Preview requests are not pageviews",
PageviewDefinition.getInstance().isPageview(
+ uri_host,
+ uri_path,
+ uri_query,
+ http_status,
+ content_type,
+ user_agent,
+ "{'blah':'1','preview':'1'}"
+
+ ) == false);
+
+ }
}
\ No newline at end of file
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
new file mode 100644
index 0000000..b8bc6b3
--- /dev/null
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
@@ -0,0 +1,64 @@
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+
+/**
+ * Created by nuria on 9/8/15.
+ *
+ * Encapsulates methods to check arguments for UDFs
+ *
+ * For some reason these are all protected in GenericUDF.java
+ * See:
https://github.com/apache/hive/blob/master/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDF.java
+ */
+public class GenericUDFHelper {
+
+ /**
+ * @return String
+ */
+ protected String getFuncName(){
+ return getClass().getSimpleName().substring(10).toLowerCase();
+ }
+
+ /*
+ * Checks variable argument list, throws exception if not within bounds
+ *
+ * @param arguments
+ * @param min
+ * @param max
+ * @throws UDFArgumentLengthException
+ */
+ protected void checkArgsSize(ObjectInspector[] arguments, int min, int max)
+ throws UDFArgumentLengthException{
+ if (arguments.length < min || arguments.length > max) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getFuncName());
+ sb.append(" requires ");
+ if (min == max) {
+ sb.append(min);
+ } else {
+ sb.append(min).append("..").append(max);
+ }
+ sb.append(" argument(s), got ");
+ sb.append(arguments.length);
+ throw new UDFArgumentLengthException(sb.toString());
+ }
+ }
+
+ /**
+ * Checks argument type
+ *
+ * @param arguments
+ * @param i
+ *
+ * @throws UDFArgumentTypeException
+ */
+ protected void checkArgPrimitive(ObjectInspector[] arguments, int i)
+ throws UDFArgumentTypeException{
+ ObjectInspector.Category oiCat = arguments[i].getCategory();
+ if (oiCat != ObjectInspector.Category.PRIMITIVE) {
+ throw new UDFArgumentTypeException(i, getFuncName() + " Argument
should be of primitive type");
+ }
+ }
+}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsAppPageviewUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsAppPageviewUDF.java
index d97b021..572273f 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsAppPageviewUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsAppPageviewUDF.java
@@ -1,12 +1,12 @@
/**
* Copyright (C) 2014 Wikimedia Foundation
- *
+ * <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ * <p/>
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,9 +16,15 @@
package org.wikimedia.analytics.refinery.hive;
-import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.wikimedia.analytics.refinery.core.PageviewDefinition;
-
/**
* A Hive UDF to classify a Wikimedia webrequest as an app pageview.
@@ -28,6 +34,9 @@
* This is not a /complete/ definition - it was initially a private method. As
a
* result it does not do, for example, HTTP status filtering. See the example
* query below for how to solve for that.
+ *
+ * Note the last argument(x_analytics) is optional
+
*
* <p>
* Hive Usage:
@@ -44,26 +53,79 @@
* AND day=7
* AND hour=12
* AND http_status IN ('200','304')
- * AND is_app_pageview(uri_path, uri_query, http_status, content_type,
user_agent)
+ * AND is_app_pageview(uri_path, uri_query, http_status, content_type,
user_agent, x_analytics)
* GROUP BY
* LOWER(uri_host)
* ORDER BY cnt desc
* LIMIT 10
* ;
*/
-public class IsAppPageviewUDF extends UDF {
- public boolean evaluate(
- String uriPath,
- String uriQuery,
- String contentType,
- String userAgent
- ) {
- PageviewDefinition pageviewDefinitionInstance =
PageviewDefinition.getInstance();
- return pageviewDefinitionInstance.isAppPageview(
- uriPath,
- uriQuery,
- contentType,
- userAgent
- );
+@UDFType(deterministic = true)
+public class IsAppPageviewUDF extends GenericUDF {
+
+ private int maxArguments = 5;
+ private int minArguments = 4;
+
+ private boolean checkForXAnalytics = false;
+
+ private ObjectInspector[] argumentsOI;
+
+ /**
+ * Executed once per job, checks arguments size.
+ *
+ * Accepts variable number of arguments, last argument being the
+ * raw string that represents the xAnalytics map
+ *
+ * @param arguments
+ * @return
+ * @throws UDFArgumentException
+ */
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws
UDFArgumentException {
+
+ GenericUDFHelper argsHelper = new GenericUDFHelper();
+ //at least we should have 6 arguments
+ argsHelper.checkArgsSize(arguments, minArguments, maxArguments);
+
+ if (arguments.length > minArguments){
+ checkForXAnalytics = true;
+ }
+
+ for (int i = 0; i < arguments.length; i++) {
+ argsHelper.checkArgPrimitive(arguments, i);
+ }
+
+ argumentsOI = arguments;
+
+ return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector;
+ }
+
+ @Override
+ public Object evaluate(GenericUDF.DeferredObject[] arguments) throws
HiveException{
+
+ String uriPath = PrimitiveObjectInspectorUtils.getString(
+ arguments[0].get(), (PrimitiveObjectInspector) argumentsOI[0]);
+ String uriQuery = PrimitiveObjectInspectorUtils.getString(
+ arguments[1].get(), (PrimitiveObjectInspector) argumentsOI[1]);
+
+ String contentType = PrimitiveObjectInspectorUtils.getString(
+ arguments[2].get(), (PrimitiveObjectInspector) argumentsOI[2]);
+ String userAgent = PrimitiveObjectInspectorUtils.getString(
+ arguments[3].get(), (PrimitiveObjectInspector) argumentsOI[3]);
+
+ String rawXAnalyticsHeader = "";
+
+ if (checkForXAnalytics) {
+ rawXAnalyticsHeader = PrimitiveObjectInspectorUtils.getString(
+ arguments[4].get(), (PrimitiveObjectInspector) argumentsOI[4]);
+ }
+ return PageviewDefinition.getInstance().isAppPageview(uriPath,
uriQuery, contentType, userAgent, rawXAnalyticsHeader);
+
+ }
+
+
+ @Override
+ public String getDisplayString(String[] arguments) {
+ return "isAppPageView(" + arguments.toString() + ")";
}
}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
index 5314729..d704a4f 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsPageviewUDF.java
@@ -1,12 +1,12 @@
/**
* Copyright (C) 2014 Wikimedia Foundation
- *
+ * <p/>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ * <p/>
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,58 +17,125 @@
package org.wikimedia.analytics.refinery.hive;
import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDF;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.wikimedia.analytics.refinery.core.PageviewDefinition;
/**
* A Hive UDF to classify a Wikimedia webrequest as a 'pageview'.
* See: https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters
- * for information on how to classify a pageview.
+ * for information on how to classify a pageview.
*
- * <p>
+ * Note the last argument(x_analytics) is optional
+ *
+ * <p/>
+ * <p/>
* Hive Usage:
- * ADD JAR /path/to/refinery-hive.jar;
- * CREATE TEMPORARY FUNCTION is_pageview AS
- * 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
- * SELECT
- * LOWER(uri_host) as uri_host,
- * count(*) as cnt
- * FROM
- * wmf_raw.webrequest
- * WHERE
- * webrequest_source = 'mobile'
- * AND year=2014
- * AND month=12
- * AND day=7
- * AND hour=12
- * AND is_pageview(uri_host, uri_path, uri_query, http_status,
content_type, user_agent)
- * GROUP BY
- * LOWER(uri_host)
- * ORDER BY cnt desc
- * LIMIT 10
- * ;
+ * ADD JAR /path/to/refinery-hive.jar;
+ * CREATE TEMPORARY FUNCTION is_pageview AS
+ * 'org.wikimedia.analytics.refinery.hive.IsPageviewUDF';
+ * SELECT
+ * LOWER(uri_host) as uri_host,
+ * count(*) as cnt
+ * FROM
+ * wmf_raw.webrequest
+ * WHERE
+ * webrequest_source = 'mobile'
+ * AND year=2014
+ * AND month=12
+ * AND day=7
+ * AND hour=12
+ * AND is_pageview(uri_host, uri_path, uri_query, http_status, content_type,
user_agent, [x_analytics_header])
+ * GROUP BY
+ * LOWER(uri_host)
+ * ORDER BY cnt desc
+ * LIMIT 10
+ * ;
*/
@Description(name = "is_pageview",
- value = "_FUNC_(uri_host, uri_path, uri_query, http_status,
content_type, user_agent) - Returns true if the request is a pageview",
+ value = "_FUNC_(uri_host, uri_path, uri_query, http_status,
content_type, user_agent, x_analytics) " +
+ "- Returns true if the request is a pageview",
extended = "")
-public class IsPageviewUDF extends UDF {
- public boolean evaluate(
- String uriHost,
- String uriPath,
- String uriQuery,
- String httpStatus,
- String contentType,
- String userAgent
- ) {
- PageviewDefinition pageviewDefinitionInstance =
PageviewDefinition.getInstance();
- return pageviewDefinitionInstance.isPageview(
- uriHost,
- uriPath,
- uriQuery,
- httpStatus,
- contentType,
- userAgent
- );
+@UDFType(deterministic = true)
+public class IsPageviewUDF extends GenericUDF {
+
+ private ObjectInspector[] argumentsOI;
+
+ private boolean checkForXAnalytics = false;
+ private int maxArguments = 7;
+ private int minArguments = 6;
+
+
+
+ /**
+ * Executed once per job, checks arguments size.
+ *
+ * Accepts variable number of arguments, last argument being the
+ * raw string that represents the xAnalytics map
+ *
+ * @param arguments
+ * @return
+ * @throws UDFArgumentException
+ */
+ @Override
+ public ObjectInspector initialize(ObjectInspector[] arguments) throws
UDFArgumentException {
+
+ GenericUDFHelper argsHelper = new GenericUDFHelper();
+ //at least we should have 6 arguments
+ argsHelper.checkArgsSize(arguments, minArguments, maxArguments);
+
+ if (arguments.length > minArguments){
+ checkForXAnalytics = true;
+ }
+
+ for (int i = 0; i < arguments.length; i++) {
+ argsHelper.checkArgPrimitive(arguments, i);
+ }
+
+ argumentsOI = arguments;
+
+ return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector;
}
+
+ @Override
+ public Object evaluate(DeferredObject[] arguments) throws HiveException{
+
+ String uriHost = PrimitiveObjectInspectorUtils.getString(
+ arguments[0].get(), (PrimitiveObjectInspector) argumentsOI[0]);
+ String uriPath = PrimitiveObjectInspectorUtils.getString(
+ arguments[1].get(), (PrimitiveObjectInspector) argumentsOI[1]);
+ String uriQuery = PrimitiveObjectInspectorUtils.getString(
+ arguments[2].get(), (PrimitiveObjectInspector) argumentsOI[2]);
+ ;
+ String httpStatus = PrimitiveObjectInspectorUtils.getString(
+ arguments[3].get(), (PrimitiveObjectInspector) argumentsOI[3]);
+ String contentType = PrimitiveObjectInspectorUtils.getString(
+ arguments[4].get(), (PrimitiveObjectInspector) argumentsOI[4]);
+ String userAgent = PrimitiveObjectInspectorUtils.getString(
+ arguments[5].get(), (PrimitiveObjectInspector) argumentsOI[5]);
+
+ String rawXAnalyticsHeader = "";
+
+ if (checkForXAnalytics) {
+ rawXAnalyticsHeader = PrimitiveObjectInspectorUtils.getString(
+ arguments[6].get(), (PrimitiveObjectInspector) argumentsOI[6]);
+ }
+ return PageviewDefinition.getInstance().isPageview(uriHost, uriPath,
uriQuery, httpStatus, contentType, userAgent, rawXAnalyticsHeader);
+
+ }
+
+ @Override
+ public String getDisplayString(String[] arguments) {
+ return "isPageView(" + arguments.toString() + ")";
+ }
+
+
+
}
\ No newline at end of file
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
deleted file mode 100644
index 3b93b0b..0000000
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/**
- * Copyright (C) 2014 Wikimedia Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.wikimedia.analytics.refinery.hive;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import static org.junit.Assert.assertEquals;
-
-import junitparams.FileParameters;
-import junitparams.JUnitParamsRunner;
-import junitparams.mappers.CsvWithHeaderMapper;
-
-@RunWith(JUnitParamsRunner.class)
-public class TestIsAppPageviewUDF {
-
- @Test
- @FileParameters(
- value = "../refinery-core/src/test/resources/pageview_test_data.csv",
- mapper = CsvWithHeaderMapper.class
- )
- public void testIsAppPageview(
- String test_description,
- String project,
- String dialect,
- String pageTitle,
- boolean is_pageview,
- boolean is_legacy_pageview,
- boolean is_app_pageview,
- String ip_address,
- String x_forwarded_for,
- String uri_host,
- String uri_path,
- String uri_query,
- String http_status,
- String content_type,
- String user_agent
- ) {
- IsAppPageviewUDF udf = new IsAppPageviewUDF();
-
- assertEquals(
- test_description,
- is_app_pageview,
- udf.evaluate(
- uri_path,
- uri_query,
- content_type,
- user_agent
- )
- );
- }
-}
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
index bcefdcc..60547db 100644
---
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
@@ -15,23 +15,44 @@
*/
package org.wikimedia.analytics.refinery.hive;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import static org.junit.Assert.assertEquals;
-
import junitparams.FileParameters;
import junitparams.JUnitParamsRunner;
import junitparams.mappers.CsvWithHeaderMapper;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
@RunWith(JUnitParamsRunner.class)
public class TestIsPageviewUDF {
+ IsPageviewUDF udf = null;
+ ObjectInspector[] initArguments = null;
+
+ @Before
+ public void setUp() throws HiveException{
+ udf = new IsPageviewUDF();
+
+ ObjectInspector valueOI =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ initArguments = new ObjectInspector[]{valueOI, valueOI, valueOI,
valueOI, valueOI, valueOI, valueOI};
+ udf.initialize(initArguments);
+ }
@Test
@FileParameters(
value = "../refinery-core/src/test/resources/pageview_test_data.csv",
mapper = CsvWithHeaderMapper.class
)
+ // this mapper cannot deal with reading strings formed like
+ // the x analytics map: "{"WMF-Last-Access":"12-Aug-2015","https":"1"}
public void testIsPageview(
String test_description,
String project,
@@ -48,20 +69,90 @@
String http_status,
String content_type,
String user_agent
- ) {
- IsPageviewUDF udf = new IsPageviewUDF();
+ ){
- assertEquals(
- test_description,
- is_pageview,
- udf.evaluate(
- uri_host,
- uri_path,
- uri_query,
- http_status,
- content_type,
- user_agent
- )
- );
+ GenericUDF.DeferredJavaObject uri_host_udf = new
GenericUDF.DeferredJavaObject(uri_host);
+ GenericUDF.DeferredJavaObject uri_path_udf = new
GenericUDF.DeferredJavaObject(uri_path);
+ GenericUDF.DeferredJavaObject uri_query_udf = new
GenericUDF.DeferredJavaObject(uri_query);
+ GenericUDF.DeferredJavaObject http_status_udf = new
GenericUDF.DeferredJavaObject(http_status);
+ GenericUDF.DeferredJavaObject content_type_udf = new
GenericUDF.DeferredJavaObject(content_type);
+ GenericUDF.DeferredJavaObject user_agent_udf = new
GenericUDF.DeferredJavaObject(user_agent);
+ GenericUDF.DeferredJavaObject x_analytics_udf = new
GenericUDF.DeferredJavaObject("");
+
+ GenericUDF.DeferredObject[] args = {uri_host_udf, uri_path_udf,
uri_query_udf,
+ http_status_udf, content_type_udf, user_agent_udf,
x_analytics_udf};
+
+
+ try {
+ assertEquals(test_description, is_pageview, udf.evaluate(args));
+ } catch (HiveException e) {
+ e.printStackTrace();
+ }
+
+
}
+
+ @Test
+ public void testIsPageviewXAnalyticsPreview() throws HiveException{
+
+ GenericUDF.DeferredJavaObject uri_host = new
GenericUDF.DeferredJavaObject("en.wikipedia");
+ GenericUDF.DeferredJavaObject uri_path = new
GenericUDF.DeferredJavaObject("/wiki/Horseshoe%20crab#anchor");
+ GenericUDF.DeferredJavaObject uri_query = new
GenericUDF.DeferredJavaObject("-");
+ GenericUDF.DeferredJavaObject http_status = new
GenericUDF.DeferredJavaObject("200");
+ GenericUDF.DeferredJavaObject content_type = new
GenericUDF.DeferredJavaObject("text/html");
+ GenericUDF.DeferredJavaObject user_agent = new
GenericUDF.DeferredJavaObject("turnip");
+ GenericUDF.DeferredJavaObject x_analytics = new
GenericUDF.DeferredJavaObject("{'blah':1,'preview':1}");
+
+ GenericUDF.DeferredObject[] args = {uri_host, uri_path, uri_query,
http_status, content_type, user_agent, x_analytics};
+
+ boolean isPageview = (boolean) udf.evaluate(args);
+
+ assertFalse("Preview requests should not be consider pageviews",
isPageview);
+ }
+
+
+
+ @Test(expected = UDFArgumentLengthException.class)
+ public void testBadNumberOfArgumentsTooFew() throws HiveException{
+
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value2 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector[] initArgumentsFew = new ObjectInspector[]{value1,
value2};
+ udf.initialize(initArgumentsFew);
+ }
+
+
+ @Test(expected = UDFArgumentLengthException.class)
+ public void testBadNumberOfArgumentsTooMany() throws HiveException{
+
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value2 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector[] initArgumentsTooMany = new ObjectInspector[]{value1,
value2, value1, value2, value1, value2, value1, value2};
+ udf.initialize(initArgumentsTooMany);
+ }
+
+ // UDF should work with variable arguments
+ public void testMinMaxNumberOfArguments() throws HiveException{
+
+ ObjectInspector value1 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value2 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value3 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value4 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value5 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector value6 =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+
+ ObjectInspector[] initArguments = new ObjectInspector[]{value1,
value2, value3, value4, value5, value6};
+ udf.initialize(initArguments);
+
+ ObjectInspector value7 =
ObjectInspectorFactory.getStandardMapObjectInspector(
+ PrimitiveObjectInspectorFactory.writableStringObjectInspector,
+ PrimitiveObjectInspectorFactory.writableIntObjectInspector
+ );
+
+ ObjectInspector[] initArgumentsMax = new ObjectInspector[]{value1,
value2, value3, value4, value5, value6, value7};
+ udf.initialize(initArgumentsMax);
+
+
+ }
+
}
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/237274
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I54190f72755810bd2287a05ce7bfc8cfe40f6a42
Gerrit-PatchSet: 8
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Nuria <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: OliverKeyes <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits