EBernhardson has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/327855 )
Change subject: UDF for extracting primary full text search request
......................................................................
UDF for extracting primary full text search request
CirrusSearch logs contain each individual request made from mediawiki
to elasticsearch. Often times we need to extract only the primary
request to do analysis on. It's quite a pain to do in HQL, so this
UDF handles it for us.
Adds a helper class for dealing with the structs, as I have the feeling
there will be more use cases for UDF's that directly handle the list of
structs.
Bug: T149047
Change-Id: I67d5f0e7674f970b353ab5992fec1431f4592256
---
A
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java
M
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
A
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java
D
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSearchRequestUDF.java
A
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java
5 files changed, 319 insertions(+), 48 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source
refs/changes/55/327855/1
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java
new file mode 100644
index 0000000..86f3089
--- /dev/null
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java
@@ -0,0 +1,86 @@
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructField;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+
+/**
+ * Helper class for dealing with the array<struct<...>> in
cirrussearchrequestset.
+ *
+ * Currently only extracts querytype and indices, but can be expanded as
necessary
+ * to meet our needs in various UDFs.
+ */
+class CirrusRequestDeser {
+ private final StructObjectInspector oi;
+
+ private final StructField queryTypeField;
+ private final StringObjectInspector queryTypeOI;
+
+ private final StructField indicesField;
+ private final ListObjectInspector indicesOI;
+ private final StringObjectInspector indicesElemOI;
+
+ public CirrusRequestDeser(StructObjectInspector oi)
+ throws UDFArgumentException {
+ this.oi = oi;
+ // verify the 'querytype' field is a string
+ queryTypeField = oi.getStructFieldRef("querytype");
+ if (queryTypeField == null) {
+ throw new UDFArgumentException("...");
+ }
+ ObjectInspector queryTypeOI = queryTypeField.getFieldObjectInspector();
+ if
(!queryTypeOI.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
+ throw new UDFArgumentException("...");
+ }
+ PrimitiveCategory queryTypeCategory = ((PrimitiveObjectInspector)
queryTypeOI).getPrimitiveCategory();
+ if (queryTypeCategory != PrimitiveCategory.STRING) {
+ throw new UDFArgumentException("...");
+ }
+ this.queryTypeOI = (StringObjectInspector)queryTypeOI;
+
+ // verify the 'indices' field is a list of strings
+ indicesField = oi.getStructFieldRef("indices");
+ if (indicesField == null) {
+ throw new UDFArgumentException("...");
+ }
+ if
(!indicesField.getFieldObjectInspector().getCategory().equals(ObjectInspector.Category.LIST))
{
+ throw new UDFArgumentException("...");
+ }
+ indicesOI =
(ListObjectInspector)indicesField.getFieldObjectInspector();
+ if
(!indicesOI.getListElementObjectInspector().getCategory().equals(ObjectInspector.Category.PRIMITIVE))
{
+ throw new UDFArgumentException("...");
+ }
+ PrimitiveObjectInspector indicesElemOI =
(PrimitiveObjectInspector)indicesOI.getListElementObjectInspector();
+ if (indicesElemOI.getPrimitiveCategory() != PrimitiveCategory.STRING) {
+ throw new UDFArgumentException("...");
+ }
+ this.indicesElemOI = (StringObjectInspector)indicesElemOI;
+ }
+
+ public String getQueryType(Object struct) {
+ Object type = oi.getStructFieldData(struct, queryTypeField);
+ return queryTypeOI.getPrimitiveJavaObject(type);
+ }
+
+ public String[] getIndices(Object struct) {
+ Object list = oi.getStructFieldData(struct, indicesField);
+ if (list == null) {
+ return null;
+ }
+
+ int len = indicesOI.getListLength(list);
+ String[] indices = new String[len];
+ int i = 0;
+ for (Object inner: indicesOI.getList(list)) {
+ indices[i] = indicesElemOI.getPrimitiveJavaObject(inner);
+ i++;
+ }
+
+ return indices;
+ }
+}
\ No newline at end of file
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
index ac6fcce..db223b9 100644
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java
@@ -3,6 +3,8 @@
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
@@ -58,9 +60,29 @@
*/
protected void checkArgPrimitive(ObjectInspector[] arguments, int i)
throws UDFArgumentTypeException{
- ObjectInspector.Category oiCat = arguments[i].getCategory();
- if (oiCat != ObjectInspector.Category.PRIMITIVE) {
- throw new UDFArgumentTypeException(i, getFuncName() + " Argument
should be of primitive type");
+ checkArgType(arguments, i, Category.PRIMITIVE);
+ }
+
+ /**
+ * Checks argument type
+ *
+ * @param arguments
+ * @param i
+ * @param type
+ *
+ * @throws UDFArgumentTypeException
+ */
+ protected void checkArgType(ObjectInspector[] arguments, int i,
PrimitiveCategory type)
+ throws UDFArgumentTypeException{
+
+ PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector)
arguments[i]).getPrimitiveCategory();
+
+ if (primitiveCategory != type) {
+ throw new UDFArgumentTypeException(0,
+ "A " + type.getClass().getSimpleName() + " argument was
expected for "
+ + getFuncName() + " but an argument of type " +
arguments[i].getTypeName()
+ + " was given."
+ );
}
}
@@ -69,18 +91,38 @@
*
* @param arguments
* @param i
+ * @param Category
*
* @throws UDFArgumentTypeException
*/
- protected void checkArgType(ObjectInspector[] arguments, int i,
PrimitiveCategory type)
- throws UDFArgumentTypeException{
+ protected void checkArgType(ObjectInspector[] arguments, int i, Category
type)
+ throws UDFArgumentTypeException{
+ Category oiCat = arguments[i].getCategory();
+ if (oiCat != type) {
+ throw new UDFArgumentTypeException(i, getFuncName() + " Argument
should be of "
+ + type.getClass().getSimpleName() + " type");
+ }
+ }
+
+ /**
+ * Checks argument type of list and it's elements
+ *
+ * @param arguments
+ * @param i
+ * @param Category Type of list elements
+ *
+ * @throws UDFArgumentTypeException
+ */
+ protected void checkListArgType(ObjectInspector[] arguments, int i,
Category type)
+ throws UDFArgumentTypeException{
+ checkArgType(arguments, i, Category.LIST);
- PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector)
arguments[i]).getPrimitiveCategory();
-
- if (primitiveCategory != type) {
- throw new UDFArgumentTypeException(0,
- "A string argument was expected for " + getFuncName() + " but
an argument of type " +
- arguments[i].getTypeName() + " was given."
+ ListObjectInspector listOI = (ListObjectInspector) arguments[i];
+ if (listOI.getListElementObjectInspector().getCategory() != type) {
+ throw new UDFArgumentTypeException(i,
+ "An array<" + type.getClass().getSimpleName() + " argument was
expected "
+ + "for " + getFuncName() + " but an argument of type array<"
+ + listOI.getListElementObjectInspector().getTypeName() + ">
was given."
);
}
}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java
new file mode 100644
index 0000000..f6c7dab
--- /dev/null
+++
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java
@@ -0,0 +1,79 @@
+package org.wikimedia.analytics.refinery.hive;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
+import
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
+
+public class GetMainSearchRequestUDF extends GenericUDF {
+ final private static String FULL_TEXT = "full_text";
+
+ private StringObjectInspector wikiOI;
+ private ListObjectInspector listOI;
+ private CirrusRequestDeser deser;
+
+ public ObjectInspector initialize(ObjectInspector[] arguments)
+ throws UDFArgumentException {
+ GenericUDFHelper argsHelper = new GenericUDFHelper();
+ argsHelper.checkArgsSize(arguments, 2, 2);
+
+ // first argument must be a string
+ argsHelper.checkArgType(arguments, 0, PrimitiveCategory.STRING);
+ wikiOI = (StringObjectInspector) arguments[0];
+
+ // second argument must be an array of structs
+ argsHelper.checkListArgType(arguments, 1,
ObjectInspector.Category.STRUCT);
+ listOI = (ListObjectInspector) arguments[1];
+ StructObjectInspector elemOI = (StructObjectInspector)
listOI.getListElementObjectInspector();
+
+ // deser will do reset of the validation
+ deser = new CirrusRequestDeser(elemOI);
+
+ // Return value is a struct from the list
+ return elemOI;
+ }
+
+ @Override
+ public String getDisplayString(String[] arguments) {
+ return "GetMainSearchRequest(" + arguments[0] + ", " +
arguments[1] + ")";
+ }
+
+ public Object evaluate(DeferredObject[] dos) throws HiveException {
+ if (dos[0].get() == null || dos[1].get() == null) {
+ return null;
+ }
+ int len = listOI.getListLength(dos[1].get());
+ if (len == 0) {
+ return null;
+ }
+
+ List<?> requests = listOI.getList(dos[1].get());
+ String wiki = wikiOI.getPrimitiveJavaObject(dos[0].get());
+ String prefix = wiki + "_";
+ for (Object request: requests) {
+ String[] indices = deser.getIndices(request);
+ if (indices == null) {
+ continue;
+ }
+ for (String index: indices) {
+ // If the request was made against a different wiki
+ // ignore it, it's likely part of interwiki search.
+ if (index != null
+ && (index.equals(wiki) || index.startsWith(prefix))
+ // At least one request must be a full_text query
+ && FULL_TEXT.equals(deser.getQueryType(request))
+ ) {
+ return request;
+ }
+ }
+ }
+
+ return null;
+ }
+}
diff --git
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSearchRequestUDF.java
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSearchRequestUDF.java
deleted file mode 100644
index c25b1db..0000000
---
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSearchRequestUDF.java
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * Copyright (C) 2014 Wikimedia Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.wikimedia.analytics.refinery.hive;
-
-import org.apache.hadoop.hive.ql.exec.UDF;
-import org.wikimedia.analytics.refinery.core.SearchRequest;
-
-/**
- * A hive UDF to identify in a boolean fashion whether a request
- * is a "search" request or not.
- */
-public class IsSearchRequestUDF extends UDF {
- public boolean evaluate(
- String uriPath,
- String uriQuery
- ) {
- SearchRequest search_inst = SearchRequest.getInstance();
- return search_inst.isSearchRequest(
- uriPath,
- uriQuery
- );
- }
-}
diff --git
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java
new file mode 100644
index 0000000..14bd90f
--- /dev/null
+++
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java
@@ -0,0 +1,101 @@
+package org.wikimedia.analytics.refinery.hive;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+import junitparams.Parameters;
+import junitparams.JUnitParamsRunner;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestGetMainSearchRequestUDF {
+ private GetMainSearchRequestUDF udf = null;
+
+ @Before
+ public void setUp() throws HiveException {
+ udf = new GetMainSearchRequestUDF();
+
+ List<String> fields = new ArrayList<>();
+ List<ObjectInspector> fieldOIs = new ArrayList<>();
+
+ fields.add("querytype");
+
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
+
+ fields.add("indices");
+ fieldOIs.add(ObjectInspectorFactory.getStandardListObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector));
+
+ ObjectInspector wikiOI =
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+ ObjectInspector listOI =
ObjectInspectorFactory.getStandardListObjectInspector(
+
ObjectInspectorFactory.getStandardStructObjectInspector(fields, fieldOIs));
+
+ udf.initialize(new ObjectInspector[]{wikiOI, listOI});
+ }
+
+ @Test
+ @Parameters(method="paramsForTest")
+ public void testEvaluate(String message, int expectIndex, String wiki,
Object[] requests) throws HiveException {
+ GenericUDF.DeferredObject[] dois = {
+ new GenericUDF.DeferredJavaObject(wiki),
+ new GenericUDF.DeferredJavaObject(requests)
+ };
+ Object expect = null;
+ if (expectIndex >= 0) {
+ expect = requests[expectIndex];
+ }
+ assertEquals(message, expect, udf.evaluate(dois));
+ }
+
+ private Object[] paramsForTest() {
+ return new Object[] {
+ new Object[]{"simplest passing example", 0, "enwiki", new
Object[] {
+ new Object[]{"full_text", new
String[]{"enwiki_content"}}
+ }},
+ new Object[]{"including interwiki", 1, "enwiki", new Object[] {
+ new Object[]{"near_match", new
String[]{"enwiki_content"}},
+ new Object[]{"full_text", new
String[]{"enwiki_content"}},
+ new Object[]{"full_text", new
String[]{"enwikibooks_content"}}
+ }},
+ new Object[]{"interwiki alternate order", 3, "zhwiki", new
Object[] {
+ new Object[]{"near_match", new
String[]{"zhwiki_content"}},
+ new Object[]{"full_text", new
String[]{"zhwikisource_content"}},
+ new Object[]{"full_text", new
String[]{"zhwiktionary_content"}},
+ new Object[]{"full_text", new
String[]{"zhwiki_content"}}
+ }},
+ new Object[]{"multimedia search", 1, "frwiki", new Object[] {
+ new Object[]{"near_match", new
String[]{"frwiki_general"}},
+ new Object[]{"full_text", new
String[]{"frwiki_general", "commonswiki_file"}}
+ }},
+ new Object[]{"search everything", 1, "enwiki", new Object[] {
+ new Object[]{"near_match", new String[]{"enwiki"}},
+ new Object[]{"full_text", new String[]{"enwiki",
"commonswiki_file"}}
+ }},
+ new Object[]{"completion suggester", -1, "itwiki", new Object[]
{
+ new Object[]{"comp_suggest", new
String[]{"itwiki_titlesuggest"}}
+ }},
+ new Object[]{"successfull 'go'", -1, "arwiki", new Object[] {
+ new Object[]{"near_match", new
String[]{"arwiki_content"}}
+ }},
+ new Object[]{"invalid record with null querytype", -1,
"zhwiki", new Object[] {
+ new Object[]{null, new String[]{"zhwiki_content"}}
+ }},
+ new Object[]{"invalid record null querytype later", -1,
"thwiki", new Object[] {
+ new Object[]{"near_match", new
String[]{"thwiki_content"}},
+ new Object[]{null, new String[]{"thwiki_content"}}
+ }},
+ new Object[]{"invalid record with null indices", -1, "thwiki",
new Object[] {
+ new Object[]{"near_match", new
String[]{"thwiki_content"}},
+ new Object[]{"full_text", null}
+ }}
+ };
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/327855
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I67d5f0e7674f970b353ab5992fec1431f4592256
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits