Nuria has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/327855 )
Change subject: UDF for extracting primary full text search request ...................................................................... UDF for extracting primary full text search request CirrusSearch logs contain each individual request made from mediawiki to elasticsearch. Often times we need to extract only the primary request to do analysis on. It's quite a pain to do in HQL, so this UDF handles it for us. Adds a helper class for dealing with the structs, as I have the feeling there will be more use cases for UDF's that directly handle the list of structs. Bug: T162054 Change-Id: I67d5f0e7674f970b353ab5992fec1431f4592256 --- A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java 4 files changed, 506 insertions(+), 10 deletions(-) Approvals: jenkins-bot: Verified DCausse: Looks good to me, but someone else must approve Gehel: Looks good to me, but someone else must approve Nuria: Looks good to me, approved diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java new file mode 100644 index 0000000..1f39e5e --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/CirrusRequestDeser.java @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; + +/** + * Helper class for dealing with the array<struct<...>> in + * cirrussearchrequestset. + * + * Currently only extracts querytype and indices, but can be expanded as + * necessary to meet our needs in various UDFs. + */ +class CirrusRequestDeser { + private final StructObjectInspector oi; + + private final StructField queryTypeField; + private final StringObjectInspector queryTypeOI; + + private final StructField indicesField; + private final ListObjectInspector indicesOI; + private final StringObjectInspector indicesElemOI; + + /** + * @param argsHelper + * Helper object to verify hive structures match the expected formats. + * @param i + * Argument index of oi (what position was oi in the argument list + * of the UDF) + * @param oi + * Object inspector for the UDF arguments. This must match the + * CirrusSearchRequest structure in the CirrusSearchRequestSet + * Avro schema. + * + * @throws UDFArgumentTypeException + * Thrown when oi does not match the CirrusSearchRequest structure + * in the CirrusSearchRequestSet Avro schema. + * Thrown when + */ + public CirrusRequestDeser(GenericUDFHelper argsHelper, int i, StructObjectInspector oi) + throws UDFArgumentTypeException { + this.oi = oi; + // verify the 'querytype' field is a string + queryTypeField = oi.getStructFieldRef("querytype"); + if (queryTypeField == null) { + throw new UDFArgumentTypeException(i, + argsHelper.getFuncName() + " Argument should contain a struct with a 'querytype' field"); + } + ObjectInspector queryTypeOI = queryTypeField.getFieldObjectInspector(); + argsHelper.checkArgType(queryTypeOI, i, PrimitiveCategory.STRING); + + this.queryTypeOI = (StringObjectInspector) queryTypeOI; + + // verify the 'indices' field is a list of strings + indicesField = oi.getStructFieldRef("indices"); + if (indicesField == null) { + throw new UDFArgumentTypeException(i, + argsHelper.getFuncName() + " Argument should contain a struct with an 'indices' field"); + } + argsHelper.checkListArgType(indicesField.getFieldObjectInspector(), i, PrimitiveCategory.STRING); + indicesOI = (ListObjectInspector) indicesField.getFieldObjectInspector(); + this.indicesElemOI = (StringObjectInspector) indicesOI.getListElementObjectInspector(); + } + + public String getQueryType(Object struct) { + Object type = oi.getStructFieldData(struct, queryTypeField); + return queryTypeOI.getPrimitiveJavaObject(type); + } + + public String[] getIndices(Object struct) { + Object list = oi.getStructFieldData(struct, indicesField); + if (list == null) { + return null; + } + + int len = indicesOI.getListLength(list); + String[] indices = new String[len]; + int i = 0; + for (Object inner : indicesOI.getList(list)) { + indices[i++] = indicesElemOI.getPrimitiveJavaObject(inner); + } + + return indices; + } +} diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java index ac6fcce..a8c97f1 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GenericUDFHelper.java @@ -2,7 +2,9 @@ import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; @@ -58,9 +60,52 @@ */ protected void checkArgPrimitive(ObjectInspector[] arguments, int i) throws UDFArgumentTypeException{ - ObjectInspector.Category oiCat = arguments[i].getCategory(); - if (oiCat != ObjectInspector.Category.PRIMITIVE) { - throw new UDFArgumentTypeException(i, getFuncName() + " Argument should be of primitive type"); + checkArgType(arguments, i, Category.PRIMITIVE); + } + + /** + * Checks argument category type is primitive + * + * @param oi + * @param i + * + * @throws UDFArgumentTypeException + */ + protected void checkArgPrimitive(ObjectInspector oi, int i) throws UDFArgumentTypeException { + checkArgType(oi, i, Category.PRIMITIVE); + } + + /** + * Checks argument type + * + * @param arguments + * @param i + * @param type + * + * @throws UDFArgumentTypeException + */ + protected void checkArgType(ObjectInspector[] arguments, int i, PrimitiveCategory type) + throws UDFArgumentTypeException{ + checkArgType(arguments[i], i, type); + } + + /** + * Checks argument type + * + * @param oi + * @param i + * @param type + * + * @throws UDFArgumentTypeException + */ + protected void checkArgType(ObjectInspector oi, int i, PrimitiveCategory type) throws UDFArgumentTypeException { + checkArgPrimitive(oi, i); + PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) oi).getPrimitiveCategory(); + + if (primitiveCategory != type) { + throw new UDFArgumentTypeException(0, + "A " + type.getClass().getSimpleName() + " argument was expected for " + + getFuncName() + " but an argument of type " + oi.getTypeName() + " was given."); } } @@ -69,18 +114,83 @@ * * @param arguments * @param i + * @param Category * * @throws UDFArgumentTypeException */ - protected void checkArgType(ObjectInspector[] arguments, int i, PrimitiveCategory type) - throws UDFArgumentTypeException{ + protected void checkArgType(ObjectInspector[] arguments, int i, Category type) + throws UDFArgumentTypeException{ + checkArgType(arguments[i], i, type); + } - PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) arguments[i]).getPrimitiveCategory(); + protected void checkArgType(ObjectInspector oi, int i, Category type) + throws UDFArgumentTypeException{ + Category oiCat = oi.getCategory(); + if (oiCat != type) { + throw new UDFArgumentTypeException(i, getFuncName() + " Argument should be of " + + type.getClass().getSimpleName() + " type"); + } + } - if (primitiveCategory != type) { - throw new UDFArgumentTypeException(0, - "A string argument was expected for " + getFuncName() + " but an argument of type " + - arguments[i].getTypeName() + " was given." + /** + * Checks argument type of list and it's elements + * + * @param arguments + * @param i + * @param Category + * Type of list elements + * + * @throws UDFArgumentTypeException + */ + protected void checkListArgType(ObjectInspector[] arguments, int i, Category type) + throws UDFArgumentTypeException { + checkListArgType(arguments[i], i, type); + } + + /** + * Checks argument type of list and it's elements + * + * @param oi + * @param i + * @param type + * Type of list elements + * + * @throws UDFArgumentTypeException + */ + protected void checkListArgType(ObjectInspector oi, int i, Category type) + throws UDFArgumentTypeException { + checkArgType(oi, i, Category.LIST); + + ListObjectInspector listOI = (ListObjectInspector) oi; + if (listOI.getListElementObjectInspector().getCategory() != type) { + throw new UDFArgumentTypeException(i, + "An array<" + type.getClass().getSimpleName() + " argument was expected " + + "for " + getFuncName() + " but an argument of type array<" + + listOI.getListElementObjectInspector().getTypeName() + "> was given." + ); + } + } + + /** + * Checks argument type of list and it's elements + * + * @param oi + * @param i + * @param type + * Type of list elements + * + * @throws UDFArgumentTypeException + */ + protected void checkListArgType(ObjectInspector oi, int i, PrimitiveCategory type) + throws UDFArgumentTypeException { + checkListArgType(oi, i, Category.PRIMITIVE); + ListObjectInspector listOI = (ListObjectInspector) oi; + PrimitiveObjectInspector listElemOI = (PrimitiveObjectInspector) listOI.getListElementObjectInspector(); + if (listElemOI.getPrimitiveCategory() != type) { + throw new UDFArgumentTypeException(i, + "An array<" + type.getClass().getSimpleName() + " argument was expected " + + "for " + getFuncName() + " but an argument of type array<" + + listElemOI.getTypeName() + "> was given." ); } } diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java new file mode 100644 index 0000000..d96bd00 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetMainSearchRequestUDF.java @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; + +/** + * Returns the primary full text search request from provided list of requests. + * + * Operates on the specialized format of the `requests` field in the + * wmf_raw.CirrusSearchRequestSets table to locate the primary full text search + * request from a set of requests. When doing a full text search in mediawiki + * there are a number of auxilliary requests performed: An initial near match + * search for similar titles to be redirected to directly, the full text + * searches for the current wiki and a number of it's sister wikis. There are + * additionally second try searches that are performed if the first full text + * search performs poorly. These may be a new full text search for a suggested + * term, or a search against a wiki in the same project but in another + * language. The search log contains an entry for every one of these individual + * requests. + * + * Some analytics tasks only need to know about the primary request. This UDF + * takes in the name of the current wiki and the list of requests made, and + * returns a single reuqest from that list, which is the primary full text + * search request. + */ +@Description(name = "get_main_search_request", + value = "_FUNC_(wiki, requests) - Returns the primary full text search request from requests") +@UDFType(deterministic = true) +public class GetMainSearchRequestUDF extends GenericUDF { + final private static String FULL_TEXT = "full_text"; + + private StringObjectInspector wikiOI; + private ListObjectInspector listOI; + private CirrusRequestDeser deser; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + GenericUDFHelper argsHelper = new GenericUDFHelper(); + argsHelper.checkArgsSize(arguments, 2, 2); + + // first argument must be a string + argsHelper.checkArgType(arguments, 0, PrimitiveCategory.STRING); + wikiOI = (StringObjectInspector) arguments[0]; + + // second argument must be an array of structs + argsHelper.checkListArgType(arguments, 1, ObjectInspector.Category.STRUCT); + listOI = (ListObjectInspector) arguments[1]; + StructObjectInspector elemOI = (StructObjectInspector) listOI.getListElementObjectInspector(); + // deser will do reset of the validation + deser = new CirrusRequestDeser(argsHelper, 1, elemOI); + + // Return value is a struct from the list + return elemOI; + } + + @Override + public String getDisplayString(String[] arguments) { + return "GetMainSearchRequest(" + arguments[0] + ", " + arguments[1] + ")"; + } + + @Override + public Object evaluate(DeferredObject[] dos) throws HiveException { + if (dos[0].get() == null || dos[1].get() == null) { + return null; + } + if (listOI.getListLength(dos[1].get()) == 0) { + return null; + } + + List<?> requests = listOI.getList(dos[1].get()); + String wiki = wikiOI.getPrimitiveJavaObject(dos[0].get()); + String prefix = wiki + "_"; + for (Object request : requests) { + String[] indices = deser.getIndices(request); + if (indices == null) { + continue; + } + for (String index : indices) { + // If the request was made against a different wiki + // ignore it, it's likely part of interwiki search. + if (index != null && (index.equals(wiki) || index.startsWith(prefix)) + // At least one request must be a full_text query + && FULL_TEXT.equals(deser.getQueryType(request))) { + return request; + } + } + } + + return null; + } +} diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java new file mode 100644 index 0000000..0cb441f --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetMainSearchRequestUDF.java @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; + +import junitparams.JUnitParamsRunner; +import junitparams.Parameters; + +@RunWith(JUnitParamsRunner.class) +public class TestGetMainSearchRequestUDF { + private GetMainSearchRequestUDF udf; + + @Before + public void setUp() throws HiveException { + udf = new GetMainSearchRequestUDF(); + + List<String> fields = new ArrayList<>(); + List<ObjectInspector> fieldOIs = new ArrayList<>(); + + fields.add("querytype"); + fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); + + fields.add("indices"); + fieldOIs.add(ObjectInspectorFactory + .getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); + + ObjectInspector wikiOI = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + ObjectInspector listOI = ObjectInspectorFactory + .getStandardListObjectInspector(ObjectInspectorFactory.getStandardStructObjectInspector(fields, fieldOIs)); + + udf.initialize(new ObjectInspector[] { wikiOI, listOI }); + } + + @Test + @Parameters(method = "paramsForTest") + public void testEvaluate(String message, int expectIndex, String wiki, Object[] requests) throws HiveException { + GenericUDF.DeferredObject[] dois = { + new GenericUDF.DeferredJavaObject(wiki), new GenericUDF.DeferredJavaObject(requests) }; + Object expect = null; + if (expectIndex >= 0) { + expect = requests[expectIndex]; + } + assertEquals(message, expect, udf.evaluate(dois)); + } + + private Object[] paramsForTest() { + return new Object[] { + // Each 'new Object[]' at this level is a single test case + // representing a plausible set of search requests that could be + // logged to the `cirrussearchrequestset` table and the desired + // index of the primary full text search request in that set of + // requests. + new Object[] { + "simplest passing example", 0, "enwiki", + new Object[] { + // This level is a list of requests made between mediawiki + // and elasticsearch in the process of answering a search + // request made of mediawiki. Multiple individual requests + // are issued between mediawiki and elasticsearch, with the + // goal of this UDF being to decide which one of those is + // the primary full text search. + new Object[] { "full_text", new String[] { "enwiki_content" } } + } }, + new Object[] { + // Example of search request to a mediawiki instance configured with + // sister search. With sistersearch a full text request is made against + // one or more wikis with the same language but in a different project. + "including interwiki", 1, "enwiki", + new Object[] { + new Object[] { "near_match", new String[] { "enwiki_content" } }, + new Object[] { "full_text", new String[] { "enwiki_content" } }, + new Object[] { "full_text", new String[] { "enwikibooks_content" } } } }, + new Object[] { + // The order of sister searches doesn't have to be with the local wiki coming + // first. Make sure the UDF appropriately finds the primary request even + // if it comes after the sister search. + "interwiki alternate order", 3, "zhwiki", + new Object[] { + new Object[] { "near_match", new String[] { "zhwiki_content" } }, + new Object[] { "full_text", new String[] { "zhwikisource_content" } }, + new Object[] { "full_text", new String[] { "zhwiktionary_content" } }, + new Object[] { "full_text", new String[] { "zhwiki_content" } } } }, + new Object[] { + // Example of a search request that was run against multimedia + // search. In this case the full text search was performed + // against multiple indices, merging the results from frwiki + // and commonswiki. + "multimedia search", 1, "frwiki", + new Object[] { + new Object[] { "near_match", new String[] { "frwiki_general" } }, + new Object[] { "full_text", new String[] { "frwiki_general", "commonswiki_file" } } } }, + new Object[] { + // Search requests that ask for the 'everything' profile don't + // perform their search against a _content or _general index, + // instead using a top level alias in elasticsearch that refers + // to both. Ensure the UDF can still find the appropriate + // search request. + "search everything", 1, "enwiki", + new Object[] { + new Object[] { "near_match", new String[] { "enwiki" } }, + new Object[] { "full_text", new String[] { "enwiki", "commonswiki_file" } } } }, + new Object[] { + // Autocomplete search requests are not considered full text. + // As such the UDF should return NULL rather than the + // autocomplete request. + "completion suggester", -1, "itwiki", + new Object[] { new Object[] { "comp_suggest", new String[] { "itwiki_titlesuggest" } } } }, + new Object[] { + // Mediawiki's 'Go' feature takes full text searches submitted + // in a particular way and looks for 'close enough' titles. If + // there is a title or redirect very close to the submitted + // string no full text search is performed and the user is + // redirected. These requests must be detected as not having a + // primary full text search + "successfull 'go'", -1, "arwiki", + new Object[] { new Object[] { "near_match", new String[] { "arwiki_content" } } } }, + new Object[] { + // This type of request should generally not be recorded, but + // test the UDF with some odd inputs to ensure it is robust to + // the oddities that may occasionally arise. + "invalid record with null querytype", -1, "zhwiki", + new Object[] { new Object[] { null, new String[] { "zhwiki_content" } } } }, + new Object[] { + // This type of request should generally not be recorded, but + // test the UDF with some odd inputs to ensure it is robust to + // the oddities that may occasionally arise. + "invalid record null querytype later", -1, "thwiki", + new Object[] { + new Object[] { "near_match", new String[] { "thwiki_content" } }, + new Object[] { null, new String[] { "thwiki_content" } } } }, + new Object[] { + // This type of request should generally not be recorded, but + // test the UDF with some odd inputs to ensure it is robust to + // the oddities that may occasionally arise. + "invalid record with null indices", -1, "thwiki", + new Object[] { + new Object[] { "near_match", new String[] { "thwiki_content" } }, + new Object[] { "full_text", null } } } }; + } +} -- To view, visit https://gerrit.wikimedia.org/r/327855 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I67d5f0e7674f970b353ab5992fec1431f4592256 Gerrit-PatchSet: 6 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: EBernhardson <[email protected]> Gerrit-Reviewer: Bearloga <[email protected]> Gerrit-Reviewer: DCausse <[email protected]> Gerrit-Reviewer: EBernhardson <[email protected]> Gerrit-Reviewer: Gehel <[email protected]> Gerrit-Reviewer: Joal <[email protected]> Gerrit-Reviewer: Nuria <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
