This is an automated email from the ASF dual-hosted git repository. gparai pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/drill.git
commit bd6d7b11ff3f687f7257b86b17b8fe9e2e134b43 Author: Charles Givre <[email protected]> AuthorDate: Thu Sep 5 10:29:01 2019 -0400 DRILL-7343: Add User-Agent UDFs to Drill closes #1840 --- contrib/udfs/README.md | 56 +++++++ contrib/udfs/pom.xml | 5 + .../apache/drill/exec/udfs/UserAgentFunctions.java | 172 +++++++++++++++++++++ .../drill/exec/udfs/TestUserAgentFunctions.java | 171 ++++++++++++++++++++ 4 files changed, 404 insertions(+) diff --git a/contrib/udfs/README.md b/contrib/udfs/README.md new file mode 100644 index 0000000..c0950e7 --- /dev/null +++ b/contrib/udfs/README.md @@ -0,0 +1,56 @@ +# Drill User Defined Functions + +This `README` documents functions which users have submitted to Apache Drill. + +## User Agent Functions +Drill UDF for parsing User Agent Strings. +This function is based on Niels Basjes Java library for parsing user agent strings which is available here: <https://github.com/nielsbasjes/yauaa>. + +### Usage +The function `parse_user_agent()` takes a user agent string as an argument and returns a map of the available fields. Note that not every field will be present in every user agent string. +``` +SELECT parse_user_agent( columns[0] ) as ua +FROM dfs.`/tmp/data/drill-httpd/ua.csv`; +``` +The query above returns: +``` +{ + "DeviceClass":"Desktop", + "DeviceName":"Macintosh", + "DeviceBrand":"Apple", + "OperatingSystemClass":"Desktop", + "OperatingSystemName":"Mac OS X", + "OperatingSystemVersion":"10.10.1", + "OperatingSystemNameVersion":"Mac OS X 10.10.1", + "LayoutEngineClass":"Browser", + "LayoutEngineName":"Blink", + "LayoutEngineVersion":"39.0", + "LayoutEngineVersionMajor":"39", + "LayoutEngineNameVersion":"Blink 39.0", + "LayoutEngineNameVersionMajor":"Blink 39", + "AgentClass":"Browser", + "AgentName":"Chrome", + "AgentVersion":"39.0.2171.99", + "AgentVersionMajor":"39", + "AgentNameVersion":"Chrome 39.0.2171.99", + "AgentNameVersionMajor":"Chrome 39", + "DeviceCpu":"Intel" +} +``` +The function returns a Drill map, so you can access any of the fields using Drill's table.map.key notation. For example, the query below illustrates how to extract a field from this map and summarize it: + +``` +SELECT uadata.ua.AgentNameVersion AS Browser, +COUNT( * ) AS BrowserCount +FROM ( + SELECT parse_user_agent( columns[0] ) AS ua + FROM dfs.drillworkshop.`user-agents.csv` +) AS uadata +GROUP BY uadata.ua.AgentNameVersion +ORDER BY BrowserCount DESC +``` +The function can also be called with an optional field as an argument. IE: +``` +SELECT parse_user_agent( `user_agent`, 'AgentName` ) as AgentName ... +``` +which will just return the requested field. If the user agent string is empty, all fields will have the value of `Hacker`. diff --git a/contrib/udfs/pom.xml b/contrib/udfs/pom.xml index 38f7dfa..0c0f775 100644 --- a/contrib/udfs/pom.xml +++ b/contrib/udfs/pom.xml @@ -63,6 +63,11 @@ <artifactId>proj4j</artifactId> <version>0.1.0</version> </dependency> + <dependency> + <groupId>nl.basjes.parse.useragent</groupId> + <artifactId>yauaa</artifactId> + <version>5.11</version> + </dependency> <!-- Test dependencies --> <dependency> diff --git a/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java new file mode 100644 index 0000000..f684a2d --- /dev/null +++ b/contrib/udfs/src/main/java/org/apache/drill/exec/udfs/UserAgentFunctions.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.udfs; + +import io.netty.buffer.DrillBuf; +import org.apache.drill.exec.expr.DrillSimpleFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.expr.annotations.Workspace; +import org.apache.drill.exec.expr.holders.NullableVarCharHolder; +import org.apache.drill.exec.expr.holders.VarCharHolder; +import org.apache.drill.exec.vector.complex.writer.BaseWriter; + +import javax.inject.Inject; + +public class UserAgentFunctions { + + @FunctionTemplate(name = "parse_user_agent", + scope = FunctionTemplate.FunctionScope.SIMPLE + ) + public static class UserAgentFunction implements DrillSimpleFunc { + @Param + VarCharHolder input; + + @Output + BaseWriter.ComplexWriter outWriter; + + @Inject + DrillBuf outBuffer; + + @Workspace + nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa; + + public void setup() { + uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build(); + uaa.getAllPossibleFieldNamesSorted(); + } + + public void eval() { + org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap(); + + String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input); + + nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString); + + for (String fieldName : agent.getAvailableFieldNamesSorted()) { + + org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder(); + String field = agent.getValue(fieldName); + + byte[] rowStringBytes = field.getBytes(); + outBuffer.reallocIfNeeded(rowStringBytes.length); + outBuffer.setBytes(0, rowStringBytes); + + rowHolder.start = 0; + rowHolder.end = rowStringBytes.length; + rowHolder.buffer = outBuffer; + + queryMapWriter.varChar(fieldName).write(rowHolder); + } + } + } + + @FunctionTemplate(name = "parse_user_agent", + scope = FunctionTemplate.FunctionScope.SIMPLE + ) + public static class NullableUserAgentFunction implements DrillSimpleFunc { + @Param + NullableVarCharHolder input; + + @Output + BaseWriter.ComplexWriter outWriter; + + @Inject + DrillBuf outBuffer; + + @Workspace + nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa; + + public void setup() { + uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build(); + uaa.getAllPossibleFieldNamesSorted(); + } + + public void eval() { + org.apache.drill.exec.vector.complex.writer.BaseWriter.MapWriter queryMapWriter = outWriter.rootAsMap(); + if (input.isSet == 0) { + // Return empty map + queryMapWriter.start(); + queryMapWriter.end(); + return; + } + String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input); + + nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString); + + for (String fieldName : agent.getAvailableFieldNamesSorted()) { + + org.apache.drill.exec.expr.holders.VarCharHolder rowHolder = new org.apache.drill.exec.expr.holders.VarCharHolder(); + String field = agent.getValue(fieldName); + + byte[] rowStringBytes = field.getBytes(); + outBuffer.reallocIfNeeded(rowStringBytes.length); + outBuffer.setBytes(0, rowStringBytes); + + rowHolder.start = 0; + rowHolder.end = rowStringBytes.length; + rowHolder.buffer = outBuffer; + + queryMapWriter.varChar(fieldName).write(rowHolder); + } + } + } + + @FunctionTemplate(name = "parse_user_agent", + scope = FunctionTemplate.FunctionScope.SIMPLE, nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) + + public static class UserAgentFieldFunction implements DrillSimpleFunc { + @Param + VarCharHolder input; + + @Param + VarCharHolder desiredField; + + @Output + VarCharHolder out; + + @Inject + DrillBuf outBuffer; + + @Workspace + nl.basjes.parse.useragent.UserAgentAnalyzerDirect uaa; + + public void setup() { + uaa = nl.basjes.parse.useragent.UserAgentAnalyzerDirect.newBuilder().dropTests().hideMatcherLoadStats().build(); + uaa.getAllPossibleFieldNamesSorted(); + } + + public void eval() { + String userAgentString = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(input); + String requestedField = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.getStringFromVarCharHolder(desiredField); + + nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString); + String field = agent.getValue(requestedField); + + byte[] rowStringBytes = field.getBytes(java.nio.charset.StandardCharsets.UTF_8); + outBuffer.reallocIfNeeded(rowStringBytes.length); + outBuffer.setBytes(0, rowStringBytes); + + out.start = 0; + out.end = rowStringBytes.length; + out.buffer = outBuffer; + } + } +} diff --git a/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java new file mode 100644 index 0000000..efa6708 --- /dev/null +++ b/contrib/udfs/src/test/java/org/apache/drill/exec/udfs/TestUserAgentFunctions.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.drill.exec.udfs; + +import org.apache.drill.categories.SqlFunctionTest; +import org.apache.drill.categories.UnlikelyTest; +import org.apache.drill.test.ClusterFixture; +import org.apache.drill.test.ClusterFixtureBuilder; +import org.apache.drill.test.ClusterTest; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import java.util.HashMap; +import java.util.Map; + +@Category({UnlikelyTest.class, SqlFunctionTest.class}) +public class TestUserAgentFunctions extends ClusterTest { + + @BeforeClass + public static void setup() throws Exception { + ClusterFixtureBuilder builder = ClusterFixture.builder(dirTestWatcher); + startCluster(builder); + } + + @Test + public void testParseUserAgentString() throws Exception { + String query = "SELECT t1.ua.DeviceClass AS DeviceClass,\n" + + "t1.ua.DeviceName AS DeviceName,\n" + + "t1.ua.DeviceBrand AS DeviceBrand,\n" + + "t1.ua.DeviceCpuBits AS DeviceCpuBits,\n" + + "t1.ua.OperatingSystemClass AS OperatingSystemClass,\n" + + "t1.ua.OperatingSystemName AS OperatingSystemName,\n" + + "t1.ua.OperatingSystemVersion AS OperatingSystemVersion,\n" + + "t1.ua.OperatingSystemVersionMajor AS OperatingSystemVersionMajor,\n" + + "t1.ua.OperatingSystemNameVersion AS OperatingSystemNameVersion,\n" + + "t1.ua.OperatingSystemNameVersionMajor AS OperatingSystemNameVersionMajor,\n" + + "t1.ua.LayoutEngineClass AS LayoutEngineClass,\n" + + "t1.ua.LayoutEngineName AS LayoutEngineName,\n" + + "t1.ua.LayoutEngineVersion AS LayoutEngineVersion,\n" + + "t1.ua.LayoutEngineVersionMajor AS LayoutEngineVersionMajor,\n" + + "t1.ua.LayoutEngineNameVersion AS LayoutEngineNameVersion,\n" + + "t1.ua.LayoutEngineBuild AS LayoutEngineBuild,\n" + + "t1.ua.AgentClass AS AgentClass,\n" + + "t1.ua.AgentName AS AgentName,\n" + + "t1.ua.AgentVersion AS AgentVersion,\n" + + "t1.ua.AgentVersionMajor AS AgentVersionMajor,\n" + + "t1.ua.AgentNameVersionMajor AS AgentNameVersionMajor,\n" + + "t1.ua.AgentLanguage AS AgentLanguage,\n" + + "t1.ua.AgentLanguageCode AS AgentLanguageCode,\n" + + "t1.ua.AgentSecurity AS AgentSecurity\n" + + "FROM (SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11') AS ua FROM (values(1))) AS t1"; + + testBuilder() + .sqlQuery(query) + .unOrdered() + .baselineColumns("DeviceClass", "DeviceName", "DeviceBrand", "DeviceCpuBits", "OperatingSystemClass", "OperatingSystemName", "OperatingSystemVersion", "OperatingSystemVersionMajor", "OperatingSystemNameVersion", "OperatingSystemNameVersionMajor", "LayoutEngineClass", "LayoutEngineName", "LayoutEngineVersion", "LayoutEngineVersionMajor", "LayoutEngineNameVersion", "LayoutEngineBuild", "AgentClass", "AgentName", "AgentVersion", "AgentVersionMajor", "AgentNameVersionMajor", "AgentLang [...] + .baselineValues("Desktop", "Desktop", "Unknown", "32", "Desktop", "Windows NT", "XP", "XP", "Windows XP", "Windows XP", "Browser", "Gecko", "1.8.1.11", "1", "Gecko 1.8.1.11", "20071127", "Browser", "Firefox", "2.0.0.11", "2", "Firefox 2", "English (United States)", "en-us", "Strong security") + .go(); + } + + @Test + public void testGetHostName() throws Exception { + String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'AgentSecurity') AS agent FROM " + + "(values(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues("Strong security") + .go(); + } + + @Test + public void testEmptyFieldName() throws Exception { + String query = "SELECT parse_user_agent('Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', '') AS agent FROM " + "(values" + + "(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues("Unknown") + .go(); + } + + @Test + public void testNullUserAgent() throws Exception { + String query = "SELECT parse_user_agent(CAST(null as VARCHAR)) AS agent FROM (values(1))"; + Map emptyMap = new HashMap(); + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues(emptyMap) + .go(); + } + + + @Test + public void testEmptyUAStringAndFieldName() throws Exception { + String query = "SELECT parse_user_agent('', '') AS agent FROM (values(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues("Unknown") + .go(); + } + + @Test + public void testNullUAStringAndEmptyFieldName() throws Exception { + String query = "SELECT parse_user_agent(CAST(null as VARCHAR), '') AS agent FROM (values(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues((String) null) + .go(); + } + + @Test + public void testNullUAStringAndNullFieldName() throws Exception { + String query = "SELECT parse_user_agent(CAST(null as VARCHAR), CAST(null as VARCHAR)) AS agent FROM (values(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues((String) null) + .go(); + } + + @Test + public void testNullUAStringAndFieldName() throws Exception { + String query = "SELECT parse_user_agent(CAST(null as VARCHAR), 'AgentSecurity') AS agent FROM (values(1))"; + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("agent") + .baselineValues((String) null) + .go(); + } + + @Test + public void testEmptyUAString() throws Exception { + String query = "SELECT t1.ua.AgentName AS AgentName FROM (SELECT parse_user_agent('') AS ua FROM (values(1))) as t1"; + + // If the UA string is empty, all returned fields default to "Hacker" + testBuilder() + .sqlQuery(query) + .ordered() + .baselineColumns("AgentName") + .baselineValues("Hacker") + .go(); + } +}
