DRILL-743: New String Functions
Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/53a89d69 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/53a89d69 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/53a89d69 Branch: refs/heads/master Commit: 53a89d69dad990be4b64998bfc89d46fadfc8a0a Parents: fed331b Author: Yash Sharma <yash...@gmail.com> Authored: Sat Jul 5 17:50:50 2014 +0530 Committer: Jacques Nadeau <jacq...@apache.org> Committed: Mon Jul 7 15:52:35 2014 -0700 ---------------------------------------------------------------------- .../exec/expr/fn/impl/StringFunctionUtil.java | 15 +++ .../exec/expr/fn/impl/StringFunctions.java | 134 +++++++++++++++++++ .../exec/physical/impl/TestStringFunctions.java | 8 +- .../functions/string/testStringFuncs.json | 44 ++++++ 4 files changed, 198 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/53a89d69/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java index 16ff8f2..6825309 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java @@ -87,4 +87,19 @@ public class StringFunctionUtil { + " at position " + idx + " encountered while decoding UTF8 string."); } + public static int utf8CharLen(byte currentByte) { + if (currentByte >= 0){ // 1-byte char. First byte is 0xxxxxxx. + return 1; + } + else if ((currentByte & 0xE0) == 0xC0 ){ // 2-byte char. First byte is 110xxxxx + return 2; + } + else if ((currentByte & 0xF0) == 0xE0 ){ // 3-byte char. First byte is 1110xxxx + return 3; + } + else if ((currentByte & 0xF8) == 0xF0){ //4-byte char. First byte is 11110xxx + return 4; + } + throw new DrillRuntimeException("Unexpected byte 0x" + Integer.toString((int)currentByte & 0xff, 16) + " encountered while decoding UTF8 string."); + } } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/53a89d69/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 92ac56f..fcb3b77 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -33,6 +33,8 @@ import org.apache.drill.exec.expr.holders.VarBinaryHolder; import org.apache.drill.exec.expr.holders.VarCharHolder; import org.apache.drill.exec.expr.holders.NullableVarCharHolder; import org.apache.drill.exec.record.RecordBatch; +import java.nio.charset.Charset; +import org.apache.drill.exec.expr.holders.IntHolder; public class StringFunctions{ static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StringFunctions.class); @@ -942,4 +944,136 @@ public class StringFunctions{ } } + + /** + * Returns the ASCII code of the first character of input string + */ + @FunctionTemplate(name = "ascii", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class AsciiString implements DrillSimpleFunc { + + @Param VarCharHolder in; + @Output IntHolder out; + + public void setup(RecordBatch incoming) { } + + public void eval() { + out.value = in.buffer.getByte(in.start); + } + } + + /** + * Returns the char corresponding to ASCII code input. + */ + @FunctionTemplate(name = "chr", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class AsciiToChar implements DrillSimpleFunc { + + @Param IntHolder in; + @Output VarCharHolder out; + @Workspace ByteBuf buffer; + + public void setup(RecordBatch incoming) { + buffer = io.netty.buffer.Unpooled.wrappedBuffer(new byte [1]); + } + + public void eval() { + out.buffer = buffer; + out.start = out.end = 0; + out.buffer.setByte(0, in.value); + ++out.end; + } + } + + /** + * Returns the input char sequences repeated nTimes. + */ + @FunctionTemplate(names = {"repeat", "repeatstr"}, scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class RepeatString implements DrillSimpleFunc { + + @Param VarCharHolder in; + @Param IntHolder nTimes; + @Output VarCharHolder out; + @Workspace ByteBuf buffer; + + public void setup(RecordBatch incoming) { + } + + public void eval() { + int num = nTimes.value; + byte[] bytea = new byte [(in.end - in.start)*num]; + int index = 0; + while(num > 0){ + for (int id = in.start; id < in.end; id++){ + bytea[index++] = in.buffer.getByte(id); + } + num--; + } + out.buffer = io.netty.buffer.Unpooled.wrappedBuffer(bytea); + out.start = 0; + out.end = bytea.length; + } + } + + /** + * Convert string to ASCII from another encoding input. + */ + @FunctionTemplate(name = "toascii", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class AsciiEndode implements DrillSimpleFunc { + + @Param VarCharHolder in; + @Param VarCharHolder enc; + @Output VarCharHolder out; + @Workspace Charset inCharset; + + public void setup(RecordBatch incoming) { + inCharset = java.nio.charset.Charset.forName(enc.toString()); + } + + public void eval() { + byte[] bytea = new byte[in.end - in.start]; + int index =0; + for(int i = in.start; i<in.end; i++, index++){ + bytea[index]=in.buffer.getByte(i); + } + byte[] outBytea = new String(bytea, inCharset).getBytes(com.google.common.base.Charsets.UTF_8); + out.buffer = io.netty.buffer.Unpooled.wrappedBuffer(outBytea); + out.start = 0; + out.end = outBytea.length; + } + } + + /** + * Returns the reverse string for given input. + */ + @FunctionTemplate(name = "reverse", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class ReverseString implements DrillSimpleFunc { + + @Param VarCharHolder in; + @Output VarCharHolder out; + @Workspace ByteBuf buffer; + + public void setup(RecordBatch incoming) { + } + + public void eval() { + int charlen = 0; + + byte[] bytea = new byte [in.end - in.start]; + int index = in.end; + int innerindex = 0; + + for (int id = in.start; id < in.end; id+=charlen){ + innerindex = charlen = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(in.buffer, id); + + while(innerindex > 0){ + bytea[index - innerindex] = in.buffer.getByte(id + (charlen - innerindex)); + innerindex-- ; + } + + index -= charlen; + } + out.buffer = io.netty.buffer.Unpooled.wrappedBuffer(bytea); + out.start = 0; + out.end = bytea.length; + } + } } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/53a89d69/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java index 3f9ba5e..cb9e749 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java @@ -23,7 +23,6 @@ import mockit.Injectable; import mockit.NonStrictExpectations; import org.apache.drill.common.config.DrillConfig; -import org.apache.drill.common.util.TestTools; import org.apache.drill.exec.ExecTest; import org.apache.drill.exec.expr.fn.FunctionImplementationRegistry; import org.apache.drill.exec.memory.TopLevelAllocator; @@ -37,9 +36,7 @@ import org.apache.drill.exec.rpc.user.UserServer; import org.apache.drill.exec.server.DrillbitContext; import org.apache.drill.exec.vector.ValueVector; import org.apache.drill.exec.vector.VarCharVector; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.TestRule; import com.codahale.metrics.MetricRegistry; import com.google.common.base.Charsets; @@ -243,4 +240,9 @@ public class TestStringFunctions extends ExecTest { runTest(bitContext, connection, expected, "functions/string/testUpper.json"); } + @Test + public void testNewStringFuncs(@Injectable final DrillbitContext bitContext, @Injectable UserServer.UserClientConnection connection) throws Throwable{ + Object [] expected = new Object[] {97, 65, -32, "A", "btrim", "Peace Peace Peace ", "हà¤à¥à¤¨à¤¾ मताता हà¤à¥à¤¨à¤¾ मताता ", "katcit", "\u00C3\u00A2pple", "नदम"}; + runTest(bitContext, connection, expected, "functions/string/testStringFuncs.json"); + } } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/53a89d69/exec/java-exec/src/test/resources/functions/string/testStringFuncs.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/functions/string/testStringFuncs.json b/exec/java-exec/src/test/resources/functions/string/testStringFuncs.json new file mode 100644 index 0000000..4beea3d --- /dev/null +++ b/exec/java-exec/src/test/resources/functions/string/testStringFuncs.json @@ -0,0 +1,44 @@ +{ + head:{ + type:"APACHE_DRILL_PHYSICAL", + version:"1", + generator:{ + type:"manual" + } + }, + graph:[ + { + @id:1, + pop:"mock-sub-scan", + url: "http://apache.org", + entries:[ + {records: 1, types: [ + {name: "varcharcol", type: "VARCHAR", mode: "REQUIRED"}, + {name: "nullvarcharcol", type: "VARCHAR", mode: "OPTIONAL"} + ]} + ] + }, + { + @id:2, + child: 1, + pop:"project", + exprs: [ + { ref : "ref1", expr : " ascii('apache') "}, + { ref : "ref2", expr : " ascii('Apache') "}, + { ref : "ref3", expr : " ascii('ठपाà¤à¥') "}, + { ref : "ref4", expr : " chr(65) "}, + { ref : "ref5", expr : " btrim('xyxbtrimyyx', 'xy') "}, + { ref : "ref6", expr : " repeatstr('Peace ', 3) "}, + { ref : "ref7", expr : " repeatstr('हà¤à¥à¤¨à¤¾ मताता ', 2) "}, + { ref : "ref8", expr : " reverse('tictak') "}, + { ref : "ref9", expr : " toascii('âpple','ISO-8859-1') "}, + { ref : "ref10", expr : " reverse('मदन') "} + ] + }, + { + @id: 3, + child: 2, + pop: "screen" + } + ] +}