DRILL-561: Implement btrim/trim Make use of existing UTF8 function
Added test for btrim Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/37eb6566 Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/37eb6566 Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/37eb6566 Branch: refs/heads/master Commit: 37eb6566e69f853ec18032eee5ec45a5c9caedfe Parents: 3e6ff2c Author: Cliff Buchanan <cbucha...@maprtech.com> Authored: Thu Jun 26 15:45:14 2014 -0700 Committer: Jacques Nadeau <jacq...@apache.org> Committed: Fri Jun 27 10:51:56 2014 -0700 ---------------------------------------------------------------------- .../exec/expr/fn/impl/StringFunctionUtil.java | 2 +- .../exec/expr/fn/impl/StringFunctions.java | 88 +++++++++++--------- .../drill/exec/planner/logical/DrillOptiq.java | 23 +++++ .../exec/physical/impl/TestStringFunctions.java | 8 ++ .../resources/functions/string/testTrim.json | 39 +++++++++ 5 files changed, 119 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java index 844a3e8..16ff8f2 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java @@ -72,7 +72,7 @@ public class StringFunctionUtil { return -1; } - private static int utf8CharLen(ByteBuf buffer, int idx) { + public static int utf8CharLen(ByteBuf buffer, int idx) { byte firstByte = buffer.getByte(idx); if (firstByte >= 0) { // 1-byte char. First byte is 0xxxxxxx. return 1; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java index 33f2c94..92ac56f 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java @@ -716,32 +716,16 @@ public class StringFunctions{ out.buffer = text.buffer; out.start = out.end = text.end; - byte currentByte = 0; - int id = 0; int bytePerChar = 0; //Scan from left of "text", stop until find a char not in "from" - for (id = text.start; id < text.end; ) { - currentByte = text.buffer.getByte(id); - - bytePerChar = 0; - - if (currentByte < 0x128) // 1-byte char. First byte is 0xxxxxxx. - bytePerChar = 1; - else if ((currentByte & 0xE0) == 0xC0 ) // 2-byte char. First byte is 110xxxxx - bytePerChar = 2; - else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx - bytePerChar = 3; - else if ((currentByte & 0xF8) == 0xF0) //4-byte char. First byte is 11110xxx - bytePerChar = 4; - - //Scan to check if "from" contains the character of "byterPerChar" bytes. + for (int id = text.start; id < text.end; id += bytePerChar) { + bytePerChar = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, id); int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, text.buffer, id, id + bytePerChar); if (pos < 0) { // Found the 1st char not in "from", stop out.start = id; break; } - id += bytePerChar; //Advance to next character. } } // end of eval @@ -765,37 +749,61 @@ public class StringFunctions{ out.buffer = text.buffer; out.start = out.end = text.start; - byte currentByte = 0; - int id = 0; int bytePerChar = 0; //Scan from right of "text", stop until find a char not in "from" - for (id = text.end-1; id>= text.start; ) { - currentByte = text.buffer.getByte(id); - - bytePerChar = 0; - //In UTF-8 encoding, the continuation byte for a multi-byte char is 10xxxxxx. - //Continue back-off to prior byte if it's continuation byte - if ( (currentByte & 0xC0) == 0x80) { - id --; - continue; - } else if (currentByte < 0x128) // 1-byte char. First byte is 0xxxxxxx. - bytePerChar = 1; - else if ((currentByte & 0xE0) == 0xC0 ) // 2-byte char. First byte is 110xxxxx - bytePerChar = 2; - else if ((currentByte & 0xF0) == 0xE0 ) // 3-byte char. First byte is 1110xxxx - bytePerChar = 3; - else if ((currentByte & 0xF8) == 0xF0) //4-byte char. First byte is 11110xxx - bytePerChar = 4; - - //Scan to check if "from" contains the character of "byterPerChar" bytes. The lead byte starts at id. + for (int id = text.end - 1; id >= text.start; id -= bytePerChar) { + while ((text.buffer.getByte(id) & 0xC0) == 0x80 && id >= text.start) id--; + bytePerChar = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, id); int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, text.buffer, id, id + bytePerChar); if (pos < 0) { // Found the 1st char not in "from", stop out.end = id+ bytePerChar; break; } + } + } // end of eval + } + + /** + * Remove the longest string containing only characters from "from" from the start of "text" + */ + @FunctionTemplate(name = "btrim", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL) + public static class Btrim implements DrillSimpleFunc{ + + @Param VarCharHolder text; + @Param VarCharHolder from; + + @Output VarCharHolder out; + + public void setup(RecordBatch incoming){ + } + + public void eval() { + out.buffer = text.buffer; + out.start = out.end = text.start; + int bytePerChar = 0; + + //Scan from left of "text", stop until find a char not in "from" + for (int id = text.start; id < text.end; id += bytePerChar) { + bytePerChar = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, id); + int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, + text.buffer, id, id + bytePerChar); + if (pos < 0) { // Found the 1st char not in "from", stop + out.start = id; + break; + } + } - id --; // back-off to prior character. + //Scan from right of "text", stop until find a char not in "from" + for (int id = text.end - 1; id >= text.start; id -= bytePerChar) { + while ((text.buffer.getByte(id) & 0xC0) == 0x80 && id >= text.start) id--; + bytePerChar = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, id); + int pos = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer, from.start, from.end, + text.buffer, id, id + bytePerChar); + if (pos < 0) { // Found the 1st char not in "from", stop + out.end = id+ bytePerChar; + break; + } } } // end of eval } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java index 21ff421..ff4a7d3 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java @@ -289,6 +289,29 @@ public class DrillOptiq { default: throw new UnsupportedOperationException("extract function supports the following time units: YEAR, MONTH, DAY, HOUR, MINUTE, SECOND"); } + } else if (functionName.equals("trim")) { + String trimFunc = null; + List<LogicalExpression> trimArgs = Lists.newArrayList(); + + assert args.get(0) instanceof ValueExpressions.QuotedString; + switch (((ValueExpressions.QuotedString)args.get(0)).value.toUpperCase()) { + case "LEADING": + trimFunc = "ltrim"; + break; + case "TRAILING": + trimFunc = "rtrim"; + break; + case "BOTH": + trimFunc = "btrim"; + break; + default: + assert 1 == 0; + } + + trimArgs.add(args.get(2)); + trimArgs.add(args.get(1)); + + return FunctionCallFactory.createExpression(trimFunc, trimArgs); } else if (functionName.equals("date_part")) { // Rewrite DATE_PART functions as extract functions // assert that the function has exactly two arguments http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java index af741a5..3f9ba5e 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java +++ b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java @@ -138,6 +138,14 @@ public class TestStringFunctions extends ExecTest { } @Test + public void testTrim(@Injectable final DrillbitContext bitContext, + @Injectable UserServer.UserClientConnection connection) throws Throwable{ + Object [] expected = new Object[] {"fghI", "", "", "!", " aaa "}; + + runTest(bitContext, connection, expected, "functions/string/testTrim.json"); + } + + @Test public void testReplace(@Injectable final DrillbitContext bitContext, @Injectable UserServer.UserClientConnection connection) throws Throwable{ Object [] expected = new Object[] {"aABABcdf", "ABABbABbcdf", "aababcdf", "acdf", "ABCD", "abc"}; http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/test/resources/functions/string/testTrim.json ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/resources/functions/string/testTrim.json b/exec/java-exec/src/test/resources/functions/string/testTrim.json new file mode 100644 index 0000000..6c81f78 --- /dev/null +++ b/exec/java-exec/src/test/resources/functions/string/testTrim.json @@ -0,0 +1,39 @@ +{ + head:{ + type:"APACHE_DRILL_PHYSICAL", + version:"1", + generator:{ + type:"manual" + } + }, + graph:[ + { + @id:1, + pop:"mock-sub-scan", + url: "http://apache.org", + entries:[ + {records: 1, types: [ + {name: "varcharcol", type: "VARCHAR", mode: "REQUIRED"}, + {name: "nullvarcharcol", type: "VARCHAR", mode: "OPTIONAL"} + ]} + ] + }, + { + @id:2, + child: 1, + pop:"project", + exprs: [ + { ref: "col1", expr: "btrim(' efghI e', 'e ')"}, + { ref: "col2", expr: "btrim('a', 'a')"}, + { ref: "col3", expr: "btrim('', '')"}, + { ref: "col4", expr: "btrim('aAa!aAa', 'aA')"}, + { ref: "col5", expr: "btrim(' aaa ', '')"} + ] + }, + { + @id: 3, + child: 2, + pop: "screen" + } + ] +}