DRILL-561: Implement btrim/trim

Make use of existing UTF8 function

Added test for btrim


Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/37eb6566
Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/37eb6566
Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/37eb6566

Branch: refs/heads/master
Commit: 37eb6566e69f853ec18032eee5ec45a5c9caedfe
Parents: 3e6ff2c
Author: Cliff Buchanan <cbucha...@maprtech.com>
Authored: Thu Jun 26 15:45:14 2014 -0700
Committer: Jacques Nadeau <jacq...@apache.org>
Committed: Fri Jun 27 10:51:56 2014 -0700

----------------------------------------------------------------------
 .../exec/expr/fn/impl/StringFunctionUtil.java   |  2 +-
 .../exec/expr/fn/impl/StringFunctions.java      | 88 +++++++++++---------
 .../drill/exec/planner/logical/DrillOptiq.java  | 23 +++++
 .../exec/physical/impl/TestStringFunctions.java |  8 ++
 .../resources/functions/string/testTrim.json    | 39 +++++++++
 5 files changed, 119 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
index 844a3e8..16ff8f2 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctionUtil.java
@@ -72,7 +72,7 @@ public class StringFunctionUtil {
     return -1;
   }
 
-  private static int utf8CharLen(ByteBuf buffer, int idx) {
+  public static int utf8CharLen(ByteBuf buffer, int idx) {
     byte firstByte = buffer.getByte(idx);
     if (firstByte >= 0) { // 1-byte char. First byte is 0xxxxxxx.
       return 1;

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
index 33f2c94..92ac56f 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
@@ -716,32 +716,16 @@ public class StringFunctions{
       out.buffer = text.buffer;
       out.start = out.end = text.end;
 
-      byte currentByte = 0;
-      int id = 0;
       int bytePerChar = 0;
       //Scan from left of "text", stop until find a char not in "from"
-      for (id = text.start; id < text.end; ) {
-        currentByte = text.buffer.getByte(id);
-
-        bytePerChar = 0;
-
-        if (currentByte < 0x128)                 // 1-byte char. First byte is 
0xxxxxxx.
-          bytePerChar = 1;
-        else if ((currentByte & 0xE0) == 0xC0 )   // 2-byte char. First byte 
is 110xxxxx
-          bytePerChar = 2;
-        else if ((currentByte & 0xF0) == 0xE0 )   // 3-byte char. First byte 
is 1110xxxx
-          bytePerChar = 3;
-        else if ((currentByte & 0xF8) == 0xF0)    //4-byte char. First byte is 
11110xxx
-          bytePerChar = 4;
-
-        //Scan to check if "from" contains the character of "byterPerChar" 
bytes.
+      for (int id = text.start; id < text.end; id += bytePerChar) {
+        bytePerChar = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, 
id);
         int pos = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer,
 from.start, from.end,
                                                                                
             text.buffer, id, id + bytePerChar);
         if (pos < 0) { // Found the 1st char not in "from", stop
           out.start = id;
           break;
         }
-        id += bytePerChar; //Advance to next character.
       }
     } // end of eval
 
@@ -765,37 +749,61 @@ public class StringFunctions{
       out.buffer = text.buffer;
       out.start = out.end = text.start;
 
-      byte currentByte = 0;
-      int id = 0;
       int bytePerChar = 0;
       //Scan from right of "text", stop until find a char not in "from"
-      for (id = text.end-1; id>=  text.start; ) {
-        currentByte = text.buffer.getByte(id);
-
-        bytePerChar = 0;
-        //In UTF-8 encoding, the continuation byte for a multi-byte char is 
10xxxxxx.
-        //Continue back-off to prior byte if it's continuation byte
-        if ( (currentByte & 0xC0) == 0x80) {
-          id --;
-          continue;
-        } else if (currentByte < 0x128)                 // 1-byte char. First 
byte is 0xxxxxxx.
-          bytePerChar = 1;
-        else if ((currentByte & 0xE0) == 0xC0 )   // 2-byte char. First byte 
is 110xxxxx
-          bytePerChar = 2;
-        else if ((currentByte & 0xF0) == 0xE0 )   // 3-byte char. First byte 
is 1110xxxx
-          bytePerChar = 3;
-        else if ((currentByte & 0xF8) == 0xF0)    //4-byte char. First byte is 
11110xxx
-          bytePerChar = 4;
-
-        //Scan to check if "from" contains the character of "byterPerChar" 
bytes. The lead byte starts at id.
+      for (int id = text.end - 1; id >= text.start; id -= bytePerChar) {
+        while ((text.buffer.getByte(id) & 0xC0) == 0x80 && id >= text.start) 
id--;
+        bytePerChar = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, 
id);
         int pos = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer,
 from.start, from.end,
                                                                                
             text.buffer, id, id + bytePerChar);
         if (pos < 0) { // Found the 1st char not in "from", stop
           out.end = id+ bytePerChar;
           break;
         }
+      }
+    } // end of eval
+  }
+
+  /**
+   * Remove the longest string containing only characters from "from"  from 
the start of "text"
+   */
+  @FunctionTemplate(name = "btrim", scope = FunctionScope.SIMPLE, nulls = 
NullHandling.NULL_IF_NULL)
+  public static class Btrim implements DrillSimpleFunc{
+    
+    @Param  VarCharHolder text;
+    @Param  VarCharHolder from;
+    
+    @Output VarCharHolder out;
+
+    public void setup(RecordBatch incoming){
+    }
+
+    public void eval() {
+      out.buffer = text.buffer;
+      out.start = out.end = text.start;
+      int bytePerChar = 0;
+      
+      //Scan from left of "text", stop until find a char not in "from"
+      for (int id = text.start; id < text.end; id += bytePerChar) {
+        bytePerChar = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, 
id);
+        int pos = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer,
 from.start, from.end,
+                                                                               
             text.buffer, id, id + bytePerChar);
+        if (pos < 0) { // Found the 1st char not in "from", stop
+          out.start = id; 
+          break;
+        }
+      }
 
-        id --; // back-off to prior character.
+      //Scan from right of "text", stop until find a char not in "from"
+      for (int id = text.end - 1; id >= text.start; id -= bytePerChar) {
+        while ((text.buffer.getByte(id) & 0xC0) == 0x80 && id >= text.start) 
id--;
+        bytePerChar = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.utf8CharLen(text.buffer, 
id);
+        int pos = 
org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.stringLeftMatchUTF8(from.buffer,
 from.start, from.end,
+                                                                               
             text.buffer, id, id + bytePerChar);
+        if (pos < 0) { // Found the 1st char not in "from", stop
+          out.end = id+ bytePerChar; 
+          break;
+        }
       }
     } // end of eval
   }

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java
 
b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java
index 21ff421..ff4a7d3 100644
--- 
a/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java
+++ 
b/exec/java-exec/src/main/java/org/apache/drill/exec/planner/logical/DrillOptiq.java
@@ -289,6 +289,29 @@ public class DrillOptiq {
           default:
             throw new UnsupportedOperationException("extract function supports 
the following time units: YEAR, MONTH, DAY, HOUR, MINUTE, SECOND");
         }
+      } else if (functionName.equals("trim")) {
+        String trimFunc = null;
+        List<LogicalExpression> trimArgs = Lists.newArrayList();
+
+        assert args.get(0) instanceof ValueExpressions.QuotedString;
+        switch 
(((ValueExpressions.QuotedString)args.get(0)).value.toUpperCase()) {
+        case "LEADING":
+          trimFunc = "ltrim";
+          break;
+        case "TRAILING":
+          trimFunc = "rtrim";
+          break;
+        case "BOTH":
+          trimFunc = "btrim";
+          break;
+        default:
+          assert 1 == 0;
+        }
+
+        trimArgs.add(args.get(2));
+        trimArgs.add(args.get(1));
+
+        return FunctionCallFactory.createExpression(trimFunc, trimArgs);
       } else if (functionName.equals("date_part")) {
         // Rewrite DATE_PART functions as extract functions
         // assert that the function has exactly two arguments

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java
----------------------------------------------------------------------
diff --git 
a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java
 
b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java
index af741a5..3f9ba5e 100644
--- 
a/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java
+++ 
b/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestStringFunctions.java
@@ -138,6 +138,14 @@ public class TestStringFunctions extends ExecTest {
   }
 
   @Test
+  public void testTrim(@Injectable final DrillbitContext bitContext,
+                           @Injectable UserServer.UserClientConnection 
connection) throws Throwable{
+    Object [] expected = new Object[] {"fghI", "", "", "!", " aaa "};
+
+    runTest(bitContext, connection, expected, 
"functions/string/testTrim.json");
+  }
+
+  @Test
   public void testReplace(@Injectable final DrillbitContext bitContext,
                            @Injectable UserServer.UserClientConnection 
connection) throws Throwable{
     Object [] expected = new Object[] {"aABABcdf", "ABABbABbcdf", "aababcdf", 
"acdf", "ABCD", "abc"};

http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/37eb6566/exec/java-exec/src/test/resources/functions/string/testTrim.json
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/resources/functions/string/testTrim.json 
b/exec/java-exec/src/test/resources/functions/string/testTrim.json
new file mode 100644
index 0000000..6c81f78
--- /dev/null
+++ b/exec/java-exec/src/test/resources/functions/string/testTrim.json
@@ -0,0 +1,39 @@
+{
+   head:{
+        type:"APACHE_DRILL_PHYSICAL",
+        version:"1",
+        generator:{
+            type:"manual"
+        }
+    },
+    graph:[
+        {
+            @id:1,
+            pop:"mock-sub-scan",
+            url: "http://apache.org";,
+            entries:[
+               {records: 1, types: [
+                 {name: "varcharcol", type: "VARCHAR", mode: "REQUIRED"},
+                 {name: "nullvarcharcol", type: "VARCHAR", mode: "OPTIONAL"}
+               ]}
+            ]
+        },
+        {
+            @id:2,
+            child: 1,
+            pop:"project",
+            exprs: [
+              { ref: "col1", expr: "btrim('     efghI e', 'e ')"},
+              { ref: "col2", expr: "btrim('a', 'a')"},
+              { ref: "col3", expr: "btrim('', '')"},
+              { ref: "col4", expr: "btrim('aAa!aAa', 'aA')"},
+              { ref: "col5", expr: "btrim(' aaa ', '')"}
+        ]
+        },
+        {
+            @id: 3,
+            child: 2,
+            pop: "screen"
+        }
+    ]
+}

Reply via email to