Updated Branches: refs/heads/master ce7cf0d26 -> 31f196497
Add UTF-8 support for substring. Add VARBINARY as a valid type for substring. Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/71e8ffeb Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/71e8ffeb Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/71e8ffeb Branch: refs/heads/master Commit: 71e8ffebd9c6197ec6bb4eab7ab6bcf503fa0984 Parents: 068a1df Author: Ben Becker <[email protected]> Authored: Sat Aug 3 17:13:48 2013 -0700 Committer: Jacques Nadeau <[email protected]> Committed: Mon Aug 5 16:44:57 2013 -0700 ---------------------------------------------------------------------- .../drill/exec/expr/EvaluationVisitor.java | 16 +- .../drill/exec/expr/fn/impl/BinSubstring.java | 104 ++++++++++++ .../drill/exec/expr/fn/impl/CharSubstring.java | 157 +++++++++++++++++++ .../drill/exec/expr/fn/impl/Substring.java | 99 ------------ .../exec/physical/impl/TestSimpleFunctions.java | 41 +++++ .../functions/testSubstringBinary.json | 37 +++++ 6 files changed, 346 insertions(+), 108 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/EvaluationVisitor.java ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/EvaluationVisitor.java b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/EvaluationVisitor.java index fc1068b..03a2a6b 100644 --- a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/EvaluationVisitor.java +++ b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/EvaluationVisitor.java @@ -1,5 +1,7 @@ package org.apache.drill.exec.expr; +import com.google.common.base.Charsets; +import com.sun.codemodel.*; import org.apache.drill.common.expression.FunctionCall; import org.apache.drill.common.expression.IfExpression; import org.apache.drill.common.expression.IfExpression.IfCondition; @@ -20,12 +22,6 @@ import org.apache.drill.exec.record.selection.SelectionVector2; import org.apache.drill.exec.vector.TypeHelper; import com.google.common.base.Preconditions; -import com.sun.codemodel.JBlock; -import com.sun.codemodel.JClass; -import com.sun.codemodel.JConditional; -import com.sun.codemodel.JExpr; -import com.sun.codemodel.JType; -import com.sun.codemodel.JVar; public class EvaluationVisitor extends AbstractExprVisitor<HoldingContainer, CodeGenerator<?>, RuntimeException> { @@ -158,7 +154,7 @@ public class EvaluationVisitor extends AbstractExprVisitor<HoldingContainer, Cod JConditional jc = block._if(hc.getIsSet().eq(JExpr.lit(0)).not()); block = jc._then(); } - if (hc.getMinorType() == TypeProtos.MinorType.VARCHAR) { + if (hc.getMinorType() == TypeProtos.MinorType.VARCHAR || hc.getMinorType() == TypeProtos.MinorType.VARBINARY) { block.add(vv.invoke("getMutator").invoke("set").arg(JExpr.direct("outIndex")).arg(hc.getHolder())); } else { block.add(vv.invoke("getMutator").invoke("set").arg(JExpr.direct("outIndex")).arg(hc.getValue())); @@ -188,7 +184,8 @@ public class EvaluationVisitor extends AbstractExprVisitor<HoldingContainer, Cod JBlock blk = generator.getBlock(); blk.assign(out.getIsSet(), vv1.invoke("getAccessor").invoke("isSet").arg(JExpr.direct("inIndex"))); JConditional jc = blk._if(out.getIsSet().eq(JExpr.lit(1))); - if (e.getMajorType().getMinorType() == TypeProtos.MinorType.VARCHAR) { + if (e.getMajorType().getMinorType() == TypeProtos.MinorType.VARCHAR || + e.getMajorType().getMinorType() == TypeProtos.MinorType.VARBINARY) { jc._then() .add(vv1.invoke("getAccessor").invoke("get").arg(JExpr.direct("inIndex")).arg(out.getHolder())); } else { @@ -196,7 +193,8 @@ public class EvaluationVisitor extends AbstractExprVisitor<HoldingContainer, Cod .assign(out.getValue(), vv1.invoke("getAccessor").invoke("get").arg(JExpr.direct("inIndex"))); } }else{ - if (e.getMajorType().getMinorType() == TypeProtos.MinorType.VARCHAR) { + if (e.getMajorType().getMinorType() == TypeProtos.MinorType.VARCHAR || + e.getMajorType().getMinorType() == TypeProtos.MinorType.VARBINARY) { generator.getBlock().add(vv1.invoke("getAccessor").invoke("get").arg(JExpr.direct("inIndex")).arg(out.getHolder())); } else { generator.getBlock().assign(out.getValue(), vv1.invoke("getAccessor").invoke("get").arg(JExpr.direct("inIndex"))); http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/BinSubstring.java ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/BinSubstring.java b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/BinSubstring.java new file mode 100644 index 0000000..e608a34 --- /dev/null +++ b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/BinSubstring.java @@ -0,0 +1,104 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +package org.apache.drill.exec.expr.fn.impl; + +import org.apache.drill.common.expression.*; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.TypeProtos.MajorType; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.expr.DrillFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.record.RecordBatch; +import org.apache.drill.exec.vector.*; + +// TODO: implement optional length parameter + +/** + * Evaluate a substring expression for a given value; specifying the start + * position, and optionally the end position. + * + * - If the start position is negative, start from abs(start) characters from + * the end of the buffer. + * + * - If no length is specified, continue to the end of the string. + * + * - If the substring expression's length exceeds the value's upward bound, the + * value's length will be used. + * + * - If the substring is invalid, return an empty string. + */ +@FunctionTemplate(name = "binsubstring", + scope = FunctionTemplate.FunctionScope.SIMPLE, + nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) +public class BinSubstring implements DrillFunc { + + @Param VarBinaryHolder string; + @Param BigIntHolder offset; + @Param BigIntHolder length; + @Output VarBinaryHolder out; + + @Override + public void setup(RecordBatch incoming) { } + + @Override + public void eval() { + out.buffer = string.buffer; + + // handle invalid values; e.g. SUBSTRING(value, 0, x) or SUBSTRING(value, x, 0) + if (offset.value == 0 || length.value <= 0) { + + out.start = 0; + out.end = 0; + + } else { + + // handle negative and positive offset values + if (offset.value < 0) + out.start = string.end + (int)offset.value; + else + out.start = (int)offset.value - 1; + + // calculate end position from length and truncate to upper value bounds + if (out.start + length.value > string.end) + out.end = string.end; + else + out.end = out.start + (int)length.value; + + } + } + + public static class Provider implements CallProvider { + + @Override + public FunctionDefinition[] getFunctionDefintions() { + return new FunctionDefinition[] { + FunctionDefinition.simple("binsubstring", + new BasicArgumentValidator(new Arg(Types.required(TypeProtos.MinorType.VARBINARY), + Types.optional(TypeProtos.MinorType.VARBINARY)), + new Arg(false, false, "offset", TypeProtos.MinorType.BIGINT), + new Arg(false, false, "length", TypeProtos.MinorType.BIGINT)), + new OutputTypeDeterminer.SameAsFirstInput(), + "bin_substring", + "bin_substr") + }; + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSubstring.java ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSubstring.java b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSubstring.java new file mode 100644 index 0000000..0fd9648 --- /dev/null +++ b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSubstring.java @@ -0,0 +1,157 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +package org.apache.drill.exec.expr.fn.impl; + +import org.apache.drill.common.expression.*; +import org.apache.drill.common.types.TypeProtos; +import org.apache.drill.common.types.Types; +import org.apache.drill.exec.expr.DrillFunc; +import org.apache.drill.exec.expr.annotations.FunctionTemplate; +import org.apache.drill.exec.expr.annotations.Output; +import org.apache.drill.exec.expr.annotations.Param; +import org.apache.drill.exec.record.RecordBatch; +import org.apache.drill.exec.vector.*; + +/** + * Evaluate a substring expression for a given UTF-8 value; specifying the start + * position, and optionally the end position. + * + * - If the start position is negative, start from abs(start) characters from + * the end of the buffer. + * + * - If no length is specified, continue to the end of the string. + * + * - If the substring expression's length exceeds the value's upward bound, the + * value's length will be used. + * + * - If the substring is invalid, return an empty string. + * + * - NOTE: UTF-8 values range from 1 to 4 bytes per character, thus searching for the + * start, length, and negative length may result in 3 partial scans of the + * UTF-8 string. + * + * - TODO: implement optional length parameter + */ +@FunctionTemplate(name = "charsubstring", + scope = FunctionTemplate.FunctionScope.SIMPLE, + nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) +public class CharSubstring implements DrillFunc { + + @Param VarCharHolder string; + @Param BigIntHolder offset; + @Param BigIntHolder length; + @Output VarCharHolder out; + + @Override + public void setup(RecordBatch incoming) { } + + @Override + public void eval() { + out.buffer = string.buffer; + + // handle invalid values; e.g. SUBSTRING(value, 0, x) or SUBSTRING(value, x, 0) + if (offset.value == 0 || length.value <= 0) { + out.start = 0; + out.end = 0; + } else { + + // start iterating over the UTF-8 buffer to find the first character of the substring + int byteCount = 0; + int charCount = 0; + int byteStart = 0; + int charStart = 0; + int byteEnd = 0; + byte currentByte; + while (byteCount < string.end - string.start) { + currentByte = string.buffer.getByte(string.start + byteCount); + // check current position matches the (positive) start position + if (offset.value > 0 && charCount == (int)offset.value - 1) { + byteStart = byteCount; + charStart = charCount; + } + + // check current position matches the supplied length + if (offset.value > 0 && charCount - charStart == (int)length.value) { + byteEnd = byteCount; + break; + } + + if (currentByte < 128) + ++charCount; + ++byteCount; + } + + out.start = string.start + byteStart; + out.end = string.start + byteEnd; + + // search backwards for negative offsets + if (offset.value < 0) { + int endBytePos = --byteCount; + int endCharPos = --charCount; + while (byteCount >= 0) { + currentByte = string.buffer.getByte(byteCount); + + if (endCharPos - charCount == -(int)offset.value) { + // matched the negative start offset + out.start = byteCount; + charCount = 0; + + // search forward until we find <length> characters + while (byteCount <= endBytePos) { + currentByte = string.buffer.getByte(byteCount); + if (currentByte < 128) + ++charCount; + ++byteCount; + if (charCount == (int)length.value) { + out.end = byteCount; + break; + } + } + break; + } + if (currentByte < 128) + --charCount; + --byteCount; + } + } + + // if length exceeds value, stop at end of value + if (out.end == 0) { + out.end = string.end; + } + } + } + + public static class Provider implements CallProvider { + + @Override + public FunctionDefinition[] getFunctionDefintions() { + return new FunctionDefinition[] { + FunctionDefinition.simple("charsubstring", + new BasicArgumentValidator(new Arg(Types.required(TypeProtos.MinorType.VARCHAR), + Types.optional(TypeProtos.MinorType.VARCHAR)), + new Arg(false, false, "offset", TypeProtos.MinorType.BIGINT), + new Arg(false, false, "length", TypeProtos.MinorType.BIGINT)), + new OutputTypeDeterminer.SameAsFirstInput(), + "substring", + "substr") + }; + } + } +} http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/Substring.java ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/Substring.java b/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/Substring.java deleted file mode 100644 index 5466159..0000000 --- a/sandbox/prototype/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/Substring.java +++ /dev/null @@ -1,99 +0,0 @@ -/******************************************************************************* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ - -package org.apache.drill.exec.expr.fn.impl; - -import org.apache.drill.common.expression.ArgumentValidators; -import org.apache.drill.common.expression.CallProvider; -import org.apache.drill.common.expression.FunctionDefinition; -import org.apache.drill.common.expression.OutputTypeDeterminer; -import org.apache.drill.exec.expr.DrillFunc; -import org.apache.drill.exec.expr.annotations.FunctionTemplate; -import org.apache.drill.exec.expr.annotations.Output; -import org.apache.drill.exec.expr.annotations.Param; -import org.apache.drill.exec.record.RecordBatch; -import org.apache.drill.exec.vector.*; - -// TODO: implement optional length parameter -// implement UTF-8 and UTF-16 support - -/** - * Evaluate a substring expression for a given value; specifying the start - * position, and optionally the end position. - * - * - If the start position is negative, start from abs(start) characters from - * the end of the buffer. - * - * - If no length is specified, continue to the end of the string. - * - * - If the substring expression's length exceeds the value's upward bound, the - * value's length will be used. - * - * - If the substring is invalid, return an empty string. - */ -@FunctionTemplate(name = "substring", - scope = FunctionTemplate.FunctionScope.SIMPLE, - nulls = FunctionTemplate.NullHandling.NULL_IF_NULL) -public class Substring implements DrillFunc { - - @Param VarCharHolder string; - @Param BigIntHolder offset; - @Param BigIntHolder length; - @Output VarCharHolder out; - - @Override - public void setup(RecordBatch incoming) { } - - @Override - public void eval() { - out.buffer = string.buffer; - - // handle invalid values; e.g. SUBSTRING(value, 0, x) or SUBSTRING(value, x, 0) - if (offset.value == 0 || length.value <= 0) { - out.start = 0; - out.end = 0; - return; - } - - // handle negative and positive offset values - if (offset.value < 0) - out.start = string.end + (int)offset.value; - else - out.start = (int)offset.value - 1; - - // calculate end position from length and truncate to upper value bounds - if (out.start + length.value > string.end) - out.end = string.end; - else - out.end = out.start + (int)length.value; - } - - public static class Provider implements CallProvider { - - @Override - public FunctionDefinition[] getFunctionDefintions() { - return new FunctionDefinition[] { - FunctionDefinition.simple("substring", - new ArgumentValidators.AnyTypeAllowed(3), - new OutputTypeDeterminer.SameAsFirstInput(), - "substring", - "substr") - }; - } - } -} http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestSimpleFunctions.java ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestSimpleFunctions.java b/sandbox/prototype/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestSimpleFunctions.java index 17932e7..4211db1 100644 --- a/sandbox/prototype/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestSimpleFunctions.java +++ b/sandbox/prototype/exec/java-exec/src/test/java/org/apache/drill/exec/physical/impl/TestSimpleFunctions.java @@ -37,6 +37,8 @@ import org.apache.drill.exec.proto.CoordinationProtos; import org.apache.drill.exec.proto.ExecProtos; import org.apache.drill.exec.rpc.user.UserServer; import org.apache.drill.exec.server.DrillbitContext; +import org.apache.drill.exec.vector.NullableVarBinaryHolder; +import org.apache.drill.exec.vector.NullableVarBinaryVector; import org.apache.drill.exec.vector.NullableVarCharHolder; import org.apache.drill.exec.vector.NullableVarCharVector; import org.junit.After; @@ -182,6 +184,45 @@ public class TestSimpleFunctions { } + @Test + public void testSubstringBinary(@Injectable final DrillbitContext bitContext, + @Injectable UserServer.UserClientConnection connection) throws Throwable{ + + new NonStrictExpectations(){{ + bitContext.getMetrics(); result = new MetricRegistry("test"); + bitContext.getAllocator(); result = BufferAllocator.getAllocator(c); + }}; + + PhysicalPlanReader reader = new PhysicalPlanReader(c, c.getMapper(), CoordinationProtos.DrillbitEndpoint.getDefaultInstance()); + PhysicalPlan plan = reader.readPhysicalPlan(Files.toString(FileUtils.getResourceAsFile("/functions/testSubstringBinary.json"), Charsets.UTF_8)); + FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c); + FragmentContext context = new FragmentContext(bitContext, ExecProtos.FragmentHandle.getDefaultInstance(), connection, null, registry); + SimpleRootExec exec = new SimpleRootExec(ImplCreator.getExec(context, (FragmentRoot) plan.getSortedOperators(false).iterator().next())); + + while(exec.next()){ + NullableVarBinaryVector c1 = exec.getValueVectorById(new SchemaPath("col3", ExpressionPosition.UNKNOWN), NullableVarBinaryVector.class); + NullableVarBinaryVector.Accessor a1; + a1 = c1.getAccessor(); + + int count = 0; + for(int i = 0; i < c1.getAccessor().getValueCount(); i++){ + if (!a1.isNull(i)) { + NullableVarBinaryHolder holder = new NullableVarBinaryHolder(); + a1.get(i, holder); + assertEquals("aa", holder.toString()); + ++count; + } + } + assertEquals(50, count); + } + + if(context.getFailureCause() != null){ + throw context.getFailureCause(); + } + assertTrue(!context.isFailed()); + + } + @After public void tearDown() throws Exception{ // pause to get logger to catch up. http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/71e8ffeb/sandbox/prototype/exec/java-exec/src/test/resources/functions/testSubstringBinary.json ---------------------------------------------------------------------- diff --git a/sandbox/prototype/exec/java-exec/src/test/resources/functions/testSubstringBinary.json b/sandbox/prototype/exec/java-exec/src/test/resources/functions/testSubstringBinary.json new file mode 100644 index 0000000..f68c179 --- /dev/null +++ b/sandbox/prototype/exec/java-exec/src/test/resources/functions/testSubstringBinary.json @@ -0,0 +1,37 @@ +{ + head:{ + type:"APACHE_DRILL_PHYSICAL", + version:"1", + generator:{ + type:"manual" + } + }, + graph:[ + { + @id:1, + pop:"mock-scan", + url: "http://apache.org", + entries:[ + {records: 100, types: [ + {name: "blue", type: "INT", mode: "REQUIRED"}, + {name: "red", type: "BIGINT", mode: "REQUIRED"}, + {name: "yellow", type: "VARBINARY", mode: "OPTIONAL"}, + {name: "green", type: "INT", mode: "REQUIRED"} + ]} + ] + }, + { + @id:2, + child: 1, + pop:"project", + exprs: [ + { ref: "col3", expr:"bin_substring(yellow, -3, 2)" } + ] + }, + { + @id: 3, + child: 2, + pop: "screen" + } + ] +} \ No newline at end of file
