Repository: incubator-drill Updated Branches: refs/heads/master 66cf6e274 -> c305c794a
DRILL-1470 : cast into varchar should recognize the length parameter in varchar. Fix casting function implementation: the length parameter should mean # of chars, not # of bytes. New unit test case to verify the result from cast function. Fix bug in cast into varchar. When target length = 0, it means we want to keep the input . code clean up. Include change for varbinary cast as well. Project: http://git-wip-us.apache.org/repos/asf/incubator-drill/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-drill/commit/73f9827b Tree: http://git-wip-us.apache.org/repos/asf/incubator-drill/tree/73f9827b Diff: http://git-wip-us.apache.org/repos/asf/incubator-drill/diff/73f9827b Branch: refs/heads/master Commit: 73f9827bff061912d4cc81169488eda4dfa17234 Parents: 66cf6e2 Author: Jinfeng Ni <j...@maprtech.com> Authored: Tue Nov 4 07:22:43 2014 -0800 Committer: Jinfeng Ni <j...@maprtech.com> Committed: Tue Nov 11 13:14:59 2014 -0800 ---------------------------------------------------------------------- exec/java-exec/src/main/codegen/data/Casts.tdd | 2 ++ .../CastFunctionsSrcVarLenTargetVarLen.java | 27 ++++++++++++++++-- .../exec/expr/ExpressionTreeMaterializer.java | 14 ++++++---- .../org/apache/drill/TestExampleQueries.java | 29 ++++++++++++++++++++ 4 files changed, 64 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/73f9827b/exec/java-exec/src/main/codegen/data/Casts.tdd ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/codegen/data/Casts.tdd b/exec/java-exec/src/main/codegen/data/Casts.tdd index a4f5fb1..cdd6218 100644 --- a/exec/java-exec/src/main/codegen/data/Casts.tdd +++ b/exec/java-exec/src/main/codegen/data/Casts.tdd @@ -50,7 +50,9 @@ {from: "Float8", to: "VarBinary", major: "TargetVarlen", javaType: "Double", bufferLength:"100"}, {from: "VarBinary", to: "VarChar", major: "SrcVarlenTargetVarlen"}, + {from: "VarChar", to: "VarChar", major: "SrcVarlenTargetVarlen"}, {from: "VarChar", to: "VarBinary", major: "SrcVarlenTargetVarlen"}, + {from: "VarBinary", to: "VarBinary", major: "SrcVarlenTargetVarlen"}, {from: "Date", to: "TimeStamp", major: "Date"}, {from: "Date", to: "TimeStampTZ", major: "Date"}, http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/73f9827b/exec/java-exec/src/main/codegen/templates/CastFunctionsSrcVarLenTargetVarLen.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/codegen/templates/CastFunctionsSrcVarLenTargetVarLen.java b/exec/java-exec/src/main/codegen/templates/CastFunctionsSrcVarLenTargetVarLen.java index 0096be8..cd8f7bd 100644 --- a/exec/java-exec/src/main/codegen/templates/CastFunctionsSrcVarLenTargetVarLen.java +++ b/exec/java-exec/src/main/codegen/templates/CastFunctionsSrcVarLenTargetVarLen.java @@ -47,22 +47,43 @@ public class Cast${type.from}${type.to} implements DrillSimpleFunc{ @Param ${type.from}Holder in; @Param BigIntHolder length; - @Workspace ByteBuf buffer; @Output ${type.to}Holder out; public void setup(RecordBatch incoming) { } public void eval() { + + <#if type.to == 'VarChar'> + + //Do 1st scan to counter # of character in string. + int charCount = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharLength(in.buffer, in.start, in.end); + + //if input length <= target_type length, do nothing + //else if target length = 0, it means cast wants all the characters in the input. Do nothing. + //else truncate based on target_type length. + out.buffer = in.buffer; + out.start = in.start; + if (charCount <= length.value || length.value == 0 ) { + out.end = in.end; + } else { + out.end = org.apache.drill.exec.expr.fn.impl.StringFunctionUtil.getUTF8CharPosition(in.buffer, in.start, in.end, (int)length.value); + } + + <#elseif type.to == 'VarBinary'> + //if input length <= target_type length, do nothing - //else truncate based on target_type length. + //else if target length = 0, it means cast wants all the bytes in the input. Do nothing. + //else truncate based on target_type length. out.buffer = in.buffer; out.start = in.start; - if (in.end - in.start <= length.value) { + if (in.end - in.start <= length.value || length.value == 0 ) { out.end = in.end; } else { out.end = out.start + (int) length.value; } + </#if> + } } http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/73f9827b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/ExpressionTreeMaterializer.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/ExpressionTreeMaterializer.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/ExpressionTreeMaterializer.java index 9cec3a4..a2d5e10 100644 --- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/ExpressionTreeMaterializer.java +++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/ExpressionTreeMaterializer.java @@ -523,7 +523,7 @@ public class ExpressionTreeMaterializer { MajorType newMajor = e.getMajorType(); MinorType newMinor = input.getMajorType().getMinorType(); - if (castEqual(e.getPosition(), newMajor, input.getMajorType())) { + if (castEqual(e.getPosition(), input.getMajorType(), newMajor)) { return input; // don't do pointless cast. } @@ -601,11 +601,15 @@ public class ExpressionTreeMaterializer { case VAR16CHAR: case VARBINARY: case VARCHAR: - if (to.getWidth() < from.getWidth() && to.getWidth() > 0) { - this.errorCollector.addGeneralError(pos, "Casting from a longer variable length type to a shorter variable length type is not currently supported."); - return false; - } else { + // We could avoid redundant cast: + // 1) when "to" length is no smaller than "from" length and "from" length is known (>0), + // 2) or "to" length is unknown (0 means unknown length?). + // Case 1 and case 2 mean that cast will do nothing. + // In other cases, cast is required to trim the "from" according to "to" length. + if ( (to.getWidth() >= from.getWidth() && from.getWidth() > 0) || to.getWidth() == 0) { return true; + } else { + return false; } default: http://git-wip-us.apache.org/repos/asf/incubator-drill/blob/73f9827b/exec/java-exec/src/test/java/org/apache/drill/TestExampleQueries.java ---------------------------------------------------------------------- diff --git a/exec/java-exec/src/test/java/org/apache/drill/TestExampleQueries.java b/exec/java-exec/src/test/java/org/apache/drill/TestExampleQueries.java index cd70b3c..e134a73 100644 --- a/exec/java-exec/src/test/java/org/apache/drill/TestExampleQueries.java +++ b/exec/java-exec/src/test/java/org/apache/drill/TestExampleQueries.java @@ -542,4 +542,33 @@ public class TestExampleQueries extends BaseTestQuery{ assertEquals(String.format("Received unexepcted number of rows in output: expected=%d, received=%s", expectedRecordCount, actualRecordCount), expectedRecordCount, actualRecordCount); } + + @Test // DRILL-1470 + public void testCastToVarcharWithLength() throws Exception { + // cast from varchar with unknown length to a fixed length. + int actualRecordCount = testSql("select first_name from cp.`employee.json` where cast(first_name as varchar(2)) = 'Sh'"); + int expectedRecordCount = 27; + assertEquals(String.format("Received unexepcted number of rows in output: expected=%d, received=%s", + expectedRecordCount, actualRecordCount), expectedRecordCount, actualRecordCount); + + // cast from varchar with unknown length to varchar(5), then to varchar(10), then to varchar(2). Should produce the same result as the first query. + actualRecordCount = testSql("select first_name from cp.`employee.json` where cast(cast(cast(first_name as varchar(5)) as varchar(10)) as varchar(2)) = 'Sh'"); + expectedRecordCount = 27; + assertEquals(String.format("Received unexepcted number of rows in output: expected=%d, received=%s", + expectedRecordCount, actualRecordCount), expectedRecordCount, actualRecordCount); + + + // this long nested cast expression should be essentially equal to substr(), meaning the query should return every row in the table. + actualRecordCount = testSql("select first_name from cp.`employee.json` where cast(cast(cast(first_name as varchar(5)) as varchar(10)) as varchar(2)) = substr(first_name, 1, 2)"); + expectedRecordCount = 1155; + assertEquals(String.format("Received unexepcted number of rows in output: expected=%d, received=%s", + expectedRecordCount, actualRecordCount), expectedRecordCount, actualRecordCount); + + // cast is applied to a column from parquet file. + actualRecordCount = testSql("select n_name from cp.`tpch/nation.parquet` where cast(n_name as varchar(2)) = 'UN'"); + expectedRecordCount = 2; + assertEquals(String.format("Received unexepcted number of rows in output: expected=%d, received=%s", + expectedRecordCount, actualRecordCount), expectedRecordCount, actualRecordCount); + } + }