IMPALA-4594: WriteSlot and CodegenWriteSlot handle escaped NULL slots differently
CodegenWriteSlot() receives negative length values for the lengths of the slots passed to it if the slots contain escape characters. (This is currently only for non-string types, as we do not codegen string types with escaped characters). The DelimitedTextParser is responsible for identifying escape characters and assigning the negative lengths appropriately. CodegenWriteCompleteTuple() passes this length to CodegenWriteSlot() as it is. This differs from the behavior of WriteSlot() where the length passed to it is always positive, as all the callers of WriteSlot() make sure of that (including WriteCompleteTuple()). The IrIsNullString() and IrGenericIsNullString() functions are responsibe for checking if the given data contains a NULL pattern. They are called by CodegenWriteSlot(). A NULL pattern usually contains an escaped character which means that the length of that slot will be a negative length. However, the IrIsNullString() and IrGenericIsNullString() that take the same length argument from CodegenWriteSlot() always expect a positive length argument. So, no slots were ever marked as NULL by these NULL-checking functions when codegen was enabled. NULL slots were still detected accidentally because of some incorrect code in CodegenWriteSlot() that marked invalid slots and NULL slots as NULL. Therefore, due to this code, even invalid slots were not marked as invalid and did not return an error. Instead they were just sliently marked as NULL. This patch makes sure that only positive lengths are passed to CodegenWriteSlot() so that NULL checking is correct and it also makes sure that invalid slots are not silently marked as NULL. Testing: Re-enabled an older hdfs-scan-node-errors test. Formatted it to fit new error message format after IMPALA-3859 and IMPALA-3895. Change-Id: I858e427ad7c2b2da8c2bb657be06b7443655781f Reviewed-on: http://gerrit.cloudera.org:8080/5377 Reviewed-by: Sailesh Mukil <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b00310ca Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b00310ca Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b00310ca Branch: refs/heads/hadoop-next Commit: b00310ca897de325c8c6bd67f430885cddd322da Parents: 88448d1 Author: Sailesh Mukil <[email protected]> Authored: Mon Dec 5 17:40:30 2016 -0800 Committer: Internal Jenkins <[email protected]> Committed: Thu Dec 8 08:30:51 2016 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-scanner.cc | 15 +++ be/src/exec/text-converter.cc | 6 +- .../DataErrorsTest/hdfs-scan-node-errors.test | 122 +++++++++---------- 3 files changed, 75 insertions(+), 68 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b00310ca/be/src/exec/hdfs-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index c6a5f37..281faa6 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -457,6 +457,21 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node, Value* data = builder.CreateLoad(data_ptr, "data"); Value* len = builder.CreateLoad(len_ptr, "len"); + // Convert length to positive if it is negative. Negative lengths are assigned to + // slots that contain escape characters. + // TODO: CodegenWriteSlot() currently does not handle text that requres unescaping. + // However, if it is modified to handle that case, we need to detect it here and + // send a 'need_escape' bool to CodegenWriteSlot(), since we are making the length + // positive here. + Value* len_lt_zero = builder.CreateICmpSLT(len, + codegen->GetIntConstant(TYPE_INT, 0), "len_lt_zero"); + Value* ones_compliment_len = builder.CreateNot(len, "ones_compliment_len"); + Value* positive_len = builder.CreateAdd( + ones_compliment_len, codegen->GetIntConstant(TYPE_INT, 1), + "positive_len"); + len = builder.CreateSelect(len_lt_zero, positive_len, len, + "select_positive_len"); + // Call slot parse function Function* slot_fn = slot_fns[slot_idx]; Value* slot_parsed = builder.CreateCall(slot_fn, http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b00310ca/be/src/exec/text-converter.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc index cb5d70c..e12be85 100644 --- a/be/src/exec/text-converter.cc +++ b/be/src/exec/text-converter.cc @@ -167,10 +167,8 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen, if (!slot_desc->type().IsVarLenStringType()) { builder.SetInsertPoint(check_zero_block); - // If len <= 0 and it is not a string col, set slot to NULL - // The len can be less than 0 if the field contained an escape character which - // is only valid for string cols. - Value* null_len = builder.CreateICmpSLE( + // If len == 0 and it is not a string col, set slot to NULL + Value* null_len = builder.CreateICmpEQ( args[2], codegen->GetIntConstant(TYPE_INT, 0)); builder.CreateCondBr(null_len, set_null_block, parse_slot_block); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b00310ca/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test index 2357f9a..ad63772 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test @@ -1,69 +1,63 @@ ==== ---- QUERY -## TODO: IMPALA-1862: Invalid bool value not reported as a scanner error -## -## TODO: the error info should be sufficient to pin point the data location: filename and -## offset -## TODO: printing the entire record will break column level security (when it is -## implemented). -#select id, bool_col, tinyint_col, smallint_col from alltypeserror -#---- ERRORS -#Error converting column: 3 to SMALLINT -#file: hdfs://regex:.$ -#Error converting column: 2 to TINYINT -#file: hdfs://regex:.$ -#Error converting column: 2 to TINYINT -#Error converting column: 3 to SMALLINT -#file: hdfs://regex:.$ -#Error converting column: 2 to TINYINT -#file: hdfs://regex:.$ -#Error converting column: 1 to BOOLEAN -#file: hdfs://regex:.$ -#Error converting column: 2 to TINYINT -#file: hdfs://regex:.$ -#Error converting column: 3 to SMALLINT -#file: hdfs://regex:.$ -#Error converting column: 1 to BOOLEAN -#Error converting column: 2 to TINYINT -#Error converting column: 3 to SMALLINT -#file: hdfs://regex:.$ -# -#---- RESULTS -#0,NULL,NULL,0 -#1,NULL,NULL,1 -#10,NULL,NULL,NULL -#11,false,NULL,NULL -#12,true,2,NULL -#13,false,3,3 -#14,true,4,4 -#15,false,NULL,5 -#16,NULL,NULL,NULL -#17,false,7,7 -#18,true,8,8 -#19,false,9,9 -#2,true,NULL,NULL -#20,true,0,0 -#21,false,1,1 -#22,true,2,2 -#23,false,3,NULL -#24,true,4,4 -#25,false,5,5 -#26,true,6,6 -#27,false,NULL,7 -#28,true,8,8 -#29,false,9,9 -#3,false,3,NULL -#30,NULL,NULL,NULL -#4,true,4,4 -#5,false,5,5 -#6,true,6,6 -#7,NULL,NULL,7 -#8,false,NULL,NULL -#9,NULL,NULL,NULL -#---- TYPES -#int, boolean, tinyint, smallint -#==== -#---- QUERY +select id, bool_col, tinyint_col, smallint_col from alltypeserror order by id +---- ERRORS +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 1 to BOOLEAN +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 2 to TINYINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +Error converting column: 1 to BOOLEAN +Error converting column: 2 to TINYINT +Error converting column: 3 to SMALLINT +row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ +---- RESULTS +0,NULL,NULL,0 +1,NULL,NULL,1 +2,true,NULL,NULL +3,false,3,NULL +4,true,4,4 +5,false,5,5 +6,true,6,6 +7,NULL,NULL,7 +8,false,NULL,NULL +9,NULL,NULL,NULL +10,NULL,NULL,NULL +11,false,NULL,NULL +12,true,2,NULL +13,false,3,3 +14,true,4,4 +15,false,NULL,5 +16,NULL,NULL,NULL +17,false,7,7 +18,true,8,8 +19,false,9,9 +20,true,0,0 +21,false,1,1 +22,true,2,2 +23,false,3,NULL +24,true,4,4 +25,false,5,5 +26,true,6,6 +27,false,NULL,7 +28,true,8,8 +29,false,9,9 +30,NULL,NULL,NULL +---- TYPES +int, boolean, tinyint, smallint +==== +---- QUERY select count(*) from functional_text_lzo.bad_text_lzo ---- ERRORS Blocksize: 536870911 is greater than LZO_MAX_BLOCK_SIZE: 67108864
