enhance of fixes to jira 1720
Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/6abfa2a6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/6abfa2a6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/6abfa2a6 Branch: refs/heads/master Commit: 6abfa2a6522376db7d3d5abbc15efdc01334b97d Parents: aecc2db Author: Liu Ming <[email protected]> Authored: Thu Jan 28 09:41:16 2016 +0000 Committer: Liu Ming <[email protected]> Committed: Thu Jan 28 09:41:16 2016 +0000 ---------------------------------------------------------------------- core/sql/common/csconvert.cpp | 2 +- core/sql/exp/exp_conv.cpp | 8 ++++-- core/sql/generator/GenRelScan.cpp | 26 +++++++++++++++---- core/sql/optimizer/BindItemExpr.cpp | 36 +++++++++++++++++++++----- core/sql/optimizer/SynthType.cpp | 8 +++--- core/sql/regress/hive/EXPECTED005 | 26 +++++++++++++++++++ core/sql/regress/hive/TEST005 | 7 +++++ core/sql/regress/hive/TEST005_a.hive.sql | 9 +++++++ core/sql/regress/hive/tbl_gbk.data | Bin 0 -> 129 bytes 9 files changed, 105 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/common/csconvert.cpp ---------------------------------------------------------------------- diff --git a/core/sql/common/csconvert.cpp b/core/sql/common/csconvert.cpp index 2423976..fc4263d 100644 --- a/core/sql/common/csconvert.cpp +++ b/core/sql/common/csconvert.cpp @@ -1311,7 +1311,7 @@ int gbkToUtf8(char* gbkString, size_t gbklen, int finalLength = charsetConvert( "gbk","utf-8", gbkString, gbklen, result, outlen); if (finalLength == -1 ) - return 0; + return -1; if ( addNullAtEnd ) { http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/exp/exp_conv.cpp ---------------------------------------------------------------------- diff --git a/core/sql/exp/exp_conv.cpp b/core/sql/exp/exp_conv.cpp index b2a11cd..36f9a30 100644 --- a/core/sql/exp/exp_conv.cpp +++ b/core/sql/exp/exp_conv.cpp @@ -9329,16 +9329,20 @@ convDoIt(char * source, int convLen = gbkToUtf8( source, sourceLen, target, targetLen); if (convLen > 0) { copyLen = convLen; + if ( varCharLen ) + setVCLength(varCharLen, varCharLenSize, copyLen); //if the target length is not enough, instead of truncate, raise a SQL Error if (convLen > targetLen) ExRaiseSqlError(heap, diagsArea, EXE_STRING_OVERFLOW); - if ( varCharLen ) - setVCLength(varCharLen, varCharLenSize, copyLen); } else { // LCOV_EXCL_START convLen = 0; copyLen = 0; + if ( varCharLen ) + setVCLength(varCharLen, varCharLenSize, copyLen); + ExRaiseSqlError(heap, diagsArea, EXE_CONVERT_STRING_ERROR); + return ex_expr::EXPR_ERROR; // LCOV_EXCL_STOP } }; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/generator/GenRelScan.cpp ---------------------------------------------------------------------- diff --git a/core/sql/generator/GenRelScan.cpp b/core/sql/generator/GenRelScan.cpp index 573873f..7729b13 100644 --- a/core/sql/generator/GenRelScan.cpp +++ b/core/sql/generator/GenRelScan.cpp @@ -202,6 +202,7 @@ int HbaseAccess::createAsciiColAndCastExpr(Generator * generator, asciiValue = NULL; castValue = NULL; CollHeap * h = generator->wHeap(); + bool needTranslate = FALSE; // if this is an upshifted datatype, remove the upshift attr. // We dont want to upshift data during retrievals or while building keys. @@ -214,22 +215,32 @@ int HbaseAccess::createAsciiColAndCastExpr(Generator * generator, ((CharType*)newGivenType)->setUpshifted(FALSE); } + if (newGivenType->getTypeQualifier() == NA_CHARACTER_TYPE && + CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "GBK") + needTranslate = TRUE; + // source ascii row is a varchar where the data is a pointer to the source data // in the hdfs buffer. NAType *asciiType = NULL; if (DFS2REC::isDoubleCharacter(newGivenType->getFSDatatype())) - asciiType = new (h) SQLVarChar(sizeof(Int64)/2, newGivenType->supportsSQLnull(), + { + asciiType = new (h) SQLVarChar(sizeof(Int64)/2, newGivenType->supportsSQLnull(), FALSE, FALSE, newGivenType->getCharSet()); + } + // set the source charset to GBK if HIVE_FILE_CHARSET is set + // HIVE_FILE_CHARSET can only be empty or GBK + else if ( needTranslate == TRUE ) + { + asciiType = new (h) SQLVarChar(sizeof(Int64)/2, newGivenType->supportsSQLnull(), + FALSE, FALSE, CharInfo::GBK); + } else asciiType = new (h) SQLVarChar(sizeof(Int64), newGivenType->supportsSQLnull()); - if (asciiType) { asciiValue = new (h) NATypeToItem(asciiType->newCopy(h)); - - castValue = new(h) Cast(asciiValue, newGivenType); - + castValue = new(h) Cast(asciiValue, newGivenType); if (castValue) { ((Cast*)castValue)->setSrcIsVarcharPtr(TRUE); @@ -787,6 +798,7 @@ short FileScan::codeGenForHive(Generator * generator) const Int32 executorPredTuppIndex = 3; const Int32 asciiTuppIndex = 4; ULng32 asciiRowLen; + ULng32 translateRowLen; ExpTupleDesc * asciiTupleDesc = 0; ex_cri_desc * work_cri_desc = NULL; @@ -796,6 +808,7 @@ short FileScan::codeGenForHive(Generator * generator) ExpTupleDesc::TupleDataFormat asciiRowFormat = ExpTupleDesc::SQLARK_EXPLODED_FORMAT; ExpTupleDesc::TupleDataFormat hdfsRowFormat = ExpTupleDesc::SQLMX_ALIGNED_FORMAT; ValueIdList asciiVids; + ValueIdList transVids; ValueIdList executorPredCastVids; ValueIdList projectExprOnlyCastVids; @@ -840,6 +853,7 @@ short FileScan::codeGenForHive(Generator * generator) asciiVids.insert(asciiValue->getValueId()); castValue->bindNode(generator->getBindWA()); + if (convertSkipList[ii] == 1 || convertSkipList[ii] == 2) executorPredCastVids.insert(castValue->getValueId()); else @@ -1501,6 +1515,7 @@ short HbaseAccess::genRowIdExpr(Generator * generator, int res; ItemExpr * castVal = NULL; ItemExpr * asciiVal = NULL; + ItemExpr * transVal = NULL; res = createAsciiColAndCastExpr(generator, givenType, asciiVal, castVal); @@ -1596,6 +1611,7 @@ short HbaseAccess::genRowIdExprForNonSQ(Generator * generator, int res; ItemExpr * castVal = NULL; ItemExpr * asciiVal = NULL; + ItemExpr * transVal = NULL; res = createAsciiColAndCastExpr(generator, givenType, asciiVal, castVal); http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/optimizer/BindItemExpr.cpp ---------------------------------------------------------------------- diff --git a/core/sql/optimizer/BindItemExpr.cpp b/core/sql/optimizer/BindItemExpr.cpp index da39397..3fd854e 100644 --- a/core/sql/optimizer/BindItemExpr.cpp +++ b/core/sql/optimizer/BindItemExpr.cpp @@ -1497,10 +1497,10 @@ ItemExpr* Assign::tryToRelaxCharTypeMatchRules(BindWA *bindWA) ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) { ItemExpr *result = this; - enum {iUCS2 = 0, iISO = 1, iUTF8 = 2, iSJIS = 3, iUNK = 4}; - Int32 Literals_involved[5] = { 0, 0, 0, 0, 0 }; - Int32 nonLiterals_involved[5] = { 0, 0, 0, 0, 0 }; - Int32 charsets_involved[5] = { 0, 0, 0, 0, 0 }; + enum {iUCS2 = 0, iISO = 1, iUTF8 = 2, iSJIS = 3, iGBK = 4, iUNK = 5}; + Int32 Literals_involved[6] = { 0, 0, 0, 0, 0, 0}; + Int32 nonLiterals_involved[6] = { 0, 0, 0, 0, 0, 0 }; + Int32 charsets_involved[6] = { 0, 0, 0, 0, 0, 0 }; Int32 charsetsCount = 0; CharInfo::CharSet cs = CharInfo::UnknownCharSet; CharInfo::CharSet curr_chld_cs= CharInfo::UnknownCharSet; @@ -1545,6 +1545,10 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) cur_chld_cs_ndx = iSJIS; break; + case CharInfo::GBK: + cur_chld_cs_ndx = iGBK; + break; + //case CharInfo::KANJI_MP: //case CharInfo::KSC5601_MP: default: @@ -1593,6 +1597,8 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) cs = CharInfo::UTF8; else if ( Literals_involved[iSJIS] > 0 ) cs = CharInfo::SJIS; + else if ( Literals_involved[iGBK] > 0 ) + cs = CharInfo::GBK; // // Now, we may be able to optimize by translating the 1st child @@ -1601,7 +1607,7 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) // if ( ( cs == chld0_cs ) && ( arity == 2 ) && ( curr_chld_opType != ITM_TRANSLATE ) && - ( charsetsCount == (charsets_involved[iUCS2] + charsets_involved[iUTF8]) ) ) + ( charsetsCount == (charsets_involved[iUCS2] + charsets_involved[iUTF8] + charsets_involved[iGBK]) ) ) { if ( chld0_opType == ITM_TRANSLATE ) cs = curr_chld_cs; //...because we will eliminate a translate op @@ -1631,7 +1637,22 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) if ( desiredType->getTypeQualifier() == NA_CHARACTER_TYPE ) { CharInfo::CharSet Desired_cs = ((const CharType*)desiredType)->getCharSet(); - if ( (chld_cs != Desired_cs) && ( ! ((Cast *)this)->tgtCharSetSpecified() ) ) + /* + * this is a special handling for jira 1720, only used in a bulkload scenario + * that is, when user set the HIVE_FILE_CHARSET to 'gbk', it means the data saved in hive + * table is encoded as GBK. Trafodion default all Hive data charset as 'UTF8', so + * this will allow the auto charset converting to happen during bulk load + * the reason is: + * hive scan will mark the source column as GBK when HIVE_FILE_CHARSET is set to GBK + * which is the only value it can be + * So the bind will invoke this implicit casting method to check if an auto charset + * converting is needed. + * In the hive scan, it does not set the tgtCharSetSpecified field, so in order to + * force it to perform a translate, add a checking here + */ + if( (chld_cs != Desired_cs) && CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "GBK" ) + result = performImplicitCasting( Desired_cs, bindWA ); + else if ( (chld_cs != Desired_cs) && ( ! ((Cast *)this)->tgtCharSetSpecified() ) ) { // // Looks like user said CAST( ... as [var]char(NNN) ) @@ -1696,6 +1717,9 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA) case Translate::UCS2_TO_UTF8: Required_cs = CharInfo::UNICODE; break; + case Translate::GBK_TO_UTF8: + Required_cs = CharInfo::GBK; + break; default: break; } http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/optimizer/SynthType.cpp ---------------------------------------------------------------------- diff --git a/core/sql/optimizer/SynthType.cpp b/core/sql/optimizer/SynthType.cpp index 7d1c8a9..5736422 100644 --- a/core/sql/optimizer/SynthType.cpp +++ b/core/sql/optimizer/SynthType.cpp @@ -5217,11 +5217,13 @@ const NAType *Translate::synthesizeType() * the logic here is: * when HIVE_FILE_CHARSET is not empty, it means the real charset in Hive table is not same as HIVE_DEFAULT_CHARSET * in this case, allow the converting , ignoring the source charset checking above - */ - if( CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "" ) //CmpCommon::getDefaultString(HIVE_DEFAULT_CHARSET) ) - err4106arg = SQLCHARSETCODE_GB2312; + if( CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "" ) + err4106arg = SQLCHARSETCODE_GBK; else charsetTarget = CharInfo::UTF8; + */ + err4106arg = SQLCHARSETSTRING_GBK; + } break; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/EXPECTED005 ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/EXPECTED005 b/core/sql/regress/hive/EXPECTED005 index 49d8a62..9299d25 100644 --- a/core/sql/regress/hive/EXPECTED005 +++ b/core/sql/regress/hive/EXPECTED005 @@ -447,4 +447,30 @@ TINT SM I BIG STR F --- 1 row(s) selected. >> +>>cqd HIVE_FILE_CHARSET 'GBK'; + +--- SQL operation complete. +>>select c1, CONVERTTOHEX(c2) from tbl_gbk; + +C1 (EXPR) +----------- -------------------------------------------------- + + 3 EC8B90EC978E + 5 EC8B90EC978E + 2 EC8B90EC978E + 4 EC8B90EC978E + 6 EC8B90EC978E + 7 EC8B90EC978E + 8 EC8B90EC978E + 3 ECBB93EB9F8FECAB97EB9B91 + 2 ECBB93EB9F8FECAB97EB9B91 + 6 ECBB93EB9F8FECAB97EB9B91 + 19 ECBB93EB9F8FECAB97EB9B91 + 8 ECBB93EB9F8FECAB97EB9B91 + +--- 12 row(s) selected. +>>cqd HIVE_FILE_CHARSET reset; + +--- SQL operation complete. +>> >>log; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/TEST005 ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/TEST005 b/core/sql/regress/hive/TEST005 index b8f7518..5bc2a21 100644 --- a/core/sql/regress/hive/TEST005 +++ b/core/sql/regress/hive/TEST005 @@ -30,16 +30,19 @@ sh regrhadoop.ksh fs -mkdir /user/hive/exttables/customer_ddl; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/customer_temp; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_utf8; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_type; +sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_gbk; --empty folders sh regrhadoop.ksh fs -rm /user/hive/exttables/customer_ddl/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/customer_temp/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_utf8/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_type/*; +sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_gbk/*; --- setup Hive tables sh regrhive.ksh -v -f $REGRTSTDIR/TEST005_a.hive.sql; sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_utf8.data /user/hive/exttables/tbl_utf8; sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_type.data /user/hive/exttables/tbl_type; +sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_gbk.data /user/hive/exttables/tbl_gbk; log LOG005 clear; @@ -219,4 +222,8 @@ select * from tbl_type; insert into tbl_type_temp select * from tbl_type; select * from tbl_type_temp; +cqd HIVE_FILE_CHARSET 'GBK'; +select c1, CONVERTTOHEX(c2) from tbl_gbk; +cqd HIVE_FILE_CHARSET reset; + log; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/TEST005_a.hive.sql ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/TEST005_a.hive.sql b/core/sql/regress/hive/TEST005_a.hive.sql index ab4098f..1b5c580 100644 --- a/core/sql/regress/hive/TEST005_a.hive.sql +++ b/core/sql/regress/hive/TEST005_a.hive.sql @@ -154,3 +154,12 @@ create table tbl_type_temp t timestamp ) row format delimited fields terminated by '|'; + +drop table tbl_gbk; +create external table tbl_gbk +( + c1 int, + c2 string +) +row format delimited fields terminated by '\t' +location '/user/hive/exttables/tbl_gbk'; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/tbl_gbk.data ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/tbl_gbk.data b/core/sql/regress/hive/tbl_gbk.data new file mode 100644 index 0000000..2fa331b Binary files /dev/null and b/core/sql/regress/hive/tbl_gbk.data differ
