enhance of fixes to jira 1720

Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/6abfa2a6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/6abfa2a6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/6abfa2a6

Branch: refs/heads/master
Commit: 6abfa2a6522376db7d3d5abbc15efdc01334b97d
Parents: aecc2db
Author: Liu Ming <[email protected]>
Authored: Thu Jan 28 09:41:16 2016 +0000
Committer: Liu Ming <[email protected]>
Committed: Thu Jan 28 09:41:16 2016 +0000

----------------------------------------------------------------------
 core/sql/common/csconvert.cpp            |   2 +-
 core/sql/exp/exp_conv.cpp                |   8 ++++--
 core/sql/generator/GenRelScan.cpp        |  26 +++++++++++++++----
 core/sql/optimizer/BindItemExpr.cpp      |  36 +++++++++++++++++++++-----
 core/sql/optimizer/SynthType.cpp         |   8 +++---
 core/sql/regress/hive/EXPECTED005        |  26 +++++++++++++++++++
 core/sql/regress/hive/TEST005            |   7 +++++
 core/sql/regress/hive/TEST005_a.hive.sql |   9 +++++++
 core/sql/regress/hive/tbl_gbk.data       | Bin 0 -> 129 bytes
 9 files changed, 105 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/common/csconvert.cpp
----------------------------------------------------------------------
diff --git a/core/sql/common/csconvert.cpp b/core/sql/common/csconvert.cpp
index 2423976..fc4263d 100644
--- a/core/sql/common/csconvert.cpp
+++ b/core/sql/common/csconvert.cpp
@@ -1311,7 +1311,7 @@ int gbkToUtf8(char* gbkString, size_t gbklen,
    int finalLength = charsetConvert( "gbk","utf-8", gbkString, gbklen,  
result, outlen);
    
    if (finalLength == -1 ) 
-     return 0;
+     return -1;
    
    if ( addNullAtEnd )
    {

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/exp/exp_conv.cpp
----------------------------------------------------------------------
diff --git a/core/sql/exp/exp_conv.cpp b/core/sql/exp/exp_conv.cpp
index b2a11cd..36f9a30 100644
--- a/core/sql/exp/exp_conv.cpp
+++ b/core/sql/exp/exp_conv.cpp
@@ -9329,16 +9329,20 @@ convDoIt(char * source,
     int convLen = gbkToUtf8( source, sourceLen, target, targetLen);
     if (convLen > 0) {
       copyLen = convLen; 
+      if ( varCharLen )
+        setVCLength(varCharLen, varCharLenSize, copyLen);
       //if the target length is not enough, instead of truncate, raise a SQL 
Error
       if (convLen > targetLen)
         ExRaiseSqlError(heap, diagsArea, EXE_STRING_OVERFLOW);
-      if ( varCharLen )
-        setVCLength(varCharLen, varCharLenSize, copyLen);
     }
     else {
       // LCOV_EXCL_START
       convLen = 0;
       copyLen = 0;
+      if ( varCharLen )
+        setVCLength(varCharLen, varCharLenSize, copyLen);
+      ExRaiseSqlError(heap, diagsArea, EXE_CONVERT_STRING_ERROR);
+      return ex_expr::EXPR_ERROR;
       // LCOV_EXCL_STOP
     }
   };

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/generator/GenRelScan.cpp
----------------------------------------------------------------------
diff --git a/core/sql/generator/GenRelScan.cpp 
b/core/sql/generator/GenRelScan.cpp
index 573873f..7729b13 100644
--- a/core/sql/generator/GenRelScan.cpp
+++ b/core/sql/generator/GenRelScan.cpp
@@ -202,6 +202,7 @@ int HbaseAccess::createAsciiColAndCastExpr(Generator * 
generator,
   asciiValue = NULL;
   castValue = NULL;
   CollHeap * h = generator->wHeap();
+  bool needTranslate = FALSE;
 
   // if this is an upshifted datatype, remove the upshift attr.
   // We dont want to upshift data during retrievals or while building keys.
@@ -214,22 +215,32 @@ int HbaseAccess::createAsciiColAndCastExpr(Generator * 
generator,
       ((CharType*)newGivenType)->setUpshifted(FALSE);
     }
 
+  if (newGivenType->getTypeQualifier() == NA_CHARACTER_TYPE &&
+      CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "GBK")
+        needTranslate = TRUE;
+
   // source ascii row is a varchar where the data is a pointer to the source 
data
   // in the hdfs buffer.
   NAType *asciiType = NULL;
   
   if (DFS2REC::isDoubleCharacter(newGivenType->getFSDatatype()))
-    asciiType =  new (h) SQLVarChar(sizeof(Int64)/2, 
newGivenType->supportsSQLnull(),
+  {
+      asciiType =  new (h) SQLVarChar(sizeof(Int64)/2, 
newGivenType->supportsSQLnull(),
                                    FALSE, FALSE, newGivenType->getCharSet());
+  }
+  // set the source charset to GBK if HIVE_FILE_CHARSET is set
+  // HIVE_FILE_CHARSET can only be empty or GBK
+  else if (  needTranslate == TRUE )
+  {
+      asciiType =  new (h) SQLVarChar(sizeof(Int64)/2, 
newGivenType->supportsSQLnull(),
+                                      FALSE, FALSE, CharInfo::GBK);
+  }
   else
     asciiType = new (h) SQLVarChar(sizeof(Int64), 
newGivenType->supportsSQLnull());
-
   if (asciiType)
     {
       asciiValue = new (h) NATypeToItem(asciiType->newCopy(h));
-
-      castValue = new(h) Cast(asciiValue, newGivenType); 
-
+      castValue = new(h) Cast(asciiValue, newGivenType);
       if (castValue)
        {
          ((Cast*)castValue)->setSrcIsVarcharPtr(TRUE);
@@ -787,6 +798,7 @@ short FileScan::codeGenForHive(Generator * generator)
   const Int32 executorPredTuppIndex = 3;
   const Int32 asciiTuppIndex = 4;
   ULng32 asciiRowLen; 
+  ULng32 translateRowLen; 
   ExpTupleDesc * asciiTupleDesc = 0;
 
   ex_cri_desc * work_cri_desc = NULL;
@@ -796,6 +808,7 @@ short FileScan::codeGenForHive(Generator * generator)
   ExpTupleDesc::TupleDataFormat asciiRowFormat = 
ExpTupleDesc::SQLARK_EXPLODED_FORMAT;
   ExpTupleDesc::TupleDataFormat hdfsRowFormat = 
ExpTupleDesc::SQLMX_ALIGNED_FORMAT;
   ValueIdList asciiVids;
+  ValueIdList transVids;
   ValueIdList executorPredCastVids;
   ValueIdList projectExprOnlyCastVids;
 
@@ -840,6 +853,7 @@ short FileScan::codeGenForHive(Generator * generator)
     asciiVids.insert(asciiValue->getValueId());
       
     castValue->bindNode(generator->getBindWA());
+
     if (convertSkipList[ii] == 1 || convertSkipList[ii] == 2)
       executorPredCastVids.insert(castValue->getValueId());
     else
@@ -1501,6 +1515,7 @@ short HbaseAccess::genRowIdExpr(Generator * generator,
          int res;
          ItemExpr * castVal = NULL;
          ItemExpr * asciiVal = NULL;
+         ItemExpr * transVal = NULL;
          res = createAsciiColAndCastExpr(generator,
                                          givenType,
                                          asciiVal, castVal);
@@ -1596,6 +1611,7 @@ short HbaseAccess::genRowIdExprForNonSQ(Generator * 
generator,
          int res;
          ItemExpr * castVal = NULL;
          ItemExpr * asciiVal = NULL;
+         ItemExpr * transVal = NULL;
          res = createAsciiColAndCastExpr(generator,
                                          givenType,
                                          asciiVal, castVal);

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/optimizer/BindItemExpr.cpp
----------------------------------------------------------------------
diff --git a/core/sql/optimizer/BindItemExpr.cpp 
b/core/sql/optimizer/BindItemExpr.cpp
index da39397..3fd854e 100644
--- a/core/sql/optimizer/BindItemExpr.cpp
+++ b/core/sql/optimizer/BindItemExpr.cpp
@@ -1497,10 +1497,10 @@ ItemExpr* Assign::tryToRelaxCharTypeMatchRules(BindWA 
*bindWA)
 ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA)
 {
   ItemExpr *result = this;
-  enum {iUCS2 = 0, iISO = 1, iUTF8 = 2, iSJIS = 3, iUNK = 4};
-  Int32 Literals_involved[5] = { 0, 0, 0, 0, 0 };
-  Int32 nonLiterals_involved[5] = { 0, 0, 0, 0, 0 };
-  Int32 charsets_involved[5] = { 0, 0, 0, 0, 0 };
+  enum {iUCS2 = 0, iISO = 1, iUTF8 = 2, iSJIS = 3, iGBK = 4, iUNK = 5};
+  Int32 Literals_involved[6] = { 0, 0, 0, 0, 0, 0};
+  Int32 nonLiterals_involved[6] = { 0, 0, 0, 0, 0, 0 };
+  Int32 charsets_involved[6] = { 0, 0, 0, 0, 0, 0 };
   Int32 charsetsCount = 0;
   CharInfo::CharSet cs          = CharInfo::UnknownCharSet;
   CharInfo::CharSet curr_chld_cs= CharInfo::UnknownCharSet;
@@ -1545,6 +1545,10 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA 
*bindWA)
           cur_chld_cs_ndx = iSJIS;
           break;
 
+        case CharInfo::GBK:
+          cur_chld_cs_ndx = iGBK;
+          break;
+
         //case CharInfo::KANJI_MP:
         //case CharInfo::KSC5601_MP:
         default:
@@ -1593,6 +1597,8 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA)
           cs = CharInfo::UTF8;
        else if ( Literals_involved[iSJIS] > 0 )
           cs = CharInfo::SJIS;
+       else if ( Literals_involved[iGBK] > 0 )
+          cs = CharInfo::GBK;
 
        //
        // Now, we may be able to optimize by translating the 1st child
@@ -1601,7 +1607,7 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA)
        //
        if ( ( cs == chld0_cs ) &&  ( arity == 2 ) &&
                ( curr_chld_opType != ITM_TRANSLATE ) &&
-               ( charsetsCount == (charsets_involved[iUCS2] + 
charsets_involved[iUTF8]) ) )
+               ( charsetsCount == (charsets_involved[iUCS2] + 
charsets_involved[iUTF8] + charsets_involved[iGBK]) ) )
        {
           if ( chld0_opType == ITM_TRANSLATE )
              cs = curr_chld_cs;  //...because we will eliminate a translate op
@@ -1631,7 +1637,22 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA 
*bindWA)
         if ( desiredType->getTypeQualifier() == NA_CHARACTER_TYPE )
         {
            CharInfo::CharSet Desired_cs = ((const 
CharType*)desiredType)->getCharSet();
-           if ( (chld_cs != Desired_cs) && ( ! ((Cast 
*)this)->tgtCharSetSpecified() ) )
+           /*
+           * this is a special handling for jira 1720, only used in a bulkload 
scenario
+           * that is, when user set the HIVE_FILE_CHARSET to 'gbk', it means 
the data saved in hive
+           * table is encoded as GBK. Trafodion default all Hive data charset 
as 'UTF8', so 
+           * this will allow the auto charset converting to happen during bulk 
load
+           * the reason is:
+           * hive scan will mark the source column as GBK when 
HIVE_FILE_CHARSET is set to GBK
+           * which is the only value it can be 
+           * So the bind will invoke this implicit casting method to check if 
an auto charset 
+           * converting is needed. 
+           * In the hive scan, it does not set the tgtCharSetSpecified field, 
so in order to 
+           * force it to perform a translate, add a checking here
+           */
+           if( (chld_cs != Desired_cs) && 
CmpCommon::getDefaultString(HIVE_FILE_CHARSET) == "GBK" )
+              result = performImplicitCasting( Desired_cs, bindWA );
+           else if ( (chld_cs != Desired_cs) && ( ! ((Cast 
*)this)->tgtCharSetSpecified() ) )
            {
               //
               // Looks like user said CAST( ... as [var]char(NNN) ) 
@@ -1696,6 +1717,9 @@ ItemExpr* ItemExpr::tryToDoImplicitCasting(BindWA *bindWA)
         case Translate::UCS2_TO_UTF8:
              Required_cs = CharInfo::UNICODE;
              break;
+       case Translate::GBK_TO_UTF8:
+            Required_cs = CharInfo::GBK;
+             break;
         default:
              break;
      }

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/optimizer/SynthType.cpp
----------------------------------------------------------------------
diff --git a/core/sql/optimizer/SynthType.cpp b/core/sql/optimizer/SynthType.cpp
index 7d1c8a9..5736422 100644
--- a/core/sql/optimizer/SynthType.cpp
+++ b/core/sql/optimizer/SynthType.cpp
@@ -5217,11 +5217,13 @@ const NAType *Translate::synthesizeType()
             * the logic here is:
             * when HIVE_FILE_CHARSET is not empty, it means the real charset 
in Hive table is not same as HIVE_DEFAULT_CHARSET
             * in this case, allow the converting , ignoring the source charset 
checking above
-            */
-            if( CmpCommon::getDefaultString(HIVE_FILE_CHARSET) ==  "" ) 
//CmpCommon::getDefaultString(HIVE_DEFAULT_CHARSET) )
-              err4106arg = SQLCHARSETCODE_GB2312;
+            if( CmpCommon::getDefaultString(HIVE_FILE_CHARSET) ==  "" ) 
+              err4106arg = SQLCHARSETCODE_GBK;
             else
              charsetTarget = CharInfo::UTF8;
+            */
+               err4106arg = SQLCHARSETSTRING_GBK;
+
        }
        break;
 

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/EXPECTED005
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/EXPECTED005 
b/core/sql/regress/hive/EXPECTED005
index 49d8a62..9299d25 100644
--- a/core/sql/regress/hive/EXPECTED005
+++ b/core/sql/regress/hive/EXPECTED005
@@ -447,4 +447,30 @@ TINT    SM      I            BIG                   STR     
                   F
 
 --- 1 row(s) selected.
 >>
+>>cqd HIVE_FILE_CHARSET 'GBK';
+
+--- SQL operation complete.
+>>select c1, CONVERTTOHEX(c2) from tbl_gbk;
+
+C1           (EXPR)
+-----------  --------------------------------------------------
+
+          3  EC8B90EC978E
+          5  EC8B90EC978E
+          2  EC8B90EC978E
+          4  EC8B90EC978E
+          6  EC8B90EC978E
+          7  EC8B90EC978E
+          8  EC8B90EC978E
+          3  ECBB93EB9F8FECAB97EB9B91
+          2  ECBB93EB9F8FECAB97EB9B91
+          6  ECBB93EB9F8FECAB97EB9B91
+         19  ECBB93EB9F8FECAB97EB9B91
+          8  ECBB93EB9F8FECAB97EB9B91
+
+--- 12 row(s) selected.
+>>cqd HIVE_FILE_CHARSET reset;
+
+--- SQL operation complete.
+>>
 >>log;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/TEST005
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/TEST005 b/core/sql/regress/hive/TEST005
index b8f7518..5bc2a21 100644
--- a/core/sql/regress/hive/TEST005
+++ b/core/sql/regress/hive/TEST005
@@ -30,16 +30,19 @@ sh regrhadoop.ksh fs -mkdir  
/user/hive/exttables/customer_ddl;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/customer_temp;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_utf8;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_type;
+sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_gbk;
 --empty folders
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/customer_ddl/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/customer_temp/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_utf8/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_type/*;
+sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_gbk/*;
 
 --- setup Hive tables
 sh regrhive.ksh -v -f $REGRTSTDIR/TEST005_a.hive.sql;
 sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_utf8.data 
/user/hive/exttables/tbl_utf8;
 sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_type.data 
/user/hive/exttables/tbl_type;
+sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_gbk.data 
/user/hive/exttables/tbl_gbk;
 
 log LOG005 clear;
 
@@ -219,4 +222,8 @@ select * from tbl_type;
 insert into tbl_type_temp select * from tbl_type;
 select * from tbl_type_temp;
 
+cqd HIVE_FILE_CHARSET 'GBK';
+select c1, CONVERTTOHEX(c2) from tbl_gbk;
+cqd HIVE_FILE_CHARSET reset;
+
 log;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/TEST005_a.hive.sql
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/TEST005_a.hive.sql 
b/core/sql/regress/hive/TEST005_a.hive.sql
index ab4098f..1b5c580 100644
--- a/core/sql/regress/hive/TEST005_a.hive.sql
+++ b/core/sql/regress/hive/TEST005_a.hive.sql
@@ -154,3 +154,12 @@ create table tbl_type_temp
      t           timestamp
 )
 row format delimited fields terminated by '|';
+
+drop table tbl_gbk;
+create external table tbl_gbk
+(
+    c1           int,
+    c2           string
+)
+row format delimited fields terminated by '\t'
+location '/user/hive/exttables/tbl_gbk';

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/6abfa2a6/core/sql/regress/hive/tbl_gbk.data
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/tbl_gbk.data 
b/core/sql/regress/hive/tbl_gbk.data
new file mode 100644
index 0000000..2fa331b
Binary files /dev/null and b/core/sql/regress/hive/tbl_gbk.data differ

Reply via email to