Repository: incubator-trafodion
Updated Branches:
  refs/heads/master f0bc795b7 -> 464a0e0ab


[TRAFODION-1912]support automatic convert DOS format during bulkload


Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/f48cc2b6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/f48cc2b6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/f48cc2b6

Branch: refs/heads/master
Commit: f48cc2b69043b33c1abd2b9cec9e3bf1e2299d11
Parents: a9eae1a
Author: Liu Ming <ovis_p...@sina.com>
Authored: Tue Apr 5 02:05:48 2016 +0000
Committer: Liu Ming <ovis_p...@sina.com>
Committed: Tue Apr 5 02:05:48 2016 +0000

----------------------------------------------------------------------
 core/sql/executor/ExHdfsScan.cpp         | 15 +++++++------
 core/sql/executor/ExHdfsScan.h           | 31 ++++++++++++++++++++++-----
 core/sql/regress/hive/EXPECTED005        | 27 +++++++++++++++++++++++
 core/sql/regress/hive/TEST005            |  8 +++++++
 core/sql/regress/hive/TEST005_a.hive.sql | 10 +++++++++
 core/sql/regress/hive/tbl_dos.data       | 10 +++++++++
 core/sql/sqlcomp/DefaultConstants.h      |  4 ++++
 core/sql/sqlcomp/nadefaults.cpp          |  5 +++++
 8 files changed, 99 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/executor/ExHdfsScan.cpp
----------------------------------------------------------------------
diff --git a/core/sql/executor/ExHdfsScan.cpp b/core/sql/executor/ExHdfsScan.cpp
index 9016073..6c38ecc 100644
--- a/core/sql/executor/ExHdfsScan.cpp
+++ b/core/sql/executor/ExHdfsScan.cpp
@@ -367,6 +367,9 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
   char cursorId[8];
   HdfsFileInfo *hdfo = NULL;
   Lng32 openType = 0;
+  int hiveScanMode = 0;
+
+  hiveScanMode = CmpCommon::getDefaultLong(HIVE_SCAN_SPECIAL_MODE);
   
   while (!qparent_.down->isEmpty())
     {
@@ -720,7 +723,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
                // Position in the hdfsScanBuffer_ to the
                // first record delimiter.  
                hdfsBufNextRow_ = hdfs_strchr(hdfsScanBuffer_,
-                                         hdfsScanTdb().recordDelimiter_, 
hdfsScanBuffer_+trailingPrevRead_+ bytesRead_, checkRangeDelimiter_);
+                                         hdfsScanTdb().recordDelimiter_, 
hdfsScanBuffer_+trailingPrevRead_+ bytesRead_, checkRangeDelimiter_, 
hiveScanMode);
                // May be that the record is too long? Or data isn't ascii?
                // Or delimiter is incorrect.
                if (! hdfsBufNextRow_)
@@ -768,7 +771,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
          ComDiagsArea *transformDiags = NULL;
          int err = 0;
          char *startOfNextRow =
-             extractAndTransformAsciiSourceToSqlRow(err, transformDiags);
+             extractAndTransformAsciiSourceToSqlRow(err, transformDiags, 
hiveScanMode);
 
          bool rowWillBeSelected = true;
          lastErrorCnd_ = NULL;
@@ -1378,7 +1381,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work()
 }
 
 char * ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err,
-                                                            ComDiagsArea* 
&diagsArea)
+                                                            ComDiagsArea* 
&diagsArea, int mode)
 {
   err = 0;
   char *sourceData = hdfsBufNextRow_;
@@ -1395,7 +1398,7 @@ char * 
ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err,
   hdfsLoggingRow_ = hdfsBufNextRow_;
   if (asciiSourceTD->numAttrs() == 0)
   {
-     sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, 
checkRangeDelimiter_);
+     sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, 
checkRangeDelimiter_, mode);
      hdfsLoggingRowEnd_  = sourceRowEnd;
 
      if (!sourceRowEnd)
@@ -1430,7 +1433,7 @@ char * 
ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err,
         attr = NULL;
  
       if (!isTrailingMissingColumn) {
-         sourceColEnd = hdfs_strchr(sourceData, rd, cd, sourceDataEnd, 
checkRangeDelimiter_, &rdSeen);
+         sourceColEnd = hdfs_strchr(sourceData, rd, cd, sourceDataEnd, 
checkRangeDelimiter_, &rdSeen,mode);
          if (sourceColEnd == NULL) {
             if (rdSeen || (sourceRowEnd == NULL))
                return NULL;
@@ -1493,7 +1496,7 @@ char * 
ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err,
   // rowDelimiter is encountered
   // So try to find the record delimiter
   if (sourceRowEnd == NULL) {
-     sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, 
checkRangeDelimiter_);
+     sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, 
checkRangeDelimiter_,mode);
      if (sourceRowEnd) {
         hdfsLoggingRowEnd_  = sourceRowEnd;
         if ((endOfRequestedRange_) &&

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/executor/ExHdfsScan.h
----------------------------------------------------------------------
diff --git a/core/sql/executor/ExHdfsScan.h b/core/sql/executor/ExHdfsScan.h
index 8ac16d3..2b3c785 100644
--- a/core/sql/executor/ExHdfsScan.h
+++ b/core/sql/executor/ExHdfsScan.h
@@ -37,6 +37,9 @@
 #include <time.h>
 #include "ExHbaseAccess.h"
 #include "ExpHbaseInterface.h"
+
+#define HIVE_MODE_DOSFORMAT    1
+
 // -----------------------------------------------------------------------
 // Classes defined in this file
 // -----------------------------------------------------------------------
@@ -200,7 +203,7 @@ protected:
   // hdfsRead. Or it could be the eof (in which case there is a good
   // row still waiting to be processed).
   char * extractAndTransformAsciiSourceToSqlRow(int &err,
-                                               ComDiagsArea * &diagsArea);
+                                               ComDiagsArea * &diagsArea, int 
mode);
 
   short moveRowToUpQueue(const char * row, Lng32 len, 
                          short * rc, NABoolean isVarchar);
@@ -429,27 +432,44 @@ protected:
 
 #define RANGE_DELIMITER '\002'
 
-inline char *hdfs_strchr(const char *s, int c, const char *end, NABoolean 
checkRangeDelimiter)
+inline char *hdfs_strchr(char *s, int c, const char *end, NABoolean 
checkRangeDelimiter, int mode = 0)
 {
   char *curr = (char *)s;
-
+  int count=0;
   while (curr < end) {
     if (*curr == c)
+    {
+       if((mode & HIVE_MODE_DOSFORMAT) > 0 ) // The line terminator and we 
want to remove the \r before it
+       {
+         if(count>0 && c == '\n')
+         {
+           if(s[count-1] == '\r') s[count-1] = ' '; 
+         }
+       }
        return curr;
+    }
     if (checkRangeDelimiter &&*curr == RANGE_DELIMITER)
        return NULL;
     curr++;
+    count++;
   }
   return NULL;
 }
 
 
-inline char *hdfs_strchr(const char *s, int rd, int cd, const char *end, 
NABoolean checkRangeDelimiter, NABoolean *rdSeen)
+inline char *hdfs_strchr(char *s, int rd, int cd, const char *end, NABoolean 
checkRangeDelimiter, NABoolean *rdSeen, int mode = 0)
 {
   char *curr = (char *)s;
-
+  int count = 0;
   while (curr < end) {
     if (*curr == rd) {
+       if( (mode & HIVE_MODE_DOSFORMAT)>0 ) //convert DOS format by replace 
the \r with space if it is \r\n here
+       {
+         if(count>0 && rd == '\n')
+         {
+           if(s[count-1] == '\r') s[count-1] = ' ';
+         }
+       }
        *rdSeen = TRUE;
        return curr;
     }
@@ -464,6 +484,7 @@ inline char *hdfs_strchr(const char *s, int rd, int cd, 
const char *end, NABoole
        return NULL;
     }
     curr++;
+    count++;
   }
   *rdSeen = FALSE;
   return NULL;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/EXPECTED005
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/EXPECTED005 
b/core/sql/regress/hive/EXPECTED005
index 9299d25..7d7b108 100644
--- a/core/sql/regress/hive/EXPECTED005
+++ b/core/sql/regress/hive/EXPECTED005
@@ -447,6 +447,7 @@ TINT    SM      I            BIG                   STR      
                  F
 
 --- 1 row(s) selected.
 >>
+>>
 >>cqd HIVE_FILE_CHARSET 'GBK';
 
 --- SQL operation complete.
@@ -473,4 +474,30 @@ C1           (EXPR)
 
 --- SQL operation complete.
 >>
+>>cqd CALL_EMBEDDED_ARKCMP 'ON';
+
+--- SQL operation complete.
+>>cqd HIVE_SCAN_SPECIAL_MODE '1';
+
+--- SQL operation complete.
+>>select * from tbl_dos;
+
+C1           C2           C3           C4                       
+-----------  -----------  -----------  -------------------------
+
+          0        39478         8147  2008-07-17               
+          1        21944         8327  2005-05-12               
+          2        32730         9999  2000-11-05               
+          3        19653         5727  2005-06-24               
+          4        67794         6012  2008-07-01               
+          5        93265         5823  2012-06-26               
+          6        28219          909  2009-04-26               
+          7        23967         8290  2006-02-21               
+          8        24265         8663  2006-10-06               
+          9        70273         3363  2001-03-17               
+
+--- 10 row(s) selected.
+>>cqd HIVE_SCAN_SPECIAL_MODE reset;
+
+--- SQL operation complete.
 >>log;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/TEST005
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/TEST005 b/core/sql/regress/hive/TEST005
index 5bc2a21..e2c824f 100644
--- a/core/sql/regress/hive/TEST005
+++ b/core/sql/regress/hive/TEST005
@@ -31,18 +31,21 @@ sh regrhadoop.ksh fs -mkdir  
/user/hive/exttables/customer_temp;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_utf8;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_type;
 sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_gbk;
+sh regrhadoop.ksh fs -mkdir  /user/hive/exttables/tbl_dos;
 --empty folders
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/customer_ddl/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/customer_temp/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_utf8/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_type/*;
 sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_gbk/*;
+sh regrhadoop.ksh fs -rm   /user/hive/exttables/tbl_dos/*;
 
 --- setup Hive tables
 sh regrhive.ksh -v -f $REGRTSTDIR/TEST005_a.hive.sql;
 sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_utf8.data 
/user/hive/exttables/tbl_utf8;
 sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_type.data 
/user/hive/exttables/tbl_type;
 sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_gbk.data 
/user/hive/exttables/tbl_gbk;
+sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_dos.data 
/user/hive/exttables/tbl_dos;
 
 log LOG005 clear;
 
@@ -222,8 +225,13 @@ select * from tbl_type;
 insert into tbl_type_temp select * from tbl_type;
 select * from tbl_type_temp;
 
+
 cqd HIVE_FILE_CHARSET 'GBK';
 select c1, CONVERTTOHEX(c2) from tbl_gbk;
 cqd HIVE_FILE_CHARSET reset;
 
+cqd CALL_EMBEDDED_ARKCMP 'ON';
+cqd HIVE_SCAN_SPECIAL_MODE '1';
+select * from tbl_dos;
+cqd HIVE_SCAN_SPECIAL_MODE reset;
 log;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/TEST005_a.hive.sql
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/TEST005_a.hive.sql 
b/core/sql/regress/hive/TEST005_a.hive.sql
index 1b5c580..cf1fbb4 100644
--- a/core/sql/regress/hive/TEST005_a.hive.sql
+++ b/core/sql/regress/hive/TEST005_a.hive.sql
@@ -163,3 +163,13 @@ create external table tbl_gbk
 )
 row format delimited fields terminated by '\t'
 location '/user/hive/exttables/tbl_gbk';
+
+drop table tbl_dos;
+CREATE  external TABLE  tbl_dos(
+   c1  int,
+   c2  int,
+   c3  int,
+   c4  string)
+row format delimited fields terminated by '|' 
+location '/user/hive/exttables/tbl_dos'
+;

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/tbl_dos.data
----------------------------------------------------------------------
diff --git a/core/sql/regress/hive/tbl_dos.data 
b/core/sql/regress/hive/tbl_dos.data
new file mode 100644
index 0000000..167db79
--- /dev/null
+++ b/core/sql/regress/hive/tbl_dos.data
@@ -0,0 +1,10 @@
+0|39478|8147|2008-07-17
+1|21944|8327|2005-05-12
+2|32730|9999|2000-11-05
+3|19653|5727|2005-06-24
+4|67794|6012|2008-07-01
+5|93265|5823|2012-06-26
+6|28219|909|2009-04-26
+7|23967|8290|2006-02-21
+8|24265|8663|2006-10-06
+9|70273|3363|2001-03-17

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/sqlcomp/DefaultConstants.h
----------------------------------------------------------------------
diff --git a/core/sql/sqlcomp/DefaultConstants.h 
b/core/sql/sqlcomp/DefaultConstants.h
index 479c2a5..a941dad 100644
--- a/core/sql/sqlcomp/DefaultConstants.h
+++ b/core/sql/sqlcomp/DefaultConstants.h
@@ -3809,6 +3809,10 @@ enum DefaultConstants
   // Currently syskey, _salt_, _division_.
   TRAF_ALLOW_RESERVED_COLNAMES,
 
+  // bitmap to control various special behavior of HIVE_SCAN
+  //   // 1 : DOS FORMAT conversion on
+  //     // 2 : todo
+  HIVE_SCAN_SPECIAL_MODE,
   // This enum constant must be the LAST one in the list; it's a count,
   // not an Attribute (it's not IN DefaultDefaults; it's the SIZE of it)!
   __NUM_DEFAULT_ATTRIBUTES

http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/sqlcomp/nadefaults.cpp
----------------------------------------------------------------------
diff --git a/core/sql/sqlcomp/nadefaults.cpp b/core/sql/sqlcomp/nadefaults.cpp
index 42400e1..f5db22b 100644
--- a/core/sql/sqlcomp/nadefaults.cpp
+++ b/core/sql/sqlcomp/nadefaults.cpp
@@ -1971,6 +1971,7 @@ SDDkwd__(EXE_DIAGNOSTIC_EVENTS,           "OFF"),
   DDflt0_(HIVE_MIN_BYTES_PER_ESP_PARTITION,     "67108864"),
   DDui___(HIVE_NUM_ESPS_PER_DATANODE,           "2"),
   DDpct__(HIVE_NUM_ESPS_ROUND_DEVIATION,        "34"),
+  DDint__(HIVE_SCAN_SPECIAL_MODE,                "0"),
   DDkwd__(HIVE_SORT_HDFS_HOSTS,                 "ON"),
   DD_____(HIVE_USE_FAKE_SQ_NODE_NAMES,          "" ),
   DDkwd__(HIVE_USE_FAKE_TABLE_DESC,             "OFF"),
@@ -6666,6 +6667,10 @@ DefaultToken NADefaults::token(Int32 attrEnum,
        isValid = TRUE;
       break;
 
+    case HIVE_SCAN_SPECIAL_MODE:
+       isValid = TRUE;
+       break;
+
     case IS_SQLCI:
       // for primary mxcmp that is invoked for user queries, the only valid
       // value for mxci_process cqd is TRUE. This cqd is set once by mxci

Reply via email to