Repository: incubator-trafodion Updated Branches: refs/heads/master f0bc795b7 -> 464a0e0ab
[TRAFODION-1912]support automatic convert DOS format during bulkload Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/f48cc2b6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/f48cc2b6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/f48cc2b6 Branch: refs/heads/master Commit: f48cc2b69043b33c1abd2b9cec9e3bf1e2299d11 Parents: a9eae1a Author: Liu Ming <ovis_p...@sina.com> Authored: Tue Apr 5 02:05:48 2016 +0000 Committer: Liu Ming <ovis_p...@sina.com> Committed: Tue Apr 5 02:05:48 2016 +0000 ---------------------------------------------------------------------- core/sql/executor/ExHdfsScan.cpp | 15 +++++++------ core/sql/executor/ExHdfsScan.h | 31 ++++++++++++++++++++++----- core/sql/regress/hive/EXPECTED005 | 27 +++++++++++++++++++++++ core/sql/regress/hive/TEST005 | 8 +++++++ core/sql/regress/hive/TEST005_a.hive.sql | 10 +++++++++ core/sql/regress/hive/tbl_dos.data | 10 +++++++++ core/sql/sqlcomp/DefaultConstants.h | 4 ++++ core/sql/sqlcomp/nadefaults.cpp | 5 +++++ 8 files changed, 99 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/executor/ExHdfsScan.cpp ---------------------------------------------------------------------- diff --git a/core/sql/executor/ExHdfsScan.cpp b/core/sql/executor/ExHdfsScan.cpp index 9016073..6c38ecc 100644 --- a/core/sql/executor/ExHdfsScan.cpp +++ b/core/sql/executor/ExHdfsScan.cpp @@ -367,6 +367,9 @@ ExWorkProcRetcode ExHdfsScanTcb::work() char cursorId[8]; HdfsFileInfo *hdfo = NULL; Lng32 openType = 0; + int hiveScanMode = 0; + + hiveScanMode = CmpCommon::getDefaultLong(HIVE_SCAN_SPECIAL_MODE); while (!qparent_.down->isEmpty()) { @@ -720,7 +723,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work() // Position in the hdfsScanBuffer_ to the // first record delimiter. hdfsBufNextRow_ = hdfs_strchr(hdfsScanBuffer_, - hdfsScanTdb().recordDelimiter_, hdfsScanBuffer_+trailingPrevRead_+ bytesRead_, checkRangeDelimiter_); + hdfsScanTdb().recordDelimiter_, hdfsScanBuffer_+trailingPrevRead_+ bytesRead_, checkRangeDelimiter_, hiveScanMode); // May be that the record is too long? Or data isn't ascii? // Or delimiter is incorrect. if (! hdfsBufNextRow_) @@ -768,7 +771,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work() ComDiagsArea *transformDiags = NULL; int err = 0; char *startOfNextRow = - extractAndTransformAsciiSourceToSqlRow(err, transformDiags); + extractAndTransformAsciiSourceToSqlRow(err, transformDiags, hiveScanMode); bool rowWillBeSelected = true; lastErrorCnd_ = NULL; @@ -1378,7 +1381,7 @@ ExWorkProcRetcode ExHdfsScanTcb::work() } char * ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err, - ComDiagsArea* &diagsArea) + ComDiagsArea* &diagsArea, int mode) { err = 0; char *sourceData = hdfsBufNextRow_; @@ -1395,7 +1398,7 @@ char * ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err, hdfsLoggingRow_ = hdfsBufNextRow_; if (asciiSourceTD->numAttrs() == 0) { - sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, checkRangeDelimiter_); + sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, checkRangeDelimiter_, mode); hdfsLoggingRowEnd_ = sourceRowEnd; if (!sourceRowEnd) @@ -1430,7 +1433,7 @@ char * ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err, attr = NULL; if (!isTrailingMissingColumn) { - sourceColEnd = hdfs_strchr(sourceData, rd, cd, sourceDataEnd, checkRangeDelimiter_, &rdSeen); + sourceColEnd = hdfs_strchr(sourceData, rd, cd, sourceDataEnd, checkRangeDelimiter_, &rdSeen,mode); if (sourceColEnd == NULL) { if (rdSeen || (sourceRowEnd == NULL)) return NULL; @@ -1493,7 +1496,7 @@ char * ExHdfsScanTcb::extractAndTransformAsciiSourceToSqlRow(int &err, // rowDelimiter is encountered // So try to find the record delimiter if (sourceRowEnd == NULL) { - sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, checkRangeDelimiter_); + sourceRowEnd = hdfs_strchr(sourceData, rd, sourceDataEnd, checkRangeDelimiter_,mode); if (sourceRowEnd) { hdfsLoggingRowEnd_ = sourceRowEnd; if ((endOfRequestedRange_) && http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/executor/ExHdfsScan.h ---------------------------------------------------------------------- diff --git a/core/sql/executor/ExHdfsScan.h b/core/sql/executor/ExHdfsScan.h index 8ac16d3..2b3c785 100644 --- a/core/sql/executor/ExHdfsScan.h +++ b/core/sql/executor/ExHdfsScan.h @@ -37,6 +37,9 @@ #include <time.h> #include "ExHbaseAccess.h" #include "ExpHbaseInterface.h" + +#define HIVE_MODE_DOSFORMAT 1 + // ----------------------------------------------------------------------- // Classes defined in this file // ----------------------------------------------------------------------- @@ -200,7 +203,7 @@ protected: // hdfsRead. Or it could be the eof (in which case there is a good // row still waiting to be processed). char * extractAndTransformAsciiSourceToSqlRow(int &err, - ComDiagsArea * &diagsArea); + ComDiagsArea * &diagsArea, int mode); short moveRowToUpQueue(const char * row, Lng32 len, short * rc, NABoolean isVarchar); @@ -429,27 +432,44 @@ protected: #define RANGE_DELIMITER '\002' -inline char *hdfs_strchr(const char *s, int c, const char *end, NABoolean checkRangeDelimiter) +inline char *hdfs_strchr(char *s, int c, const char *end, NABoolean checkRangeDelimiter, int mode = 0) { char *curr = (char *)s; - + int count=0; while (curr < end) { if (*curr == c) + { + if((mode & HIVE_MODE_DOSFORMAT) > 0 ) // The line terminator and we want to remove the \r before it + { + if(count>0 && c == '\n') + { + if(s[count-1] == '\r') s[count-1] = ' '; + } + } return curr; + } if (checkRangeDelimiter &&*curr == RANGE_DELIMITER) return NULL; curr++; + count++; } return NULL; } -inline char *hdfs_strchr(const char *s, int rd, int cd, const char *end, NABoolean checkRangeDelimiter, NABoolean *rdSeen) +inline char *hdfs_strchr(char *s, int rd, int cd, const char *end, NABoolean checkRangeDelimiter, NABoolean *rdSeen, int mode = 0) { char *curr = (char *)s; - + int count = 0; while (curr < end) { if (*curr == rd) { + if( (mode & HIVE_MODE_DOSFORMAT)>0 ) //convert DOS format by replace the \r with space if it is \r\n here + { + if(count>0 && rd == '\n') + { + if(s[count-1] == '\r') s[count-1] = ' '; + } + } *rdSeen = TRUE; return curr; } @@ -464,6 +484,7 @@ inline char *hdfs_strchr(const char *s, int rd, int cd, const char *end, NABoole return NULL; } curr++; + count++; } *rdSeen = FALSE; return NULL; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/EXPECTED005 ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/EXPECTED005 b/core/sql/regress/hive/EXPECTED005 index 9299d25..7d7b108 100644 --- a/core/sql/regress/hive/EXPECTED005 +++ b/core/sql/regress/hive/EXPECTED005 @@ -447,6 +447,7 @@ TINT SM I BIG STR F --- 1 row(s) selected. >> +>> >>cqd HIVE_FILE_CHARSET 'GBK'; --- SQL operation complete. @@ -473,4 +474,30 @@ C1 (EXPR) --- SQL operation complete. >> +>>cqd CALL_EMBEDDED_ARKCMP 'ON'; + +--- SQL operation complete. +>>cqd HIVE_SCAN_SPECIAL_MODE '1'; + +--- SQL operation complete. +>>select * from tbl_dos; + +C1 C2 C3 C4 +----------- ----------- ----------- ------------------------- + + 0 39478 8147 2008-07-17 + 1 21944 8327 2005-05-12 + 2 32730 9999 2000-11-05 + 3 19653 5727 2005-06-24 + 4 67794 6012 2008-07-01 + 5 93265 5823 2012-06-26 + 6 28219 909 2009-04-26 + 7 23967 8290 2006-02-21 + 8 24265 8663 2006-10-06 + 9 70273 3363 2001-03-17 + +--- 10 row(s) selected. +>>cqd HIVE_SCAN_SPECIAL_MODE reset; + +--- SQL operation complete. >>log; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/TEST005 ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/TEST005 b/core/sql/regress/hive/TEST005 index 5bc2a21..e2c824f 100644 --- a/core/sql/regress/hive/TEST005 +++ b/core/sql/regress/hive/TEST005 @@ -31,18 +31,21 @@ sh regrhadoop.ksh fs -mkdir /user/hive/exttables/customer_temp; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_utf8; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_type; sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_gbk; +sh regrhadoop.ksh fs -mkdir /user/hive/exttables/tbl_dos; --empty folders sh regrhadoop.ksh fs -rm /user/hive/exttables/customer_ddl/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/customer_temp/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_utf8/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_type/*; sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_gbk/*; +sh regrhadoop.ksh fs -rm /user/hive/exttables/tbl_dos/*; --- setup Hive tables sh regrhive.ksh -v -f $REGRTSTDIR/TEST005_a.hive.sql; sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_utf8.data /user/hive/exttables/tbl_utf8; sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_type.data /user/hive/exttables/tbl_type; sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_gbk.data /user/hive/exttables/tbl_gbk; +sh regrhadoop.ksh fs -put $REGRTSTDIR/tbl_dos.data /user/hive/exttables/tbl_dos; log LOG005 clear; @@ -222,8 +225,13 @@ select * from tbl_type; insert into tbl_type_temp select * from tbl_type; select * from tbl_type_temp; + cqd HIVE_FILE_CHARSET 'GBK'; select c1, CONVERTTOHEX(c2) from tbl_gbk; cqd HIVE_FILE_CHARSET reset; +cqd CALL_EMBEDDED_ARKCMP 'ON'; +cqd HIVE_SCAN_SPECIAL_MODE '1'; +select * from tbl_dos; +cqd HIVE_SCAN_SPECIAL_MODE reset; log; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/TEST005_a.hive.sql ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/TEST005_a.hive.sql b/core/sql/regress/hive/TEST005_a.hive.sql index 1b5c580..cf1fbb4 100644 --- a/core/sql/regress/hive/TEST005_a.hive.sql +++ b/core/sql/regress/hive/TEST005_a.hive.sql @@ -163,3 +163,13 @@ create external table tbl_gbk ) row format delimited fields terminated by '\t' location '/user/hive/exttables/tbl_gbk'; + +drop table tbl_dos; +CREATE external TABLE tbl_dos( + c1 int, + c2 int, + c3 int, + c4 string) +row format delimited fields terminated by '|' +location '/user/hive/exttables/tbl_dos' +; http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/regress/hive/tbl_dos.data ---------------------------------------------------------------------- diff --git a/core/sql/regress/hive/tbl_dos.data b/core/sql/regress/hive/tbl_dos.data new file mode 100644 index 0000000..167db79 --- /dev/null +++ b/core/sql/regress/hive/tbl_dos.data @@ -0,0 +1,10 @@ +0|39478|8147|2008-07-17 +1|21944|8327|2005-05-12 +2|32730|9999|2000-11-05 +3|19653|5727|2005-06-24 +4|67794|6012|2008-07-01 +5|93265|5823|2012-06-26 +6|28219|909|2009-04-26 +7|23967|8290|2006-02-21 +8|24265|8663|2006-10-06 +9|70273|3363|2001-03-17 http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/sqlcomp/DefaultConstants.h ---------------------------------------------------------------------- diff --git a/core/sql/sqlcomp/DefaultConstants.h b/core/sql/sqlcomp/DefaultConstants.h index 479c2a5..a941dad 100644 --- a/core/sql/sqlcomp/DefaultConstants.h +++ b/core/sql/sqlcomp/DefaultConstants.h @@ -3809,6 +3809,10 @@ enum DefaultConstants // Currently syskey, _salt_, _division_. TRAF_ALLOW_RESERVED_COLNAMES, + // bitmap to control various special behavior of HIVE_SCAN + // // 1 : DOS FORMAT conversion on + // // 2 : todo + HIVE_SCAN_SPECIAL_MODE, // This enum constant must be the LAST one in the list; it's a count, // not an Attribute (it's not IN DefaultDefaults; it's the SIZE of it)! __NUM_DEFAULT_ATTRIBUTES http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/f48cc2b6/core/sql/sqlcomp/nadefaults.cpp ---------------------------------------------------------------------- diff --git a/core/sql/sqlcomp/nadefaults.cpp b/core/sql/sqlcomp/nadefaults.cpp index 42400e1..f5db22b 100644 --- a/core/sql/sqlcomp/nadefaults.cpp +++ b/core/sql/sqlcomp/nadefaults.cpp @@ -1971,6 +1971,7 @@ SDDkwd__(EXE_DIAGNOSTIC_EVENTS, "OFF"), DDflt0_(HIVE_MIN_BYTES_PER_ESP_PARTITION, "67108864"), DDui___(HIVE_NUM_ESPS_PER_DATANODE, "2"), DDpct__(HIVE_NUM_ESPS_ROUND_DEVIATION, "34"), + DDint__(HIVE_SCAN_SPECIAL_MODE, "0"), DDkwd__(HIVE_SORT_HDFS_HOSTS, "ON"), DD_____(HIVE_USE_FAKE_SQ_NODE_NAMES, "" ), DDkwd__(HIVE_USE_FAKE_TABLE_DESC, "OFF"), @@ -6666,6 +6667,10 @@ DefaultToken NADefaults::token(Int32 attrEnum, isValid = TRUE; break; + case HIVE_SCAN_SPECIAL_MODE: + isValid = TRUE; + break; + case IS_SQLCI: // for primary mxcmp that is invoked for user queries, the only valid // value for mxci_process cqd is TRUE. This cqd is set once by mxci