HIVE-18517: Vectorization: Fix VectorMapOperator to accept VRBs and check vectorized flag correctly to support LLAP Caching (Matt McCline, review by Gopal Vijayaraghavan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/255cf4ab Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/255cf4ab Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/255cf4ab Branch: refs/heads/master Commit: 255cf4ab22295996262ac39af703c2747b98dbf3 Parents: 8de3334 Author: Matt McCline <[email protected]> Authored: Fri Jan 26 13:12:40 2018 -0600 Committer: Matt McCline <[email protected]> Committed: Fri Jan 26 13:12:40 2018 -0600 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 29 + .../hive/llap/io/api/impl/LlapInputFormat.java | 2 +- .../apache/hadoop/hive/ql/exec/Utilities.java | 14 +- .../hive/ql/exec/vector/VectorMapOperator.java | 112 +- .../hadoop/hive/ql/io/HiveInputFormat.java | 2 +- .../hadoop/hive/ql/io/NullRowsInputFormat.java | 2 +- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 4 +- .../ql/io/parquet/MapredParquetInputFormat.java | 2 +- .../hive/ql/optimizer/physical/Vectorizer.java | 74 +- .../org/apache/hadoop/hive/ql/plan/MapWork.java | 19 +- .../hive/ql/plan/VectorPartitionDesc.java | 9 +- .../schema_evol_orc_acid_part_llap_io.q | 320 ++++ .../schema_evol_orc_acid_part_update_llap_io.q | 161 ++ .../schema_evol_orc_acid_table_llap_io.q | 318 +++ .../schema_evol_orc_acid_table_update_llap_io.q | 113 ++ .../schema_evol_orc_acidvec_part_llap_io.q | 359 ++++ ...chema_evol_orc_acidvec_part_update_llap_io.q | 119 ++ .../schema_evol_orc_acidvec_table_llap_io.q | 357 ++++ ...hema_evol_orc_acidvec_table_update_llap_io.q | 113 ++ ...a_evol_orc_nonvec_part_all_complex_llap_io.q | 164 ++ ...evol_orc_nonvec_part_all_primitive_llap_io.q | 207 ++ .../schema_evol_orc_nonvec_part_llap_io.q | 338 ++++ .../schema_evol_orc_nonvec_table_llap_io.q | 325 ++++ ...hema_evol_orc_vec_part_all_complex_llap_io.q | 164 ++ ...ma_evol_orc_vec_part_all_primitive_llap_io.q | 207 ++ .../schema_evol_orc_vec_part_llap_io.q | 339 ++++ .../schema_evol_orc_vec_table_llap_io.q | 324 ++++ ..._evol_text_nonvec_part_all_complex_llap_io.q | 164 ++ ...vol_text_nonvec_part_all_primitive_llap_io.q | 222 +++ .../schema_evol_text_nonvec_part_llap_io.q | 338 ++++ .../schema_evol_text_nonvec_table_llap_io.q | 324 ++++ ...ema_evol_text_vec_part_all_complex_llap_io.q | 174 ++ ...a_evol_text_vec_part_all_primitive_llap_io.q | 223 +++ .../schema_evol_text_vec_part_llap_io.q | 374 ++++ .../schema_evol_text_vec_table_llap_io.q | 360 ++++ ..._evol_text_vecrow_part_all_complex_llap_io.q | 175 ++ ...vol_text_vecrow_part_all_primitive_llap_io.q | 223 +++ .../schema_evol_text_vecrow_part_llap_io.q | 374 ++++ .../schema_evol_text_vecrow_table_llap_io.q | 363 ++++ .../queries/clientpositive/vector_llap_text_1.q | 45 + .../schema_evol_orc_acid_part_llap_io.q.out | 1080 +++++++++++ ...hema_evol_orc_acid_part_update_llap_io.q.out | 1681 ++++++++++++++++ .../schema_evol_orc_acid_table_llap_io.q.out | 1077 +++++++++++ ...ema_evol_orc_acid_table_update_llap_io.q.out | 449 +++++ .../schema_evol_orc_acidvec_part_llap_io.q.out | 1811 ++++++++++++++++++ ...a_evol_orc_acidvec_part_update_llap_io.q.out | 537 ++++++ .../schema_evol_orc_acidvec_table_llap_io.q.out | 1794 +++++++++++++++++ ..._evol_orc_acidvec_table_update_llap_io.q.out | 449 +++++ ...ol_orc_nonvec_part_all_complex_llap_io.q.out | 687 +++++++ ..._orc_nonvec_part_all_primitive_llap_io.q.out | 1042 ++++++++++ .../schema_evol_orc_nonvec_part_llap_io.q.out | 1474 ++++++++++++++ .../schema_evol_orc_nonvec_table_llap_io.q.out | 1291 +++++++++++++ ..._evol_orc_vec_part_all_complex_llap_io.q.out | 765 ++++++++ ...vol_orc_vec_part_all_primitive_llap_io.q.out | 1172 ++++++++++++ .../llap/schema_evol_orc_vec_part_llap_io.q.out | 1708 +++++++++++++++++ .../schema_evol_orc_vec_table_llap_io.q.out | 1416 ++++++++++++++ ...l_text_nonvec_part_all_complex_llap_io.q.out | 687 +++++++ ...text_nonvec_part_all_primitive_llap_io.q.out | 644 +++++++ .../schema_evol_text_nonvec_part_llap_io.q.out | 1474 ++++++++++++++ .../schema_evol_text_nonvec_table_llap_io.q.out | 1291 +++++++++++++ ...evol_text_vec_part_all_complex_llap_io.q.out | 388 ++++ ...ol_text_vec_part_all_primitive_llap_io.q.out | 779 ++++++++ .../schema_evol_text_vec_part_llap_io.q.out | 1204 ++++++++++++ .../schema_evol_text_vec_table_llap_io.q.out | 1185 ++++++++++++ ...l_text_vecrow_part_all_complex_llap_io.q.out | 385 ++++ ...text_vecrow_part_all_primitive_llap_io.q.out | 774 ++++++++ .../schema_evol_text_vecrow_part_llap_io.q.out | 1195 ++++++++++++ .../schema_evol_text_vecrow_table_llap_io.q.out | 1176 ++++++++++++ .../llap/vector_llap_text_1.q.out | 351 ++++ .../clientpositive/parquet_no_row_serde.q.out | 2 +- .../vectorization_input_format_excludes.q.out | 24 +- 71 files changed, 37466 insertions(+), 82 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 1362079..80e6aee 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -598,34 +598,62 @@ minillaplocal.query.files=\ quotedid_smb.q,\ resourceplan.q,\ sample10.q,\ + schema_evol_orc_acid_part_llap_io.q,\ schema_evol_orc_acid_part.q,\ + schema_evol_orc_acid_part_update_llap_io.q,\ schema_evol_orc_acid_part_update.q,\ + schema_evol_orc_acid_table_llap_io.q,\ schema_evol_orc_acid_table.q,\ + schema_evol_orc_acid_table_update_llap_io.q,\ schema_evol_orc_acid_table_update.q,\ + schema_evol_orc_acidvec_part_llap_io.q,\ schema_evol_orc_acidvec_part.q,\ + schema_evol_orc_acidvec_part_update_llap_io.q,\ schema_evol_orc_acidvec_part_update.q,\ + schema_evol_orc_acidvec_table_llap_io.q,\ schema_evol_orc_acidvec_table.q,\ + schema_evol_orc_acidvec_table_update_llap_io.q,\ schema_evol_orc_acidvec_table_update.q,\ + schema_evol_orc_nonvec_part_llap_io.q,\ schema_evol_orc_nonvec_part.q,\ + schema_evol_orc_nonvec_part_all_complex_llap_io.q,\ schema_evol_orc_nonvec_part_all_complex.q,\ + schema_evol_orc_nonvec_part_all_primitive_llap_io.q,\ schema_evol_orc_nonvec_part_all_primitive.q,\ + schema_evol_orc_nonvec_table_llap_io.q,\ schema_evol_orc_nonvec_table.q,\ + schema_evol_orc_vec_part_llap_io.q,\ schema_evol_orc_vec_part.q,\ + schema_evol_orc_vec_part_all_complex_llap_io.q,\ schema_evol_orc_vec_part_all_complex.q,\ + schema_evol_orc_vec_part_all_primitive_llap_io.q,\ schema_evol_orc_vec_part_all_primitive.q,\ + schema_evol_orc_vec_table_llap_io.q,\ schema_evol_orc_vec_table.q,\ schema_evol_stats.q,\ + schema_evol_text_nonvec_part_llap_io.q,\ schema_evol_text_nonvec_part.q,\ + schema_evol_text_nonvec_part_all_complex_llap_io.q,\ schema_evol_text_nonvec_part_all_complex.q,\ + schema_evol_text_nonvec_part_all_primitive_llap_io.q,\ schema_evol_text_nonvec_part_all_primitive.q,\ + schema_evol_text_nonvec_table_llap_io.q,\ schema_evol_text_nonvec_table.q,\ + schema_evol_text_vec_part_llap_io.q,\ schema_evol_text_vec_part.q,\ + schema_evol_text_vec_part_all_complex_llap_io.q,\ schema_evol_text_vec_part_all_complex.q,\ + schema_evol_text_vec_part_all_primitive_llap_io.q,\ schema_evol_text_vec_part_all_primitive.q,\ + schema_evol_text_vec_table_llap_io.q,\ schema_evol_text_vec_table.q,\ + schema_evol_text_vecrow_part_llap_io.q,\ schema_evol_text_vecrow_part.q,\ + schema_evol_text_vecrow_part_all_complex_llap_io.q,\ schema_evol_text_vecrow_part_all_complex.q,\ + schema_evol_text_vecrow_part_all_primitive_llap_io.q,\ schema_evol_text_vecrow_part_all_primitive.q,\ + schema_evol_text_vecrow_table_llap_io.q,\ schema_evol_text_vecrow_table.q,\ selectDistinctStar.q,\ semijoin.q,\ @@ -698,6 +726,7 @@ minillaplocal.query.files=\ vector_join30.q,\ vector_join_filters.q,\ vector_leftsemi_mapjoin.q,\ + vector_llap_text_1.q,\ vector_mapjoin_reduce.q,\ vector_number_compare_projection.q,\ vector_partitioned_date_time.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java index 4336220..bb319f0 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapInputFormat.java @@ -110,7 +110,7 @@ public class LlapInputFormat implements InputFormat<NullWritable, VectorizedRowB } // For non-vectorized operator case, wrap the reader if possible. RecordReader<NullWritable, VectorizedRowBatch> result = rr; - if (!Utilities.getUseVectorizedInputFileFormat(job)) { + if (!Utilities.getIsVectorized(job)) { result = wrapLlapReader(includedCols, rr, split); if (result == null) { // Cannot wrap a reader for non-vectorized pipeline. http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index 2e1fd37..d7b3e4b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -3684,12 +3684,12 @@ public final class Utilities { /** * Returns true if a plan is both configured for vectorized execution - * and the node is vectorized and the Input File Format is marked VectorizedInputFileFormat. + * and the node is vectorized. * * The plan may be configured for vectorization * but vectorization disallowed eg. for FetchOperator execution. */ - public static boolean getUseVectorizedInputFileFormat(Configuration conf) { + public static boolean getIsVectorized(Configuration conf) { if (conf.get(VECTOR_MODE) != null) { // this code path is necessary, because with HS2 and client // side split generation we end up not finding the map work. @@ -3697,13 +3697,12 @@ public final class Utilities { // generation is multi-threaded - HS2 plan cache uses thread // locals). return - conf.getBoolean(VECTOR_MODE, false) && - conf.getBoolean(USE_VECTORIZED_INPUT_FILE_FORMAT, false); + conf.getBoolean(VECTOR_MODE, false); } else { if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) && Utilities.getPlanPath(conf) != null) { MapWork mapWork = Utilities.getMapWork(conf); - return (mapWork.getVectorMode() && mapWork.getUseVectorizedInputFileFormat()); + return mapWork.getVectorMode(); } else { return false; } @@ -3711,10 +3710,9 @@ public final class Utilities { } - public static boolean getUseVectorizedInputFileFormat(Configuration conf, MapWork mapWork) { + public static boolean getIsVectorized(Configuration conf, MapWork mapWork) { return HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED) && - mapWork.getVectorMode() && - mapWork.getUseVectorizedInputFileFormat(); + mapWork.getVectorMode(); } /** http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java index cd12a0b..6f1346d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorMapOperator.java @@ -620,6 +620,25 @@ public class VectorMapOperator extends AbstractMapOperator { } /* + * Flush a partially full deserializerBatch. + * @return Return true if the operator tree is not done yet. + */ + private boolean flushDeserializerBatch() throws HiveException { + if (deserializerBatch.size > 0) { + + batchCounter++; + oneRootOperator.process(deserializerBatch, 0); + deserializerBatch.reset(); + if (oneRootOperator.getDone()) { + setDone(true); + return false; + } + + } + return true; + } + + /* * Setup the context for reading from the next partition file. */ private void setupPartitionContextVars(String nominalPath) throws HiveException { @@ -672,20 +691,14 @@ public class VectorMapOperator extends AbstractMapOperator { currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE); - if (deserializerBatch.size > 0) { - - /* - * Clear out any rows in the batch from previous partition since we are going to change - * the repeating partition column values. - */ - batchCounter++; - oneRootOperator.process(deserializerBatch, 0); - deserializerBatch.reset(); - if (oneRootOperator.getDone()) { - setDone(true); - return; - } + /* + * Clear out any rows in the batch from previous partition since we are going to change + * the repeating partition column values. + */ + if (!flushDeserializerBatch()) { + // Operator tree is now done. + return; } /* @@ -773,6 +786,38 @@ public class VectorMapOperator extends AbstractMapOperator { return null; } + /* + * Deliver a vector batch to the operator tree. + * + * The Vectorized Input File Format reader has already set the partition column + * values, reset and filled in the batch, etc. + * + * We pass the VectorizedRowBatch through here. + * + * @return Return true if the operator tree is not done yet. + */ + private boolean deliverVectorizedRowBatch(Writable value) throws HiveException { + + batchCounter++; + if (value != null) { + VectorizedRowBatch batch = (VectorizedRowBatch) value; + numRows += batch.size; + if (hasRowIdentifier) { + if (batchContext.getRecordIdColumnVector() == null) { + setRowIdentiferToNull(batch); + } else { + batch.cols[rowIdentifierColumnNum] = batchContext.getRecordIdColumnVector(); + } + } + } + oneRootOperator.process(value, 0); + if (oneRootOperator.getDone()) { + setDone(true); + return false; + } + return true; + } + @Override public void process(Writable value) throws HiveException { @@ -798,30 +843,33 @@ public class VectorMapOperator extends AbstractMapOperator { try { if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) { + if (!deliverVectorizedRowBatch(value)) { + + // Operator tree is now done. + return; + } + + } else if (value instanceof VectorizedRowBatch) { + /* - * The Vectorized Input File Format reader has already set the partition column - * values, reset and filled in the batch, etc. - * - * We pass the VectorizedRowBatch through here. + * This case can happen with LLAP. If it is able to deserialize and cache data from the + * input format, it will deliver that cached data to us as VRBs. */ - batchCounter++; - if (value != null) { - VectorizedRowBatch batch = (VectorizedRowBatch) value; - numRows += batch.size; - if (hasRowIdentifier) { - if (batchContext.getRecordIdColumnVector() == null) { - setRowIdentiferToNull(batch); - } else { - batch.cols[rowIdentifierColumnNum] = batchContext.getRecordIdColumnVector(); - } - } - } - oneRootOperator.process(value, 0); - if (oneRootOperator.getDone()) { - setDone(true); + + /* + * Clear out any rows we may have processed in row-mode for the current partition.. + */ + if (!flushDeserializerBatch()) { + + // Operator tree is now done. return; } + if (!deliverVectorizedRowBatch(value)) { + + // Operator tree is now done. + return; + } } else { /* http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java index 2687d33..7d3ff36 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java @@ -222,7 +222,7 @@ public class HiveInputFormat<K extends WritableComparable, V extends Writable> String ifName = inputFormat.getClass().getCanonicalName(); boolean isSupported = inputFormat instanceof LlapWrappableInputFormatInterface; boolean isCacheOnly = inputFormat instanceof LlapCacheOnlyInputFormatInterface; - boolean isVectorized = Utilities.getUseVectorizedInputFileFormat(conf); + boolean isVectorized = Utilities.getIsVectorized(conf); if (!isVectorized) { // Pretend it's vectorized if the non-vector wrapped is enabled. isVectorized = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED) http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java index ce5dae0..6a372a3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/NullRowsInputFormat.java @@ -71,7 +71,7 @@ public class NullRowsInputFormat implements InputFormat<NullWritable, NullWritab private boolean addPartitionCols = true; public NullRowsRecordReader(Configuration conf, InputSplit split) throws IOException { - boolean isVectorMode = Utilities.getUseVectorizedInputFileFormat(conf); + boolean isVectorMode = Utilities.getIsVectorized(conf); if (LOG.isDebugEnabled()) { LOG.debug(getClass().getSimpleName() + " in " + (isVectorMode ? "" : "non-") + "vector mode"); http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index e92bcb4..ff2cc04 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -506,7 +506,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, List<FileStatus> files ) throws IOException { - if (Utilities.getUseVectorizedInputFileFormat(conf)) { + if (Utilities.getIsVectorized(conf)) { return new VectorizedOrcInputFormat().validateInput(fs, conf, files); } @@ -1890,7 +1890,7 @@ public class OrcInputFormat implements InputFormat<NullWritable, OrcStruct>, getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { //CombineHiveInputFormat may produce FileSplit that is not OrcSplit - boolean vectorMode = Utilities.getUseVectorizedInputFileFormat(conf); + boolean vectorMode = Utilities.getIsVectorized(conf); boolean isAcidRead = isAcidRead(conf, inputSplit); if (!isAcidRead) { if (vectorMode) { http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java index ab8c0ca..10e6a18 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/MapredParquetInputFormat.java @@ -74,7 +74,7 @@ public class MapredParquetInputFormat extends FileInputFormat<NullWritable, Arra final org.apache.hadoop.mapred.Reporter reporter ) throws IOException { try { - if (Utilities.getUseVectorizedInputFileFormat(job)) { + if (Utilities.getIsVectorized(job)) { if (LOG.isDebugEnabled()) { LOG.debug("Using vectorized record reader"); } http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 215aaad..99618c6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -35,6 +35,7 @@ import java.util.Map.Entry; import java.util.Properties; import java.util.Set; import java.util.Stack; +import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; @@ -1111,6 +1112,25 @@ public class Vectorizer implements PhysicalPlanResolver { } /* + * Add a vector partition descriptor to partition descriptor, removing duplicate object. + * + * If the same vector partition descriptor has already been allocated, share that object. + */ + private void addVectorPartitionDesc(PartitionDesc pd, VectorPartitionDesc vpd, + Map<VectorPartitionDesc, VectorPartitionDesc> vectorPartitionDescMap) { + + VectorPartitionDesc existingEle = vectorPartitionDescMap.get(vpd); + if (existingEle != null) { + + // Use the object we already have. + vpd = existingEle; + } else { + vectorPartitionDescMap.put(vpd, vpd); + } + pd.setVectorPartitionDesc(vpd); + } + + /* * There are 3 modes of reading for vectorization: * * 1) One for the Vectorized Input File Format which returns VectorizedRowBatch as the row. @@ -1127,6 +1147,7 @@ public class Vectorizer implements PhysicalPlanResolver { private boolean verifyAndSetVectorPartDesc( PartitionDesc pd, boolean isAcidTable, Set<String> inputFileFormatClassNameSet, + Map<VectorPartitionDesc, VectorPartitionDesc> vectorPartitionDescMap, Set<String> enabledConditionsMetSet, ArrayList<String> enabledConditionsNotMetList, Set<Support> newSupportSet) { @@ -1154,9 +1175,11 @@ public class Vectorizer implements PhysicalPlanResolver { addVectorizedInputFileFormatSupport( newSupportSet, isInputFileFormatVectorized, inputFileFormatClass); - pd.setVectorPartitionDesc( + addVectorPartitionDesc( + pd, VectorPartitionDesc.createVectorizedInputFileFormat( - inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd))); + inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)), + vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); return true; @@ -1173,9 +1196,11 @@ public class Vectorizer implements PhysicalPlanResolver { addVectorizedInputFileFormatSupport( newSupportSet, isInputFileFormatVectorized, inputFileFormatClass); - pd.setVectorPartitionDesc( + addVectorPartitionDesc( + pd, VectorPartitionDesc.createVectorizedInputFileFormat( - inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd))); + inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd)), + vectorPartitionDescMap); enabledConditionsMetSet.add( HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTORIZED_INPUT_FILE_FORMAT.varname); @@ -1239,18 +1264,22 @@ public class Vectorizer implements PhysicalPlanResolver { // Add the support for read variations in Vectorized Text. newSupportSet.addAll(vectorDeserializeTextSupportSet); - pd.setVectorPartitionDesc( + addVectorPartitionDesc( + pd, VectorPartitionDesc.createVectorDeserialize( - inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE)); + inputFileFormatClassName, VectorDeserializeType.LAZY_SIMPLE), + vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); return true; } } else if (isSequenceFormat) { - pd.setVectorPartitionDesc( + addVectorPartitionDesc( + pd, VectorPartitionDesc.createVectorDeserialize( - inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY)); + inputFileFormatClassName, VectorDeserializeType.LAZY_BINARY), + vectorPartitionDescMap); enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_VECTOR_DESERIALIZE.varname); return true; @@ -1262,15 +1291,27 @@ public class Vectorizer implements PhysicalPlanResolver { // inspect-able Object[] row to a VectorizedRowBatch in the VectorMapOperator. if (useRowDeserialize) { - if (!isInputFormatExcluded(inputFileFormatClassName, rowDeserializeInputFormatExcludes)) { - pd.setVectorPartitionDesc( + boolean isRowDeserializeExcluded = + isInputFormatExcluded(inputFileFormatClassName, rowDeserializeInputFormatExcludes); + if (!isRowDeserializeExcluded && !isInputFileFormatVectorized) { + addVectorPartitionDesc( + pd, VectorPartitionDesc.createRowDeserialize( inputFileFormatClassName, Utilities.isInputFileFormatSelfDescribing(pd), - deserializerClassName)); - + deserializerClassName), + vectorPartitionDescMap); + enabledConditionsMetSet.add(HiveConf.ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname); return true; + } else if (isInputFileFormatVectorized) { + + /* + * Vectorizer does not vectorize in row deserialize mode if the input format has + * VectorizedInputFormat so input formats will be clear if the isVectorized flag + * is on, they are doing VRB work. + */ + enabledConditionsNotMetList.add("Row deserialization of vectorized input format not supported"); } else { enabledConditionsNotMetList.add(ConfVars.HIVE_VECTORIZATION_USE_ROW_DESERIALIZE.varname + " IS true AND " + ConfVars.HIVE_VECTORIZATION_ROW_DESERIALIZE_INPUTFORMAT_EXCLUDES.varname @@ -1369,6 +1410,8 @@ public class Vectorizer implements PhysicalPlanResolver { // Remember the input file formats we validated and why. Set<String> inputFileFormatClassNameSet = new HashSet<String>(); + Map<VectorPartitionDesc, VectorPartitionDesc> vectorPartitionDescMap = + new LinkedHashMap<VectorPartitionDesc, VectorPartitionDesc>(); Set<String> enabledConditionsMetSet = new HashSet<String>(); ArrayList<String> enabledConditionsNotMetList = new ArrayList<String>(); Set<Support> inputFormatSupportSet = new TreeSet<Support>(); @@ -1395,11 +1438,15 @@ public class Vectorizer implements PhysicalPlanResolver { if (!verifyAndSetVectorPartDesc( partDesc, isAcidTable, inputFileFormatClassNameSet, + vectorPartitionDescMap, enabledConditionsMetSet, enabledConditionsNotMetList, newSupportSet)) { // Always set these so EXPLAIN can see. mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet); + ArrayList<VectorPartitionDesc> vectorPartitionDescList = new ArrayList<VectorPartitionDesc>(); + vectorPartitionDescList.addAll(vectorPartitionDescMap.keySet()); + mapWork.setVectorPartitionDescList(vectorPartitionDescList); mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet)); mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList); @@ -1517,6 +1564,9 @@ public class Vectorizer implements PhysicalPlanResolver { // Always set these so EXPLAIN can see. mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet); + ArrayList<VectorPartitionDesc> vectorPartitionDescList = new ArrayList<VectorPartitionDesc>(); + vectorPartitionDescList.addAll(vectorPartitionDescMap.keySet()); + mapWork.setVectorPartitionDescList(vectorPartitionDescList); mapWork.setVectorizationEnabledConditionsMet(new ArrayList(enabledConditionsMetSet)); mapWork.setVectorizationEnabledConditionsNotMet(enabledConditionsNotMetList); http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java index 92946f5..f2b2fc5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/MapWork.java @@ -149,6 +149,7 @@ public class MapWork extends BaseWork { private VectorizerReason notEnabledInputFileFormatReason; private Set<String> vectorizationInputFileFormatClassNameSet; + private List<VectorPartitionDesc> vectorPartitionDescList; private List<String> vectorizationEnabledConditionsMet; private List<String> vectorizationEnabledConditionsNotMet; @@ -268,7 +269,7 @@ public class MapWork extends BaseWork { boolean canWrapAny = false, doCheckIfs = false; if (isLlapOn) { // We can wrap inputs if the execution is vectorized, or if we use a wrapper. - canWrapAny = Utilities.getUseVectorizedInputFileFormat(conf, this); + canWrapAny = Utilities.getIsVectorized(conf, this); // ExecDriver has no plan path, so we cannot derive VRB stuff for the wrapper. if (!canWrapAny && !isExecDriver) { canWrapAny = HiveConf.getBoolVar(conf, ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED); @@ -792,6 +793,14 @@ public class MapWork extends BaseWork { return vectorizationInputFileFormatClassNameSet; } + public void setVectorPartitionDescList(List<VectorPartitionDesc> vectorPartitionDescList) { + this.vectorPartitionDescList = vectorPartitionDescList; + } + + public List<VectorPartitionDesc> getVectorPartitionDescList() { + return vectorPartitionDescList; + } + public void setVectorizationEnabledConditionsMet(ArrayList<String> vectorizationEnabledConditionsMet) { this.vectorizationEnabledConditionsMet = VectorizationCondition.addBooleans(vectorizationEnabledConditionsMet, true); } @@ -822,6 +831,14 @@ public class MapWork extends BaseWork { return mapWork.getVectorizationInputFileFormatClassNameSet(); } + /* + // Too many Q out file changes for the moment... + @Explain(vectorization = Vectorization.DETAIL, displayName = "vectorPartitionDescs", explainLevels = { Level.DEFAULT, Level.EXTENDED }) + public String vectorPartitionDescs() { + return mapWork.getVectorPartitionDescList().toString(); + } + */ + @Explain(vectorization = Vectorization.SUMMARY, displayName = "inputFormatFeatureSupport", explainLevels = { Level.DEFAULT, Level.EXTENDED }) public String getInputFormatSupport() { Set<Support> inputFormatSupportSet = mapWork.getInputFormatSupportSet(); http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java index 787d0c2..2c8904d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/VectorPartitionDesc.java @@ -227,24 +227,25 @@ public class VectorPartitionDesc { @Override public String toString() { StringBuilder sb = new StringBuilder(); - sb.append("vector map operator read type "); + sb.append("("); sb.append(vectorMapOperatorReadType.name()); - sb.append(", input file format class name "); + sb.append(", "); sb.append(inputFileFormatClassName); switch (vectorMapOperatorReadType) { case VECTORIZED_INPUT_FILE_FORMAT: break; case VECTOR_DESERIALIZE: - sb.append(", deserialize type "); + sb.append(", "); sb.append(vectorDeserializeType.name()); break; case ROW_DESERIALIZE: - sb.append(", deserializer class name "); + sb.append(", "); sb.append(rowDeserializerClassName); break; default: throw new RuntimeException("Unexpected vector map operator read type " + vectorMapOperatorReadType.name()); } + sb.append(")"); return sb.toString(); } http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_llap_io.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_llap_io.q b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_llap_io.q new file mode 100644 index 0000000..a526f57 --- /dev/null +++ b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_llap_io.q @@ -0,0 +1,320 @@ +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.mapred.mode=nonstrict; +set hive.cli.print.header=true; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.exec.schema.evolution=false; +SET hive.vectorized.use.vectorized.input.format=true; +SET hive.vectorized.use.vector.serde.deserialize=false; +SET hive.vectorized.use.row.serde.deserialize=false; +SET hive.vectorized.execution.enabled=false; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.disallow.incompatible.col.type.changes=true; +set hive.default.fileformat=orc; +set hive.llap.io.enabled=true; +set hive.llap.io.encode.enabled=true; + +-- SORT_QUERY_RESULTS +-- +-- FILE VARIATION: ORC, ACID Non-Vectorized, MapWork, Partitioned +-- *IMPORTANT NOTE* We set hive.exec.schema.evolution=false above since schema evolution is always used for ACID. +-- Also, we don't do regular EXPLAINs on ACID files because the transaction id causes Q file statistics differences... +-- Instead just one explain vectorization only detail +-- + +CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into table schema_evolution_data; + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE ADD COLUMNS +-- +-- +-- SUBSECTION: ALTER TABLE ADD COLUMNS: INT PERMUTE SELECT +-- +-- +CREATE TABLE part_add_int_permute_select(insert_num int, a INT, b STRING) PARTITIONED BY(part INT) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_add_int_permute_select partition(part=1) VALUES (1, 1111, 'new'); + +-- Table-Non-Cascade ADD COLUMNS ... +alter table part_add_int_permute_select add columns(c int); + +insert into table part_add_int_permute_select partition(part=1) VALUES (2, 2222, 'new', 3333); + +-- SELECT permutation columns to make sure NULL defaulting works right +select insert_num,part,a,b from part_add_int_permute_select; +select insert_num,part,a,b,c from part_add_int_permute_select; +select insert_num,part,c from part_add_int_permute_select; + +drop table part_add_int_permute_select; + + +-- SUBSECTION: ALTER TABLE ADD COLUMNS: INT, STRING, PERMUTE SELECT +-- +-- +CREATE TABLE part_add_int_string_permute_select(insert_num int, a INT, b STRING) PARTITIONED BY(part INT) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_add_int_string_permute_select partition(part=1) VALUES (1, 1111, 'new'); + +-- Table-Non-Cascade ADD COLUMNS ... +alter table part_add_int_string_permute_select add columns(c int, d string); + +insert into table part_add_int_string_permute_select partition(part=1) VALUES (2, 2222, 'new', 3333, '4444'); + +explain vectorization only detail +select insert_num,part,a,b,c,d from part_add_int_string_permute_select; + +-- SELECT permutation columns to make sure NULL defaulting works right +select insert_num,part,a,b from part_add_int_string_permute_select; +select insert_num,part,a,b,c from part_add_int_string_permute_select; +select insert_num,part,a,b,c,d from part_add_int_string_permute_select; +select insert_num,part,a,c,d from part_add_int_string_permute_select; +select insert_num,part,a,d from part_add_int_string_permute_select; +select insert_num,part,c from part_add_int_string_permute_select; +select insert_num,part,d from part_add_int_string_permute_select; + +drop table part_add_int_string_permute_select; + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> DOUBLE +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> DOUBLE: (STRING, CHAR, VARCHAR) +-- +CREATE TABLE part_change_string_group_double(insert_num int, c1 STRING, c2 CHAR(50), c3 VARCHAR(50), b STRING) PARTITIONED BY(part INT) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_change_string_group_double partition(part=1) SELECT insert_num, double_str, double_str, double_str, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_string_group_double replace columns (insert_num int, c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, b STRING); + +insert into table part_change_string_group_double partition(part=1) SELECT insert_num, double1, double1, double1, 'new' FROM schema_evolution_data WHERE insert_num = 111; + +select insert_num,part,c1,c2,c3,b from part_change_string_group_double; + +drop table part_change_string_group_double; + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for DATE_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for DATE_GROUP -> STRING_GROUP: DATE,TIMESTAMP, (STRING, CHAR, CHAR trunc, VARCHAR, VARCHAR trunc) +-- +CREATE TABLE part_change_date_group_string_group_date_timestamp(insert_num int, c1 DATE, c2 DATE, c3 DATE, c4 DATE, c5 DATE, c6 TIMESTAMP, c7 TIMESTAMP, c8 TIMESTAMP, c9 TIMESTAMP, c10 TIMESTAMP, b STRING) PARTITIONED BY(part INT) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_change_date_group_string_group_date_timestamp partition(part=1) SELECT insert_num, date1, date1, date1, date1, date1, timestamp1, timestamp1, timestamp1, timestamp1, timestamp1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_date_group_string_group_date_timestamp replace columns(insert_num int, c1 STRING, c2 CHAR(50), c3 CHAR(15), c4 VARCHAR(50), c5 VARCHAR(15), c6 STRING, c7 CHAR(50), c8 CHAR(15), c9 VARCHAR(50), c10 VARCHAR(15), b STRING); + +insert into table part_change_date_group_string_group_date_timestamp partition(part=1) VALUES (111, 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'new'); + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,b from part_change_date_group_string_group_date_timestamp; + +drop table part_change_date_group_string_group_date_timestamp; + + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP: +-- (TINYINT, SMALLINT, INT, BIGINT), STRING and +-- (TINYINT, SMALLINT, INT, BIGINT), CHAR and CHAR trunc and +-- (TINYINT, SMALLINT, INT, BIGINT), VARCHAR and VARCHAR trunc +-- +-- +CREATE TABLE part_change_numeric_group_string_group_multi_ints_string_group(insert_num int, + c1 tinyint, c2 smallint, c3 int, c4 bigint, + c5 tinyint, c6 smallint, c7 int, c8 bigint, c9 tinyint, c10 smallint, c11 int, c12 bigint, + c13 tinyint, c14 smallint, c15 int, c16 bigint, c17 tinyint, c18 smallint, c19 int, c20 bigint, + b STRING) PARTITIONED BY(part INT); + +insert into table part_change_numeric_group_string_group_multi_ints_string_group partition(part=1) SELECT insert_num, + tinyint1, smallint1, int1, bigint1, + tinyint1, smallint1, int1, bigint1, tinyint1, smallint1, int1, bigint1, + tinyint1, smallint1, int1, bigint1, tinyint1, smallint1, int1, bigint1, + 'original' FROM schema_evolution_data; + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,b from part_change_numeric_group_string_group_multi_ints_string_group; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_numeric_group_string_group_multi_ints_string_group replace columns (insert_num int, + c1 STRING, c2 STRING, c3 STRING, c4 STRING, + c5 CHAR(50), c6 CHAR(50), c7 CHAR(50), c8 CHAR(50), c9 CHAR(5), c10 CHAR(5), c11 CHAR(5), c12 CHAR(5), + c13 VARCHAR(50), c14 VARCHAR(50), c15 VARCHAR(50), c16 VARCHAR(50), c17 VARCHAR(5), c18 VARCHAR(5), c19 VARCHAR(5), c20 VARCHAR(5), + b STRING) ; + +insert into table part_change_numeric_group_string_group_multi_ints_string_group partition(part=1) VALUES (111, + 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'new'); + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,b from part_change_numeric_group_string_group_multi_ints_string_group; + +drop table part_change_numeric_group_string_group_multi_ints_string_group; + + + +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP: +-- (DECIMAL, FLOAT, DOUBLE), STRING and +-- (DECIMAL, FLOAT, DOUBLE), CHAR and CHAR trunc and +-- (DECIMAL, FLOAT, DOUBLE), VARCHAR and VARCHAR trunc +-- +-- +CREATE TABLE part_change_numeric_group_string_group_floating_string_group(insert_num int, + c1 decimal(38,18), c2 float, c3 double, + c4 decimal(38,18), c5 float, c6 double, c7 decimal(38,18), c8 float, c9 double, + c10 decimal(38,18), c11 float, c12 double, c13 decimal(38,18), c14 float, c15 double, + b STRING) PARTITIONED BY(part INT); + +insert into table part_change_numeric_group_string_group_floating_string_group partition(part=1) SELECT insert_num, + decimal1, float1, double1, + decimal1, float1, double1, decimal1, float1, double1, + decimal1, float1, double1, decimal1, float1, double1, + 'original' FROM schema_evolution_data; + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,b from part_change_numeric_group_string_group_floating_string_group; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_numeric_group_string_group_floating_string_group replace columns (insert_num int, + c1 STRING, c2 STRING, c3 STRING, + c4 CHAR(50), c5 CHAR(50), c6 CHAR(50), c7 CHAR(7), c8 CHAR(7), c9 CHAR(7), + c10 VARCHAR(50), c11 VARCHAR(50), c12 VARCHAR(50), c13 VARCHAR(7), c14 VARCHAR(7), c15 VARCHAR(7), + b STRING); + +insert into table part_change_numeric_group_string_group_floating_string_group partition(part=1) VALUES (111, + 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'new'); + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,b from part_change_numeric_group_string_group_floating_string_group; + +drop table part_change_numeric_group_string_group_floating_string_group; + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> STRING_GROUP: STRING, (CHAR, CHAR trunc, VARCHAR, VARCHAR trunc) and +-- CHAR, (VARCHAR, VARCHAR trunc, STRING) and VARCHAR, (CHAR, CHAR trunc, STRING) +-- +CREATE TABLE part_change_string_group_string_group_string(insert_num int, + c1 string, c2 string, c3 string, c4 string, + c5 CHAR(50), c6 CHAR(50), c7 CHAR(50), + c8 VARCHAR(50), c9 VARCHAR(50), c10 VARCHAR(50), b STRING) PARTITIONED BY(part INT) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_change_string_group_string_group_string partition(part=1) SELECT insert_num, + string2, string2, string2, string2, + string2, string2, string2, + string2, string2, string2, + 'original' FROM schema_evolution_data; + +select insert_num,part,c1,c2,c3,c4,b from part_change_string_group_string_group_string; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_string_group_string_group_string replace columns (insert_num int, + c1 CHAR(50), c2 CHAR(9), c3 VARCHAR(50), c4 CHAR(9), + c5 VARCHAR(50), c6 VARCHAR(9), c7 STRING, + c8 CHAR(50), c9 CHAR(9), c10 STRING, b STRING) ; + +insert into table part_change_string_group_string_group_string partition(part=1) VALUES (111, + 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', + 'new'); + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,b from part_change_string_group_string_group_string; + +drop table part_change_string_group_string_group_string; + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP: +-- TINYINT, (SMALLINT, INT, BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- SMALLINT, (INT, BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- INT, (BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- BIGINT, (DECIMAL, FLOAT, DOUBLE) +-- +CREATE TABLE part_change_lower_to_higher_numeric_group_tinyint_to_bigint(insert_num int, + c1 tinyint, c2 tinyint, c3 tinyint, c4 tinyint, c5 tinyint, c6 tinyint, + c7 smallint, c8 smallint, c9 smallint, c10 smallint, c11 smallint, + c12 int, c13 int, c14 int, c15 int, + c16 bigint, c17 bigint, c18 bigint, + b STRING) PARTITIONED BY(part INT) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_change_lower_to_higher_numeric_group_tinyint_to_bigint partition(part=1) SELECT insert_num, + tinyint1, tinyint1, tinyint1, tinyint1, tinyint1, tinyint1, + smallint1, smallint1, smallint1, smallint1, smallint1, + int1, int1, int1, int1, + bigint1, bigint1, bigint1, + 'original' FROM schema_evolution_data; + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,b from part_change_lower_to_higher_numeric_group_tinyint_to_bigint; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_lower_to_higher_numeric_group_tinyint_to_bigint replace columns (insert_num int, + c1 SMALLINT, c2 INT, c3 BIGINT, c4 decimal(38,18), c5 FLOAT, c6 DOUBLE, + c7 INT, c8 BIGINT, c9 decimal(38,18), c10 FLOAT, c11 DOUBLE, + c12 BIGINT, c13 decimal(38,18), c14 FLOAT, c15 DOUBLE, + c16 decimal(38,18), c17 FLOAT, c18 DOUBLE, + b STRING) ; + +insert into table part_change_lower_to_higher_numeric_group_tinyint_to_bigint partition(part=1) VALUES (111, + 7000, 80000, 90000000, 1234.5678, 9876.543, 789.321, + 80000, 90000000, 1234.5678, 9876.543, 789.321, + 90000000, 1234.5678, 9876.543, 789.321, + 1234.5678, 9876.543, 789.321, + 'new'); + +select insert_num,part,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,b from part_change_lower_to_higher_numeric_group_tinyint_to_bigint; + +drop table part_change_lower_to_higher_numeric_group_tinyint_to_bigint; + + + +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP: +-- DECIMAL, (FLOAT, DOUBLE) and +-- FLOAT, (DOUBLE) +-- +CREATE TABLE part_change_lower_to_higher_numeric_group_decimal_to_float(insert_num int, + c1 decimal(38,18), c2 decimal(38,18), + c3 float, + b STRING) PARTITIONED BY(part INT) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table part_change_lower_to_higher_numeric_group_decimal_to_float partition(part=1) SELECT insert_num, + decimal1, decimal1, + float1, + 'original' FROM schema_evolution_data; + +select insert_num,part,c1,c2,c3,b from part_change_lower_to_higher_numeric_group_decimal_to_float; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table part_change_lower_to_higher_numeric_group_decimal_to_float replace columns (insert_num int, c1 float, c2 double, c3 DOUBLE, b STRING) ; + +insert into table part_change_lower_to_higher_numeric_group_decimal_to_float partition(part=1) VALUES (111, 1234.5678, 9876.543, 1234.5678, 'new'); + +select insert_num,part,c1,c2,c3,b from part_change_lower_to_higher_numeric_group_decimal_to_float; + +drop table part_change_lower_to_higher_numeric_group_decimal_to_float; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_update_llap_io.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_update_llap_io.q b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_update_llap_io.q new file mode 100644 index 0000000..1866fc8 --- /dev/null +++ b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_part_update_llap_io.q @@ -0,0 +1,161 @@ +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.mapred.mode=nonstrict; +set hive.cli.print.header=true; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.exec.schema.evolution=false; +SET hive.vectorized.use.vectorized.input.format=true; +SET hive.vectorized.use.vector.serde.deserialize=false; +SET hive.vectorized.use.row.serde.deserialize=false; +SET hive.vectorized.execution.enabled=false; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.disallow.incompatible.col.type.changes=true; +set hive.default.fileformat=orc; +set hive.llap.io.enabled=true; +set hive.llap.io.encode.enabled=true; + +-- SORT_QUERY_RESULTS +-- +-- FILE VARIATION: ORC, ACID Non-Vectorized, MapWork, Partitioned +-- *IMPORTANT NOTE* We set hive.exec.schema.evolution=false above since schema evolution is always used for ACID. +-- Also, we don't do EXPLAINs on ACID files because the transaction id causes Q file statistics differences... +-- + +CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into table schema_evolution_data; + +CREATE TABLE schema_evolution_data_2(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data_2.txt' overwrite into table schema_evolution_data_2; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... UPDATE New Columns +--- +CREATE TABLE partitioned_update_1(insert_num int, a INT, b STRING) PARTITIONED BY(part INT) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table partitioned_update_1 partition(part=1) SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table partitioned_update_1 add columns(c int, d string); + +insert into table partitioned_update_1 partition(part=2) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num <=110; + +insert into table partitioned_update_1 partition(part=1) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num > 110; + +select insert_num,part,a,b,c,d from partitioned_update_1; + +-- UPDATE New Columns +update partitioned_update_1 set c=99; + +select insert_num,part,a,b,c,d from partitioned_update_1; + +alter table partitioned_update_1 partition(part=1) compact 'major'; +alter table partitioned_update_1 partition(part=2) compact 'major'; + +select insert_num,part,a,b,c,d from partitioned_update_1; + +DROP TABLE partitioned_update_1; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... DELETE where old column +--- +CREATE TABLE partitioned_delete_1(insert_num int, a INT, b STRING) PARTITIONED BY(part INT) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table partitioned_delete_1 partition(part=1) SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table partitioned_delete_1 add columns(c int, d string); + +insert into table partitioned_delete_1 partition(part=2) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num <=110; + +insert into table partitioned_delete_1 partition(part=1) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num > 110; + +select part,a,b,c,d from partitioned_delete_1; + +-- DELETE where old column +delete from partitioned_delete_1 where insert_num = 102 or insert_num = 104 or insert_num = 106; + +select insert_num,part,a,b,c,d from partitioned_delete_1; + +alter table partitioned_delete_1 partition(part=1) compact 'major'; +alter table partitioned_delete_1 partition(part=2) compact 'major'; + +select insert_num,part,a,b,c,d from partitioned_delete_1; + +DROP TABLE partitioned_delete_1; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... DELETE where new column +--- +CREATE TABLE partitioned_delete_2(insert_num int, a INT, b STRING) PARTITIONED BY(part INT) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table partitioned_delete_2 partition(part=1) SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table partitioned_delete_2 add columns(c int, d string); + +insert into table partitioned_delete_2 partition(part=2) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num <=110; + +insert into table partitioned_delete_2 partition(part=1) SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num > 110; + +select insert_num,part,a,b,c,d from partitioned_delete_2; + +-- DELETE where new column +delete from partitioned_delete_2 where insert_num = 108 or insert_num > 113; + +select insert_num,part,a,b,c,d from partitioned_delete_2; + +alter table partitioned_delete_2 partition(part=1) compact 'major'; +alter table partitioned_delete_2 partition(part=2) compact 'major'; + +select insert_num,part,a,b,c,d from partitioned_delete_2; + +DROP TABLE partitioned_delete_2; + +--following tests is moved from system tests +drop table if exists missing_ddl_2; +create table missing_ddl_2(name string, age int); +insert overwrite table missing_ddl_2 select value, key from srcbucket; +alter table missing_ddl_2 add columns (gps double); + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.optimize.sort.dynamic.partition=true; + +DROP TABLE IF EXISTS all100kjson_textfile_orc; +CREATE TABLE all100kjson_textfile_orc ( + si smallint, + i int, + b bigint, + f float, + d double, + s string, + bo boolean, + ts timestamp) + PARTITIONED BY (t tinyint) + ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' + WITH SERDEPROPERTIES ('timestamp.formats'='yyyy-MM-dd\'T\'HH:mm:ss') + STORED AS TEXTFILE; + +INSERT INTO TABLE all100kjson_textfile_orc PARTITION (t) SELECT csmallint, cint, cbigint, cfloat, cdouble, cstring1, cboolean1, ctimestamp1, ctinyint FROM alltypesorc WHERE ctinyint > 0; + +ALTER TABLE all100kjson_textfile_orc + SET FILEFORMAT + INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' + OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' + SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'; + +INSERT INTO TABLE all100kjson_textfile_orc PARTITION (t) SELECT csmallint, cint, cbigint, cfloat, cdouble, cstring1, cboolean1, ctimestamp1, ctinyint FROM alltypesorc WHERE ctinyint < 1 and ctinyint > -50 ; + +-- HIVE-11977: Hive should handle an external avro table with zero length files present +DROP TABLE IF EXISTS emptyavro; +CREATE TABLE emptyavro (i int) + PARTITIONED BY (s string) + STORED AS AVRO; +load data local inpath '../../data/files/empty1.txt' into table emptyavro PARTITION (s='something'); +SELECT COUNT(*) from emptyavro; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_llap_io.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_llap_io.q b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_llap_io.q new file mode 100644 index 0000000..66e6da4 --- /dev/null +++ b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_llap_io.q @@ -0,0 +1,318 @@ +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.cli.print.header=true; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.exec.schema.evolution=false; +SET hive.vectorized.use.vectorized.input.format=true; +SET hive.vectorized.use.vector.serde.deserialize=false; +SET hive.vectorized.use.row.serde.deserialize=false; +SET hive.vectorized.execution.enabled=false; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.disallow.incompatible.col.type.changes=true; +set hive.default.fileformat=orc; +set hive.llap.io.enabled=true; +set hive.llap.io.encode.enabled=true; + +-- SORT_QUERY_RESULTS +-- +-- FILE VARIATION: ORC, ACID Non-Vectorized, MapWork, Table +-- *IMPORTANT NOTE* We set hive.exec.schema.evolution=false above since schema evolution is always used for ACID. +-- Also, we don't do regular EXPLAINs on ACID files because the transaction id causes Q file statistics differences... +-- Instead just one explain vectorization only detail +-- + +CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into table schema_evolution_data; + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE ADD COLUMNS +-- +-- +-- SUBSECTION: ALTER TABLE ADD COLUMNS: INT PERMUTE SELECT +-- +-- +CREATE TABLE table_add_int_permute_select(insert_num int, a INT, b STRING) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_add_int_permute_select SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table table_add_int_permute_select add columns(c int); + +insert into table table_add_int_permute_select VALUES (111, 80000, 'new', 80000); + +explain vectorization only detail +select insert_num,a,b,c from table_add_int_permute_select; + +-- SELECT permutation columns to make sure NULL defaulting works right +select insert_num,a,b from table_add_int_permute_select; +select insert_num,a,b,c from table_add_int_permute_select; +select insert_num,c from table_add_int_permute_select; + +drop table table_add_int_permute_select; + + +-- SUBSECTION: ALTER TABLE ADD COLUMNS: INT, STRING, PERMUTE SELECT +-- +-- +CREATE TABLE table_add_int_string_permute_select(insert_num int, a INT, b STRING) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_add_int_string_permute_select SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table table_add_int_string_permute_select add columns(c int, d string); + +insert into table table_add_int_string_permute_select VALUES (111, 80000, 'new', 80000, 'filler'); + +-- SELECT permutation columns to make sure NULL defaulting works right +select insert_num,a,b from table_add_int_string_permute_select; +select insert_num,a,b,c from table_add_int_string_permute_select; +select insert_num,a,b,c,d from table_add_int_string_permute_select; +select insert_num,a,c,d from table_add_int_string_permute_select; +select insert_num,a,d from table_add_int_string_permute_select; +select insert_num,c from table_add_int_string_permute_select; +select insert_num,d from table_add_int_string_permute_select; + +drop table table_add_int_string_permute_select; + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> DOUBLE +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> DOUBLE: (STRING, CHAR, VARCHAR) +-- +CREATE TABLE table_change_string_group_double(insert_num int, c1 STRING, c2 CHAR(50), c3 VARCHAR(50), b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_string_group_double SELECT insert_num, double_str, double_str, double_str, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_string_group_double replace columns (insert_num int, c1 DOUBLE, c2 DOUBLE, c3 DOUBLE, b STRING); + +insert into table table_change_string_group_double VALUES (111, 789.321, 789.321, 789.321, 'new'); + +select insert_num,c1,c2,c3,b from table_change_string_group_double; + +drop table table_change_string_group_double; + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for DATE_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for DATE_GROUP -> STRING_GROUP: DATE,TIMESTAMP, (STRING, CHAR, CHAR trunc, VARCHAR, VARCHAR trunc) +-- +CREATE TABLE table_change_date_group_string_group_date_group(insert_num int, c1 DATE, c2 DATE, c3 DATE, c4 DATE, c5 DATE, c6 TIMESTAMP, c7 TIMESTAMP, c8 TIMESTAMP, c9 TIMESTAMP, c10 TIMESTAMP, b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_date_group_string_group_date_group SELECT insert_num, date1, date1, date1, date1, date1, timestamp1, timestamp1, timestamp1, timestamp1, timestamp1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_date_group_string_group_date_group replace columns(insert_num int, c1 STRING, c2 CHAR(50), c3 CHAR(15), c4 VARCHAR(50), c5 VARCHAR(15), c6 STRING, c7 CHAR(50), c8 CHAR(15), c9 VARCHAR(50), c10 VARCHAR(15), b STRING); + +insert into table table_change_date_group_string_group_date_group VALUES (111, 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'new'); + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,b from table_change_date_group_string_group_date_group; + +drop table table_change_date_group_string_group_date_group; + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP: +-- (TINYINT, SMALLINT, INT, BIGINT), STRING and +-- (TINYINT, SMALLINT, INT, BIGINT), CHAR and CHAR trunc and +-- (TINYINT, SMALLINT, INT, BIGINT), VARCHAR and VARCHAR trunc +-- +-- +CREATE TABLE table_change_numeric_group_string_group_multi_ints_string_group(insert_num int, + c1 tinyint, c2 smallint, c3 int, c4 bigint, + c5 tinyint, c6 smallint, c7 int, c8 bigint, c9 tinyint, c10 smallint, c11 int, c12 bigint, + c13 tinyint, c14 smallint, c15 int, c16 bigint, c17 tinyint, c18 smallint, c19 int, c20 bigint, + b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_numeric_group_string_group_multi_ints_string_group SELECT insert_num, + tinyint1, smallint1, int1, bigint1, + tinyint1, smallint1, int1, bigint1, tinyint1, smallint1, int1, bigint1, + tinyint1, smallint1, int1, bigint1, tinyint1, smallint1, int1, bigint1, + 'original' FROM schema_evolution_data; + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,b from table_change_numeric_group_string_group_multi_ints_string_group; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_numeric_group_string_group_multi_ints_string_group replace columns (insert_num int, + c1 STRING, c2 STRING, c3 STRING, c4 STRING, + c5 CHAR(50), c6 CHAR(50), c7 CHAR(50), c8 CHAR(50), c9 CHAR(5), c10 CHAR(5), c11 CHAR(5), c12 CHAR(5), + c13 VARCHAR(50), c14 VARCHAR(50), c15 VARCHAR(50), c16 VARCHAR(50), c17 VARCHAR(5), c18 VARCHAR(5), c19 VARCHAR(5), c20 VARCHAR(5), + b STRING) ; + +insert into table table_change_numeric_group_string_group_multi_ints_string_group VALUES (111, + 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'new'); + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,c20,b from table_change_numeric_group_string_group_multi_ints_string_group; + +drop table table_change_numeric_group_string_group_multi_ints_string_group; + + + +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for NUMERIC_GROUP -> STRING_GROUP: +-- (DECIMAL, FLOAT, DOUBLE), STRING and +-- (DECIMAL, FLOAT, DOUBLE), CHAR and CHAR trunc and +-- (DECIMAL, FLOAT, DOUBLE), VARCHAR and VARCHAR trunc +-- +-- +CREATE TABLE table_change_numeric_group_string_group_floating_string_group(insert_num int, + c1 decimal(38,18), c2 float, c3 double, + c4 decimal(38,18), c5 float, c6 double, c7 decimal(38,18), c8 float, c9 double, + c10 decimal(38,18), c11 float, c12 double, c13 decimal(38,18), c14 float, c15 double, + b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_numeric_group_string_group_floating_string_group SELECT insert_num, + decimal1, float1, double1, + decimal1, float1, double1, decimal1, float1, double1, + decimal1, float1, double1, decimal1, float1, double1, + 'original' FROM schema_evolution_data; + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,b from table_change_numeric_group_string_group_floating_string_group; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_numeric_group_string_group_floating_string_group replace columns (insert_num int, + c1 STRING, c2 STRING, c3 STRING, + c4 CHAR(50), c5 CHAR(50), c6 CHAR(50), c7 CHAR(7), c8 CHAR(7), c9 CHAR(7), + c10 VARCHAR(50), c11 VARCHAR(50), c12 VARCHAR(50), c13 VARCHAR(7), c14 VARCHAR(7), c15 VARCHAR(7), + b STRING); + +insert into table table_change_numeric_group_string_group_floating_string_group VALUES (111, + 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', 'filler', 'filler', 'filler', + 'new'); + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,b from table_change_numeric_group_string_group_floating_string_group; + +drop table table_change_numeric_group_string_group_floating_string_group; + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> STRING_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for STRING_GROUP -> STRING_GROUP: STRING, (CHAR, CHAR trunc, VARCHAR, VARCHAR trunc) and +-- CHAR, (VARCHAR, VARCHAR trunc, STRING) and VARCHAR, (CHAR, CHAR trunc, STRING) +-- +CREATE TABLE table_change_string_group_string_group_string(insert_num int, + c1 string, c2 string, c3 string, c4 string, + c5 CHAR(50), c6 CHAR(50), c7 CHAR(50), + c8 VARCHAR(50), c9 VARCHAR(50), c10 VARCHAR(50), b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_string_group_string_group_string SELECT insert_num, + string2, string2, string2, string2, + string2, string2, string2, + string2, string2, string2, + 'original' FROM schema_evolution_data; + +select insert_num,c1,c2,c3,c4,b from table_change_string_group_string_group_string; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_string_group_string_group_string replace columns (insert_num int, + c1 CHAR(50), c2 CHAR(9), c3 VARCHAR(50), c4 CHAR(9), + c5 VARCHAR(50), c6 VARCHAR(9), c7 STRING, + c8 CHAR(50), c9 CHAR(9), c10 STRING, b STRING) ; + +insert into table table_change_string_group_string_group_string VALUES (111, + 'filler', 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', + 'filler', 'filler', 'filler', + 'new'); + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,b from table_change_string_group_string_group_string; + +drop table table_change_string_group_string_group_string; + + + +------------------------------------------------------------------------------------------ +-- SECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP +-- +-- +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP: +-- TINYINT, (SMALLINT, INT, BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- SMALLINT, (INT, BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- INT, (BIGINT, DECIMAL, FLOAT, DOUBLE) and +-- BIGINT, (DECIMAL, FLOAT, DOUBLE) +-- +CREATE TABLE table_change_lower_to_higher_numeric_group_tinyint_to_bigint(insert_num int, + c1 tinyint, c2 tinyint, c3 tinyint, c4 tinyint, c5 tinyint, c6 tinyint, + c7 smallint, c8 smallint, c9 smallint, c10 smallint, c11 smallint, + c12 int, c13 int, c14 int, c15 int, + c16 bigint, c17 bigint, c18 bigint, + b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_lower_to_higher_numeric_group_tinyint_to_bigint SELECT insert_num, + tinyint1, tinyint1, tinyint1, tinyint1, tinyint1, tinyint1, + smallint1, smallint1, smallint1, smallint1, smallint1, + int1, int1, int1, int1, + bigint1, bigint1, bigint1, + 'original' FROM schema_evolution_data; + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,b from table_change_lower_to_higher_numeric_group_tinyint_to_bigint; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_lower_to_higher_numeric_group_tinyint_to_bigint replace columns (insert_num int, + c1 SMALLINT, c2 INT, c3 BIGINT, c4 decimal(38,18), c5 FLOAT, c6 DOUBLE, + c7 INT, c8 BIGINT, c9 decimal(38,18), c10 FLOAT, c11 DOUBLE, + c12 BIGINT, c13 decimal(38,18), c14 FLOAT, c15 DOUBLE, + c16 decimal(38,18), c17 FLOAT, c18 DOUBLE, + b STRING) ; + +insert into table table_change_lower_to_higher_numeric_group_tinyint_to_bigint VALUES (111, + 7000, 80000, 90000000, 1234.5678, 9876.543, 789.321, + 80000, 90000000, 1234.5678, 9876.543, 789.321, + 90000000, 1234.5678, 9876.543, 789.321, + 1234.5678, 9876.543, 789.321, + 'new'); + +select insert_num,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,b from table_change_lower_to_higher_numeric_group_tinyint_to_bigint; + +drop table table_change_lower_to_higher_numeric_group_tinyint_to_bigint; + + + +-- +-- SUBSECTION: ALTER TABLE CHANGE COLUMNS for "lower" type to "higher" NUMERIC_GROUP: +-- DECIMAL, (FLOAT, DOUBLE) and +-- FLOAT, (DOUBLE) +-- +CREATE TABLE table_change_lower_to_higher_numeric_group_decimal_to_float(insert_num int, + c1 decimal(38,18), c2 decimal(38,18), + c3 float, + b STRING) clustered by (c1) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table_change_lower_to_higher_numeric_group_decimal_to_float SELECT insert_num, + decimal1, decimal1, + float1, + 'original' FROM schema_evolution_data; + +select insert_num,c1,c2,c3,b from table_change_lower_to_higher_numeric_group_decimal_to_float; + +-- Table-Non-Cascade CHANGE COLUMNS ... +alter table table_change_lower_to_higher_numeric_group_decimal_to_float replace columns (insert_num int, c1 float, c2 double, c3 DOUBLE, b STRING) ; + +insert into table table_change_lower_to_higher_numeric_group_decimal_to_float VALUES (111, 1234.5678, 9876.543, 1234.5678, 'new'); + +select insert_num,c1,c2,c3,b from table_change_lower_to_higher_numeric_group_decimal_to_float; + +drop table table_change_lower_to_higher_numeric_group_decimal_to_float; http://git-wip-us.apache.org/repos/asf/hive/blob/255cf4ab/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_update_llap_io.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_update_llap_io.q b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_update_llap_io.q new file mode 100644 index 0000000..16df15a --- /dev/null +++ b/ql/src/test/queries/clientpositive/schema_evol_orc_acid_table_update_llap_io.q @@ -0,0 +1,113 @@ +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.cli.print.header=true; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; +set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; +SET hive.exec.schema.evolution=false; +SET hive.vectorized.use.vectorized.input.format=true; +SET hive.vectorized.use.vector.serde.deserialize=false; +SET hive.vectorized.use.row.serde.deserialize=false; +SET hive.vectorized.execution.enabled=false; +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.metastore.disallow.incompatible.col.type.changes=true; +set hive.default.fileformat=orc; +set hive.llap.io.enabled=true; +set hive.llap.io.encode.enabled=true; + +-- SORT_QUERY_RESULTS +-- +-- FILE VARIATION: ORC, ACID Non-Vectorized, MapWork, Table +-- *IMPORTANT NOTE* We set hive.exec.schema.evolution=false above since schema evolution is always used for ACID. +-- Also, we don't do EXPLAINs on ACID files because the transaction id causes Q file statistics differences... +-- + +CREATE TABLE schema_evolution_data(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data.txt' overwrite into table schema_evolution_data; + +CREATE TABLE schema_evolution_data_2(insert_num int, boolean1 boolean, tinyint1 tinyint, smallint1 smallint, int1 int, bigint1 bigint, decimal1 decimal(38,18), float1 float, double1 double, string1 string, string2 string, date1 date, timestamp1 timestamp, boolean_str string, tinyint_str string, smallint_str string, int_str string, bigint_str string, decimal_str string, float_str string, double_str string, date_str string, timestamp_str string, filler string) +row format delimited fields terminated by '|' stored as textfile; +load data local inpath '../../data/files/schema_evolution/schema_evolution_data_2.txt' overwrite into table schema_evolution_data_2; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... UPDATE New Columns +--- +CREATE TABLE table5(insert_num int, a INT, b STRING) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table5 SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table table5 add columns(c int, d string); + +insert into table table5 SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2; + +select a,b,c,d from table5; + +-- UPDATE New Columns +update table5 set c=99; + +select a,b,c,d from table5; + +alter table table5 compact 'major'; + +select a,b,c,d from table5; + +DROP TABLE table5; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... DELETE where old column +--- +CREATE TABLE table6(insert_num int, a INT, b STRING) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table6 SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table table6 add columns(c int, d string); + +insert into table table6 SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num <= 110; + +insert into table table6 SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num > 110; + +select a,b,c,d from table6; + +-- DELETE where old column +delete from table6 where insert_num = 102 or insert_num = 104 or insert_num = 106; + +select a,b,c,d from table6; + +alter table table6 compact 'major'; + +select a,b,c,d from table6; + +DROP TABLE table6; + +-- +-- +-- SECTION VARIATION: ALTER TABLE ADD COLUMNS ... DELETE where new column +--- +CREATE TABLE table7(insert_num int, a INT, b STRING) clustered by (a) into 2 buckets STORED AS ORC TBLPROPERTIES ('transactional'='true'); + +insert into table table7 SELECT insert_num, int1, 'original' FROM schema_evolution_data; + +-- Table-Non-Cascade ADD COLUMNS ... +alter table table7 add columns(c int, d string); + +insert into table table7 SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num <= 110; + +insert into table table7 SELECT insert_num, int1, 'new', int1, string1 FROM schema_evolution_data_2 WHERE insert_num > 110; + +select a,b,c,d from table7; + +-- DELETE where new column +delete from table7 where insert_num = 107 or insert_num >= 110; + +select a,b,c,d from table7; + +alter table table7 compact 'major'; + +select a,b,c,d from table7; + +DROP TABLE table7;
