hive git commit: HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin)
Repository: hive Updated Branches: refs/heads/branch-2.0 8b3e7aa51 -> a3502d05c HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/a3502d05 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/a3502d05 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/a3502d05 Branch: refs/heads/branch-2.0 Commit: a3502d05ce4349cfb8836c37311018d49872ca93 Parents: 8b3e7aa Author: Matt McClineAuthored: Wed Jan 20 14:08:41 2016 -0800 Committer: Matt McCline Committed: Tue Apr 26 16:26:58 2016 -0700 -- .../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 8 +- .../hive/ql/io/orc/TreeReaderFactory.java | 29 +++--- .../queries/clientpositive/orc_remove_cols.q| 17 .../clientpositive/orc_remove_cols.q.out| 94 4 files changed, 132 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hive/blob/a3502d05/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 567899a..acfe1a2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java @@ -242,6 +242,9 @@ public class RecordReaderImpl implements RecordReader { this.types = builder.types; TreeReaderFactory.TreeReaderSchema treeReaderSchema; if (options.getSchema() == null) { + if (LOG.isInfoEnabled()) { +LOG.info("Schema on read not provided -- using file schema " + types.toString()); + } treeReaderSchema = new TreeReaderFactory.TreeReaderSchema().fileTypes(types).schemaTypes(types); } else { @@ -999,7 +1002,7 @@ public class RecordReaderImpl implements RecordReader { // since stream kind is optional, first check if it exists if (stream.hasKind() && (StreamName.getArea(streamKind) == StreamName.Area.DATA) && - includedColumns[column]) { + (column < includedColumns.length && includedColumns[column])) { // if we aren't filtering or it is a dictionary, load it. if (includedRowGroups == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(column))) { @@ -1024,7 +1027,8 @@ public class RecordReaderImpl implements RecordReader { long streamOffset = 0; for (OrcProto.Stream streamDesc : streamDescriptions) { int column = streamDesc.getColumn(); - if ((includeColumn != null && !includeColumn[column]) || + if ((includeColumn != null && + (column < included.length && !includeColumn[column])) || streamDesc.hasKind() && (StreamName.getArea(streamDesc.getKind()) != StreamName.Area.DATA)) { streamOffset += streamDesc.getLength(); http://git-wip-us.apache.org/repos/asf/hive/blob/a3502d05/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java index d8a134b..8bb32ea 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/TreeReaderFactory.java @@ -2050,7 +2050,7 @@ public class TreeReaderFactory { } protected static class StructTreeReader extends TreeReader { -private final int fileColumnCount; +private final int readColumnCount; private final int resultColumnCount; protected final TreeReader[] fields; private final String[] fieldNames; @@ -2063,30 +2063,31 @@ public class TreeReaderFactory { super(columnId); OrcProto.Type fileStructType = treeReaderSchema.getFileTypes().get(columnId); - fileColumnCount = fileStructType.getFieldNamesCount(); OrcProto.Type schemaStructType = treeReaderSchema.getSchemaTypes().get(columnId); + readColumnCount = Math.min(fileStructType.getFieldNamesCount(), schemaStructType.getFieldNamesCount()); + if (columnId == treeReaderSchema.getInnerStructSubtype()) { // If there are more result columns than reader columns, we will default those additional // columns to NULL. resultColumnCount = schemaStructType.getFieldNamesCount(); } else { -resultColumnCount = fileColumnCount; +resultColumnCount = readColumnCount; } -
hive git commit: HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin)
Repository: hive Updated Branches: refs/heads/master 43c7f3a8b -> 64ab5aba9 HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/64ab5aba Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/64ab5aba Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/64ab5aba Branch: refs/heads/master Commit: 64ab5aba9768a710efeb8d0f337c48f571d55f8b Parents: 43c7f3a Author: Matt McClineAuthored: Wed Jan 20 14:08:41 2016 -0800 Committer: Matt McCline Committed: Wed Jan 20 14:08:41 2016 -0800 -- .../hadoop/hive/ql/io/orc/OrcInputFormat.java | 43 ++--- .../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 8 +- .../hive/ql/io/orc/TreeReaderFactory.java | 29 +++--- .../queries/clientpositive/orc_remove_cols.q| 17 .../clientpositive/orc_remove_cols.q.out| 94 5 files changed, 164 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hive/blob/64ab5aba/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java index eae281c..8119449 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java @@ -87,6 +87,7 @@ import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.hadoop.hive.serde2.ColumnProjectionUtils; import org.apache.hadoop.hive.serde2.SerDeStats; @@ -2090,12 +2091,13 @@ public class OrcInputFormat implements InputFormat , } if (haveSchemaEvolutionProperties) { - LOG.info("Using schema evolution configuration variables schema.evolution.columns " + - schemaEvolutionColumnNames.toString() + - " / schema.evolution.columns.types " + - schemaEvolutionTypeDescrs.toString() + - " (isAcid " + isAcid + ")"); - + if (LOG.isInfoEnabled()) { +LOG.info("Using schema evolution configuration variables schema.evolution.columns " + +schemaEvolutionColumnNames.toString() + +" / schema.evolution.columns.types " + +schemaEvolutionTypeDescrs.toString() + +" (isAcid " + isAcid + ")"); + } } else { // Try regular properties; @@ -2114,11 +2116,30 @@ public class OrcInputFormat implements InputFormat , if (schemaEvolutionTypeDescrs.size() != schemaEvolutionColumnNames.size()) { return null; } - LOG.info("Using column configuration variables columns " + - schemaEvolutionColumnNames.toString() + - " / columns.types " + - schemaEvolutionTypeDescrs.toString() + - " (isAcid " + isAcid + ")"); + + // Find first virtual column and clip them off. + int virtualColumnClipNum = -1; + int columnNum = 0; + for (String columnName : schemaEvolutionColumnNames) { +if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) { + virtualColumnClipNum = columnNum; + break; +} +columnNum++; + } + if (virtualColumnClipNum != -1) { +schemaEvolutionColumnNames = +Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum)); +schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum)); + } + + if (LOG.isInfoEnabled()) { +LOG.info("Using column configuration variables columns " + +schemaEvolutionColumnNames.toString() + +" / columns.types " + +schemaEvolutionTypeDescrs.toString() + +" (isAcid " + isAcid + ")"); + } } // Desired schema does not include virtual columns or partition columns. http://git-wip-us.apache.org/repos/asf/hive/blob/64ab5aba/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index
hive git commit: HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin)
Repository: hive Updated Branches: refs/heads/branch-1 d36207ec9 -> 743585890 HIVE-12887 Handle ORC schema on read with fewer columns than file schema (after Schema Evolution changes) (Matt McCline, reviewed by Sergey Shelukhin) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/74358589 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/74358589 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/74358589 Branch: refs/heads/branch-1 Commit: 743585890950d8360b84ae044d50cc1fb8151a61 Parents: d36207e Author: Matt McClineAuthored: Wed Jan 20 14:25:21 2016 -0800 Committer: Matt McCline Committed: Wed Jan 20 14:25:21 2016 -0800 -- .../apache/hadoop/hive/ql/io/orc/OrcUtils.java | 49 ++ .../hadoop/hive/ql/io/orc/RecordReaderImpl.java | 8 +- .../hive/ql/io/orc/TreeReaderFactory.java | 29 +++--- .../queries/clientpositive/orc_remove_cols.q| 17 .../clientpositive/orc_remove_cols.q.out| 94 5 files changed, 164 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/hive/blob/74358589/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcUtils.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcUtils.java index ad4a9e8..84fd3c3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcUtils.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcUtils.java @@ -36,6 +36,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.ql.io.orc.OrcProto.Type; +import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; @@ -721,15 +722,13 @@ public class OrcUtils { } if (haveSchemaEvolutionProperties) { - LOG.info("Using schema evolution configuration variables " + - "schema.evolution.columns " + - schemaEvolutionColumnNames.toString() + - " / schema.evolution.columns.types " + - schemaEvolutionTypeDescrs.toString() + - " (isAcid " + - isAcid + - ")"); - + if (LOG.isInfoEnabled()) { +LOG.info("Using schema evolution configuration variables schema.evolution.columns " + +schemaEvolutionColumnNames.toString() + +" / schema.evolution.columns.types " + +schemaEvolutionTypeDescrs.toString() + +" (isAcid " + isAcid + ")"); + } } else { // Try regular properties; @@ -748,14 +747,30 @@ public class OrcUtils { if (schemaEvolutionTypeDescrs.size() != schemaEvolutionColumnNames.size()) { return null; } - LOG.info("Using column configuration variables " + - "columns " + - schemaEvolutionColumnNames.toString() + - " / columns.types " + - schemaEvolutionTypeDescrs.toString() + - " (isAcid " + - isAcid + - ")"); + + // Find first virtual column and clip them off. + int virtualColumnClipNum = -1; + int columnNum = 0; + for (String columnName : schemaEvolutionColumnNames) { +if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) { + virtualColumnClipNum = columnNum; + break; +} +columnNum++; + } + if (virtualColumnClipNum != -1) { +schemaEvolutionColumnNames = +Lists.newArrayList(schemaEvolutionColumnNames.subList(0, virtualColumnClipNum)); +schemaEvolutionTypeDescrs = Lists.newArrayList(schemaEvolutionTypeDescrs.subList(0, virtualColumnClipNum)); + } + + if (LOG.isInfoEnabled()) { +LOG.info("Using column configuration variables columns " + +schemaEvolutionColumnNames.toString() + +" / columns.types " + +schemaEvolutionTypeDescrs.toString() + +" (isAcid " + isAcid + ")"); + } } // Desired schema does not include virtual columns or partition columns. http://git-wip-us.apache.org/repos/asf/hive/blob/74358589/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java -- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java index 24834a5..44cac68 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java +++