IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files
This commit unblocks queries materializing only scalar typed columns on tables backed by RC/sequence files containing complex typed columns. This worked prior to 2.3.0 release. Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe Reviewed-on: http://gerrit.cloudera.org:8080/2580 Reviewed-by: Alex Behm <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5cd7ada7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5cd7ada7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5cd7ada7 Branch: refs/heads/master Commit: 5cd7ada727d04fe56d62ced2e8bfa56f4448ea57 Parents: 2809746 Author: Bharath Vissapragada <[email protected]> Authored: Sun Mar 13 06:17:06 2016 -0700 Committer: Internal Jenkins <[email protected]> Committed: Thu Mar 31 12:06:57 2016 +0000 ---------------------------------------------------------------------- .../cloudera/impala/catalog/HdfsFileFormat.java | 28 ++++++++++++++------ .../cloudera/impala/planner/HdfsScanNode.java | 14 ++++++---- .../PlannerTest/complex-types-file-formats.test | 23 +++++++++------- 3 files changed, 43 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java index 9c883fc..3670aa5 100644 --- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java +++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java @@ -35,26 +35,26 @@ public enum HdfsFileFormat { RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat", "org.apache.hadoop.hive.ql.io.RCFileOutputFormat", "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", - false), + false, true), TEXT("org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - false), + false, false), LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", - "", - false), + "", false, false), SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat", "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat", - "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false), + "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false, + true), AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", "org.apache.hadoop.hive.serde2.avro.AvroSerDe", - false), + false, false), PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", - true); + true, true); private final String inputFormat_; private final String outputFormat_; @@ -63,12 +63,18 @@ public enum HdfsFileFormat { // Indicates whether we support scanning complex types for this file format. private final boolean isComplexTypesSupported_; + // Indicates whether the file format can skip complex columns in scans and just + // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true. + // TODO: Remove this once we support complex types for all file formats. + private final boolean canSkipColumnTypes_; + HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib, - boolean isComplexTypesSupported) { + boolean isComplexTypesSupported, boolean canSkipColumnTypes) { inputFormat_ = inputFormat; outputFormat_ = outputFormat; serializationLib_ = serializationLib; isComplexTypesSupported_ = isComplexTypesSupported; + canSkipColumnTypes_ = canSkipColumnTypes; } public String inputFormat() { return inputFormat_; } @@ -235,6 +241,12 @@ public enum HdfsFileFormat { public boolean isComplexTypesSupported() { return isComplexTypesSupported_; } /** + * Returns true if this file format can skip complex typed columns and materialize + * only scalar typed columns. + */ + public boolean canSkipComplexTypes() { return canSkipColumnTypes_; } + + /** * Returns a list with all formats for which isComplexTypesSupported() is true. */ public static List<HdfsFileFormat> complexTypesFormats() { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java index c6f7722..5edc0dc 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java @@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode { } if (firstComplexTypedCol == null) return; - boolean hasMaterializedSlots = false; + boolean referencesComplexTypedCol = false; for (SlotDescriptor slotDesc: desc_.getSlots()) { - if (slotDesc.isMaterialized()) { - hasMaterializedSlots = true; + if (!slotDesc.isMaterialized()) continue; + if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) { + referencesComplexTypedCol = true; break; } } @@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode { for (HdfsPartition part: partitions_) { HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat(); if (format.isComplexTypesSupported()) continue; - // Allow count(*) and similar queries on RC_FILE with complex types. - if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue; + // If the file format allows querying just scalar typed columns and the query + // doesn't materialize any complex typed columns, it is allowed. + if (format.canSkipComplexTypes() && !referencesComplexTypedCol) { + continue; + } String errSuffix = String.format( "Complex types are supported for these file formats: %s", Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats())); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test index 487bb3b..f0431a2 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test @@ -38,13 +38,18 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'. Complex types are supported for these file formats: PARQUET. ==== -# Complex types are not supported on RC files, even if no complex-typed -# columns are selected. -select id from functional_rc_snap.complextypes_fileformat +select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m ---- PLAN -not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'. +not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'. Complex types are supported for these file formats: PARQUET. ==== +# Complex types are not supported on RC files, however queries materializing +# only scalar type columns are allowed. +select id from functional_rc_snap.complextypes_fileformat +---- PLAN +00:SCAN HDFS [functional_rc_snap.complextypes_fileformat] + partitions=1/1 files=1 size=56B +==== # Complex types are not supported on RC files but count(*) and similar # queries should work. select count(*) from functional_rc_snap.complextypes_fileformat @@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'. Complex types are supported for these file formats: PARQUET. ==== -# Complex types are not supported on sequence files, even if no complex-typed -# columns are selected. -select 1 from functional_seq_snap.complextypes_fileformat +# Queries referencing only scalar typed columns on sequence files +# are allowed. +select id from functional_seq_snap.complextypes_fileformat ---- PLAN -not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'. -Complex types are supported for these file formats: PARQUET. +00:SCAN HDFS [functional_seq_snap.complextypes_fileformat] + partitions=1/1 files=1 size=87B ==== # Scanning all partitions fails because there are partitions with a file format for which # complex types are not supported. The error message is abbreviated because it is
