sequence files

tarmstrong Tue, 12 Apr 2016 14:19:05 -0700

IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence 
files


This commit unblocks queries materializing only scalar typed
columns on tables backed by RC/sequence files containing complex
typed columns. This worked prior to 2.3.0 release.

Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe
Reviewed-on: http://gerrit.cloudera.org:8080/2580
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5cd7ada7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5cd7ada7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5cd7ada7

Branch: refs/heads/master
Commit: 5cd7ada727d04fe56d62ced2e8bfa56f4448ea57
Parents: 2809746
Author: Bharath Vissapragada <[email protected]>
Authored: Sun Mar 13 06:17:06 2016 -0700
Committer: Internal Jenkins <[email protected]>
Committed: Thu Mar 31 12:06:57 2016 +0000

----------------------------------------------------------------------
 .../cloudera/impala/catalog/HdfsFileFormat.java | 28 ++++++++++++++------
 .../cloudera/impala/planner/HdfsScanNode.java   | 14 ++++++----
 .../PlannerTest/complex-types-file-formats.test | 23 +++++++++-------
 3 files changed, 43 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java 
b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
index 9c883fc..3670aa5 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
   RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
       "org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
-      false),
+      false, true),
   TEXT("org.apache.hadoop.mapred.TextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
-      false),
+      false, false),
   LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
-      "",
-      false),
+      "", false, false),
   SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
-      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
+      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
+      true),
   AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
       "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
       "org.apache.hadoop.hive.serde2.avro.AvroSerDe",
-      false),
+      false, false),
   PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-      true);
+      true, true);
 
   private final String inputFormat_;
   private final String outputFormat_;
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
   // Indicates whether we support scanning complex types for this file format.
   private final boolean isComplexTypesSupported_;
 
+  // Indicates whether the file format can skip complex columns in scans and 
just
+  // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is 
true.
+  // TODO: Remove this once we support complex types for all file formats.
+  private final boolean canSkipColumnTypes_;
+
   HdfsFileFormat(String inputFormat, String outputFormat, String 
serializationLib,
-      boolean isComplexTypesSupported) {
+      boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
     inputFormat_ = inputFormat;
     outputFormat_ = outputFormat;
     serializationLib_ = serializationLib;
     isComplexTypesSupported_ = isComplexTypesSupported;
+    canSkipColumnTypes_ = canSkipColumnTypes;
   }
 
   public String inputFormat() { return inputFormat_; }
@@ -235,6 +241,12 @@ public enum HdfsFileFormat {
   public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }
 
   /**
+   * Returns true if this file format can skip complex typed columns and 
materialize
+   * only scalar typed columns.
+   */
+  public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
+
+  /**
    * Returns a list with all formats for which isComplexTypesSupported() is 
true.
    */
   public static List<HdfsFileFormat> complexTypesFormats() {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java 
b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
index c6f7722..5edc0dc 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
     }
     if (firstComplexTypedCol == null) return;
 
-    boolean hasMaterializedSlots = false;
+    boolean referencesComplexTypedCol = false;
     for (SlotDescriptor slotDesc: desc_.getSlots()) {
-      if (slotDesc.isMaterialized()) {
-        hasMaterializedSlots = true;
+      if (!slotDesc.isMaterialized()) continue;
+      if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
+        referencesComplexTypedCol = true;
         break;
       }
     }
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
     for (HdfsPartition part: partitions_) {
       HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
       if (format.isComplexTypesSupported()) continue;
-      // Allow count(*) and similar queries on RC_FILE with complex types.
-      if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
+      // If the file format allows querying just scalar typed columns and the 
query
+      // doesn't materialize any complex typed columns, it is allowed.
+      if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
+        continue;
+      }
       String errSuffix = String.format(
           "Complex types are supported for these file formats: %s",
           Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cd7ada7/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
 
b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
index 487bb3b..f0431a2 100644
--- 
a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ 
b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -38,13 +38,18 @@ select 1 from functional_rc_snap.complextypes_fileformat t, 
t.a
 not implemented: Scan of table 't' in format 'RC_FILE' is not supported 
because the table has a column 's' with a complex type 
'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on RC files, even if no complex-typed
-# columns are selected.
-select id from functional_rc_snap.complextypes_fileformat
+select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
 ---- PLAN
-not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in 
format 'RC_FILE' is not supported because the table has a column 's' with a 
complex type 'STRUCT<f1:STRING,f2:INT>'.
+not implemented: Scan of table 't' in format 'RC_FILE' is not supported 
because the table has a column 's' with a complex type 
'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
+# Complex types are not supported on RC files, however queries materializing
+# only scalar type columns are allowed.
+select id from functional_rc_snap.complextypes_fileformat
+---- PLAN
+00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=56B
+====
 # Complex types are not supported on RC files but count(*) and similar
 # queries should work.
 select count(*) from functional_rc_snap.complextypes_fileformat
@@ -61,12 +66,12 @@ select s.f1 from 
functional_seq_snap.complextypes_fileformat t, t.a
 not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported 
because the table has a column 's' with a complex type 
'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on sequence files, even if no complex-typed
-# columns are selected.
-select 1 from functional_seq_snap.complextypes_fileformat
+# Queries referencing only scalar typed columns on sequence files
+# are allowed.
+select id from functional_seq_snap.complextypes_fileformat
 ---- PLAN
-not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' 
in format 'SEQUENCE_FILE' is not supported because the table has a column 's' 
with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=87B
 ====
 # Scanning all partitions fails because there are partitions with a file 
format for which
 # complex types are not supported. The error message is abbreviated because it 
is

[22/50] incubator-impala git commit: IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files

Reply via email to