This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit d836246ab53e643407224d2340e30ef017d6b058
Author: Csaba Ringhofer <csringho...@cloudera.com>
AuthorDate: Mon Jan 13 18:12:48 2020 +0100

    IMPALA-9249: Fix ORC scanner crash when root type is not struct
    
    The root type should be struct as far as I know, and this was
    checked with a DCHECK, leading to crashes in fuzz tests. This
    change replaced the DCHECK with returning an error message.
    
    Testing:
    - added corrupt ORC file and e2e test
    
    Change-Id: I7fba8cffbcdf8f647e27e2d5ee9e6716a4492b9b
    Reviewed-on: http://gerrit.cloudera.org:8080/15021
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/exec/hdfs-orc-scanner.cc     |   4 ++--
 be/src/exec/orc-metadata-utils.cc   |  14 +++++++++-----
 be/src/exec/orc-metadata-utils.h    |  19 ++++++++-----------
 testdata/data/README                |   4 ++++
 testdata/data/corrupt_root_type.orc | Bin 0 -> 1813 bytes
 tests/query_test/test_scanners.py   |  23 +++++++++++++++++------
 6 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 164feaa..e8e236a 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -184,8 +184,8 @@ Status HdfsOrcScanner::Open(ScannerContext* context) {
   // Build 'col_id_path_map_' that maps from ORC column ids to their 
corresponding
   // SchemaPath in the table. The map is used in the constructors of 
OrcColumnReaders
   // where we resolve SchemaPaths of the descriptors.
-  OrcMetadataUtils::BuildSchemaPaths(reader_->getType(),
-      scan_node_->num_partition_keys(), &col_id_path_map_);
+  
RETURN_IF_ERROR(schema_resolver_->BuildSchemaPaths(scan_node_->num_partition_keys(),
+      &col_id_path_map_));
 
   // To create OrcColumnReaders, we need the selected orc schema. It's a 
subset of the
   // file schema: a tree of selected orc types and can only be got from an 
orc::RowReader
diff --git a/be/src/exec/orc-metadata-utils.cc 
b/be/src/exec/orc-metadata-utils.cc
index 0dd55fd..27db46d 100644
--- a/be/src/exec/orc-metadata-utils.cc
+++ b/be/src/exec/orc-metadata-utils.cc
@@ -22,20 +22,24 @@
 
 namespace impala {
 
-void OrcMetadataUtils::BuildSchemaPaths(const orc::Type& root, int 
num_partition_keys,
+Status OrcSchemaResolver::BuildSchemaPaths(int num_partition_keys,
     vector<SchemaPath>* paths) {
+  if (root_->getKind() != orc::TypeKind::STRUCT) {
+    return Status(Substitute("Corrupt ORC File '$0': root type is $1 (should 
be struct)",
+        filename_, root_->toString()));
+  }
   SchemaPath path;
   paths->push_back(path);
-  DCHECK_EQ(root.getKind(), orc::TypeKind::STRUCT);
-  int num_columns = root.getSubtypeCount();
+  int num_columns = root_->getSubtypeCount();
   for (int i = 0; i < num_columns; ++i) {
     path.push_back(i + num_partition_keys);
-    BuildSchemaPath(*root.getSubtype(i), &path, paths);
+    BuildSchemaPath(*root_->getSubtype(i), &path, paths);
     path.pop_back();
   }
+  return Status::OK();
 }
 
-void OrcMetadataUtils::BuildSchemaPath(const orc::Type& node, SchemaPath* path,
+void OrcSchemaResolver::BuildSchemaPath(const orc::Type& node, SchemaPath* 
path,
     vector<SchemaPath>* paths) {
   DCHECK_EQ(paths->size(), node.getColumnId());
   paths->push_back(*path);
diff --git a/be/src/exec/orc-metadata-utils.h b/be/src/exec/orc-metadata-utils.h
index eac085a..d93ae6f 100644
--- a/be/src/exec/orc-metadata-utils.h
+++ b/be/src/exec/orc-metadata-utils.h
@@ -24,17 +24,6 @@
 
 namespace impala {
 
-/// Utils to build a map from each orc::Type id to a SchemaPath. The map will 
be used in
-/// creating OrcColumnReaders.
-class OrcMetadataUtils {
- public:
-  static void BuildSchemaPaths(const orc::Type& root, int num_partition_keys,
-      std::vector<SchemaPath>* paths);
- private:
-  static void BuildSchemaPath(const orc::Type& node, SchemaPath* path,
-      std::vector<SchemaPath>* paths);
-};
-
 /// Util class to resolve SchemaPaths of TupleDescriptors/SlotDescriptors into 
orc::Type.
 class OrcSchemaResolver {
  public:
@@ -47,6 +36,11 @@ class OrcSchemaResolver {
   /// 'missing_field' is set to true if the column is missing in the ORC file.
   Status ResolveColumn(const SchemaPath& col_path, const orc::Type** node,
       bool* pos_field, bool* missing_field) const;
+
+  /// Build a map from each orc::Type id to a SchemaPath. The map will be used 
in
+  /// creating OrcColumnReaders.
+  Status BuildSchemaPaths(int num_partition_keys, std::vector<SchemaPath>* 
paths);
+
  private:
   const HdfsTableDescriptor& tbl_desc_;
   const orc::Type* const root_;
@@ -55,5 +49,8 @@ class OrcSchemaResolver {
   /// Validate whether the ColumnType is compatible with the orc type
   Status ValidateType(const ColumnType& type, const orc::Type& orc_type) const
       WARN_UNUSED_RESULT;
+
+  static void BuildSchemaPath(const orc::Type& node, SchemaPath* path,
+      std::vector<SchemaPath>* paths);
 };
 }
diff --git a/testdata/data/README b/testdata/data/README
index a8f20c2..f46bc9f 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -458,3 +458,7 @@ Contains one row (1300-01-01 00:00:00) which is outside 
Impala's valid time rang
 
 corrupt_schema.orc:
 ORC file from IMPALA-9277, generated by fuzz test. The file contains malformed 
metadata.
+
+corrupt_root_type.orc:
+ORC file for IMPALA-9249, generated by fuzz test. The root type of the schema 
is not
+struct, which used to hit a DCHECK.
diff --git a/testdata/data/corrupt_root_type.orc 
b/testdata/data/corrupt_root_type.orc
new file mode 100644
index 0000000..b6fab22
Binary files /dev/null and b/testdata/data/corrupt_root_type.orc differ
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 1d26463..b303a62 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1334,15 +1334,26 @@ class TestOrc(ImpalaTestSuite):
       self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
                          new_vector, unique_database)
 
-  def test_invalid_schema(self, vector, unique_database):
-    """Test scanning of ORC file with malformed schema."""
-    test_files = ["testdata/data/corrupt_schema.orc"]
+  def _run_invalid_schema_test(self, unique_database, test_name, 
expected_error):
+    """Copies 'test_name'.orc to a table and runs a simple query. These tests 
should
+       cause an error during the processing of the ORC schema, so the file's 
columns do
+       not have to match with the table's columns.
+    """
+    test_files = ["testdata/data/%s.orc" % test_name]
     create_table_and_copy_files(self.client,
         "CREATE TABLE {db}.{tbl} (id BIGINT) STORED AS ORC",
-        unique_database, "corrupt_schema", test_files)
+        unique_database, test_name, test_files)
     err = self.execute_query_expect_failure(self.client,
-        "select count(*) from {0}.{1}".format(unique_database, 
"corrupt_schema"))
-    assert "Encountered parse error during schema selection" in str(err)
+        "select count(*) from {0}.{1}".format(unique_database, test_name))
+    assert expected_error in str(err)
+
+  def test_invalid_schema(self, vector, unique_database):
+    """Test scanning of ORC file with malformed schema."""
+    self._run_invalid_schema_test(unique_database, "corrupt_schema",
+        "Encountered parse error during schema selection")
+    self._run_invalid_schema_test(unique_database, "corrupt_root_type",
+        "root type is boolean (should be struct)")
+
 
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod

Reply via email to