[impala] 06/06: IMPALA-8369 (part 4): Hive 3: fixes for functional dataset loading

joemcdonnell Wed, 15 May 2019 10:08:59 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 3567a2b5d4f797d0d48e37efc0126d022cb6a189
Author: Todd Lipcon <t...@apache.org>
AuthorDate: Fri May 3 17:05:52 2019 -0700

    IMPALA-8369 (part 4): Hive 3: fixes for functional dataset loading
    
    This fixes three issues for functional dataset loading:
    
    - works around HIVE-21675, a bug in which 'CREATE VIEW IF NOT EXISTS'
      does not function correctly in our current Hive build. This has been
      fixed already, but the workaround is pretty simple, and actually the
      'drop and recreate' pattern is used more widely for data-loading than
      the 'create if not exists' one.
    
    - Moves the creation of the 'hive_index' table from
      load-dependent-tables.sql to a new load-dependent-tables-hive2.sql
      file which is only executed on Hive 2.
    
    - Moving from MR to Tez execution changed the behavior of data loading
      by disabling the auto-merging of small files. With Hive-on-MR, this
      behavior defaulted to true, but with Hive-on-Tez it defaults false.
      The change is likely motivated by the fact that Tez automatically
      groups small splits on the _input_ side and thus is less likely to
      produce lots of small files. However, that grouping functionality
      doesn't work properly in localhost clusters (TEZ-3310) so we aren't
      seeing the benefit. So, this patch enables the post-process merging of
      small files.
    
      Prior to this change, the 'alltypesaggmultifilesnopart' test table was
      getting 40+ files inside it, which broke various planner tests. With
      the change, it gets the expected 4 files.
    
    Change-Id: Ic34930dc064da3136dde4e01a011d14db6a74ecd
    Reviewed-on: http://gerrit.cloudera.org:8080/13251
    Reviewed-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 .../catalog/CatalogObjectToFromThriftTest.java     |  1 +
 fe/src/test/resources/hive-site.xml.py             | 19 +++++++++++++-
 testdata/bin/create-load-data.sh                   |  5 ++++
 testdata/bin/load-dependent-tables-hive2.sql       | 30 ++++++++++++++++++++++
 testdata/bin/load-dependent-tables.sql             | 10 ++------
 .../functional/functional_schema_template.sql      |  6 +++--
 6 files changed, 60 insertions(+), 11 deletions(-)

diff --git 
a/fe/src/test/java/org/apache/impala/catalog/CatalogObjectToFromThriftTest.java 
b/fe/src/test/java/org/apache/impala/catalog/CatalogObjectToFromThriftTest.java
index 7c5c576..0431373 100644
--- 
a/fe/src/test/java/org/apache/impala/catalog/CatalogObjectToFromThriftTest.java
+++ 
b/fe/src/test/java/org/apache/impala/catalog/CatalogObjectToFromThriftTest.java
@@ -218,6 +218,7 @@ public class CatalogObjectToFromThriftTest {
         "Skipping this test since it is only supported when running against 
Hive-2",
         TestUtils.getHiveMajorVersion() == 2);
     Table table = catalog_.getOrLoadTable("functional", "hive_index_tbl");
+    Assert.assertNotNull(table);
     TTable thriftTable = getThriftTable(table);
     Assert.assertEquals(thriftTable.tbl_name, "hive_index_tbl");
     Assert.assertEquals(thriftTable.db_name, "functional");
diff --git a/fe/src/test/resources/hive-site.xml.py 
b/fe/src/test/resources/hive-site.xml.py
index 65d65e4..0124a56 100644
--- a/fe/src/test/resources/hive-site.xml.py
+++ b/fe/src/test/resources/hive-site.xml.py
@@ -84,11 +84,28 @@ if hive_major_version >= 3:
    # We run YARN with Tez on the classpath directly
    'tez.ignore.lib.uris': 'true',
    'tez.use.cluster.hadoop-libs': 'true',
+
    # Some of the tests change the columns in a incompatible manner
    # (eg. string to timestamp) this is disallowed by default in Hive-3 which 
causes
    # these tests to fail. We disable this behavior in minicluster to keep 
running the
    # same tests on both hms-2 and hms-3
-   'hive.metastore.disallow.incompatible.col.type.changes': 'false'
+   'hive.metastore.disallow.incompatible.col.type.changes': 'false',
+
+   # Group input splits to run in a small number of mappers, and merge small
+   # files at the end of jobs if necessary, to be more similar to the legacy
+   # MR execution defaults. This helps ensure that we produce the same
+   # dataload results with Hive2-MR vs Hive3-Tez.
+   #
+   # NOTE: This currently doesn't seem to take effect on our pseudo-distributed
+   # test cluster, because the hostname is 'localhost' and some Tez code path
+   # gets triggered which ignores the min-size parameter. See TEZ-3310.
+   'tez.grouping.min-size': 256 * 1024 * 1024,
+
+   # Instead, we use post-process merging to make sure that we merge files
+   # where possible at the end of jobs.
+   # TODO(todd) re-evaluate whether this is necessary once TEZ-3310 is fixed
+   # (see above).
+   'hive.merge.tezfiles': 'true',
   })
 else:
   CONFIG.update({
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index c2122d0..74f0f63 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -390,6 +390,11 @@ function copy-and-load-dependent-tables {
   # TODO: Find a good way to integrate this with the normal data loading 
scripts
   beeline -n $USER -u "${JDBC_URL}" -f\
     ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
+
+  if [[ "$IMPALA_HIVE_MAJOR_VERSION" == "2" ]]; then
+    beeline -n $USER -u "${JDBC_URL}" -f\
+      ${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive2.sql
+  fi
 }
 
 function create-internal-hbase-table {
diff --git a/testdata/bin/load-dependent-tables-hive2.sql 
b/testdata/bin/load-dependent-tables-hive2.sql
new file mode 100644
index 0000000..0585fc6
--- /dev/null
+++ b/testdata/bin/load-dependent-tables-hive2.sql
@@ -0,0 +1,30 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- Create and load tables that depend upon data in the hive test-warehouse
+-- already existing.
+--
+-- The queries in this file will only be executed on Hive 2 (and not later
+-- versions).
+
+
+USE functional;
+DROP INDEX IF EXISTS hive_index ON alltypes;
+CREATE INDEX hive_index ON TABLE alltypes (int_col)
+AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
+WITH DEFERRED REBUILD IN TABLE hive_index_tbl;
+
diff --git a/testdata/bin/load-dependent-tables.sql 
b/testdata/bin/load-dependent-tables.sql
index d4ff102..a75c4af 100644
--- a/testdata/bin/load-dependent-tables.sql
+++ b/testdata/bin/load-dependent-tables.sql
@@ -106,11 +106,5 @@ TBLPROPERTIES ('avro.schema.literal'='{"type":"record",
 
 ---- Unsupported Impala table types
 USE functional;
-CREATE VIEW IF NOT EXISTS hive_view AS SELECT 1 AS int_col FROM alltypes limit 
1;
-
-USE functional;
-DROP INDEX IF EXISTS hive_index ON alltypes;
-CREATE INDEX hive_index ON TABLE alltypes (int_col)
-AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler'
-WITH DEFERRED REBUILD IN TABLE hive_index_tbl;
-
+DROP VIEW IF EXISTS hive_view;
+CREATE VIEW hive_view AS SELECT 1 AS int_col FROM alltypes limit 1;
diff --git a/testdata/datasets/functional/functional_schema_template.sql 
b/testdata/datasets/functional/functional_schema_template.sql
index f6818ff..187f478 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -993,15 +993,17 @@ functional
 alltypes_hive_view
 ---- CREATE_HIVE
 -- Test that Impala can handle incorrect column metadata created by Hive 
(IMPALA-994).
+DROP VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
 -- Beeline cannot handle the stmt below when broken up into multiple lines.
-CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS SELECT * FROM 
{db_name}{db_suffix}.alltypes;
+CREATE VIEW {db_name}{db_suffix}.{table_name} AS SELECT * FROM 
{db_name}{db_suffix}.alltypes;
 ====
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
 alltypes_view_sub
 ---- CREATE
-CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} (x, y, z)
+DROP VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
+CREATE VIEW {db_name}{db_suffix}.{table_name} (x, y, z)
 AS SELECT int_col, string_col, timestamp_col FROM 
{db_name}{db_suffix}.alltypes;
 ---- LOAD
 ====

[impala] 06/06: IMPALA-8369 (part 4): Hive 3: fixes for functional dataset loading

Reply via email to