IMPALA-3687: Prefer Avro field name during schema reconciliation

Since it is possible to create an Avro table with both column
definitions and an Avro schema, Impala attempts to reconcile
inconsistencies in the two schema definitions, generally preferring the
Avro schema. The only exception to this rule was with
CHAR/VARCHAR/STRING columns, where the column definition was preferred
in order to support tables with CHAR/VARCHAR columns although Avro only
supports STRING. This exception is confusing because the name for such a
column will be taken from the column definition (and not from the Avro
schema).

This patch prefers name, comment from Avro schema definition and
uses column type from column definition for CHAR/VARCHAR/STRING
columns.

Change-Id: Ia3e43b2885853c2b4f207a45a873c9d7f31379cd
Reviewed-on: http://gerrit.cloudera.org:8080/3331
Reviewed-by: Huaisi Xu <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/c6ce32b3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/c6ce32b3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/c6ce32b3

Branch: refs/heads/master
Commit: c6ce32b3b6ae9b16f9b7921d22205c851745f410
Parents: 476f687
Author: Huaisi Xu <[email protected]>
Authored: Mon Jun 6 18:12:47 2016 -0700
Committer: Taras Bobrovytsky <[email protected]>
Committed: Thu Jul 14 19:04:43 2016 +0000

----------------------------------------------------------------------
 .../impala/util/AvroSchemaConverter.java        |   5 +
 .../cloudera/impala/util/AvroSchemaUtils.java   |  27 +++--
 .../queries/QueryTest/avro-schema-changes.test  |   2 +-
 .../QueryTest/avro-schema-resolution.test       | 110 ++++++++++++++++++-
 tests/query_test/test_avro_schema_resolution.py |   4 +-
 5 files changed, 131 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c6ce32b3/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java 
b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
index 9a0077c..dce35da 100644
--- a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
+++ b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
@@ -40,6 +40,11 @@ import com.google.common.collect.Lists;
  * Error behavior: These functions throw an UnsupportedOperationException when 
failing
  * to generate an Impala-compatible Avro schema, e.g., because of an unknown 
type or a
  * type not supported by Impala.
+ *
+ * Behavior for TIMESTAMP:
+ * A TIMESTAMP column definition maps to an Avro STRING and is created as a 
STRING column,
+ * because Avro has no binary TIMESTAMP representation. As a result, no Avro 
table may
+ * have a TIMESTAMP column.
  */
 public class AvroSchemaConverter {
   // Arbitrarily chosen schema name and record prefix. Note that

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c6ce32b3/fe/src/main/java/com/cloudera/impala/util/AvroSchemaUtils.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaUtils.java 
b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaUtils.java
index 558b569..ede6ebb 100644
--- a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaUtils.java
+++ b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaUtils.java
@@ -109,14 +109,10 @@ public class AvroSchemaUtils {
    * resolution policy:
    *
    * Mismatched number of columns -> Prefer Avro columns.
-   * Mismatched name/type -> Prefer Avro column, except:
-   *   A CHAR/VARCHAR column definition maps to an Avro STRING, and is 
preserved
-   *   as a CHAR/VARCHAR in the reconciled schema.
-   *
-   * Behavior for TIMESTAMP:
-   * A TIMESTAMP column definition maps to an Avro STRING and is presented as 
a STRING
-   * in the reconciled schema, because Avro has no binary TIMESTAMP 
representation.
-   * As a result, no Avro table may have a TIMESTAMP column.
+   * Always prefer Avro schema except for column type CHAR/VARCHAR/STRING:
+   *   A CHAR/VARCHAR/STRING column definition maps to an Avro STRING. The 
reconciled
+   *   column will preserve the type in the column definition but use the 
column name
+   *   and comment from the Avro schema.
    */
   public static List<ColumnDef> reconcileSchemas(
       List<ColumnDef> colDefs, List<ColumnDef> avroCols, StringBuilder 
warning) {
@@ -135,12 +131,21 @@ public class AvroSchemaUtils {
       Preconditions.checkNotNull(colDef.getType());
       Preconditions.checkNotNull(avroCol.getType());
 
-      // A CHAR/VARCHAR column definition maps to an Avro STRING, and is 
preserved
-      // as a CHAR/VARCHAR in the reconciled schema.
+      // A CHAR/VARCHAR/STRING column definition maps to an Avro STRING, and 
is preserved
+      // as a CHAR/VARCHAR/STRING in the reconciled schema. Column name and 
comment
+      // are taken from the Avro schema.
       if ((colDef.getType().isStringType() && 
avroCol.getType().isStringType())) {
         Preconditions.checkState(
             avroCol.getType().getPrimitiveType() == PrimitiveType.STRING);
-        result.add(colDef);
+        ColumnDef reconciledColDef = new ColumnDef(
+            avroCol.getColName(), colDef.getTypeDef(), avroCol.getComment());
+        try {
+          reconciledColDef.analyze();
+        } catch (AnalysisException e) {
+          Preconditions.checkNotNull(
+              null, "reconciledColDef.analyze() should never throw.");
+        }
+        result.add(reconciledColDef);
       } else {
         result.add(avroCol);
       }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c6ce32b3/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
index 7fc0bc8..e4b87c5 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
@@ -1,6 +1,6 @@
 ====
 ---- QUERY
-# Create a table with default fileformat and later change it to avro using
+# Create a table with default fileformat and later change it to Avro using
 # alter sql. The query runs with stale metadata and a warning should be raised.
 # Invalidating metadata should cause the Avro schema to be properly set upon 
the
 # next metadata load.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c6ce32b3/testdata/workloads/functional-query/queries/QueryTest/avro-schema-resolution.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-resolution.test
 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-resolution.test
index d5d8942..c640fa6 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-resolution.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-resolution.test
@@ -1,7 +1,7 @@
 ====
 ---- QUERY
 # see testdata/avro_schema_resolution
-select * from schema_resolution_test
+select * from functional_avro_snap.schema_resolution_test
 ---- TYPES
 boolean, int, long, float, double, string, string, string
 ---- RESULTS
@@ -15,9 +15,9 @@ false,2,2,2,2,'serialized string','','NULL'
 # IMPALA-1947: A TIMESTAMP from the column definitions results in a STRING 
column
 # backed by a stored Avro STRING during table loading.
 # See testdata/avro_schema_resolution
-select * from no_avro_schema where year = 2009 order by id limit 1
+select * from functional_avro_snap.no_avro_schema where year = 2009 order by 
id limit 1
 union all
-select * from no_avro_schema where year = 2010 order by id limit 1
+select * from functional_avro_snap.no_avro_schema where year = 2010 order by 
id limit 1
 ---- TYPES
 int, boolean, int, int, int, bigint, float, double, string, string, string, 
int, int
 ---- RESULTS: VERIFY_IS_EQUAL_SORTED
@@ -74,3 +74,107 @@ int, string
 4,'4'
 4,'4'
 ====
+---- QUERY
+# IMPALA-3687: Create a table with mismatched column definitions and Avro 
schema.
+CREATE EXTERNAL TABLE alltypesagg_mismatch_column_name_comment (
+  id INT COMMENT 'int commnet',
+  bool_col BOOLEAN COMMENT 'bool commnet',
+  tinyint_col float COMMENT 'tinyint comment',
+  smallint_col double COMMENT 'smallint comment',
+  int_col INT COMMENT 'int comment',
+  bigint_col BIGINT COMMENT 'bigint comment',
+  float_col FLOAT COMMENT 'float comment',
+  double_col DOUBLE COMMENT 'double comment',
+  date_string_col STRING COMMENT 'string comment',
+  char_col char(2) COMMENT 'char comment',
+  varchar_col varchar(5) COMMENT 'varchar comment'
+)
+STORED AS AVRO
+TBLPROPERTIES ('avro.schema.literal'= '{
+"fields": [
+{"type": ["int", "null"], "name": "id_mismatch", "doc": "int comment 
mismatch"},
+{"type": ["boolean", "null"], "name": "bool_col_mismatch", "doc": "bool 
comment mismatch"},
+{"type": ["int", "null"], "name": "tinyint_col_mismatch", "doc": "tinyint 
comment mismatch"},
+{"type": ["int", "null"], "name": "smallint_col_mismatch", "doc": "smallint 
comment mismatch"},
+{"type": ["int", "null"], "name": "int_col_mismatch", "doc": "int comment 
mismatch"},
+{"type": ["long", "null"], "name": "bigint_col_mismatch", "doc": "bigint 
comment mismatch"},
+{"type": ["float", "null"], "name": "float_col_mismatch", "doc": "float 
comment mismatch"},
+{"type": ["double", "null"], "name": "double_col_mismatch", "doc": "double 
comment mismatch"},
+{"type": ["string", "null"], "name": "date_string_col_mismatch", "doc": 
"string comment mismatch"},
+{"type": ["string", "null"], "name": "char_col_mismatch", "doc": "char comment 
mismatch"},
+{"type": ["string", "null"], "name": "varchar_col_mismatch", "doc": "varchar 
comment mismatch"}],
+"type": "record", "name": "a"}')
+====
+---- QUERY
+# We favor Avro schema definition over column definitions except for 
CHAR/VARCHAR, for
+# which we use the type from the column definition.
+DESCRIBE alltypesagg_mismatch_column_name_comment
+---- RESULTS
+'id_mismatch','int','int comment mismatch'
+'bool_col_mismatch','boolean','bool comment mismatch'
+'tinyint_col_mismatch','int','tinyint comment mismatch'
+'smallint_col_mismatch','int','smallint comment mismatch'
+'int_col_mismatch','int','int comment mismatch'
+'bigint_col_mismatch','bigint','bigint comment mismatch'
+'float_col_mismatch','float','float comment mismatch'
+'double_col_mismatch','double','double comment mismatch'
+'date_string_col_mismatch','string','string comment mismatch'
+'char_col_mismatch','char(2)','char comment mismatch'
+'varchar_col_mismatch','varchar(5)','varchar comment mismatch'
+---- TYPES
+string, string, string
+====
+---- QUERY
+# IMPALA-3687: Create a table with only Avro schema and later alter it to be a 
mismatched
+# schema.
+CREATE EXTERNAL TABLE alltypesagg_alter_avro_name_comment
+STORED AS AVRO
+TBLPROPERTIES ('avro.schema.literal'= '{
+"fields": [
+{"type": ["int", "null"], "name": "id", "doc": "int comment"},
+{"type": ["boolean", "null"], "name": "bool_col", "doc": "bool comment"},
+{"type": ["int", "null"], "name": "tinyint_col", "doc": "tinyint comment"},
+{"type": ["int", "null"], "name": "smallint_col", "doc": "smallint comment"},
+{"type": ["int", "null"], "name": "int_col", "doc": "int comment"},
+{"type": ["long", "null"], "name": "bigint_col", "doc": "bigint comment"},
+{"type": ["float", "null"], "name": "float_col", "doc": "float comment"},
+{"type": ["double", "null"], "name": "double_col", "doc": "double comment"},
+{"type": ["string", "null"], "name": "date_string_col", "doc": "string 
comment"},
+{"type": ["string", "null"], "name": "char_col", "doc": "char comment"},
+{"type": ["string", "null"], "name": "varchar_col", "doc": "varchar comment"}],
+"type": "record", "name": "a"}');
+
+ALTER TABLE alltypesagg_alter_avro_name_comment SET TBLPROPERTIES 
('avro.schema.literal'= '{
+"fields": [
+{"type": ["int", "null"], "name": "id_mismatch", "doc": "int comment 
mismatch"},
+{"type": ["boolean", "null"], "name": "bool_col_mismatch", "doc": "bool 
comment mismatch"},
+{"type": ["int", "null"], "name": "tinyint_col_mismatch", "doc": "tinyint 
comment mismatch"},
+{"type": ["int", "null"], "name": "smallint_col_mismatch", "doc": "smallint 
comment mismatch"},
+{"type": ["int", "null"], "name": "int_col_mismatch", "doc": "int comment 
mismatch"},
+{"type": ["long", "null"], "name": "bigint_col_mismatch", "doc": "bigint 
comment mismatch"},
+{"type": ["float", "null"], "name": "float_col_mismatch", "doc": "float 
comment mismatch"},
+{"type": ["double", "null"], "name": "double_col_mismatch", "doc": "double 
comment mismatch"},
+{"type": ["string", "null"], "name": "date_string_col_mismatch", "doc": 
"string comment mismatch"},
+{"type": ["string", "null"], "name": "char_col_mismatch", "doc": "char comment 
mismatch"},
+{"type": ["string", "null"], "name": "varchar_col_mismatch", "doc": "varchar 
comment mismatch"}],
+"type": "record", "name": "a"}');
+
+REFRESH alltypesagg_alter_avro_name_comment;
+====
+---- QUERY
+DESCRIBE alltypesagg_alter_avro_name_comment
+---- RESULTS
+'id_mismatch','int','int comment mismatch'
+'bool_col_mismatch','boolean','bool comment mismatch'
+'tinyint_col_mismatch','int','tinyint comment mismatch'
+'smallint_col_mismatch','int','smallint comment mismatch'
+'int_col_mismatch','int','int comment mismatch'
+'bigint_col_mismatch','bigint','bigint comment mismatch'
+'float_col_mismatch','float','float comment mismatch'
+'double_col_mismatch','double','double comment mismatch'
+'date_string_col_mismatch','string','string comment mismatch'
+'char_col_mismatch','string','char comment mismatch'
+'varchar_col_mismatch','string','varchar comment mismatch'
+---- TYPES
+string, string, string
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/c6ce32b3/tests/query_test/test_avro_schema_resolution.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_avro_schema_resolution.py 
b/tests/query_test/test_avro_schema_resolution.py
index 5754c83..527181c 100644
--- a/tests/query_test/test_avro_schema_resolution.py
+++ b/tests/query_test/test_avro_schema_resolution.py
@@ -20,8 +20,8 @@ class TestAvroSchemaResolution(ImpalaTestSuite):
         v.get_value('table_format').file_format == 'avro' and\
         v.get_value('table_format').compression_codec == 'snap')
 
-  def test_avro_schema_resolution(self, vector):
-    self.run_test_case('QueryTest/avro-schema-resolution', vector)
+  def test_avro_schema_resolution(self, vector, unique_database):
+    self.run_test_case('QueryTest/avro-schema-resolution', vector, 
unique_database)
 
   def test_avro_c_lib_unicode_nulls(self, vector):
     """Test for IMPALA-1136 and IMPALA-2161 and unicode characters in the

Reply via email to