IMPALA-3092: Set default value to NULL in AvroSchemaConverter

This change ensures that Avro tables created without column definitions
remain queryable if columns are added via ALTER TABLE. The bug was that
when synthesizing an Avro schema from the column definitions we used to
not add default values.

Change-Id: Ib86e9ba1f4329b285ae14ee299365f7291a7410e
Reviewed-on: http://gerrit.cloudera.org:8080/3219
Reviewed-by: Alex Behm <[email protected]>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/816735a0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/816735a0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/816735a0

Branch: refs/heads/master
Commit: 816735a032daff77da089735faab68904a803fd4
Parents: 98d7b8a
Author: Huaisi Xu <[email protected]>
Authored: Wed May 25 13:44:34 2016 -0700
Committer: Tim Armstrong <[email protected]>
Committed: Tue May 31 23:32:11 2016 -0700

----------------------------------------------------------------------
 .../impala/util/AvroSchemaConverter.java        |  7 +-
 .../queries/QueryTest/avro-schema-changes.test  | 69 ++++++++++++++++++++
 .../queries/QueryTest/avro-stale-schema.test    | 36 ----------
 tests/query_test/test_avro_schema_resolution.py | 12 ++--
 4 files changed, 79 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/816735a0/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java 
b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
index 504d263..9a0077c 100644
--- a/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
+++ b/fe/src/main/java/com/cloudera/impala/util/AvroSchemaConverter.java
@@ -18,7 +18,9 @@ import java.util.List;
 
 import org.apache.avro.Schema;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.codehaus.jackson.JsonNode;
 import org.codehaus.jackson.node.IntNode;
+import org.codehaus.jackson.node.JsonNodeFactory;
 
 import com.cloudera.impala.analysis.ColumnDef;
 import com.cloudera.impala.catalog.ArrayType;
@@ -99,6 +101,7 @@ public class AvroSchemaConverter {
   private Schema convertFieldSchemasImpl(
       List<FieldSchema> fieldSchemas, String schemaName) {
     List<Schema.Field> avroFields = Lists.newArrayList();
+    JsonNode nullDefault = JsonNodeFactory.instance.nullNode();
     for (FieldSchema fs: fieldSchemas) {
       Type impalaType = Type.parseColumnType(fs.getType());
       if (impalaType == null) {
@@ -106,7 +109,7 @@ public class AvroSchemaConverter {
             fs.getType() + " is not a suppported Impala type");
       }
       final Schema.Field avroField = new Schema.Field(fs.getName(),
-          createAvroSchema(impalaType), fs.getComment(), null);
+          createAvroSchema(impalaType), fs.getComment(), nullDefault);
       avroFields.add(avroField);
     }
     return createAvroRecord(avroFields, schemaName);
@@ -136,7 +139,7 @@ public class AvroSchemaConverter {
     }
     // Make the Avro schema nullable.
     Schema nullSchema = Schema.create(Schema.Type.NULL);
-    return Schema.createUnion(Arrays.asList(schema, nullSchema));
+    return Schema.createUnion(Arrays.asList(nullSchema, schema));
   }
 
   private Schema createScalarSchema(ScalarType impalaScalarType) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/816735a0/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
new file mode 100644
index 0000000..7fc0bc8
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/avro-schema-changes.test
@@ -0,0 +1,69 @@
+====
+---- QUERY
+# Create a table with default fileformat and later change it to avro using
+# alter sql. The query runs with stale metadata and a warning should be raised.
+# Invalidating metadata should cause the Avro schema to be properly set upon 
the
+# next metadata load.
+CREATE EXTERNAL TABLE alltypesagg_staleschema (
+  id INT,
+  bool_col BOOLEAN,
+  tinyint_col INT,
+  smallint_col INT,
+  int_col INT,
+  bigint_col BIGINT,
+  float_col FLOAT,
+  double_col DOUBLE,
+  date_string_col STRING,
+  string_col STRING,
+  timestamp_col STRING
+)
+LOCATION 
'$FILESYSTEM_PREFIX/test-warehouse/alltypesaggmultifilesnopart_avro_snap'
+TBLPROPERTIES ('avro.schema.url'= 
'$FILESYSTEM_PREFIX/test-warehouse/avro_schemas/functional/alltypesaggmultifilesnopart.json')
+====
+---- QUERY
+alter table alltypesagg_staleschema set fileformat avro
+====
+---- QUERY
+select count(*) from alltypesagg_staleschema
+---- CATCH
+Missing Avro schema in scan node. This could be due to stale metadata.
+====
+---- QUERY
+invalidate metadata alltypesagg_staleschema
+====
+---- QUERY
+select count(*) from alltypesagg_staleschema
+---- RESULTS
+11000
+---- TYPES
+bigint
+====
+---- QUERY
+# IMPALA-3092. Create an Avro table without column definitions and add columns 
via ALTER
+# TABLE. Querying the table should work.
+CREATE EXTERNAL TABLE avro_alter_table_add_new_column (
+a string,
+b string)
+STORED AS AVRO
+LOCATION '$FILESYSTEM_PREFIX/test-warehouse/tinytable_avro';
+
+ALTER TABLE avro_alter_table_add_new_column ADD COLUMNS (
+bool_col boolean,
+int_col int,
+bigint_col bigint,
+float_col float,
+double_col double,
+timestamp_col timestamp,
+decimal_col decimal(2,0),
+string_col string)
+====
+---- QUERY
+# Every new column just added should have NULL filled
+select * from avro_alter_table_add_new_column
+---- RESULTS
+'aaaaaaa','bbbbbbb',NULL,NULL,NULL,NULL,NULL,'NULL',NULL,'NULL'
+'ccccc','dddd',NULL,NULL,NULL,NULL,NULL,'NULL',NULL,'NULL'
+'eeeeeeee','f',NULL,NULL,NULL,NULL,NULL,'NULL',NULL,'NULL'
+---- TYPES
+string, string, boolean, int, bigint, float, double, string, decimal, string
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/816735a0/testdata/workloads/functional-query/queries/QueryTest/avro-stale-schema.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/avro-stale-schema.test 
b/testdata/workloads/functional-query/queries/QueryTest/avro-stale-schema.test
deleted file mode 100644
index 6375ff6..0000000
--- 
a/testdata/workloads/functional-query/queries/QueryTest/avro-stale-schema.test
+++ /dev/null
@@ -1,36 +0,0 @@
-====
----- QUERY
-CREATE EXTERNAL TABLE alltypesagg_staleschema (
-  id INT,
-  bool_col BOOLEAN,
-  tinyint_col INT,
-  smallint_col INT,
-  int_col INT,
-  bigint_col BIGINT,
-  float_col FLOAT,
-  double_col DOUBLE,
-  date_string_col STRING,
-  string_col STRING,
-  timestamp_col STRING
-)
-LOCATION '/test-warehouse/alltypesaggmultifilesnopart_avro_snap'
-TBLPROPERTIES ('avro.schema.url'= 
'/test-warehouse/avro_schemas/functional/alltypesaggmultifilesnopart.json')
-====
----- QUERY
-alter table alltypesagg_staleschema set fileformat avro
-====
----- QUERY
-select count(*) from alltypesagg_staleschema
----- CATCH
-Missing Avro schema in scan node. This could be due to stale metadata.
-====
----- QUERY
-invalidate metadata alltypesagg_staleschema
-====
----- QUERY
-select count(*) from alltypesagg_staleschema
----- RESULTS
-11000
----- TYPES
-bigint
-====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/816735a0/tests/query_test/test_avro_schema_resolution.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_avro_schema_resolution.py 
b/tests/query_test/test_avro_schema_resolution.py
index 21d65bd..ff955ce 100644
--- a/tests/query_test/test_avro_schema_resolution.py
+++ b/tests/query_test/test_avro_schema_resolution.py
@@ -42,12 +42,10 @@ class TestAvroSchemaResolution(ImpalaTestSuite):
     """
     self.run_test_case('QueryTest/avro-schema-resolution', vector)
 
-  def test_avro_stale_schema(self, vector, unique_database):
-    """Test for IMPALA-3314 and IMPALA-3513. Impalad shouldn't crash with 
stale avro
+  def test_avro_schema_changes(self, vector, unique_database):
+    """Test for IMPALA-3314 and IMPALA-3513: Impalad shouldn't crash with 
stale avro
     metadata. Instead, should provide a meaningful error message.
+    Test for IMPALA-3092: Impala should be able to query an avro table after 
ALTER TABLE
+    ... ADD COLUMN ...
     """
-    # Create a table with default fileformat and later change it to avro using
-    # alter sql. The query runs with stale metadata and a warning should be 
raised.
-    # Invalidating metadata should cause the Avro schema to be properly set 
upon the
-    # next metadata load.
-    self.run_test_case('QueryTest/avro-stale-schema', vector, unique_database)
+    self.run_test_case('QueryTest/avro-schema-changes', vector, 
unique_database)

Reply via email to