attilapiros commented on a change in pull request #31133:
URL: https://github.com/apache/spark/pull/31133#discussion_r558594242
##########
File path:
sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
##########
@@ -1883,6 +1883,60 @@ class HiveDDLSuite
}
}
+ test("SPARK-26836: support Avro schema evolution") {
+ withTable("t") {
+ val originalSchema =
+ """
+ |{
+ | "namespace": "test",
+ | "name": "some_schema",
+ | "type": "record",
+ | "fields": [
+ | {
+ | "name": "col2",
+ | "type": "string"
+ | }
+ | ]
+ |}
+ """.stripMargin
+ sql(
+ s"""
+ |CREATE TABLE t PARTITIONED BY (ds string)
+ |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
+ |WITH SERDEPROPERTIES ('avro.schema.literal'='$originalSchema')
+ |STORED AS
+ |INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
+ |OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
+ """.stripMargin)
+ sql("INSERT INTO t partition (ds='1981-01-07') VALUES ('col2_value')")
+ val evolvedSchema =
+ """
+ |{
+ | "namespace": "test",
+ | "name": "some_schema",
+ | "type": "record",
+ | "fields": [
+ | {
+ | "name": "col1",
+ | "type": "string",
+ | "default": "col1_default"
+ | },
+ | {
+ | "name": "col2",
+ | "type": "string"
+ | }
+ | ]
+ |}
+ """.stripMargin
+ sql(s"""ALTER TABLE t SET SERDEPROPERTIES
('avro.schema.literal'='$evolvedSchema')""")
+ sql("INSERT INTO t partition (ds='1983-04-27') VALUES ('col1_value',
'col2_value')")
+ withSQLConf(SQLConf.HIVE_AVRO_SCHEMA_EVOLUTION_ENABLED.key -> "true") {
+ checkAnswer(spark.table("t"), Row("col1_default", "col2_value",
"1981-01-07")
+ :: Row("col1_value", "col2_value", "1983-04-27") :: Nil)
+ }
Review comment:
I mean there are different use cases depending on how the schema is
changed.
Here we have an example where the value in wrong column makes the old
behavior practically unusable.
But for cases when just the default value is changed that sounds at least
acceptable:
all the data is read back with the value which was used during the write
(including the missing values).
What about adding such a case for the
`SQLConf.HIVE_AVRO_SCHEMA_EVOLUTION_ENABLED.key -> "false"`?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]