[jira] [Updated] (HUDI-5835) spark cannot read mor table after execute update statement

Tao Meng (Jira) Thu, 23 Feb 2023 02:27:05 -0800


     [ 
https://issues.apache.org/jira/browse/HUDI-5835?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Tao Meng updated HUDI-5835:
---------------------------
    Description: 
avro schema create by sparksql will miss avro name and namespace, 

This will lead the read schema and write schema of the log file to be 
incompatible

 
{code:java}
// code placeholder
{code}
test("Test Add Column and Update Table") \{ withTempDir { tmp => val tableName 
= generateTableName //spark.sql("SET 
hoodie.datasource.read.extract.partition.values.from.path=true") val tablePath 
= new Path(tmp.getCanonicalPath, tableName) // create table spark.sql( s""" 
|create table $tableName ( | id int, | name string, | price double, | ts long, 
| ff decimal(38, 10) |) using hudi | location '${tablePath.toString}' | 
tblproperties ( | type = 'mor', | primaryKey = 'id', | preCombineField = 'ts' | 
) """.stripMargin) // insert data to table spark.sql(s"insert into $tableName 
select 1, 'a1', 10, 1000, 10.0") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 10.0, 1000) ) spark.sql(s"update $tableName set 
price = 22 where id = 1") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 22.0, 1000) ) spark.sql(s"alter table $tableName add 
column new_col1 int") checkAnswer(s"select id, name, price, ts, new_col1 from 
$tableName")( Seq(1, "a1", 22.0, 1000, null) ) // update and check 
spark.sql(s"update $tableName set price = price * 2 where id = 1") 
checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")( Seq(1, 
"a1", 44.0, 1000, null) ) } }

  was:
avro schema create by sparksql will miss avro name and namespace, 

This will lead the read schema and write schema of the log file to be 
incompatible

 

test("Test Add Column and Update Table") {
withTempDir { tmp =>
val tableName = generateTableName

//spark.sql("SET 
hoodie.datasource.read.extract.partition.values.from.path=true")

val tablePath = new Path(tmp.getCanonicalPath, tableName)
// create table
spark.sql(
s"""
|create table $tableName (
| id int,
| name string,
| price double,
| ts long,
| ff decimal(38, 10)
|) using hudi
| location '${tablePath.toString}'
| tblproperties (
| type = 'mor',
| primaryKey = 'id',
| preCombineField = 'ts'
| )
""".stripMargin)

// insert data to table
spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000, 10.0")
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 10.0, 1000)
)

spark.sql(s"update $tableName set price = 22 where id = 1")
checkAnswer(s"select id, name, price, ts from $tableName")(
Seq(1, "a1", 22.0, 1000)
)

spark.sql(s"alter table $tableName add column new_col1 int")

checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")(
Seq(1, "a1", 22.0, 1000, null)
)

// update and check
spark.sql(s"update $tableName set price = price * 2 where id = 1")
checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")(
Seq(1, "a1", 44.0, 1000, null)
)
}
}


> spark cannot read mor table after execute update statement
> ----------------------------------------------------------
>
>                 Key: HUDI-5835
>                 URL: https://issues.apache.org/jira/browse/HUDI-5835
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: spark
>    Affects Versions: 0.13.0
>            Reporter: Tao Meng
>            Priority: Blocker
>
> avro schema create by sparksql will miss avro name and namespace, 
> This will lead the read schema and write schema of the log file to be 
> incompatible
>  
> {code:java}
> // code placeholder
> {code}
> test("Test Add Column and Update Table") \{ withTempDir { tmp => val 
> tableName = generateTableName //spark.sql("SET 
> hoodie.datasource.read.extract.partition.values.from.path=true") val 
> tablePath = new Path(tmp.getCanonicalPath, tableName) // create table 
> spark.sql( s""" |create table $tableName ( | id int, | name string, | price 
> double, | ts long, | ff decimal(38, 10) |) using hudi | location 
> '${tablePath.toString}' | tblproperties ( | type = 'mor', | primaryKey = 
> 'id', | preCombineField = 'ts' | ) """.stripMargin) // insert data to table 
> spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000, 10.0") 
> checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 
> 10.0, 1000) ) spark.sql(s"update $tableName set price = 22 where id = 1") 
> checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 
> 22.0, 1000) ) spark.sql(s"alter table $tableName add column new_col1 int") 
> checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")( Seq(1, 
> "a1", 22.0, 1000, null) ) // update and check spark.sql(s"update $tableName 
> set price = price * 2 where id = 1") checkAnswer(s"select id, name, price, 
> ts, new_col1 from $tableName")( Seq(1, "a1", 44.0, 1000, null) ) } }



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HUDI-5835) spark cannot read mor table after execute update statement

Reply via email to