[jira] [Updated] (HUDI-5835) spark cannot read mor table after execute update statement

Tao Meng (Jira) Thu, 23 Feb 2023 02:27:07 -0800


     [ 
https://issues.apache.org/jira/browse/HUDI-5835?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Tao Meng updated HUDI-5835:
---------------------------
    Description: 
avro schema create by sparksql will miss avro name and namespace, 

This will lead the read schema and write schema of the log file to be 
incompatible

 
{code:java}
// code placeholdertest("Test Add Column and Update Table") { withTempDir { tmp 
=> val tableName = generateTableName //spark.sql("SET 
hoodie.datasource.read.extract.partition.values.from.path=true") val tablePath 
= new Path(tmp.getCanonicalPath, tableName) // create table spark.sql( s""" 
|create table $tableName ( | id int, | name string, | price double, | ts long, 
| ff decimal(38, 10) |) using hudi | location '${tablePath.toString}' | 
tblproperties ( | type = 'mor', | primaryKey = 'id', | preCombineField = 'ts' | 
) """.stripMargin) // insert data to table spark.sql(s"insert into $tableName 
select 1, 'a1', 10, 1000, 10.0") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 10.0, 1000) ) spark.sql(s"update $tableName set 
price = 22 where id = 1") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 22.0, 1000) ) spark.sql(s"alter table $tableName add 
column new_col1 int") checkAnswer(s"select id, name, price, ts, new_col1 from 
$tableName")( Seq(1, "a1", 22.0, 1000, null) ) // update and check 
spark.sql(s"update $tableName set price = price * 2 where id = 1") 
checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")( Seq(1, 
"a1", 44.0, 1000, null) ) } }

{code}

  was:
avro schema create by sparksql will miss avro name and namespace, 

This will lead the read schema and write schema of the log file to be 
incompatible

 
{code:java}
// code placeholder
{code}
test("Test Add Column and Update Table") \{ withTempDir { tmp => val tableName 
= generateTableName //spark.sql("SET 
hoodie.datasource.read.extract.partition.values.from.path=true") val tablePath 
= new Path(tmp.getCanonicalPath, tableName) // create table spark.sql( s""" 
|create table $tableName ( | id int, | name string, | price double, | ts long, 
| ff decimal(38, 10) |) using hudi | location '${tablePath.toString}' | 
tblproperties ( | type = 'mor', | primaryKey = 'id', | preCombineField = 'ts' | 
) """.stripMargin) // insert data to table spark.sql(s"insert into $tableName 
select 1, 'a1', 10, 1000, 10.0") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 10.0, 1000) ) spark.sql(s"update $tableName set 
price = 22 where id = 1") checkAnswer(s"select id, name, price, ts from 
$tableName")( Seq(1, "a1", 22.0, 1000) ) spark.sql(s"alter table $tableName add 
column new_col1 int") checkAnswer(s"select id, name, price, ts, new_col1 from 
$tableName")( Seq(1, "a1", 22.0, 1000, null) ) // update and check 
spark.sql(s"update $tableName set price = price * 2 where id = 1") 
checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")( Seq(1, 
"a1", 44.0, 1000, null) ) } }


> spark cannot read mor table after execute update statement
> ----------------------------------------------------------
>
>                 Key: HUDI-5835
>                 URL: https://issues.apache.org/jira/browse/HUDI-5835
>             Project: Apache Hudi
>          Issue Type: Bug
>          Components: spark
>    Affects Versions: 0.13.0
>            Reporter: Tao Meng
>            Priority: Blocker
>
> avro schema create by sparksql will miss avro name and namespace, 
> This will lead the read schema and write schema of the log file to be 
> incompatible
>  
> {code:java}
> // code placeholdertest("Test Add Column and Update Table") { withTempDir { 
> tmp => val tableName = generateTableName //spark.sql("SET 
> hoodie.datasource.read.extract.partition.values.from.path=true") val 
> tablePath = new Path(tmp.getCanonicalPath, tableName) // create table 
> spark.sql( s""" |create table $tableName ( | id int, | name string, | price 
> double, | ts long, | ff decimal(38, 10) |) using hudi | location 
> '${tablePath.toString}' | tblproperties ( | type = 'mor', | primaryKey = 
> 'id', | preCombineField = 'ts' | ) """.stripMargin) // insert data to table 
> spark.sql(s"insert into $tableName select 1, 'a1', 10, 1000, 10.0") 
> checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 
> 10.0, 1000) ) spark.sql(s"update $tableName set price = 22 where id = 1") 
> checkAnswer(s"select id, name, price, ts from $tableName")( Seq(1, "a1", 
> 22.0, 1000) ) spark.sql(s"alter table $tableName add column new_col1 int") 
> checkAnswer(s"select id, name, price, ts, new_col1 from $tableName")( Seq(1, 
> "a1", 22.0, 1000, null) ) // update and check spark.sql(s"update $tableName 
> set price = price * 2 where id = 1") checkAnswer(s"select id, name, price, 
> ts, new_col1 from $tableName")( Seq(1, "a1", 44.0, 1000, null) ) } }
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Updated] (HUDI-5835) spark cannot read mor table after execute update statement

Reply via email to