sivabalan narayanan created HUDI-3347:
-----------------------------------------

             Summary: Updating table schema fails w/ hms mode
                 Key: HUDI-3347
                 URL: https://issues.apache.org/jira/browse/HUDI-3347
             Project: Apache Hudi
          Issue Type: Task
          Components: hive-sync
            Reporter: sivabalan narayanan


When table schema got upgraded with a new batch of write, hms mode sync fails. 

 

steps to reproduce using our docker demo set up. 

I used 0.10.0 to test this out.

adhoc-1
{code:java}
$SPARK_INSTALL/bin/spark-shell --jars $HUDI_SPARK_BUNDLE --master local[2] 
--driver-class-path $HADOOP_CONF_DIR --conf 
spark.sql.hive.convertMetastoreParquet=false --deploy-mode client 
--driver-memory 1G --executor-memory 3G --num-executors 1 --packages 
org.apache.spark:spark-avro_2.11:2.4.4 {code}
{code:java}
import java.sql.Timestamp
import spark.implicits._import org.apache.hudi.QuickstartUtils._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceReadOptions._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._
val df1 = Seq(
        ("row1", 1, "part1" ,1578283932000L ),
        ("row2", 1, "part1", 1578283942000L)
      ).toDF("row", "ppath", "preComb","eventTime")
 df1.write.format("hudi").
        options(getQuickstartWriteConfigs).
        option(PRECOMBINE_FIELD_OPT_KEY, "preComb").
        option(RECORDKEY_FIELD_OPT_KEY, "row").
        option(PARTITIONPATH_FIELD_OPT_KEY, "ppath").
        
option("hoodie.datasource.write.keygenerator.class","org.apache.hudi.keygen.TimestampBasedKeyGenerator").
        
option("hoodie.deltastreamer.keygen.timebased.timestamp.type","EPOCHMILLISECONDS").
        
option("hoodie.deltastreamer.keygen.timebased.output.dateformat","yyyy/MM/dd").
        option("hoodie.deltastreamer.keygen.timebased.timezone","GMT+8:00").
        option("hoodie.datasource.hive_sync.mode","hms").
        option("hoodie.datasource.hive_sync.database","default").
        option("hoodie.datasource.hive_sync.table","timestamp_tbl1").
        option("hoodie.datasource.hive_sync.partition_fields","year,month,day").
        option("hoodie.datasource.hive_sync.enable","true").
        option(TABLE_NAME, "timestamp_tbl1").
        mode(Overwrite).
        save("/tmp/hudi_timestamp_tbl1")
 {code}
// evol schema
{code:java}
val df2 = Seq(
        ("row1", 1, "part1" ,1678283932000L, "abcd" ),
        ("row2", 1, "part1", 1678283942000L, "defg")
      ).toDF("row", "ppath", "preComb", "eventTime", "randomStr")
 df2.write.format("hudi").
        options(getQuickstartWriteConfigs).
        option(PRECOMBINE_FIELD_OPT_KEY, "preComb").
        option(RECORDKEY_FIELD_OPT_KEY, "row").
        option(PARTITIONPATH_FIELD_OPT_KEY, "ppath").
        
option("hoodie.datasource.write.keygenerator.class","org.apache.hudi.keygen.TimestampBasedKeyGenerator").
        
option("hoodie.deltastreamer.keygen.timebased.timestamp.type","EPOCHMILLISECONDS").
        
option("hoodie.deltastreamer.keygen.timebased.output.dateformat","yyyy/MM/dd").
        option("hoodie.deltastreamer.keygen.timebased.timezone","GMT+8:00").
        option("hoodie.datasource.hive_sync.mode","hms").
        option("hoodie.datasource.hive_sync.database","default").
        option("hoodie.datasource.hive_sync.table","timestamp_tbl1").
        option("hoodie.datasource.hive_sync.partition_fields","year,month,day").
        option("hoodie.datasource.hive_sync.enable","true").
        option(TABLE_NAME, "timestamp_tbl1").
        mode(Append).
        save("/tmp/hudi_timestamp_tbl1")
 {code}
stacktrace
{code:java}
scala>  df2.write.format("hudi").
     |         options(getQuickstartWriteConfigs).
     |         option(PRECOMBINE_FIELD_OPT_KEY, "preComb").
     |         option(RECORDKEY_FIELD_OPT_KEY, "row").
     |         option(PARTITIONPATH_FIELD_OPT_KEY, "ppath").
     |         
option("hoodie.datasource.write.keygenerator.class","org.apache.hudi.keygen.TimestampBasedKeyGenerator").
     |         
option("hoodie.deltastreamer.keygen.timebased.timestamp.type","EPOCHMILLISECONDS").
     |         
option("hoodie.deltastreamer.keygen.timebased.output.dateformat","yyyy/MM/dd").
     |         
option("hoodie.deltastreamer.keygen.timebased.timezone","GMT+8:00").
     |         option("hoodie.datasource.hive_sync.mode","hms").
     |         option("hoodie.datasource.hive_sync.database","default").
     |         option("hoodie.datasource.hive_sync.table","timestamp_tbl1").
     |         
option("hoodie.datasource.hive_sync.partition_fields","year,month,day").
     |         option("hoodie.datasource.hive_sync.enable","true").
     |         option(TABLE_NAME, "timestamp_tbl1").
     |         mode(Append).
     |         save("/tmp/hudi_timestamp_tbl1")
warning: there was one deprecation warning; re-run with -deprecation for details
01:00  WARN: Timeline-server-based markers are not supported for HDFS: base 
path /tmp/hudi_timestamp_tbl1.  Falling back to direct markers.
01:00  WARN: Timeline-server-based markers are not supported for HDFS: base 
path /tmp/hudi_timestamp_tbl1.  Falling back to direct markers.
01:01  WARN: Timeline-server-based markers are not supported for HDFS: base 
path /tmp/hudi_timestamp_tbl1.  Falling back to direct markers.
java.lang.NoSuchMethodError: 
org.apache.hadoop.hive.metastore.IMetaStoreClient.alter_table_with_environmentContext(Ljava/lang/String;Ljava/lang/String;Lorg/apache/hadoop/hive/metastore/api/Table;Lorg/apache/hadoop/hive/metastore/api/EnvironmentContext;)V
  at 
org.apache.hudi.hive.ddl.HMSDDLExecutor.updateTableDefinition(HMSDDLExecutor.java:146)
  at 
org.apache.hudi.hive.HoodieHiveClient.updateTableDefinition(HoodieHiveClient.java:184)
  at org.apache.hudi.hive.HiveSyncTool.syncSchema(HiveSyncTool.java:250)
  at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:182)
  at org.apache.hudi.hive.HiveSyncTool.doSync(HiveSyncTool.java:131)
  at org.apache.hudi.hive.HiveSyncTool.syncHoodieTable(HiveSyncTool.java:117)
  at 
org.apache.hudi.HoodieSparkSqlWriter$.org$apache$hudi$HoodieSparkSqlWriter$$syncHive(HoodieSparkSqlWriter.scala:537)
  at 
org.apache.hudi.HoodieSparkSqlWriter$$anonfun$metaSync$2.apply(HoodieSparkSqlWriter.scala:593)
  at 
org.apache.hudi.HoodieSparkSqlWriter$$anonfun$metaSync$2.apply(HoodieSparkSqlWriter.scala:589)
  at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
  at 
org.apache.hudi.HoodieSparkSqlWriter$.metaSync(HoodieSparkSqlWriter.scala:589)
  at 
org.apache.hudi.HoodieSparkSqlWriter$.commitAndPerformPostOperations(HoodieSparkSqlWriter.scala:662)
  at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:282)
  at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:164)
  at 
org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
  at 
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
  at 
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
  at 
org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:86)
  at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
  at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
  at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
  at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
  at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
  at 
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
  at 
org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
  at 
org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
  at 
org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:676)
  at 
org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:78)
  at 
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
  at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
  at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:676)
  at 
org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:285)
  at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:271)
  at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:229)
  ... 79 elided


scala>  {code}
 

 



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to