n3nash commented on a change in pull request #2927:
URL: https://github.com/apache/hudi/pull/2927#discussion_r628787350



##########
File path: 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -248,6 +257,39 @@ private[hudi] object HoodieSparkSqlWriter {
     }
   }
 
+  /**
+   * Checks if schema needs upgrade (if incoming records's schema is old while 
table schema got evolved).
+   * @param fs instance of FileSystem.
+   * @param basePath base path.
+   * @param sparkContext instance of spark context.
+   * @param schema incoming record's schema.
+   * @return Pair of(boolean, table schema), where first entry will be true 
only if schema conversion is required.
+   */
+  def schemaNeedsConversion(fs: FileSystem, basePath: Path, sparkContext: 
SparkContext, schema: Schema) : (Boolean, Schema) = {
+    var convertGenRecsToLatestTableSchema = false
+    var latestSchema: Schema = null
+    if(fs.exists(new Path(basePath.toString + "/" + 
HoodieTableMetaClient.METAFOLDER_NAME))) {
+      val tableMetaClient = 
HoodieTableMetaClient.builder.setConf(sparkContext.hadoopConfiguration).setBasePath(basePath.toString).build()
+      try {
+        val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
+        if(tableSchemaResolver.areCommitsAvailable()) {
+          val tableSchema = 
tableSchemaResolver.getTableAvroSchemaWithoutMetadataFields
+          if (tableSchema != null && 
TableSchemaResolver.isSchemaSubset(tableSchema, schema)) {
+            // if incoming schema is a subset (old schema) compared to table 
schema. For eg, one of the
+            // ingestion pipeline is still producing events in old schema
+            latestSchema = tableSchema;
+            convertGenRecsToLatestTableSchema = true;
+            log.warn("Using latest table schema to rewrite incoming records " 
+ tableSchema.toString)
+          }
+        }
+      } catch {
+        case e: IllegalArgumentException => log.warn("Likely first commit and 
hence could not find any schema for the table")

Review comment:
       Change to "Timeline is empty, could not find any schema for table" 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to