n3nash commented on a change in pull request #2927:
URL: https://github.com/apache/hudi/pull/2927#discussion_r628787402
##########
File path:
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -248,6 +257,39 @@ private[hudi] object HoodieSparkSqlWriter {
}
}
+ /**
+ * Checks if schema needs upgrade (if incoming records's schema is old while
table schema got evolved).
+ * @param fs instance of FileSystem.
+ * @param basePath base path.
+ * @param sparkContext instance of spark context.
+ * @param schema incoming record's schema.
+ * @return Pair of(boolean, table schema), where first entry will be true
only if schema conversion is required.
+ */
+ def schemaNeedsConversion(fs: FileSystem, basePath: Path, sparkContext:
SparkContext, schema: Schema) : (Boolean, Schema) = {
+ var convertGenRecsToLatestTableSchema = false
+ var latestSchema: Schema = null
+ if(fs.exists(new Path(basePath.toString + "/" +
HoodieTableMetaClient.METAFOLDER_NAME))) {
+ val tableMetaClient =
HoodieTableMetaClient.builder.setConf(sparkContext.hadoopConfiguration).setBasePath(basePath.toString).build()
+ try {
+ val tableSchemaResolver = new TableSchemaResolver(tableMetaClient)
+ if(tableSchemaResolver.areCommitsAvailable()) {
+ val tableSchema =
tableSchemaResolver.getTableAvroSchemaWithoutMetadataFields
+ if (tableSchema != null &&
TableSchemaResolver.isSchemaSubset(tableSchema, schema)) {
+ // if incoming schema is a subset (old schema) compared to table
schema. For eg, one of the
+ // ingestion pipeline is still producing events in old schema
+ latestSchema = tableSchema;
+ convertGenRecsToLatestTableSchema = true;
+ log.warn("Using latest table schema to rewrite incoming records "
+ tableSchema.toString)
+ }
+ }
+ } catch {
+ case e: IllegalArgumentException => log.warn("Likely first commit and
hence could not find any schema for the table")
+ case e: InvalidTableException => log.warn("Likely first commit and
hence could not find any schema for the table")
+ }
+ }
+ (convertGenRecsToLatestTableSchema, latestSchema)
Review comment:
Simply return `latestSchema` based on previous comment if valid
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]