alexeykudinkin commented on code in PR #5428:
URL: https://github.com/apache/hudi/pull/5428#discussion_r925000613


##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala:
##########
@@ -227,9 +251,38 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
   /**
    * Returns true in case table supports Schema on Read (Schema Evolution)
    */
-  def hasSchemaOnRead: Boolean = !internalSchema.isEmptySchema
+  def hasSchemaOnRead: Boolean = internalSchemaOpt.isDefined
 
-  override def schema: StructType = tableStructSchema
+  /**
+   * Data schema is determined as the actual schema of the Table's Data Files 
(for ex, parquet/orc/etc);
+   *
+   * In cases when partition values are not persisted w/in the data files, 
data-schema is defined as
+   * <pre>table's schema - partition columns</pre>
+   *
+   * Check scala-doc for [[shouldExtractPartitionValuesFromPartitionPath]] for 
more details
+   */
+  def dataSchema: StructType =
+    if (shouldExtractPartitionValuesFromPartitionPath) {
+      prunePartitionColumns(tableStructSchema)
+    } else {
+      tableStructSchema
+    }
+
+  /**
+   * Determines whether relation's schema could be pruned by Spark's Optimizer
+   */
+  def canPruneRelationSchema: Boolean =
+    (fileFormat.isInstanceOf[ParquetFileFormat] || 
fileFormat.isInstanceOf[OrcFileFormat]) &&

Review Comment:
   Good catch! We can clean up the flag indeed



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieBaseRelation.scala:
##########
@@ -127,36 +133,46 @@ abstract class HoodieBaseRelation(val sqlContext: 
SQLContext,
    * NOTE: Initialization of teh following members is coupled on purpose to 
minimize amount of I/O
    *       required to fetch table's Avro and Internal schemas
    */
-  protected lazy val (tableAvroSchema: Schema, internalSchema: InternalSchema) 
= {
+  protected lazy val (tableAvroSchema: Schema, internalSchemaOpt: 
Option[InternalSchema]) = {
     val schemaResolver = new TableSchemaResolver(metaClient)
-    val avroSchema: Schema = schemaSpec.map(convertToAvroSchema).getOrElse {
-      Try(schemaResolver.getTableAvroSchema) match {
-        case Success(schema) => schema
+    val internalSchemaOpt = if (!isSchemaEvolutionEnabled) {
+      None
+    } else {
+      Try(schemaResolver.getTableInternalSchemaFromCommitMetadata) match {
+        case Success(internalSchemaOpt) => toScalaOption(internalSchemaOpt)
         case Failure(e) =>
-          logError("Failed to fetch schema from the table", e)
-          throw new HoodieSchemaException("Failed to fetch schema from the 
table")
+          logWarning("Failed to fetch internal-schema from the table", e)
+          None
       }
     }
 
-    val internalSchema: InternalSchema = if (!isSchemaEvolutionEnabled) {
-      InternalSchema.getEmptyInternalSchema
-    } else {
-      Try(schemaResolver.getTableInternalSchemaFromCommitMetadata) match {
-        case Success(internalSchemaOpt) =>
-          
toScalaOption(internalSchemaOpt).getOrElse(InternalSchema.getEmptyInternalSchema)
+    val avroSchema = internalSchemaOpt.map { is =>
+      AvroInternalSchemaConverter.convert(is, "schema")
+    } orElse {
+      schemaSpec.map(convertToAvroSchema)
+    } getOrElse {
+      Try(schemaResolver.getTableAvroSchema) match {
+        case Success(schema) => schema
         case Failure(e) =>
-          logWarning("Failed to fetch internal-schema from the table", e)
-          InternalSchema.getEmptyInternalSchema
+          logError("Failed to fetch schema from the table", e)
+          throw new HoodieSchemaException("Failed to fetch schema from the 
table")
       }
     }
 
-    (avroSchema, internalSchema)
+    (avroSchema, internalSchemaOpt)
   }
 
-  protected lazy val tableStructSchema: StructType = 
AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)
+  protected val tableStructSchema: StructType = 
AvroConversionUtils.convertAvroSchemaToStructType(tableAvroSchema)

Review Comment:
   Good catch. This is fixed in later PR. Will cherry-pick here as well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to