Re: [PR] [HUDI-4468] Simplify TimeTravel logic for Spark [hudi]

via GitHub Tue, 19 Nov 2024 10:42:52 -0800


jonvex commented on code in PR #12270:
URL: https://github.com/apache/hudi/pull/12270#discussion_r1848854643



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala:
##########
@@ -56,9 +56,10 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport {
   })
 
   def getTableSqlSchema(metaClient: HoodieTableMetaClient,

Review Comment:
   actually, getTableSqlSchema is only used in one place, and is set to true. 
So we can just use the current impl of getTableAvroSchema and add 
   ```
      if (timestamp == null) {
         return 
getTableAvroSchema(metaClient.getTableConfig().populateMetaFields());
       }
   ```
   to the beginning of it



##########
hudi-spark-datasource/hudi-spark3.3.x/src/main/scala/org/apache/spark/sql/parser/HoodieSpark3_3ExtendedSqlParser.scala:
##########
@@ -121,11 +120,7 @@ class HoodieSpark3_3ExtendedSqlParser(session: 
SparkSession, delegate: ParserInt
 
   private def isHoodieCommand(sqlText: String): Boolean = {
     val normalized = 
sqlText.toLowerCase(Locale.ROOT).trim().replaceAll("\\s+", " ")
-    normalized.contains("system_time as of") ||

Review Comment:
   we are ok getting rid of all of these aliases?



##########
hudi-common/src/main/java/org/apache/hudi/common/table/TableSchemaResolver.java:
##########
@@ -150,6 +150,23 @@ public Schema getTableAvroSchema(String timestamp) throws 
Exception {
         .orElseThrow(schemaNotFoundError());
   }
 
+  /**
+   * Fetches tables schema in Avro format as of the given instant
+   *
+   * @param timestamp as of which table's schema will be fetched
+   */
+  public Schema getTableAvroSchema(String timestamp, boolean 
includeMetadataFields) throws Exception {

Review Comment:
   I think there is some code reuse we can do with the method above



##########
hudi-spark-datasource/hudi-spark3-common/src/main/scala/org/apache/spark/sql/hudi/catalog/HoodieCatalog.scala:
##########
@@ -153,6 +155,72 @@ class HoodieCatalog extends DelegatingCatalogExtension
     }
   }
 
+  override def loadTable(ident: Identifier, timestamp: Long): Table = {
+    super.loadTable(ident) match {
+      case V1Table(catalogTable0) if sparkAdapter.isHoodieTable(catalogTable0) 
=>
+
+        // spark passes microseconds
+        val milliseconds = TimeUnit.MICROSECONDS.toMillis(timestamp)
+        val targetTimestamp = 
HoodieInstantTimeGenerator.formatMillis(milliseconds)
+
+        constructTable(ident, catalogTable0, targetTimestamp)
+
+      case t => t
+    }
+  }
+
+  override def loadTable(ident: Identifier, version: String): Table = {
+    super.loadTable(ident) match {
+      case V1Table(catalogTable0) if sparkAdapter.isHoodieTable(catalogTable0) 
=>
+
+        if (!HoodieInstantTimeGenerator.isValidInstantTime(version)) {
+          throw new AnalysisException(s"invalid snapshot id: $version")
+        }
+
+        constructTable(ident, catalogTable0, version);
+
+      case t => t
+    }
+  }
+
+  private def constructTable(ident: Identifier,
+                             catalogTable0: CatalogTable,
+                             timestampAsOf: String): Table = {
+    val catalogTable = catalogTable0.comment match {
+      case Some(v) =>
+        val newProps = catalogTable0.storage.properties + (

Review Comment:
   catalogTable0.storage.copy(properties = newProps)) is this creating a new 
instance so the original catalog table is not modified.



##########
hudi-common/src/main/java/org/apache/hudi/common/table/timeline/HoodieInstantTimeGenerator.java:
##########
@@ -88,6 +88,18 @@ public static String createNewInstantTime(boolean 
shouldLock, TimeGenerator time
     });
   }
 
+  public static String formatMillis(long milliseconds) {

Review Comment:
   This seems to be a copy of 
org.apache.hudi.common.table.timeline.HoodieInstantTimeGenerator#createNewInstantTime.
 can we try to reuse the existing code? If the HoodieInstantTimeGenerator 
implementation is changed, then we will need to update this too



##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/hudi/HoodieSqlCommonUtils.scala:
##########
@@ -56,9 +56,10 @@ object HoodieSqlCommonUtils extends SparkAdapterSupport {
   })
 
   def getTableSqlSchema(metaClient: HoodieTableMetaClient,

Review Comment:
   and get rid of includeMetadataFields: Boolean = false, from getTableSqlSchema



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [HUDI-4468] Simplify TimeTravel logic for Spark [hudi]

Reply via email to