[GitHub] [hudi] pengzhiwei2018 commented on a change in pull request #2283: [HUDI-1415] Read Hoodie Table As Spark DataSource Table

GitBox Mon, 08 Mar 2021 18:20:23 -0800


pengzhiwei2018 commented on a change in pull request #2283:
URL: https://github.com/apache/hudi/pull/2283#discussion_r589890569




##########
File path: 
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieSparkSqlWriter.scala
##########
@@ -378,11 +389,75 @@ private[hudi] object HoodieSparkSqlWriter {
     hiveSyncConfig.autoCreateDatabase = 
parameters.get(HIVE_AUTO_CREATE_DATABASE_OPT_KEY).exists(r => r.toBoolean)
     hiveSyncConfig.decodePartition = 
parameters.getOrElse(URL_ENCODE_PARTITIONING_OPT_KEY,
       DEFAULT_URL_ENCODE_PARTITIONING_OPT_VAL).toBoolean
+    hiveSyncConfig.tableProperties = 
parameters.getOrElse(HIVE_TABLE_PROPERTIES, null)
+    hiveSyncConfig.serdeProperties = createSqlTableSerdeProperties(parameters, 
basePath.toString,
+      hiveSyncConfig.partitionFields.size())
     hiveSyncConfig
   }
 
-  private def metaSync(parameters: Map[String, String],
-                       basePath: Path,
+  /**
+    * Add Spark Sql related table properties to the HIVE_TABLE_PROPERTIES.
+    * @param sqlConf
+    * @param schema
+    * @param parameters
+    * @return A new parameters added the HIVE_TABLE_PROPERTIES property.
+    */
+  private def addSqlTableProperties(sqlConf: SQLConf, schema: StructType,
+                                    parameters: Map[String, String]): 
Map[String, String] = {
+    // Convert the schema and partition info used by spark sql to hive table 
properties.
+    // The following code refers to the spark code in
+    // 
https://github.com/apache/spark/blob/master/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+
+    val partitionSet = parameters(HIVE_PARTITION_FIELDS_OPT_KEY)
+      .split(",").map(_.trim).filter(!_.isEmpty).toSet
+    val threshold = sqlConf.getConf(SCHEMA_STRING_LENGTH_THRESHOLD)
+
+    val (partitionCols, dataCols) = schema.partition(c => 
partitionSet.contains(c.name))
+    val reOrderedType = StructType(dataCols ++ partitionCols)
+    val schemaParts = reOrderedType.json.grouped(threshold).toSeq
+
+    var properties = Map(
+      "spark.sql.sources.provider" -> "hudi",
+      "spark.sql.sources.schema.numParts" -> schemaParts.size.toString

Review comment:
       Spark need this properties when load the meta data from the hive meta 
store.So we should store them there.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] pengzhiwei2018 commented on a change in pull request #2283: [HUDI-1415] Read Hoodie Table As Spark DataSource Table

Reply via email to