yihua commented on code in PR #5364:
URL: https://github.com/apache/hudi/pull/5364#discussion_r853541246
##########
hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/adapter/Spark3_1Adapter.scala:
##########
@@ -55,14 +54,7 @@ class Spark3_1Adapter extends BaseSpark3Adapter {
}
}
- override def createHoodieParquetFileFormat(): Option[ParquetFileFormat] = {
- if (SPARK_VERSION.startsWith("3.1")) {
- val loadClassName =
"org.apache.spark.sql.execution.datasources.parquet.Spark312HoodieParquetFileFormat"
- val clazz = Class.forName(loadClassName, true,
Thread.currentThread().getContextClassLoader)
- val ctor = clazz.getConstructors.head
- Some(ctor.newInstance().asInstanceOf[ParquetFileFormat])
- } else {
- None
- }
+ override def createHoodieParquetFileFormat(appendPartitionValues: Boolean):
Option[ParquetFileFormat] = {
+ Some(new Spark312HoodieParquetFileFormat(appendPartitionValues))
Review Comment:
Is there any reason why the class loader is used before, instead of directly
creating a new instance with the class? @xushiyan do you have any context
here, to make sure there is no historical get-around and we're not breaking any
logic?
##########
hudi-spark-datasource/hudi-spark3/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark32HoodieParquetFileFormat.scala:
##########
@@ -27,233 +25,262 @@ import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID,
TaskID, TaskType}
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
import org.apache.hudi.common.fs.FSUtils
import org.apache.hudi.common.util.InternalSchemaCache
+import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
import org.apache.hudi.common.util.collection.Pair
import org.apache.hudi.internal.schema.InternalSchema
import org.apache.hudi.internal.schema.action.InternalSchemaMerger
import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.filter2.predicate.FilterApi
import
org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS
-import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat,
ParquetRecordReader}
+import org.apache.parquet.hadoop.{ParquetInputFormat, ParquetRecordReader}
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
import
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import
org.apache.spark.sql.execution.datasources.parquet.Spark32HoodieParquetFileFormat.{pruneInternalSchema,
rebuildFilterFromParquet}
import org.apache.spark.sql.execution.datasources.{DataSourceUtils,
PartitionedFile, RecordReaderIterator}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{AtomicType, DataType, StructField,
StructType}
import org.apache.spark.util.SerializableConfiguration
-class Spark32HoodieParquetFileFormat extends ParquetFileFormat {
-
- // reference ParquetFileFormat from spark project
- override def buildReaderWithPartitionValues(
- sparkSession: SparkSession,
- dataSchema: StructType,
- partitionSchema: StructType,
- requiredSchema: StructType,
- filters: Seq[Filter],
- options: Map[String, String],
- hadoopConf: Configuration):
PartitionedFile => Iterator[InternalRow] = {
- if (hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA,
"").isEmpty) {
- // fallback to origin parquet File read
- super.buildReaderWithPartitionValues(sparkSession, dataSchema,
partitionSchema, requiredSchema, filters, options, hadoopConf)
Review Comment:
If `shouldAppendPartitionValues` is true and the existing if condition is
true, can we still fall back to the original parquet file read?
##########
hudi-spark-datasource/hudi-spark3.1.x/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/Spark312HoodieParquetFileFormat.scala:
##########
@@ -17,279 +17,301 @@
package org.apache.spark.sql.execution.datasources.parquet
-import java.net.URI
-import java.util
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.hadoop.mapreduce.{JobID, TaskAttemptID, TaskID, TaskType}
+import org.apache.hudi.HoodieSparkUtils
import org.apache.hudi.client.utils.SparkInternalSchemaConverter
import org.apache.hudi.common.fs.FSUtils
-import org.apache.hudi.HoodieSparkUtils
-import org.apache.hudi.common.util.InternalSchemaCache
+import org.apache.hudi.common.util.StringUtils.isNullOrEmpty
+import org.apache.hudi.common.util.{InternalSchemaCache, StringUtils}
import org.apache.hudi.common.util.collection.Pair
import org.apache.hudi.internal.schema.InternalSchema
-import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper}
import org.apache.hudi.internal.schema.action.InternalSchemaMerger
+import org.apache.hudi.internal.schema.utils.{InternalSchemaUtils, SerDeHelper}
import org.apache.parquet.filter2.compat.FilterCompat
import org.apache.parquet.filter2.predicate.FilterApi
import
org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat,
ParquetRecordReader}
-
import org.apache.spark.TaskContext
import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.avro.AvroDeserializer
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
import
org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.{Cast, JoinedRow}
import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import
org.apache.spark.sql.execution.datasources.parquet.Spark312HoodieParquetFileFormat.{createParquetFilters,
pruneInternalSchema, rebuildFilterFromParquet}
import org.apache.spark.sql.execution.datasources.{DataSourceUtils,
PartitionedFile, RecordReaderIterator}
-import org.apache.spark.sql.execution.datasources.parquet._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{AtomicType, DataType, StructField,
StructType}
import org.apache.spark.util.SerializableConfiguration
-class Spark312HoodieParquetFileFormat extends ParquetFileFormat {
-
- // reference ParquetFileFormat from spark project
- override def buildReaderWithPartitionValues(
- sparkSession: SparkSession,
- dataSchema: StructType,
- partitionSchema: StructType,
- requiredSchema: StructType,
- filters: Seq[Filter],
- options: Map[String, String],
- hadoopConf: Configuration):
PartitionedFile => Iterator[InternalRow] = {
- if (hadoopConf.get(SparkInternalSchemaConverter.HOODIE_QUERY_SCHEMA,
"").isEmpty) {
Review Comment:
similar here
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]