dongjoon-hyun commented on a change in pull request #24307: [SPARK-25407][SQL]
Ensure we pass a compatible pruned schema to ParquetRowConverter
URL: https://github.com/apache/spark/pull/24307#discussion_r272870812
##########
File path:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
##########
@@ -49,34 +49,65 @@ import org.apache.spark.sql.types._
* Due to this reason, we no longer rely on [[ReadContext]] to pass requested
schema from [[init()]]
* to [[prepareForRead()]], but use a private `var` for simplicity.
*/
-private[parquet] class ParquetReadSupport(val convertTz: Option[TimeZone])
+private[parquet] class ParquetReadSupport(val convertTz: Option[TimeZone],
isMR: Boolean)
extends ReadSupport[UnsafeRow] with Logging {
private var catalystRequestedSchema: StructType = _
def this() {
// We need a zero-arg constructor for SpecificParquetRecordReaderBase.
But that is only
// used in the vectorized reader, where we get the convertTz value
directly, and the value here
// is ignored.
- this(None)
+ this(None, isMR = false)
}
/**
* Called on executor side before [[prepareForRead()]] and instantiating
actual Parquet record
* readers. Responsible for figuring out Parquet requested schema used for
column pruning.
*/
override def init(context: InitContext): ReadContext = {
+ val conf = context.getConfiguration
catalystRequestedSchema = {
- val conf = context.getConfiguration
val schemaString =
conf.get(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA)
assert(schemaString != null, "Parquet requested schema not set.")
StructType.fromString(schemaString)
}
- val caseSensitive =
context.getConfiguration.getBoolean(SQLConf.CASE_SENSITIVE.key,
+ val caseSensitive = conf.getBoolean(SQLConf.CASE_SENSITIVE.key,
SQLConf.CASE_SENSITIVE.defaultValue.get)
- val parquetRequestedSchema = ParquetReadSupport.clipParquetSchema(
- context.getFileSchema, catalystRequestedSchema, caseSensitive)
-
+ val schemaPruningEnabled =
conf.getBoolean(SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.key,
+ SQLConf.NESTED_SCHEMA_PRUNING_ENABLED.defaultValue.get)
+ val parquetFileSchema = context.getFileSchema
+ val parquetClippedSchema =
ParquetReadSupport.clipParquetSchema(parquetFileSchema,
+ catalystRequestedSchema, caseSensitive)
+
+ // We pass two schema to ParquetRecordMaterializer:
+ // - parquetRequestedSchema: the schema of the file data we want to read
+ // - catalystRequestedSchema: the schema of the rows we want to return
+ // The reader is responsible for reconciling the differences between the
two.
+ val parquetRequestedSchema = if (isMR && schemaPruningEnabled) {
+ // Parquet-MR reader requires that parquetRequestedSchema
+ // include only those fields present in the underlying
parquetFileSchema. Therefore,
+ // we intersect the parquetClippedSchema with the parquetFileSchema
+ ParquetReadSupport.intersectParquetGroups(parquetClippedSchema,
parquetFileSchema)
+ .map(groupType => new MessageType(groupType.getName,
groupType.getFields))
+ .getOrElse(ParquetSchemaConverter.EMPTY_MESSAGE)
+ } else {
+ // Spark's vectorized reader only support atomic types currently. It
also skip fields
+ // in parquetRequestedSchema which are not present in the file.
+ parquetClippedSchema
+ }
+ log.debug {
Review comment:
Sure!
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]