yihua commented on code in PR #10957:
URL: https://github.com/apache/hudi/pull/10957#discussion_r1628627195
##########
hudi-client/hudi-spark-client/src/main/scala/org/apache/hudi/SparkFileFormatInternalRowReaderContext.scala:
##########
@@ -73,16 +84,27 @@ class SparkFileFormatInternalRowReaderContext(readerMaps:
mutable.Map[Long, Part
}
}).asInstanceOf[ClosableIterator[InternalRow]]
} else {
- val schemaPairHashKey = generateSchemaPairHashKey(dataSchema,
requiredSchema)
- if (!readerMaps.contains(schemaPairHashKey)) {
- throw new IllegalStateException("schemas don't hash to a known reader")
- }
- new
CloseableInternalRowIterator(readerMaps(schemaPairHashKey).apply(fileInfo))
+ // partition value is empty because the spark parquet reader will append
the partition columns to
+ // each row if they are given. That is the only usage of the partition
values in the reader.
+ val fileInfo = sparkAdapter.getSparkPartitionedFileUtils
+ .createPartitionedFile(InternalRow.empty, filePath, start, length)
+ val (readSchema, readFilters) = getSchemaAndFiltersForRead(structType)
+ new CloseableInternalRowIterator(parquetFileReader.read(fileInfo,
+ readSchema, StructType(Seq.empty), readFilters,
storage.getConf.asInstanceOf[StorageConfiguration[Configuration]]))
}
}
- private def generateSchemaPairHashKey(dataSchema: Schema, requestedSchema:
Schema): Long = {
- dataSchema.hashCode() + requestedSchema.hashCode()
+ private def getSchemaAndFiltersForRead(structType: StructType): (StructType,
Seq[Filter]) = {
+ (getHasLogFiles, getNeedsBootstrapMerge, getUseRecordPosition) match {
Review Comment:
The controlling flag looks incorrect: `shouldUseRecordPosition` controls the
merging based on record positions from the log files, not whether to read
record positions from the parquet file with the Spark 3.5 parquet reader (along
with filter pushdown). Only in Spark 3.5, when reading from the parquet base
file, the reader should fetch the positions from the Spark parquet row index
meta column, instead of counting the position inside Hudi.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]