jonvex commented on code in PR #10957:
URL: https://github.com/apache/hudi/pull/10957#discussion_r1626118377
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/HoodieFileGroupReaderBasedParquetFileFormat.scala:
##########
@@ -71,28 +65,38 @@ class
HoodieFileGroupReaderBasedParquetFileFormat(tableState: HoodieTableState,
isMOR: Boolean,
isBootstrap: Boolean,
isIncremental: Boolean,
+ isCDC: Boolean,
+ validCommits: String,
shouldUseRecordPosition:
Boolean,
requiredFilters: Seq[Filter]
- ) extends ParquetFileFormat with
SparkAdapterSupport with HoodieFormatTrait {
+ ) extends ParquetFileFormat
with SparkAdapterSupport with HoodieFormatTrait {
def getRequiredFilters: Seq[Filter] = requiredFilters
+ private val sanitizedTableName =
AvroSchemaUtils.getAvroRecordQualifiedName(tableName)
+
/**
* Support batch needs to remain consistent, even if one side of a bootstrap
merge can support
* while the other side can't
*/
private var supportBatchCalled = false
private var supportBatchResult = false
- private val sanitizedTableName =
AvroSchemaUtils.getAvroRecordQualifiedName(tableName)
override def supportBatch(sparkSession: SparkSession, schema: StructType):
Boolean = {
if (!supportBatchCalled || supportBatchResult) {
supportBatchCalled = true
- supportBatchResult = !isMOR && !isIncremental && !isBootstrap &&
super.supportBatch(sparkSession, schema)
+ supportBatchResult = !isCDC && !isIncremental && !isMOR && !isBootstrap
&& super.supportBatch(sparkSession, schema)
Review Comment:
I thought it was disabled before. I'll revert it.
I was able to support batch for mor when there are no log files and
bootstrap when there is no bootstrap merging (bootstrap actually should be able
to be supported in even more cases, but decided that was lower priority) but
had to disable because of relation reuse. For example, we create a temp view
for a table. And then we run the queries:
select * from table where x = 7;
select * from table;
both queries will use the same relation. HoodiePruneFileSourcePartitions
will not run file listing on the second query because hasPredicatesPushedDown
will be set to true from the first query. We need file listing to happen there
to tell if there will be no log files because the other file listing calls
happen too late
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]