nsivabalan commented on a change in pull request #3946:
URL: https://github.com/apache/hudi/pull/3946#discussion_r763655793
##########
File path:
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/IncrementalRelation.scala
##########
@@ -155,27 +155,60 @@ class IncrementalRelation(val sqlContext: SQLContext,
if (filteredRegularFullPaths.isEmpty &&
filteredMetaBootstrapFullPaths.isEmpty) {
sqlContext.sparkContext.emptyRDD[Row]
} else {
- log.info("Additional Filters to be applied to incremental source are :"
+ filters)
+ log.info("Additional Filters to be applied to incremental source are :"
+ filters.mkString("Array(", ", ", ")"))
var df: DataFrame =
sqlContext.createDataFrame(sqlContext.sparkContext.emptyRDD[Row], usedSchema)
- if (metaBootstrapFileIdToFullPath.nonEmpty) {
- df = sqlContext.sparkSession.read
- .format("hudi")
- .schema(usedSchema)
- .option(DataSourceReadOptions.READ_PATHS.key,
filteredMetaBootstrapFullPaths.mkString(","))
- .load()
+ val fullTableScanFallback =
optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.key,
+
DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES.defaultValue).toBoolean
+
+ var doFullTableScan = false
+
+ if (fullTableScanFallback) {
+ val fs = new
Path(basePath).getFileSystem(sqlContext.sparkContext.hadoopConfiguration);
+ val timer = new HoodieTimer().startTimer();
+
+ val allFilesToCheck = filteredMetaBootstrapFullPaths ++
filteredRegularFullPaths
+ val firstNotFoundPath = allFilesToCheck.find(path => !fs.exists(new
Path(path)))
+ val timeTaken = timer.endTimer()
+ log.info("Checking if paths exists took " + timeTaken + "ms")
+
+ val optStartTs = optParams(DataSourceReadOptions.BEGIN_INSTANTTIME.key)
+ val isInstantArchived =
optStartTs.compareTo(commitTimeline.firstInstant().get().getTimestamp) < 0 //
True if optStartTs < activeTimeline.first
Review comment:
my bad. from re-reading the description, guess the fix does not sit
well. cleaner will not touch the timeline right. So, how do we know if a commit
has been cleaned up or not (bcoz, it could still be part of active timeline)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]