alexeykudinkin commented on a change in pull request #4026:
URL: https://github.com/apache/hudi/pull/4026#discussion_r756314781
##########
File path:
hudi-spark-datasource/hudi-spark/src/main/scala/org/apache/hudi/HoodieFileIndex.scala
##########
@@ -206,18 +257,22 @@ case class HoodieFileIndex(
*/
override def listFiles(partitionFilters: Seq[Expression],
dataFilters: Seq[Expression]):
Seq[PartitionDirectory] = {
- // try to load filterFiles from index
- val filterFiles: Set[String] = if (enableDataSkipping()) {
- filterFilesByDataSkippingIndex(dataFilters)
- } else {
- Set.empty
- }
+ // Look up candidate files names in the Z-index, if all of the following
conditions are true
+ // - Data-skipping is enabled
+ // - Z-index is present
+ // - List of predicates (filters) is present
+ val candidateFilesNamesOpt: Option[Set[String]] =
lookupCandidateFilesNamesInZIndex(dataFilters)
+
+ logDebug(s"Overlapping candidate files (from Z-index):
${candidateFilesNamesOpt.getOrElse(Set.empty)}")
+
if (queryAsNonePartitionedTable) { // Read as Non-Partitioned table.
- val candidateFiles = if (!filterFiles.isEmpty) {
- allFiles.filterNot(fileStatus =>
filterFiles.contains(fileStatus.getPath.getName))
- } else {
- allFiles
- }
+ // Filter in candidate files based on the Z-index lookup
+ val candidateFiles =
+ allFiles.filter(fileStatus =>
+ // NOTE: This predicate is true when {@code Option} is empty
Review comment:
Correct
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]