codope commented on code in PR #5723:
URL: https://github.com/apache/hudi/pull/5723#discussion_r891252545
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala:
##########
@@ -34,7 +37,77 @@ class HoodieInMemoryFileIndex(sparkSession: SparkSession,
parameters: Map[String, String],
userSpecifiedSchema: Option[StructType],
fileStatusCache: FileStatusCache = NoopCache)
- extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters,
userSpecifiedSchema, fileStatusCache) {
+ extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters,
userSpecifiedSchema, fileStatusCache)
+ with SparkAdapterSupport {
+
+ /**
+ * Returns all valid files grouped into partitions when the data is
partitioned. If the data is unpartitioned,
+ * this will return a single partition with no partition values
+ *
+ * NOTE: This method replicates the one it overrides, however it uses custom
method
+ * that accepts files starting with "."
+ */
+ override def listFiles(partitionFilters: Seq[Expression], dataFilters:
Seq[Expression]): Seq[PartitionDirectory] = {
+ val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
+ PartitionDirectory(InternalRow.empty, allFiles().filter(f =>
isDataPath(f.getPath))) :: Nil
+ } else {
+ prunePartitions(partitionFilters, partitionSpec()).map {
+ case PartitionPath(values, path) =>
+ val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
Review Comment:
even with partition pruning, this method is going to list files.
FileStatus[] is already built in `AbstractTableFileSystemView`. So, now the
same set of files are being listed twice i.e. once while building filesystem
view and second time here. Is there a way to avoid that?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]