boneanxs commented on code in PR #5723:
URL: https://github.com/apache/hudi/pull/5723#discussion_r902119876
##########
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala:
##########
@@ -34,7 +37,77 @@ class HoodieInMemoryFileIndex(sparkSession: SparkSession,
parameters: Map[String, String],
userSpecifiedSchema: Option[StructType],
fileStatusCache: FileStatusCache = NoopCache)
- extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters,
userSpecifiedSchema, fileStatusCache) {
+ extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters,
userSpecifiedSchema, fileStatusCache)
+ with SparkAdapterSupport {
+
+ /**
+ * Returns all valid files grouped into partitions when the data is
partitioned. If the data is unpartitioned,
+ * this will return a single partition with no partition values
+ *
+ * NOTE: This method replicates the one it overrides, however it uses custom
method
+ * that accepts files starting with "."
+ */
+ override def listFiles(partitionFilters: Seq[Expression], dataFilters:
Seq[Expression]): Seq[PartitionDirectory] = {
Review Comment:
This is needed, as we'll get partitions in `listLatestBaseFile` before,
```scala
protected def listLatestBaseFiles(globbedPaths: Seq[Path], partitionFilters:
Seq[Expression], dataFilters: Seq[Expression]): Map[Path, Seq[FileStatus]] = {
val partitionDirs = if (globbedPaths.isEmpty) {
fileIndex.listFiles(partitionFilters, dataFilters)
} else {
val inMemoryFileIndex = HoodieInMemoryFileIndex.create(sparkSession,
globbedPaths)
inMemoryFileIndex.listFiles(partitionFilters, dataFilters)
}
```
which will call `inMemoryFileIndex.listFiles` to get all partitionDirs, if
we don't overwrite this method, log paths will be filtered
```scala
val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
// Method isDataPath will filter path if it is a log file
PartitionDirectory(InternalRow.empty, allFiles().filter(f =>
isDataPath(f.getPath))) :: Nil
}
```
As `isDataPath` is a private method in `PartitioningAwareFileIndex`, we
can't overwrite it directly, so we need to overwrite `listFiles`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]