cloud-fan commented on a change in pull request #24830: [SPARK-27990][SQL][ML] Provide a way to recursively load data from datasource URL: https://github.com/apache/spark/pull/24830#discussion_r294589696
########## File path: sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala ########## @@ -96,30 +102,36 @@ abstract class PartitioningAwareFileIndex( def allFiles(): Seq[FileStatus] = { val files = if (partitionSpec().partitionColumns.isEmpty) { - // For each of the root input paths, get the list of files inside them - rootPaths.flatMap { path => - // Make the path qualified (consistent with listLeafFiles and bulkListLeafFiles). - val fs = path.getFileSystem(hadoopConf) - val qualifiedPathPre = fs.makeQualified(path) - val qualifiedPath: Path = if (qualifiedPathPre.isRoot && !qualifiedPathPre.isAbsolute) { - // SPARK-17613: Always append `Path.SEPARATOR` to the end of parent directories, - // because the `leafFile.getParent` would have returned an absolute path with the - // separator at the end. - new Path(qualifiedPathPre, Path.SEPARATOR) - } else { - qualifiedPathPre - } - - // There are three cases possible with each path - // 1. The path is a directory and has children files in it. Then it must be present in - // leafDirToChildrenFiles as those children files will have been found as leaf files. - // Find its children files from leafDirToChildrenFiles and include them. - // 2. The path is a file, then it will be present in leafFiles. Include this path. - // 3. The path is a directory, but has no children files. Do not include this path. + if (recursive) { + leafFiles.values.toSeq Review comment: look at `PartitioningAwareFileIndex#allFiles`, this is corrected. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org