Github user cloud-fan commented on a diff in the pull request:
https://github.com/apache/spark/pull/21004#discussion_r181103480
--- Diff:
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
---
@@ -140,41 +130,29 @@ case class DataSource(
* be any further inference in any triggers.
*
* @param format the file format object for this DataSource
- * @param fileStatusCache the shared cache for file statuses to speed up
listing
+ * @param optionalFileIndex optional [[FileIndex]] for getting partition
schema and file list
* @return A pair of the data schema (excluding partition columns) and
the schema of the partition
* columns.
*/
private def getOrInferFileFormatSchema(
format: FileFormat,
- fileStatusCache: FileStatusCache = NoopCache): (StructType,
StructType) = {
- // the operations below are expensive therefore try not to do them if
we don't need to, e.g.,
- // in streaming mode, we have already inferred and registered
partition columns, we will
- // never have to materialize the lazy val below
- lazy val tempFileIndex = {
- val allPaths = caseInsensitiveOptions.get("path") ++ paths
- val hadoopConf = sparkSession.sessionState.newHadoopConf()
- val globbedPaths = allPaths.toSeq.flatMap { path =>
- val hdfsPath = new Path(path)
- val fs = hdfsPath.getFileSystem(hadoopConf)
- val qualified = hdfsPath.makeQualified(fs.getUri,
fs.getWorkingDirectory)
- SparkHadoopUtil.get.globPathIfNecessary(fs, qualified)
- }.toArray
- new InMemoryFileIndex(sparkSession, globbedPaths, options, None,
fileStatusCache)
- }
+ optionalFileIndex: Option[FileIndex] = None): (StructType,
StructType) = {
--- End diff --
`existingFileIndex`
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]