Github user steveloughran commented on a diff in the pull request: https://github.com/apache/spark/pull/22339#discussion_r221326673 --- Diff: streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala --- @@ -196,29 +191,29 @@ class FileInputDStream[K, V, F <: NewInputFormat[K, V]]( logDebug(s"Getting new files for time $currentTime, " + s"ignoring files older than $modTimeIgnoreThreshold") - val newFileFilter = new PathFilter { - def accept(path: Path): Boolean = isNewFile(path, currentTime, modTimeIgnoreThreshold) - } - val directoryFilter = new PathFilter { - override def accept(path: Path): Boolean = fs.getFileStatus(path).isDirectory - } - val directories = fs.globStatus(directoryPath, directoryFilter).map(_.getPath) + val directories = Option(fs.globStatus(directoryPath)).getOrElse(Array.empty[FileStatus]) + .filter(_.isDirectory) + .map(_.getPath) val newFiles = directories.flatMap(dir => - fs.listStatus(dir, newFileFilter).map(_.getPath.toString)) + fs.listStatus(dir) + .filter(isNewFile(_, currentTime, modTimeIgnoreThreshold)) + .map(_.getPath.toString)) val timeTaken = clock.getTimeMillis() - lastNewFileFindingTime - logInfo("Finding new files took " + timeTaken + " ms") - logDebug("# cached file times = " + fileToModTime.size) + logInfo(s"Finding new files took $timeTaken ms") --- End diff -- It was originally @ info, so if it it filled up logs *too much* there'd be complaints. What's important is that the time to scan is printed, either @ info or debug, so someone can see what's happening. Probably what does need logging @ warn is when the time to scan is greater than the window, or just getting close to it.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org