[spark] branch master updated: [SPARK-27136][SQL] Remove data source option check_files_exist

wenchen Thu, 14 Mar 2019 19:20:16 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 6d22ee3  [SPARK-27136][SQL] Remove data source option check_files_exist
6d22ee3 is described below

commit 6d22ee3969597ed5e38333d5b6f0f891a859594e
Author: Gengliang Wang <[email protected]>
AuthorDate: Fri Mar 15 10:19:26 2019 +0800

    [SPARK-27136][SQL] Remove data source option check_files_exist
    
    ## What changes were proposed in this pull request?
    
    The data source option check_files_exist is introduced in In #23383 when 
the file source V2 framework is implemented. In the PR, FileIndex was created 
as a member of FileTable, so that we could implement partition pruning like 
0f9fcab in the future. At that time `FileIndex`es will always be created for 
file writes, so we needed the option to decide whether to check file existence.
    
    After https://github.com/apache/spark/pull/23774, the option is not needed 
anymore, since Dataframe writes won't create unnecessary FileIndex. This PR is 
to remove the option.
    
    ## How was this patch tested?
    
    Unit test.
    
    Closes #24069 from gengliangwang/removeOptionCheckFilesExist.
    
    Authored-by: Gengliang Wang <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala   | 5 ++---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala   | 5 ++---
 .../org/apache/spark/sql/execution/datasources/v2/FileTable.scala    | 4 +---
 3 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 2cc9370..dfba12a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -213,9 +213,8 @@ class DataFrameReader private[sql](sparkSession: 
SparkSession) extends Logging {
         val objectMapper = new ObjectMapper()
         Some("paths" -> objectMapper.writeValueAsString(paths.toArray))
       }
-      // TODO SPARK-27113: remove this option.
-      val checkFilesExistsOpt = "check_files_exist" -> "true"
-      val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption + 
checkFilesExistsOpt
+
+      val finalOptions = sessionOptions ++ extraOptions.toMap ++ pathsOption
       val dsOptions = new CaseInsensitiveStringMap(finalOptions.asJava)
       val table = userSpecifiedSchema match {
         case Some(schema) => provider.getTable(dsOptions, schema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index e58225e..3c51edd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -261,10 +261,9 @@ final class DataFrameWriter[T] private[sql](ds: 
Dataset[T]) {
       val provider = 
cls.getConstructor().newInstance().asInstanceOf[TableProvider]
       val sessionOptions = DataSourceV2Utils.extractSessionConfigs(
         provider, session.sessionState.conf)
-      // TODO SPARK-27113: remove this option.
-      val checkFilesExistsOption = "check_files_exist" -> "false"
-      val options = sessionOptions ++ extraOptions + checkFilesExistsOption
+      val options = sessionOptions ++ extraOptions
       val dsOptions = new CaseInsensitiveStringMap(options.asJava)
+
       provider.getTable(dsOptions) match {
         case table: SupportsBatchWrite =>
           lazy val relation = DataSourceV2Relation.create(table, dsOptions)
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
index 08873a3..21fb6fd 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileTable.scala
@@ -36,10 +36,8 @@ abstract class FileTable(
   lazy val fileIndex: PartitioningAwareFileIndex = {
     val scalaMap = options.asScala.toMap
     val hadoopConf = 
sparkSession.sessionState.newHadoopConfWithOptions(scalaMap)
-    // This is an internal config so must be present.
-    val checkFilesExist = options.get("check_files_exist").toBoolean
     val rootPathsSpecified = DataSource.checkAndGlobPathIfNecessary(paths, 
hadoopConf,
-      checkEmptyGlobPath = true, checkFilesExist = checkFilesExist)
+      checkEmptyGlobPath = true, checkFilesExist = true)
     val fileStatusCache = FileStatusCache.getOrCreate(sparkSession)
     new InMemoryFileIndex(
       sparkSession, rootPathsSpecified, scalaMap, userSpecifiedSchema, 
fileStatusCache)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-27136][SQL] Remove data source option check_files_exist

Reply via email to