[GitHub] [hudi] xushiyan commented on a change in pull request #4877: [HUDI-3457] Refactored Spark DataSource Relations to avoid code duplication

GitBox Fri, 18 Mar 2022 12:20:53 -0700


xushiyan commented on a change in pull request #4877:
URL: https://github.com/apache/hudi/pull/4877#discussion_r830227161




##########
File path: 
hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieRealtimeInputFormatUtils.java
##########
@@ -67,41 +49,6 @@ public static boolean doesBelongToIncrementalQuery(FileSplit 
s) {
     return false;
   }
 
-  // Return parquet file with a list of log files in the same file group.
-  public static List<Pair<Option<HoodieBaseFile>, List<HoodieLogFile>>> 
groupLogsByBaseFile(Configuration conf, List<Path> partitionPaths) {

Review comment:
       this not used or moved?

##########
File path: 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelation.scala
##########
@@ -20,65 +20,134 @@ package org.apache.hudi
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{GlobPattern, Path}
 import org.apache.hudi.HoodieBaseRelation.createBaseFileReader
-import org.apache.hudi.common.model.HoodieRecord
+import org.apache.hudi.HoodieConversionUtils.toScalaOption
+import org.apache.hudi.common.fs.FSUtils.getRelativePartitionPath
+import org.apache.hudi.common.model.{FileSlice, HoodieRecord}
 import org.apache.hudi.common.table.HoodieTableMetaClient
+import org.apache.hudi.common.table.timeline.{HoodieInstant, HoodieTimeline}
 import org.apache.hudi.common.table.view.HoodieTableFileSystemView
+import org.apache.hudi.common.util.StringUtils
 import org.apache.hudi.exception.HoodieException
 import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils.{getCommitMetadata, 
getWritePartitionPaths, listAffectedFilesForCommits}
-import 
org.apache.hudi.hadoop.utils.HoodieRealtimeRecordReaderUtils.getMaxCompactionMemoryInBytes
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{Row, SQLContext}
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types.StructType
 
-import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+import scala.collection.immutable
 
 /**
- * Experimental.
- * Relation, that implements the Hoodie incremental view for Merge On Read 
table.
- *
+ * @Experimental
  */
 class MergeOnReadIncrementalRelation(sqlContext: SQLContext,

Review comment:
       removed lines in this class are mostly due to common logic in 
HoodieBaseRelation? hard to tell from the diff

##########
File path: 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/HoodieHadoopFSUtils.scala
##########
@@ -0,0 +1,370 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.viewfs.ViewFileSystem
+import org.apache.hadoop.fs._
+import org.apache.hadoop.hdfs.DistributedFileSystem
+import org.apache.spark.internal.Logging
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.util.SerializableConfiguration
+
+import java.io.FileNotFoundException
+import scala.collection.mutable
+
+/**
+ * NOTE: This method class is replica of HadoopFSUtils from Spark 3.2.1, with 
the following adjustments
+ *
+ *    - Filtering out of the listed files is adjusted to include files 
starting w/ "." (to include Hoodie Delta Log
+ *    files)
+ */
+object HoodieHadoopFSUtils extends Logging {
+  /**
+   * Lists a collection of paths recursively. Picks the listing strategy 
adaptively depending
+   * on the number of paths to list.
+   *
+   * This may only be called on the driver.
+   *
+   * @param sc                   Spark context used to run parallel listing.
+   * @param paths                Input paths to list
+   * @param hadoopConf           Hadoop configuration
+   * @param filter               Path filter used to exclude leaf files from 
result
+   * @param ignoreMissingFiles   Ignore missing files that happen during 
recursive listing
+   *                             (e.g., due to race conditions)
+   * @param ignoreLocality       Whether to fetch data locality info when 
listing leaf files. If false,
+   *                             this will return `FileStatus` without 
`BlockLocation` info.
+   * @param parallelismThreshold The threshold to enable parallelism. If the 
number of input paths
+   *                             is smaller than this value, this will 
fallback to use
+   *                             sequential listing.
+   * @param parallelismMax       The maximum parallelism for listing. If the 
number of input paths is
+   *                             larger than this value, parallelism will be 
throttled to this value
+   *                             to avoid generating too many tasks.
+   * @return for each input path, the set of discovered files for the path
+   */
+  def parallelListLeafFiles(sc: SparkContext,
+                            paths: Seq[Path],
+                            hadoopConf: Configuration,
+                            filter: PathFilter,
+                            ignoreMissingFiles: Boolean,
+                            ignoreLocality: Boolean,
+                            parallelismThreshold: Int,
+                            parallelismMax: Int): Seq[(Path, Seq[FileStatus])] 
= {
+    parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = 
true,
+      ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax)
+  }
+
+  // scalastyle:off parameter.number
+  private def parallelListLeafFilesInternal(sc: SparkContext,
+                                            paths: Seq[Path],
+                                            hadoopConf: Configuration,
+                                            filter: PathFilter,
+                                            isRootLevel: Boolean,
+                                            ignoreMissingFiles: Boolean,
+                                            ignoreLocality: Boolean,
+                                            parallelismThreshold: Int,
+                                            parallelismMax: Int): Seq[(Path, 
Seq[FileStatus])] = {
+
+    // Short-circuits parallel listing when serial listing is likely to be 
faster.
+    if (paths.size <= parallelismThreshold) {
+      // scalastyle:off return
+      return paths.map { path =>
+        val leafFiles = listLeafFiles(
+          path,
+          hadoopConf,
+          filter,
+          Some(sc),
+          ignoreMissingFiles = ignoreMissingFiles,
+          ignoreLocality = ignoreLocality,
+          isRootPath = isRootLevel,
+          parallelismThreshold = parallelismThreshold,
+          parallelismMax = parallelismMax)
+        (path, leafFiles)
+      }
+      // scalastyle:on return
+    }
+
+    logInfo(s"Listing leaf files and directories in parallel under 
${paths.length} paths." +
+      s" The first several paths are: ${paths.take(10).mkString(", ")}.")
+    HiveCatalogMetrics.incrementParallelListingJobCount(1)
+
+    val serializableConfiguration = new SerializableConfiguration(hadoopConf)
+    val serializedPaths = paths.map(_.toString)
+
+    // Set the number of parallelism to prevent following file listing from 
generating many tasks
+    // in case of large #defaultParallelism.
+    val numParallelism = Math.min(paths.size, parallelismMax)
+
+    val previousJobDescription = 
sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
+    val statusMap = try {
+      val description = paths.size match {
+        case 0 =>
+          "Listing leaf files and directories 0 paths"
+        case 1 =>
+          s"Listing leaf files and directories for 1 path:<br/>${paths(0)}"
+        case s =>
+          s"Listing leaf files and directories for $s paths:<br/>${paths(0)}, 
..."
+      }
+      sc.setJobDescription(description)
+      sc
+        .parallelize(serializedPaths, numParallelism)
+        .mapPartitions { pathStrings =>
+          val hadoopConf = serializableConfiguration.value
+          pathStrings.map(new Path(_)).toSeq.map { path =>
+            val leafFiles = listLeafFiles(
+              path = path,
+              hadoopConf = hadoopConf,
+              filter = filter,
+              contextOpt = None, // Can't execute parallel scans on workers
+              ignoreMissingFiles = ignoreMissingFiles,
+              ignoreLocality = ignoreLocality,
+              isRootPath = isRootLevel,
+              parallelismThreshold = Int.MaxValue,
+              parallelismMax = 0)
+            (path, leafFiles)
+          }.iterator
+        }.map { case (path, statuses) =>
+        val serializableStatuses = statuses.map { status =>
+          // Turn FileStatus into SerializableFileStatus so we can send it 
back to the driver
+          val blockLocations = status match {
+            case f: LocatedFileStatus =>
+              f.getBlockLocations.map { loc =>
+                SerializableBlockLocation(
+                  loc.getNames,
+                  loc.getHosts,
+                  loc.getOffset,
+                  loc.getLength)
+              }
+
+            case _ =>
+              Array.empty[SerializableBlockLocation]
+          }
+
+          SerializableFileStatus(
+            status.getPath.toString,
+            status.getLen,
+            status.isDirectory,
+            status.getReplication,
+            status.getBlockSize,
+            status.getModificationTime,
+            status.getAccessTime,
+            blockLocations)
+        }
+        (path.toString, serializableStatuses)
+      }.collect()
+    } finally {
+      sc.setJobDescription(previousJobDescription)
+    }
+
+    // turn SerializableFileStatus back to Status
+    statusMap.map { case (path, serializableStatuses) =>
+      val statuses = serializableStatuses.map { f =>
+        val blockLocations = f.blockLocations.map { loc =>
+          new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
+        }
+        new LocatedFileStatus(
+          new FileStatus(
+            f.length, f.isDir, f.blockReplication, f.blockSize, 
f.modificationTime,
+            new Path(f.path)),
+          blockLocations)
+      }
+      (new Path(path), statuses)
+    }
+  }
+  // scalastyle:on parameter.number
+
+  // scalastyle:off parameter.number
+  /**
+   * Lists a single filesystem path recursively. If a `SparkContext` object is 
specified, this
+   * function may launch Spark jobs to parallelize listing based on 
`parallelismThreshold`.
+   *
+   * If sessionOpt is None, this may be called on executors.
+   *
+   * @return all children of path that match the specified filter.
+   */
+  private def listLeafFiles(path: Path,
+                            hadoopConf: Configuration,
+                            filter: PathFilter,
+                            contextOpt: Option[SparkContext],
+                            ignoreMissingFiles: Boolean,
+                            ignoreLocality: Boolean,
+                            isRootPath: Boolean,
+                            parallelismThreshold: Int,
+                            parallelismMax: Int): Seq[FileStatus] = {
+
+    logTrace(s"Listing $path")
+    val fs = path.getFileSystem(hadoopConf)
+
+    // Note that statuses only include FileStatus for the files and dirs 
directly under path,
+    // and does not include anything else recursively.
+    val statuses: Array[FileStatus] = try {
+      fs match {
+        // DistributedFileSystem overrides listLocatedStatus to make 1 single 
call to namenode
+        // to retrieve the file status with the file block location. The 
reason to still fallback
+        // to listStatus is because the default implementation would 
potentially throw a
+        // FileNotFoundException which is better handled by doing the lookups 
manually below.
+        case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality 
=>
+          val remoteIter = fs.listLocatedStatus(path)
+          new Iterator[LocatedFileStatus]() {
+            def next(): LocatedFileStatus = remoteIter.next
+
+            def hasNext(): Boolean = remoteIter.hasNext
+          }.toArray
+        case _ => fs.listStatus(path)
+      }
+    } catch {
+      // If we are listing a root path for SQL (e.g. a top level directory of 
a table), we need to
+      // ignore FileNotFoundExceptions during this root level of the listing 
because
+      //
+      //  (a) certain code paths might construct an InMemoryFileIndex with 
root paths that
+      //      might not exist (i.e. not all callers are guaranteed to have 
checked
+      //      path existence prior to constructing InMemoryFileIndex) and,
+      //  (b) we need to ignore deleted root paths during REFRESH TABLE, 
otherwise we break
+      //      existing behavior and break the ability drop SessionCatalog 
tables when tables'
+      //      root directories have been deleted (which breaks a number of 
Spark's own tests).
+      //
+      // If we are NOT listing a root path then a FileNotFoundException here 
means that the
+      // directory was present in a previous level of file listing but is 
absent in this
+      // listing, likely indicating a race condition (e.g. concurrent table 
overwrite or S3
+      // list inconsistency).
+      //
+      // The trade-off in supporting existing behaviors / use-cases is that we 
won't be
+      // able to detect race conditions involving root paths being deleted 
during
+      // InMemoryFileIndex construction. However, it's still a net improvement 
to detect and
+      // fail-fast on the non-root cases. For more info see the SPARK-27676 
review discussion.
+      case _: FileNotFoundException if isRootPath || ignoreMissingFiles =>
+        logWarning(s"The directory $path was not found. Was it deleted very 
recently?")
+        Array.empty[FileStatus]
+    }
+
+    val filteredStatuses =
+      statuses.filterNot(status => 
shouldFilterOutPathName(status.getPath.getName))
+
+    val allLeafStatuses = {
+      val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
+      val nestedFiles: Seq[FileStatus] = contextOpt match {
+        case Some(context) if dirs.size > parallelismThreshold =>
+          parallelListLeafFilesInternal(
+            context,
+            dirs.map(_.getPath),
+            hadoopConf = hadoopConf,
+            filter = filter,
+            isRootLevel = false,
+            ignoreMissingFiles = ignoreMissingFiles,
+            ignoreLocality = ignoreLocality,
+            parallelismThreshold = parallelismThreshold,
+            parallelismMax = parallelismMax
+          ).flatMap(_._2)
+        case _ =>
+          dirs.flatMap { dir =>
+            listLeafFiles(
+              path = dir.getPath,
+              hadoopConf = hadoopConf,
+              filter = filter,
+              contextOpt = contextOpt,
+              ignoreMissingFiles = ignoreMissingFiles,
+              ignoreLocality = ignoreLocality,
+              isRootPath = false,
+              parallelismThreshold = parallelismThreshold,
+              parallelismMax = parallelismMax)
+          }
+      }
+      val allFiles = topLevelFiles ++ nestedFiles
+      if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else 
allFiles
+    }
+
+    val missingFiles = mutable.ArrayBuffer.empty[String]
+    val resolvedLeafStatuses = allLeafStatuses.flatMap {
+      case f: LocatedFileStatus =>
+        Some(f)
+
+      // NOTE:
+      //
+      // - Although S3/S3A/S3N file system can be quite slow for remote file 
metadata
+      //   operations, calling `getFileBlockLocations` does no harm here since 
these file system
+      //   implementations don't actually issue RPC for this method.
+      //
+      // - Here we are calling `getFileBlockLocations` in a sequential manner, 
but it should not
+      //   be a big deal since we always use to `parallelListLeafFiles` when 
the number of
+      //   paths exceeds threshold.
+      case f if !ignoreLocality =>
+        // The other constructor of LocatedFileStatus will call 
FileStatus.getPermission(),
+        // which is very slow on some file system (RawLocalFileSystem, which 
is launch a
+        // subprocess and parse the stdout).
+        try {
+          val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
+            // Store BlockLocation objects to consume less memory
+            if (loc.getClass == classOf[BlockLocation]) {
+              loc
+            } else {
+              new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, 
loc.getLength)
+            }
+          }
+          val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, 
f.getReplication, f.getBlockSize,
+            f.getModificationTime, 0, null, null, null, null, f.getPath, 
locations)
+          if (f.isSymlink) {
+            lfs.setSymlink(f.getSymlink)
+          }
+          Some(lfs)
+        } catch {
+          case _: FileNotFoundException if ignoreMissingFiles =>
+            missingFiles += f.getPath.toString
+            None
+        }
+
+      case f => Some(f)
+    }
+
+    if (missingFiles.nonEmpty) {
+      logWarning(
+        s"the following files were missing during file scan:\n  
${missingFiles.mkString("\n  ")}")
+    }
+
+    resolvedLeafStatuses
+  }
+  // scalastyle:on parameter.number
+
+  /** A serializable variant of HDFS's BlockLocation. This is required by 
Hadoop 2.7. */
+  private case class SerializableBlockLocation(names: Array[String],
+                                               hosts: Array[String],
+                                               offset: Long,
+                                               length: Long)
+
+  /** A serializable variant of HDFS's FileStatus. This is required by Hadoop 
2.7. */
+  private case class SerializableFileStatus(path: String,
+                                            length: Long,
+                                            isDir: Boolean,
+                                            blockReplication: Short,
+                                            blockSize: Long,
+                                            modificationTime: Long,
+                                            accessTime: Long,
+                                            blockLocations: 
Array[SerializableBlockLocation])
+
+  /** Checks if we should filter out this path name. */
+  def shouldFilterOutPathName(pathName: String): Boolean = {
+    // We filter follow paths:
+    // 1. everything that starts with _ and ., except _common_metadata and 
_metadata
+    // because Parquet needs to find those metadata files from leaf files 
returned by this method.
+    // We should refactor this logic to not mix metadata files with data files.
+    // 2. everything that ends with `._COPYING_`, because this is a 
intermediate state of file. we
+    // should skip this file in case of double reading.
+    val exclude = (pathName.startsWith("_") && !pathName.contains("=")) || 
pathName.endsWith("._COPYING_")
+    val include = pathName.startsWith("_common_metadata") || 
pathName.startsWith("_metadata")
+    exclude && !include

Review comment:
       should some utils from MDT be the source of truth to these rules 
instead? spark side does not own these, also can avoid copying it over 
different spark versions

##########
File path: 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadSnapshotRelation.scala
##########
@@ -41,43 +43,28 @@ case class HoodieMergeOnReadFileSplit(dataFile: 
Option[PartitionedFile],
                                       latestCommit: String,
                                       tablePath: String,
                                       maxCompactionMemoryInBytes: Long,
-                                      mergeType: String)
+                                      mergeType: String) extends 
HoodieFileSplit
 
 class MergeOnReadSnapshotRelation(sqlContext: SQLContext,

Review comment:
       same question as above. hard to exam line by line what's extracted out. 
but overall direction looks good.

##########
File path: 
hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/spark/execution/datasources/HoodieInMemoryFileIndex.scala
##########
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.execution.datasources
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, Path, PathFilter}
+import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
+import org.apache.spark.HoodieHadoopFSUtils
+import org.apache.spark.metrics.source.HiveCatalogMetrics
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.types.StructType
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+class HoodieInMemoryFileIndex(sparkSession: SparkSession,
+                              rootPathsSpecified: Seq[Path],
+                              parameters: Map[String, String],
+                              userSpecifiedSchema: Option[StructType],
+                              fileStatusCache: FileStatusCache = NoopCache)
+  extends InMemoryFileIndex(sparkSession, rootPathsSpecified, parameters, 
userSpecifiedSchema, fileStatusCache) {
+
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do 
parallel
+   * listing whenever there is a path having more files than the parallel 
partition discovery threshold.
+   *
+   * This is publicly visible for testing.
+   *
+   * NOTE: This method replicates the one it overrides, however it uses custom 
method to run parallel
+   *       listing that accepts files starting with "."
+   */
+  override def listLeafFiles(paths: Seq[Path]): 
mutable.LinkedHashSet[FileStatus] = {
+    val startTime = System.nanoTime()
+    val output = mutable.LinkedHashSet[FileStatus]()
+    val pathsToFetch = mutable.ArrayBuffer[Path]()
+    for (path <- paths) {
+      fileStatusCache.getLeafFiles(path) match {
+        case Some(files) =>
+          HiveCatalogMetrics.incrementFileCacheHits(files.length)
+          output ++= files
+        case None =>
+          pathsToFetch += path
+      }
+      () // for some reasons scalac 2.12 needs this; return type doesn't matter
+    }
+    val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, 
this.getClass))
+    val discovered = bulkListLeafFiles(sparkSession, pathsToFetch, filter, 
hadoopConf)
+
+    discovered.foreach { case (path, leafFiles) =>
+      HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
+      fileStatusCache.putLeafFiles(path, leafFiles.toArray)
+      output ++= leafFiles
+    }
+
+    logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to 
list leaf files" +
+      s" for ${paths.length} paths.")
+
+    output
+  }
+
+  protected def bulkListLeafFiles(sparkSession: SparkSession, paths: 
ArrayBuffer[Path], filter: PathFilter, hadoopConf: Configuration): Seq[(Path, 
Seq[FileStatus])] = {
+    HoodieHadoopFSUtils.parallelListLeafFiles(
+      sc = sparkSession.sparkContext,
+      paths = paths,
+      hadoopConf = hadoopConf,
+      filter = new PathFilterWrapper(filter),
+      ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles,
+      // NOTE: We're disabling fetching Block Info to speed up file listing

Review comment:
       we may need a special token here to indicate changed part in hudi's 
codebase for easier maintenance. `// NOTE:` is not special enough. what about 
`// HUDI NOTE:` ? this can apply to any other incoming code variation




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [hudi] xushiyan commented on a change in pull request #4877: [HUDI-3457] Refactored Spark DataSource Relations to avoid code duplication

Reply via email to