[spark] branch master updated: [SPARK-31292][CORE][SQL] Replace toSet.toSeq with distinct for readability

yamamuro Sat, 28 Mar 2020 16:51:01 -0700

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 0b237bd  [SPARK-31292][CORE][SQL] Replace toSet.toSeq with distinct 
for readability
0b237bd is described below

commit 0b237bd615da4b2c2b781e72af4ad3a4f2951444
Author: Kengo Seki <[email protected]>
AuthorDate: Sun Mar 29 08:48:08 2020 +0900

    [SPARK-31292][CORE][SQL] Replace toSet.toSeq with distinct for readability
    
    ### What changes were proposed in this pull request?
    
    This PR replaces the method calls of `toSet.toSeq` with `distinct`.
    
    ### Why are the changes needed?
    
    `toSet.toSeq` is intended to make its elements unique but a bit verbose. 
Using `distinct` instead is easier to understand and improves readability.
    
    ### Does this PR introduce any user-facing change?
    
    No
    
    ### How was this patch tested?
    
    Tested with the existing unit tests and found no problem.
    
    Closes #28062 from sekikn/SPARK-31292.
    
    Authored-by: Kengo Seki <[email protected]>
    Signed-off-by: Takeshi Yamamuro <[email protected]>
---
 core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala       | 2 +-
 core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala         | 2 +-
 core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala     | 2 +-
 core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala  | 2 +-
 .../test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala  | 2 +-
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala              | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala 
b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
index 36ef906..162f090 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
@@ -150,7 +150,7 @@ private[spark] object ResourceUtils extends Logging {
   def listResourceIds(sparkConf: SparkConf, componentName: String): 
Seq[ResourceID] = {
     sparkConf.getAllWithPrefix(s"$componentName.$RESOURCE_PREFIX.").map { case 
(key, _) =>
       key.substring(0, key.indexOf('.'))
-    }.toSet.toSeq.map(name => new ResourceID(componentName, name))
+    }.distinct.map(name => new ResourceID(componentName, name))
   }
 
   def parseAllResourceRequests(
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala 
b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
index 857c89d..15f2161 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultTask.scala
@@ -69,7 +69,7 @@ private[spark] class ResultTask[T, U](
   with Serializable {
 
   @transient private[this] val preferredLocs: Seq[TaskLocation] = {
-    if (locs == null) Nil else locs.toSet.toSeq
+    if (locs == null) Nil else locs.distinct
   }
 
   override def runTask(context: TaskContext): U = {
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala 
b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
index 4c0c30a..a0ba920 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -71,7 +71,7 @@ private[spark] class ShuffleMapTask(
   }
 
   @transient private val preferredLocs: Seq[TaskLocation] = {
-    if (locs == null) Nil else locs.toSet.toSeq
+    if (locs == null) Nil else locs.distinct
   }
 
   override def runTask(context: TaskContext): MapStatus = {
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala 
b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 7e2fbb4..f0f84fe 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -487,7 +487,7 @@ private[spark] class TaskSchedulerImpl(
         newExecAvail = true
       }
     }
-    val hosts = offers.map(_.host).toSet.toSeq
+    val hosts = offers.map(_.host).distinct
     for ((host, Some(rack)) <- hosts.zip(getRacksForHosts(hosts))) {
       hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += host
     }
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala 
b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index 9ee84a8..b9a11e7 100644
--- 
a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -761,7 +761,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with 
LocalSparkContext with B
         // that are explicitly blacklisted, plus those that have *any* 
executors blacklisted.
         val nodesForBlacklistedExecutors = offers.filter { offer =>
           execBlacklist.contains(offer.executorId)
-        }.map(_.host).toSet.toSeq
+        }.map(_.host).distinct
         val nodesWithAnyBlacklisting = (nodeBlacklist ++ 
nodesForBlacklistedExecutors).toSet
         // Similarly, figure out which executors have any blacklisting.  This 
means all executors
         // that are explicitly blacklisted, plus all executors on nodes that 
are blacklisted.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e3c6388..e1e3e8e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2455,7 +2455,7 @@ class Dataset[T] private[sql](
   def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan {
     val resolver = sparkSession.sessionState.analyzer.resolver
     val allColumns = queryExecution.analyzed.output
-    val groupCols = colNames.toSet.toSeq.flatMap { (colName: String) =>
+    val groupCols = colNames.distinct.flatMap { (colName: String) =>
       // It is possibly there are more than one columns with the same name,
       // so we call filter instead of find.
       val cols = allColumns.filter(col => resolver(col.name, colName))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-31292][CORE][SQL] Replace toSet.toSeq with distinct for readability

Reply via email to