Github user cloud-fan commented on a diff in the pull request:

    https://github.com/apache/spark/pull/20915#discussion_r193260674
  
    --- Diff: 
sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala 
---
    @@ -371,14 +387,27 @@ case class FileSourceScanExec(
               val hosts = getBlockHosts(getBlockLocations(f), 0, f.getLen)
               PartitionedFile(p.values, f.getPath.toUri.toString, 0, f.getLen, 
hosts)
             }
    -      }.groupBy { f =>
    -        BucketingUtils
    -          .getBucketId(new Path(f.filePath).getName)
    -          .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}"))
           }
     
    +    val prunedBucketed = if (optionalBucketSet.isDefined) {
    +      val bucketSet = optionalBucketSet.get
    +      bucketed.filter {
    +        f => bucketSet.get(
    +          BucketingUtils.getBucketId(new Path(f.filePath).getName)
    +            .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}")))
    +      }
    +    } else {
    +      bucketed
    +    }
    +
    +    val filesGroupedToBuckets = prunedBucketed.groupBy { f =>
    +      BucketingUtils
    --- End diff --
    
    can we avoid calculating bucket id from file name twice?


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to