JingsongLi commented on code in PR #1454:
URL: https://github.com/apache/incubator-paimon/pull/1454#discussion_r1251655329
##########
paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/WriteIntoPaimonTable.scala:
##########
@@ -77,45 +77,56 @@ case class WriteIntoPaimonTable(_table: FileStoreTable,
saveMode: SaveMode, data
val toRow = withBucketDataEncoder.createSerializer()
val fromRow = withBucketDataEncoder.createDeserializer()
- val withAssignedBucket = if (isDynamicBucketTable) {
- val partitioned = if (primaryKeyCols.nonEmpty) {
- // Make sure that the records with the same bucket values is within a
task.
- withBucketCol.repartition(primaryKeyCols: _*)
- } else {
- withBucketCol
- }
- val numSparkPartitions = partitioned.rdd.getNumPartitions
- val dynamicBucketProcessor =
- DynamicBucketProcessor(table, rowType, bucketColIdx,
numSparkPartitions, toRow, fromRow)
-
partitioned.mapPartitions(dynamicBucketProcessor.processPartition)(withBucketDataEncoder)
- } else {
+ def commonBucketProcessed = {
val commonBucketProcessor = CommonBucketProcessor(writeBuilder,
bucketColIdx, toRow, fromRow)
withBucketCol.mapPartitions(commonBucketProcessor.processPartition)(withBucketDataEncoder)
}
- val commitMessages =
- withAssignedBucket
- .toDF()
- .repartition(partitionCols ++ Seq(col(BUCKET_COL)): _*)
- .mapPartitions {
- iter =>
- val write = writeBuilder.newWrite()
- write.withIOManager(createIOManager)
- try {
- iter.foreach {
- row =>
- val bucket = row.getInt(bucketColIdx)
- val bucketColDropped = originFromRow(toRow(row))
- write.write(new DynamicBucketRow(new SparkRow(rowType,
bucketColDropped), bucket))
- }
- val serializer = new CommitMessageSerializer
-
write.prepareCommit().asScala.map(serializer.serialize).toIterator
- } finally {
- write.close()
+ def repartition(ds: Dataset[Row]) = {
Review Comment:
repartitionByBucket
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]