parisni opened a new issue #3766:
URL: https://github.com/apache/hudi/issues/3766
> hudi 0.9.0
When null value present in the clustering sorting column then this does not
work as expected.
```
from pyspark.sql.types import StructType, StructField, StringType,
IntegerType
sc.setLogLevel("WARN")
tableName = "test_hudi_pyspark_local"
basePath = f"/tmp/{tableName}"
data = [
(1, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b21", "A", "BC", "C"),
(2, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b22", "A", "BC", "C"),
(3, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b23", "A", "BC", "C"),
(4, None, "A", "BC", "C"),
]
schema = StructType(
[
StructField("event_id", IntegerType(), True),
StructField("user_id", StringType(), True),
StructField("kafka_date", StringType(), True),
StructField("real_date", StringType(), True),
StructField("kafka_partition", StringType(), True),
]
)
df = spark.createDataFrame(data=data, schema=schema)
hudi_options = {
"hoodie.table.name": tableName,
"hoodie.datasource.write.recordkey.field": "event_id",
"hoodie.datasource.write.partitionpath.field": "real_date:simple",
"hoodie.datasource.write.table.name": tableName,
"hoodie.datasource.write.operation": "insert",
"hoodie.datasource.write.precombine.field": "kafka_partition",
"hoodie.datasource.write.keygenerator.class":
"org.apache.hudi.keygen.CustomKeyGenerator",
"hoodie.upsert.shuffle.parallelism": 2,
"hoodie.insert.shuffle.parallelism": 2,
"index.global.enabled": "true",
"hoodie.metadata.enable": "true",
"hoodie.combine.before.upsert": "false",
"hoodie.parquet.small.file.limit": "0",
"hoodie.clustering.inline": "true",
"hoodie.clustering.inline.max.commits": "1",
"hoodie.clustering.plan.strategy.target.file.max.bytes": "1073741824",
"hoodie.clustering.plan.strategy.small.file.limit": "629145600",
"hoodie.clustering.plan.strategy.sort.columns": "user_id",
}
(df.write.format("hudi").options(**hudi_options).mode("overwrite").save(basePath))
```
```
21/10/08 11:28:16 ERROR Executor: Exception in task 0.0 in stage 1822.0 (TID
3447)
java.lang.NullPointerException
at
org.sparkproject.guava.base.Preconditions.checkNotNull(Preconditions.java:191)
at
org.sparkproject.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:35)
at
org.sparkproject.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:28)
at
scala.math.LowPriorityOrderingImplicits$$anon$4.compare(Ordering.scala:156)
at
org.apache.spark.util.collection.WritablePartitionedPairCollection$.$anonfun$partitionKeyComparator$1(WritablePartitionedPairCollection.scala:81)
at
org.apache.spark.util.collection.TimSort.countRunAndMakeAscending(TimSort.java:275)
at org.apache.spark.util.collection.TimSort.sort(TimSort.java:128)
at org.apache.spark.util.collection.Sorter.sort(Sorter.scala:37)
at
org.apache.spark.util.collection.PartitionedPairBuffer.partitionedDestructiveSortedIterator(PartitionedPairBuffer.scala:79)
at
org.apache.spark.util.collection.ExternalSorter.partitionedIterator(ExternalSorter.scala:658)
at
org.apache.spark.util.collection.ExternalSorter.iterator(ExternalSorter.scala:672)
at
org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:138)
at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:386)
at
org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1440)
at
org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
at
org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
at
org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
21/10/08 11:28:16 WARN TaskSetManager: Lost task 0.0 in stage 1822.0 (TID
3447) (C7D8ZW2 executor driver): java.lang.NullPointerException
at
org.sparkproject.guava.base.Preconditions.checkNotNull(Preconditions.java:191)
at
org.sparkproject.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:35)
at
org.sparkproject.guava.collect.NaturalOrdering.compare(NaturalOrdering.java:28)
at
scala.math.LowPriorityOrderingImplicits$$anon$4.compare(Ordering.scala:156)
at
org.apache.spark.util.collection.WritablePartitionedPairCollection$.$anonfun$partitionKeyComparator$1(WritablePartitionedPairCollection.scala:81)
at
org.apache.spark.util.collection.TimSort.countRunAndMakeAscending(TimSort.java:275)
at org.apache.spark.util.collection.TimSort.sort(TimSort.java:128)
at org.apache.spark.util.collection.Sorter.sort(Sorter.scala:37)
at
org.apache.spark.util.collection.PartitionedPairBuffer.partitionedDestructiveSortedIterator(PartitionedPairBuffer.scala:79)
at
org.apache.spark.util.collection.ExternalSorter.partitionedIterator(ExternalSorter.scala:658)
at
org.apache.spark.util.collection.ExternalSorter.iterator(ExternalSorter.scala:672)
at
org.apache.spark.shuffle.BlockStoreShuffleReader.read(BlockStoreShuffleReader.scala:138)
at org.apache.spark.rdd.ShuffledRDD.compute(ShuffledRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:106)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.$anonfun$getOrCompute$1(RDD.scala:386)
at
org.apache.spark.storage.BlockManager.$anonfun$doPutIterator$1(BlockManager.scala:1440)
at
org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$doPut(BlockManager.scala:1350)
at
org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1414)
at
org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:1237)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:384)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:335)
at
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:131)
at
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
21/10/08 11:28:16 ERROR TaskSetManager: Task 0 in stage 1822.0 failed 1
times; aborting job
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]