parisni opened a new issue, #8934:
URL: https://github.com/apache/hudi/issues/8934
hudi 0.13.1
All goes well util two cases:
# When only one row
```python
from pyspark.sql.types import StructType, StructField, IntegerType,
StringType, MapType, ArrayType, TimestampType
tableName = 'test_hudi_spacial'
basePath = "/tmp/test/{tableName}".format(tableName=tableName)
from pyspark.sql.functions import expr
df =spark.sql("select '1' as event_id, 'a' col1, 'b' col2, 'c' col3, '2' as
ts, '3' as version, 'foo' as event_date")
##
# Create a nested table
##
hudi_options = {
"hoodie.table.name": tableName,
"hoodie.datasource.write.recordkey.field": "event_id",
"hoodie.datasource.write.partitionpath.field": "version,event_date",
"hoodie.datasource.write.table.name": tableName,
"hoodie.datasource.write.operation": "bulk_insert",
"hoodie.datasource.write.precombine.field": "ts",
"hoodie.datasource.write.keygenerator.class":
"org.apache.hudi.keygen.ComplexKeyGenerator",
"hoodie.datasource.write.hive_style_partitioning": "true",
"hoodie.datasource.hive_sync.enable": "false",
"hoodie.datasource.write.keygenerator.class":
"org.apache.hudi.keygen.ComplexKeyGenerator",
"hoodie.metadata.enable": "false",
# for spacial curve
"hoodie.clustering.plan.strategy.sort.columns": "col1,col2,col3",
"hoodie.layout.optimize.build.curve.sample.size": "1",
"hoodie.layout.optimize.curve.build.method": "sample",
"hoodie.layout.optimize.strategy": "hilbert",
"hoodie.bulkinsert.user.defined.partitioner.class":
"org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner",
}
(df.write.format("hudi").options(**hudi_options).mode("overwrite").save(basePath))
```
raises
```
: java.lang.ArithmeticException: / by zero
at
org.apache.spark.sql.hudi.execution.RangeSampleSort$.$anonfun$sortDataFrameBySample$11(RangeSample.scala:337)
at
scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at
org.apache.spark.sql.hudi.execution.RangeSampleSort$.sortDataFrameBySample(RangeSample.scala:336)
at
org.apache.hudi.sort.SpaceCurveSortingHelper.orderDataFrameBySamplingValues(SpaceCurveSortingHelper.java:275)
at
org.apache.hudi.execution.bulkinsert.SpatialCurveSortPartitionerBase.reorder(SpatialCurveSortPartitionerBase.java:73)
at
org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner.repartitionRecords(RowSpatialCurveSortPartitioner.java:40)
at
org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner.repartitionRecords(RowSpatialCurveSortPartitioner.java:26)
at
org.apache.hudi.HoodieDatasetBulkInsertHelper$.prepareForBulkInsert(HoodieDatasetBulkInsertHelper.scala:125)
at
org.apache.hudi.HoodieSparkSqlWriter$.bulkInsertAsRow(HoodieSparkSqlWriter.scala:802)
at
org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:322)
at
org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:150)
```
# Now when empty dataframe
```python
from pyspark.sql.types import StructType, StructField, IntegerType,
StringType, MapType, ArrayType, TimestampType
tableName = 'test_hudi_spacial'
basePath = "/tmp/test/{tableName}".format(tableName=tableName)
from pyspark.sql.functions import expr
df =spark.sql("select '1' as event_id, 'a' col1, 'b' col2, 'c' col3, '2' as
ts, '3' as version, 'foo' as event_date")
##
# Create a nested table
##
hudi_options = {
"hoodie.table.name": tableName,
"hoodie.datasource.write.recordkey.field": "event_id",
"hoodie.datasource.write.partitionpath.field": "version,event_date",
"hoodie.datasource.write.table.name": tableName,
"hoodie.datasource.write.operation": "bulk_insert",
"hoodie.datasource.write.precombine.field": "ts",
"hoodie.datasource.write.keygenerator.class":
"org.apache.hudi.keygen.ComplexKeyGenerator",
"hoodie.datasource.write.hive_style_partitioning": "true",
"hoodie.datasource.hive_sync.enable": "false",
"hoodie.datasource.write.keygenerator.class":
"org.apache.hudi.keygen.ComplexKeyGenerator",
"hoodie.metadata.enable": "false",
# for spacial curve
"hoodie.clustering.plan.strategy.sort.columns": "col1,col2,col3",
"hoodie.layout.optimize.build.curve.sample.size": "1",
"hoodie.layout.optimize.curve.build.method": "sample",
"hoodie.layout.optimize.strategy": "hilbert",
"hoodie.bulkinsert.user.defined.partitioner.class":
"org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner",
}
(df.limit(0).write.format("hudi").options(**hudi_options).mode("overwrite").save(basePath))
```
```
: java.util.NoSuchElementException: next on empty iterator
at scala.collection.Iterator$$anon$2.next(Iterator.scala:41)
at scala.collection.Iterator$$anon$2.next(Iterator.scala:39)
at scala.collection.IterableLike.head(IterableLike.scala:109)
at scala.collection.IterableLike.head$(IterableLike.scala:108)
at
scala.collection.mutable.ArrayBuffer.scala$collection$IndexedSeqOptimized$$super$head(ArrayBuffer.scala:49)
at
scala.collection.IndexedSeqOptimized.head(IndexedSeqOptimized.scala:129)
at
scala.collection.IndexedSeqOptimized.head$(IndexedSeqOptimized.scala:129)
at scala.collection.mutable.ArrayBuffer.head(ArrayBuffer.scala:49)
at
org.apache.spark.sql.hudi.execution.RangeSampleSort$.sortDataFrameBySample(RangeSample.scala:320)
at
org.apache.hudi.sort.SpaceCurveSortingHelper.orderDataFrameBySamplingValues(SpaceCurveSortingHelper.java:275)
at
org.apache.hudi.execution.bulkinsert.SpatialCurveSortPartitionerBase.reorder(SpatialCurveSortPartitionerBase.java:73)
at
org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner.repartitionRecords(RowSpatialCurveSortPartitioner.java:40)
at
org.apache.hudi.execution.bulkinsert.RowSpatialCurveSortPartitioner.repartitionRecords(RowSpatialCurveSortPartitioner.java:26)
at
org.apache.hudi.HoodieDatasetBulkInsertHelper$.prepareForBulkInsert(HoodieDatasetBulkInsertHelper.scala:125)
at
org.apache.hudi.HoodieSparkSqlWriter$.bulkInsertAsRow(HoodieSparkSqlWriter.scala:802)
at
org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:322)
at
org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:150)
at
org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:45)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
at
org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:110)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at
org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]