parisni commented on issue #9022:
URL: https://github.com/apache/hudi/issues/9022#issuecomment-1623765649
I have a reproductible example. It turns out just after compaction, the MDT
becomes not readable:
```python
from pyspark.sql.types import StructType, StructField, StringType,
IntegerType
data = [
(1, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b21", "A", "BC", "C"),
(2, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b22", "A", "BC", "C"),
(3, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b21", "A", "BC", "C"),
(4, "f5c2ebfd-f57b-4ff3-ac5c-f30674037b22", "A", "BC", "C"),
]
schema = StructType(
[
StructField("uuid", IntegerType(), True),
StructField("user_id", StringType(), True),
StructField("col1", StringType(), True),
StructField("ts", StringType(), True),
StructField("part", StringType(), True),
]
)
df = spark.createDataFrame(data=data, schema=schema)
bucket = ...
tableName = "test_hudi_mdt"
basePath = f"s3://"+bucket+"/test/" + tableName
hudi_options = {
"hoodie.table.name": tableName,
"hoodie.datasource.write.recordkey.field": "uuid",
"hoodie.datasource.write.partitionpath.field": "part",
"hoodie.datasource.write.table.name": tableName,
"hoodie.datasource.write.operation": "insert",
"hoodie.datasource.write.precombine.field": "ts",
"hoodie.upsert.shuffle.parallelism": 2,
"hoodie.insert.shuffle.parallelism": 2,
"hoodie.datasource.hive_sync.enable": "false",
}
mode = "overwrite"
# make sure one mdt compaction kicks in
for i in range(1,11):
(df.write.format("hudi").options(**hudi_options).mode(mode).save(basePath))
mode = "append"
spark.read.format("hudi").load(basePath + "/.hoodie/metadata").show()
```
@ad1happy2go can you plz re-open this ?
```
: java.lang.NoSuchMethodError:
org.apache.spark.sql.execution.datasources.PartitionedFile.<init>(Lorg/apache/spark/sql/catalyst/InternalRow;Ljava/lang/String;JJ[Ljava/lang/String;)V
at
org.apache.hudi.BaseMergeOnReadSnapshotRelation.$anonfun$buildSplits$2(MergeOnReadSnapshotRelation.scala:237)
at scala.Option.map(Option.scala:230)
at
org.apache.hudi.BaseMergeOnReadSnapshotRelation.$anonfun$buildSplits$1(MergeOnReadSnapshotRelation.scala:235)
at scala.collection.immutable.List.map(List.scala:293)
at
org.apache.hudi.BaseMergeOnReadSnapshotRelation.buildSplits(MergeOnReadSnapshotRelation.scala:231)
at
org.apache.hudi.BaseMergeOnReadSnapshotRelation.collectFileSplits(MergeOnReadSnapshotRelation.scala:223)
at
org.apache.hudi.BaseMergeOnReadSnapshotRelation.collectFileSplits(MergeOnReadSnapshotRelation.scala:64)
at
org.apache.hudi.HoodieBaseRelation.buildScan(HoodieBaseRelation.scala:353)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$apply$4(DataSourceStrategy.scala:360)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.$anonfun$pruneFilterProject$1(DataSourceStrategy.scala:394)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProjectRaw(DataSourceStrategy.scala:473)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.pruneFilterProject(DataSourceStrategy.scala:393)
at
org.apache.spark.sql.execution.datasources.DataSourceStrategy$.apply(DataSourceStrategy.scala:360)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$1(QueryPlanner.scala:63)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:71)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$3(QueryPlanner.scala:78)
at
scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
at
scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at
scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
at
scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.$anonfun$plan$2(QueryPlanner.scala:75)
at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
at
org.apache.spark.sql.catalyst.planning.QueryPlanner.plan(QueryPlanner.scala:93)
at
org.apache.spark.sql.execution.SparkStrategies.plan(SparkStrategies.scala:71)
at
org.apache.spark.sql.execution.QueryExecution$.createSparkPlan(QueryExecution.scala:504)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$2(QueryExecution.scala:165)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:192)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:224)
at
org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:224)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$sparkPlan$1(QueryExecution.scala:165)
at
org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:75)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:158)
at
org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:158)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$2(QueryExecution.scala:178)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:192)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:224)
at
org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:224)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:175)
at
org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:75)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:171)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:171)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$writePlans$5(QueryExecution.scala:308)
at
org.apache.spark.sql.catalyst.plans.QueryPlan$.append(QueryPlan.scala:606)
at
org.apache.spark.sql.execution.QueryExecution.writePlans(QueryExecution.scala:308)
at
org.apache.spark.sql.execution.QueryExecution.toString(QueryExecution.scala:323)
at
org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:277)
at
org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:256)
at
org.apache.spark.sql.execution.SQLExecution$.executeQuery$1(SQLExecution.scala:102)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:135)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:107)
at
org.apache.spark.sql.execution.SQLExecution$.withTracker(SQLExecution.scala:232)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:135)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:253)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:134)
at
org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:68)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3768)
at org.apache.spark.sql.Dataset.head(Dataset.scala:2769)
at org.apache.spark.sql.Dataset.take(Dataset.scala:2976)
at org.apache.spark.sql.Dataset.getRows(Dataset.scala:289)
at org.apache.spark.sql.Dataset.showString(Dataset.scala:328)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]