alberttwong opened a new issue, #12944: URL: https://github.com/apache/hudi/issues/12944
Running databricks cloud on AWS with LTS 15.4 (spark 3.5, scala 2.12) with single node compute. Using config ``` spark.serializer org.apache.spark.serializer.KryoSerializer spark.sql.catalog.spark_catalog org.apache.spark.sql.hudi.catalog.HoodieCatalog spark.sql.extensions org.apache.spark.sql.hudi.HoodieSparkSessionExtension spark.kryo.registrator org.apache.spark.HoodieSparkKryoRegistrar ``` library org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0 running notebook from https://www.onehouse.ai/blog/how-to-use-apache-hudi-with-databricks ``` tripsDF = spark.read.format("org.apache.hudi").option("hoodie.file.index.enable", "false").load(basePath) tripsDF.show() ``` error ``` : java.lang.NoSuchMethodError: org.apache.spark.sql.execution.datasources.FileStatusCache.putLeafFiles(Lorg/apache/hadoop/fs/Path;[Lorg/apache/hadoop/fs/FileStatus;)V at org.apache.hudi.SparkHoodieTableFileIndex$$anon$1.put(SparkHoodieTableFileIndex.scala:516) at org.apache.hudi.BaseHoodieTableFileIndex.lambda$listPartitionPathFiles$13(BaseHoodieTableFileIndex.java:410) at java.util.HashMap.forEach(HashMap.java:1290) at org.apache.hudi.BaseHoodieTableFileIndex.listPartitionPathFiles(BaseHoodieTableFileIndex.java:408) at org.apache.hudi.BaseHoodieTableFileIndex.loadFileSlicesForPartitions(BaseHoodieTableFileIndex.java:266) at org.apache.hudi.BaseHoodieTableFileIndex.ensurePreloadedPartitions(BaseHoodieTableFileIndex.java:254) at org.apache.hudi.BaseHoodieTableFileIndex.getInputFileSlices(BaseHoodieTableFileIndex.java:240) at org.apache.hudi.HoodieFileIndex.prunePartitionsAndGetFileSlices(HoodieFileIndex.scala:344) at org.apache.hudi.HoodieFileIndex.filterFileSlices(HoodieFileIndex.scala:230) at org.apache.hudi.HoodieFileIndex.listFiles(HoodieFileIndex.scala:167) at org.apache.spark.sql.execution.datasources.FileIndex.listPartitionDirectoriesAndFiles(FileIndex.scala:207) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.listFiles(DataSourceScanExec.scala:817) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.$anonfun$_selectedPartitions$2(DataSourceScanExec.scala:891) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike._selectedPartitions(DataSourceScanExec.scala:883) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike._selectedPartitions$(DataSourceScanExec.scala:882) at org.apache.spark.sql.execution.FileSourceScanExec._selectedPartitions$lzycompute(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.FileSourceScanExec._selectedPartitions(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.selectedPartitions(DataSourceScanExec.scala:909) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.selectedPartitions$(DataSourceScanExec.scala:908) at org.apache.spark.sql.execution.FileSourceScanExec.selectedPartitions(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.$anonfun$dynamicallySelectedPartitions$3(DataSourceScanExec.scala:989) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.$anonfun$dynamicallySelectedPartitions$1(DataSourceScanExec.scala:920) at com.databricks.spark.util.FrameProfiler$.record(FrameProfiler.scala:94) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.dynamicallySelectedPartitions(DataSourceScanExec.scala:920) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.dynamicallySelectedPartitions$(DataSourceScanExec.scala:918) at org.apache.spark.sql.execution.FileSourceScanExec.dynamicallySelectedPartitions$lzycompute(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.FileSourceScanExec.dynamicallySelectedPartitions(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.finalSelectedPartitions(DataSourceScanExec.scala:1026) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.finalSelectedPartitions$(DataSourceScanExec.scala:1026) at org.apache.spark.sql.execution.FileSourceScanExec.finalSelectedPartitions(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.totalFinalSelectedPartitionFileSize(DataSourceScanExec.scala:1015) at org.apache.spark.sql.execution.SparkOrAetherFileSourceScanLike.totalFinalSelectedPartitionFileSize$(DataSourceScanExec.scala:1015) at org.apache.spark.sql.execution.FileSourceScanExec.totalFinalSelectedPartitionFileSize$lzycompute(DataSourceScanExec.scala:2752) at org.apache.spark.sql.execution.FileSourceScanExec.totalFinalSelectedPartitionFileSize(DataSourceScanExec.scala:2752) at com.databricks.sql.transaction.tahoe.metering.DeltaMetering$.$anonfun$reportUsage$3(DeltaMetering.scala:599) at com.databricks.sql.transaction.tahoe.metering.DeltaMetering$.$anonfun$reportUsage$3$adapted(DeltaMetering.scala:238) at scala.collection.TraversableLike$WithFilter.$anonfun$foreach$1(TraversableLike.scala:985) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:984) at com.databricks.sql.transaction.tahoe.metering.DeltaMetering$.reportUsage(DeltaMetering.scala:238) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$10(SQLExecution.scala:675) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:813) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:334) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:1210) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:205) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:750) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4805) at org.apache.spark.sql.Dataset.head(Dataset.scala:3544) at org.apache.spark.sql.Dataset.take(Dataset.scala:3775) at org.apache.spark.sql.Dataset.getRows(Dataset.scala:397) at org.apache.spark.sql.Dataset.showString(Dataset.scala:433) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:397) at py4j.Gateway.invoke(Gateway.java:306) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:199) at py4j.ClientServerConnection.run(ClientServerConnection.java:119) at java.lang.Thread.run(Thread.java:750) File <command-3080611123797219>, line 2 1 tripsDF = spark.read.format("org.apache.hudi").option("hoodie.file.index.enable", "false").load(basePath) ----> 2 tripsDF.show() ``` https://github.com/sagarlakshmipathy/hudi-on-databricks/issues/2 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
