202212312312.commit is not a Parquet file. expected magic number at tail

GitBox Thu, 22 Sep 2022 13:54:36 -0700


jharringtonCoupons commented on issue #5891:
URL: https://github.com/apache/hudi/issues/5891#issuecomment-1255538815


   Can confirm having the same issue. See full stack trace. Is there a viable 
workaround?
   
   `{
       "Event": "GlueETLJobExceptionEvent",
       "Timestamp": 1663789368616,
       "Failure Reason": "Traceback (most recent call last):\n  File 
\"/tmp/some_glue_job.py\", line 29, in <module>\n    datasource_some_table_name 
= glueContext.create_dynamic_frame.from_catalog(database = 
args['ENV']+\"_some_datawarehouse\", table_name = \"some_table_name\", 
transformation_ctx = \"datasource_cdhr0\")\n  File 
\"/opt/amazon/lib/python3.6/site-packages/awsglue/dynamicframe.py\", line 642, 
in from_catalog\n    return 
self._glue_context.create_dynamic_frame_from_catalog(db, table_name, 
redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, 
catalog_id, **kwargs)\n  File 
\"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py\", line 186, in 
create_dynamic_frame_from_catalog\n    return source.getFrame(**kwargs)\n  File 
\"/opt/amazon/lib/python3.6/site-packages/awsglue/data_source.py\", line 36, in 
getFrame\n    jframe = self._jsource.getDynamicFrame()\n  File 
\"/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\", line 
125
 7, in __call__\n    answer, self.gateway_client, self.target_id, self.name)\n  
File \"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py\", line 
63, in deco\n    return f(*a, **kw)\n  File 
\"/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\", line 
328, in get_return_value\n    format(target_id, \".\", name), 
value)\npy4j.protocol.Py4JJavaError: An error occurred while calling 
o82.getDynamicFrame.\n: org.apache.spark.SparkException: Job aborted due to 
stage failure: Task 0 in stage 1.0 failed 4 times, most recent failure: Lost 
task 0.3 in stage 1.0 (TID 4, 172.34.145.50, executor 1): 
org.apache.spark.SparkException: Exception thrown in awaitResult: \n\tat 
org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:226)\n\tat 
org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:290)\n\tat 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:538)\n\tat
 org.apache.spark.sql.exec
 
ution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:611)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:603)\n\tat
 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)\n\tat
 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)\n\tat
 org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\n\tat 
org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\n\tat 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat 
org.apache.spark.scheduler.Task.run(Task.scala:121)\n\tat 
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat
 org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)\n\tat 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat 
java.util.concurr
 ent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat
 java.lang.Thread.run(Thread.java:750)\nCaused by: java.io.IOException: Could 
not read footer for file: 
FileStatus{path=s3://clx-datawarehouse-qa/dwh/cardholder/some_table_name/.hoodie/20220831153536771.commit;
 isDirectory=false; length=15280; replication=0; blocksize=0; 
modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-; 
isSymlink=false}\n\tat 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:551)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:538)\n\tat
 
org.apache.spark.util.ThreadUtils$$anonfun$3$$anonfun$apply$1.apply(ThreadUtils.scala:287)\n\tat
 scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.s
 cala:24)\n\tat 
scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)\n\tat
 
scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)\n\tat
 scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)\n\tat 
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)\n\tat
 scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)\n\tat 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)\nCaused
 by: java.lang.RuntimeException: 
s3://clx-datawarehouse-qa/dwh/cardholder/some_table_name/.hoodie/20220831153536771.commit
 is not a Parquet file. expected magic number at tail [80, 65, 82, 49] but 
found [32, 125, 10, 125]\n\tat 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:524)\n\tat
 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:505)\n\tat
 org.apache.parquet.hadoop.ParquetFileReader.readFooter(Par
 quetFileReader.java:499)\n\tat 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:476)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:544)\n\t...
 9 more\n\nDriver stacktrace:\n\tat 
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)\n\tat
 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)\n\tat
 
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)\n\tat
 
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)\n\tat
 scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)\n\tat 
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)\n\tat
 
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\n\tat
 org.ap
 
ache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)\n\tat
 scala.Option.foreach(Option.scala:257)\n\tat 
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)\n\tat
 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)\n\tat
 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)\n\tat
 
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)\n\tat
 org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)\n\tat 
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)\n\tat 
org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)\n\tat 
org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)\n\tat 
org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)\n\tat 
org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)\n\tat 
org.apache.spark.rdd.RDD$$anonfun$collect$1.apply
 (RDD.scala:945)\n\tat 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n\tat
 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n\tat
 org.apache.spark.rdd.RDD.withScope(RDD.scala:363)\n\tat 
org.apache.spark.rdd.RDD.collect(RDD.scala:944)\n\tat 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:633)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:241)\n\tat
 
org.apache.spark.sql.wrapper.SparkSqlDecoratorDataSource$$anonfun$2.apply(SparkSqlDecoratorDataSource.scala:33)\n\tat
 
org.apache.spark.sql.wrapper.SparkSqlDecoratorDataSource$$anonfun$2.apply(SparkSqlDecoratorDataSource.scala:33)\n\tat
 scala.Option.orElse(Option.scala:289)\n\tat 
org.apache.spark.sql.wrapper.SparkSqlDecoratorDataSource.getOrInferFileFormatSchema(SparkSqlDecoratorDataSource.scala:32)\n\tat
 org.apache.spark.sql.wrapper.SparkSqlDecora
 torDataSource.resolveRelation(SparkSqlDecoratorDataSource.scala:53)\n\tat 
com.amazonaws.services.glue.SparkSQLDataSource$$anonfun$getDynamicFrame$9.apply(DataSource.scala:761)\n\tat
 
com.amazonaws.services.glue.SparkSQLDataSource$$anonfun$getDynamicFrame$9.apply(DataSource.scala:732)\n\tat
 
com.amazonaws.services.glue.util.FileSchemeWrapper$$anonfun$executeWithQualifiedScheme$1.apply(FileSchemeWrapper.scala:89)\n\tat
 
com.amazonaws.services.glue.util.FileSchemeWrapper$$anonfun$executeWithQualifiedScheme$1.apply(FileSchemeWrapper.scala:89)\n\tat
 
com.amazonaws.services.glue.util.FileSchemeWrapper.executeWith(FileSchemeWrapper.scala:82)\n\tat
 
com.amazonaws.services.glue.util.FileSchemeWrapper.executeWithQualifiedScheme(FileSchemeWrapper.scala:89)\n\tat
 
com.amazonaws.services.glue.SparkSQLDataSource.getDynamicFrame(DataSource.scala:731)\n\tat
 
com.amazonaws.services.glue.DataSource$class.getDynamicFrame(DataSource.scala:97)\n\tat
 com.amazonaws.services.glue.SparkSQLDataSource.getDynamicFram
 e(DataSource.scala:709)\n\tat 
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat
 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat
 java.lang.reflect.Method.invoke(Method.java:498)\n\tat 
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat 
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat 
py4j.Gateway.invoke(Gateway.java:282)\n\tat 
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat 
py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat 
py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat 
java.lang.Thread.run(Thread.java:750)\nCaused by: 
org.apache.spark.SparkException: Exception thrown in awaitResult: \n\tat 
org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:226)\n\tat 
org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:290)\n\tat 
org.apache.spark.sq
 
l.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:538)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:611)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:603)\n\tat
 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)\n\tat
 
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)\n\tat
 org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)\n\tat 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)\n\tat 
org.apache.spark.rdd.RDD.iterator(RDD.scala:288)\n\tat 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)\n\tat 
org.apache.spark.scheduler.Task.run(Task.scala:121)\n\tat 
org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)\n\tat
 org.apache.spark.util.Uti
 ls$.tryWithSafeFinally(Utils.scala:1360)\n\tat 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)\n\tat 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat
 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\t...
 1 more\nCaused by: java.io.IOException: Could not read footer for file: 
FileStatus{path=s3://clx-datawarehouse-qa/dwh/cardholder/some_table_name/.hoodie/20220831153536771.commit;
 isDirectory=false; length=15280; replication=0; blocksize=0; 
modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-; 
isSymlink=false}\n\tat 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:551)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:538)\n\tat
 org.apache.spark.util.ThreadUtils$$anonfun$3$$anonfun$apply$1.ap
 ply(ThreadUtils.scala:287)\n\tat 
scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)\n\tat
 
scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)\n\tat
 
scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)\n\tat
 scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)\n\tat 
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)\n\tat
 scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)\n\tat 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)\nCaused
 by: java.lang.RuntimeException: 
s3://clx-datawarehouse-qa/dwh/cardholder/some_table_name/.hoodie/20220831153536771.commit
 is not a Parquet file. expected magic number at tail [80, 65, 82, 49] but 
found [32, 125, 10, 125]\n\tat 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:524)\n\tat
 org.apache.parquet.hadoop.ParquetFile
 Reader.readFooter(ParquetFileReader.java:505)\n\tat 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:499)\n\tat
 
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:476)\n\tat
 
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:544)\n\t...
 9 more\n",
       "Stack Trace": [
           {
               "Declaring Class": "get_return_value",
               "Method Name": "format(target_id, \".\", name), value)",
               "File Name": 
"/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py",
               "Line Number": 328
           },
           {
               "Declaring Class": "deco",
               "Method Name": "return f(*a, **kw)",
               "File Name": 
"/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py",
               "Line Number": 63
           },
           {
               "Declaring Class": "__call__",
               "Method Name": "answer, self.gateway_client, self.target_id, 
self.name)",
               "File Name": 
"/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py",
               "Line Number": 1257
           },
           {
               "Declaring Class": "getFrame",
               "Method Name": "jframe = self._jsource.getDynamicFrame()",
               "File Name": 
"/opt/amazon/lib/python3.6/site-packages/awsglue/data_source.py",
               "Line Number": 36
           },
           {
               "Declaring Class": "create_dynamic_frame_from_catalog",
               "Method Name": "return source.getFrame(**kwargs)",
               "File Name": 
"/opt/amazon/lib/python3.6/site-packages/awsglue/context.py",
               "Line Number": 186
           },
           {
               "Declaring Class": "from_catalog",
               "Method Name": "return 
self._glue_context.create_dynamic_frame_from_catalog(db, table_name, 
redshift_tmp_dir, transformation_ctx, push_down_predicate, additional_options, 
catalog_id, **kwargs)",
               "File Name": 
"/opt/amazon/lib/python3.6/site-packages/awsglue/dynamicframe.py",
               "Line Number": 642
           },
           {
               "Declaring Class": "<module>",
               "Method Name": "datasource_some_table_name = 
glueContext.create_dynamic_frame.from_catalog(database = 
args['ENV']+\"_some_datawarehouse\", table_name = \"some_table_name\", 
transformation_ctx = \"datasource_cdhr0\")",
               "File Name": "/tmp/some_glue_job.py",
               "Line Number": 29
           }
       ],
       "Last Executed Line number": 29,
       "script": "some_glue_job.py"
   }`


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to