[
https://issues.apache.org/jira/browse/SPARK-27654?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Gautham Rajendiran updated SPARK-27654:
---------------------------------------
Affects Version/s: (was: 2.4.3)
2.4.0
> spark unable to read parquet file- corrupt footer
> -------------------------------------------------
>
> Key: SPARK-27654
> URL: https://issues.apache.org/jira/browse/SPARK-27654
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 2.4.0
> Reporter: Gautham Rajendiran
> Priority: Blocker
>
> Reading large parquet file produces corrupt footer error with parquet file
>
> {code:java}
> ---------------------------------------------------------------------------
> Py4JJavaError Traceback (most recent call last) <command-2958098653513126> in
> <module>() ----> 1 df = spark.read.parquet("/mnt/valassis/data1") 2
> df.head(1) /databricks/spark/python/pyspark/sql/readwriter.py in
> parquet(self, *paths) 314 [('name', 'string'), ('year', 'int'), ('month',
> 'int'), ('day', 'int')] 315 """ --> 316 return
> self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) 317 318
> @ignore_unicode_prefix
> /databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in
> __call__(self, *args) 1255 answer = self.gateway_client.send_command(command)
> 1256 return_value = get_return_value( -> 1257 answer, self.gateway_client,
> self.target_id, self.name) 1258 1259 for temp_arg in temp_args:
> /databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 61 def
> deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except
> py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString()
> /databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in
> get_return_value(answer, gateway_client, target_id, name) 326 raise
> Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". --> 328
> format(target_id, ".", name), value) 329 else: 330 raise Py4JError(
> Py4JJavaError: An error occurred while calling o1045.parquet. :
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in
> stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0
> (TID 1458, 10.139.64.5, executor 0): org.apache.spark.SparkException:
> Exception thrown in awaitResult: at
> org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at
> org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
> at
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
> at
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at
> org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
> org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at
> org.apache.spark.scheduler.Task.run(Task.scala:112) at
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> at java.lang.Thread.run(Thread.java:748) Caused by: java.io.IOException:
> Could not read footer for file: FileStatus{path=dbfs:/mnt/valassis/data1;
> isDirectory=false; length=66061642673; replication=0; blocksize=0;
> modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-;
> isSymlink=false} at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
> at
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
> at
> org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
> at
> org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
> at
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
> at
> scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
> at
> scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
> at
> scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a
> Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1,
> -1, 0, 0] at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
> ... 12 more Driver stacktrace: at
> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2100)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2088)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2087)
> at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2087)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
> at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
> at scala.Option.foreach(Option.scala:257) at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1076)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2319)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2267)
> at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2255)
> at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at
> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:873) at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2252) at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2274) at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2293) at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2318) at
> org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:961) at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:379) at
> org.apache.spark.rdd.RDD.collect(RDD.scala:960) at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:697)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:250)
> at
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:205)
> at
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:196)
> at scala.Option.orElse(Option.scala:289) at
> org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:196)
> at
> org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:415)
> at
> org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:298)
> at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:279) at
> org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:733) at
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498) at
> py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
> py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at
> py4j.Gateway.invoke(Gateway.java:295) at
> py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at
> py4j.commands.CallCommand.execute(CallCommand.java:79) at
> py4j.GatewayConnection.run(GatewayConnection.java:251) at
> java.lang.Thread.run(Thread.java:748) Caused by:
> org.apache.spark.SparkException: Exception thrown in awaitResult: at
> org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at
> org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
> at
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
> at
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
> at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60)
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at
> org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
> org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at
> org.apache.spark.scheduler.Task.run(Task.scala:112) at
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
> at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> ... 1 more Caused by: java.io.IOException: Could not read footer for file:
> FileStatus{path=dbfs:/mnt/valassis/data1; isDirectory=false;
> length=66061642673; replication=0; blocksize=0; modification_time=0;
> access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false} at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
> at
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
> at
> org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
> at
> org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
> at
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
> at
> scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
> at
> scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
> at
> scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
> at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> at
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a
> Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1,
> -1, 0, 0] at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
> at
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
> ... 12 more
> {code}
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]