[ 
https://issues.apache.org/jira/browse/SPARK-27654?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Gautham Rajendiran updated SPARK-27654:
---------------------------------------
    Affects Version/s:     (was: 2.4.3)
                       2.4.0

> spark unable to read parquet file- corrupt footer
> -------------------------------------------------
>
>                 Key: SPARK-27654
>                 URL: https://issues.apache.org/jira/browse/SPARK-27654
>             Project: Spark
>          Issue Type: Bug
>          Components: Spark Core
>    Affects Versions: 2.4.0
>            Reporter: Gautham Rajendiran
>            Priority: Blocker
>
> Reading large parquet file produces corrupt footer error with parquet file
>  
> {code:java}
> --------------------------------------------------------------------------- 
> Py4JJavaError Traceback (most recent call last) <command-2958098653513126> in 
> <module>() ----> 1 df = spark.read.parquet("/mnt/valassis/data1") 2 
> df.head(1) /databricks/spark/python/pyspark/sql/readwriter.py in 
> parquet(self, *paths) 314 [('name', 'string'), ('year', 'int'), ('month', 
> 'int'), ('day', 'int')] 315 """ --> 316 return 
> self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths))) 317 318 
> @ignore_unicode_prefix 
> /databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in 
> __call__(self, *args) 1255 answer = self.gateway_client.send_command(command) 
> 1256 return_value = get_return_value( -> 1257 answer, self.gateway_client, 
> self.target_id, self.name) 1258 1259 for temp_arg in temp_args: 
> /databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 61 def 
> deco(*a, **kw): 62 try: ---> 63 return f(*a, **kw) 64 except 
> py4j.protocol.Py4JJavaError as e: 65 s = e.java_exception.toString() 
> /databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in 
> get_return_value(answer, gateway_client, target_id, name) 326 raise 
> Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". --> 328 
> format(target_id, ".", name), value) 329 else: 330 raise Py4JError( 
> Py4JJavaError: An error occurred while calling o1045.parquet. : 
> org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in 
> stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0 
> (TID 1458, 10.139.64.5, executor 0): org.apache.spark.SparkException: 
> Exception thrown in awaitResult: at 
> org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at 
> org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
>  at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
>  at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
>  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60) 
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at 
> org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at 
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at 
> org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at 
> org.apache.spark.scheduler.Task.run(Task.scala:112) at 
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
>  at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  at java.lang.Thread.run(Thread.java:748) Caused by: java.io.IOException: 
> Could not read footer for file: FileStatus{path=dbfs:/mnt/valassis/data1; 
> isDirectory=false; length=66061642673; replication=0; blocksize=0; 
> modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-; 
> isSymlink=false} at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
>  at 
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
>  at 
> org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
>  at 
> org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
>  at 
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
>  at 
> scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
>  at 
> scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24) 
> at 
> scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
>  at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>  at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) 
> at 
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>  Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a 
> Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1, 
> -1, 0, 0] at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
>  ... 12 more Driver stacktrace: at 
> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2100)
>  at 
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2088)
>  at 
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2087)
>  at 
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
>  at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at 
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2087) 
> at 
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
>  at 
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
>  at scala.Option.foreach(Option.scala:257) at 
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1076)
>  at 
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2319)
>  at 
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2267)
>  at 
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2255)
>  at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at 
> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:873) at 
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2252) at 
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2274) at 
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2293) at 
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2318) at 
> org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:961) at 
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
>  at 
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
>  at org.apache.spark.rdd.RDD.withScope(RDD.scala:379) at 
> org.apache.spark.rdd.RDD.collect(RDD.scala:960) at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:697)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:250)
>  at 
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:205)
>  at 
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:196)
>  at scala.Option.orElse(Option.scala:289) at 
> org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:196)
>  at 
> org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:415)
>  at 
> org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:298) 
> at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:279) at 
> org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:733) at 
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498) at 
> py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at 
> py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at 
> py4j.Gateway.invoke(Gateway.java:295) at 
> py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at 
> py4j.commands.CallCommand.execute(CallCommand.java:79) at 
> py4j.GatewayConnection.run(GatewayConnection.java:251) at 
> java.lang.Thread.run(Thread.java:748) Caused by: 
> org.apache.spark.SparkException: Exception thrown in awaitResult: at 
> org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at 
> org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
>  at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
>  at 
> org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
>  at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60) 
> at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at 
> org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at 
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at 
> org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at 
> org.apache.spark.scheduler.Task.run(Task.scala:112) at 
> org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
>  at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  ... 1 more Caused by: java.io.IOException: Could not read footer for file: 
> FileStatus{path=dbfs:/mnt/valassis/data1; isDirectory=false; 
> length=66061642673; replication=0; blocksize=0; modification_time=0; 
> access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false} at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
>  at 
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
>  at 
> org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
>  at 
> org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
>  at 
> org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
>  at 
> scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
>  at 
> scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24) 
> at 
> scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
>  at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at 
> scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
>  at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) 
> at 
> scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
>  Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a 
> Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1, 
> -1, 0, 0] at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
>  at 
> org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
>  at 
> org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
>  ... 12 more
> {code}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to