Gautham Rajendiran created SPARK-27654:
------------------------------------------
Summary: spark unable to read parquet file- corrupt footer
Key: SPARK-27654
URL: https://issues.apache.org/jira/browse/SPARK-27654
Project: Spark
Issue Type: Bug
Components: Spark Core
Affects Versions: 2.4.3
Reporter: Gautham Rajendiran
Reading large parquet file produces corrupt footer error with parquet file
{code:java}
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last) <command-2958098653513126> in
<module>() ----> 1 df = spark.read.parquet("/mnt/valassis/data1") 2 df.head(1)
/databricks/spark/python/pyspark/sql/readwriter.py in parquet(self, *paths) 314
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')] 315 """
--> 316 return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
317 318 @ignore_unicode_prefix
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in
__call__(self, *args) 1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value( -> 1257 answer, self.gateway_client,
self.target_id, self.name) 1258 1259 for temp_arg in temp_args:
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 61 def deco(*a,
**kw): 62 try: ---> 63 return f(*a, **kw) 64 except py4j.protocol.Py4JJavaError
as e: 65 s = e.java_exception.toString()
/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in
get_return_value(answer, gateway_client, target_id, name) 326 raise
Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". --> 328
format(target_id, ".", name), value) 329 else: 330 raise Py4JError(
Py4JJavaError: An error occurred while calling o1045.parquet. :
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in
stage 9.0 failed 4 times, most recent failure: Lost task 0.3 in stage 9.0 (TID
1458, 10.139.64.5, executor 0): org.apache.spark.SparkException: Exception
thrown in awaitResult: at
org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at
org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60) at
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at
org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at
org.apache.spark.scheduler.Task.run(Task.scala:112) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748) Caused by: java.io.IOException: Could
not read footer for file: FileStatus{path=dbfs:/mnt/valassis/data1;
isDirectory=false; length=66061642673; replication=0; blocksize=0;
modification_time=0; access_time=0; owner=; group=; permission=rw-rw-rw-;
isSymlink=false} at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
at
org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
at
org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
at
org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
at
org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
at
scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
at
scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a
Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1, -1,
0, 0] at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
... 12 more Driver stacktrace: at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:2100)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2088)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:2087)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2087) at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:1076)
at scala.Option.foreach(Option.scala:257) at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1076)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2319)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2267)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2255)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at
org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:873) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2252) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2274) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2293) at
org.apache.spark.SparkContext.runJob(SparkContext.scala:2318) at
org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:961) at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:379) at
org.apache.spark.rdd.RDD.collect(RDD.scala:960) at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:697)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:250)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:205)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:196)
at scala.Option.orElse(Option.scala:289) at
org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:196)
at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:415)
at
org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:298) at
org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:279) at
org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:733) at
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498) at
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at
py4j.Gateway.invoke(Gateway.java:295) at
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at
py4j.commands.CallCommand.execute(CallCommand.java:79) at
py4j.GatewayConnection.run(GatewayConnection.java:251) at
java.lang.Thread.run(Thread.java:748) Caused by:
org.apache.spark.SparkException: Exception thrown in awaitResult: at
org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:355) at
org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:422) at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:602)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:675)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:667)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
at
org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:817)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:60) at
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:340) at
org.apache.spark.rdd.RDD.iterator(RDD.scala:304) at
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at
org.apache.spark.scheduler.Task.doRunTask(Task.scala:139) at
org.apache.spark.scheduler.Task.run(Task.scala:112) at
org.apache.spark.executor.Executor$TaskRunner$$anonfun$13.apply(Executor.scala:497)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1432) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:503) at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more Caused by: java.io.IOException: Could not read footer for file:
FileStatus{path=dbfs:/mnt/valassis/data1; isDirectory=false;
length=66061642673; replication=0; blocksize=0; modification_time=0;
access_time=0; owner=; group=; permission=rw-rw-rw-; isSymlink=false} at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:615)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:602)
at
org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2$$anonfun$apply$3.apply(ThreadUtils.scala:419)
at
org.apache.spark.util.threads.SparkThreadLocalCapturingHelper$class.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:52)
at
org.apache.spark.util.threads.CapturedSparkThreadLocals.runWithCaptured(SparkThreadLocalForwardingThreadPoolExecutor.scala:71)
at
org.apache.spark.util.ThreadUtils$$anonfun$5$$anonfun$apply$2.apply(ThreadUtils.scala:419)
at
scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24)
at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24)
at
scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121)
at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at
scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: java.lang.RuntimeException: dbfs:/mnt/valassis/data1 is not a
Parquet file. expected magic number at tail [80, 65, 82, 49] but found [-1, -1,
0, 0] at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:525)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:506)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:500)
at
org.apache.parquet.hadoop.ParquetFileReader.readFooter(ParquetFileReader.java:477)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$readParquetFootersInParallel$1.apply(ParquetFileFormat.scala:608)
... 12 more
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]