[
https://issues.apache.org/jira/browse/PARQUET-632?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17039121#comment-17039121
]
t oo commented on PARQUET-632:
------------------------------
I am facing same issue writing to S3A path with spark standalone (v2.3.4) but
no EMR.
2020-02-17 21:26:35,367 [task-result-getter-3] WARN
org.apache.spark.scheduler.TaskSetManager - Lost task 0.0 in stage 41.0 (TID
47, xxx, executor 1): org.apache.spark.SparkException: Task failed while
writing rows.
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:288)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:198)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1.apply(FileFormatWriter.scala:197)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: No space left on device
at java.io.FileOutputStream.writeBytes(Native Method)
at java.io.FileOutputStream.write(FileOutputStream.java:326)
at
java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82)
at java.io.BufferedOutputStream.write(BufferedOutputStream.java:126)
at
org.apache.hadoop.fs.s3a.S3AOutputStream.write(S3AOutputStream.java:140)
at
org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:58)
at java.io.DataOutputStream.write(DataOutputStream.java:107)
at java.io.FilterOutputStream.write(FilterOutputStream.java:97)
at
org.apache.parquet.bytes.ConcatenatingByteArrayCollector.writeAllTo(ConcatenatingByteArrayCollector.java:46)
at
org.apache.parquet.hadoop.ParquetFileWriter.writeDataPages(ParquetFileWriter.java:443)
at
org.apache.parquet.hadoop.ColumnChunkPageWriteStore$ColumnChunkPageWriter.writeToFileWriter(ColumnChunkPageWriteStore.java:186)
at
org.apache.parquet.hadoop.ColumnChunkPageWriteStore.flushToFileWriter(ColumnChunkPageWriteStore.java:245)
at
org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:168)
at
org.apache.parquet.hadoop.InternalParquetRecordWriter.checkBlockSizeReached(InternalParquetRecordWriter.java:143)
at
org.apache.parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:125)
at
org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:180)
at
org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:46)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.write(ParquetOutputWriter.scala:40)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$DynamicPartitionWriteTask$$anonfun$execute$5.apply(FileFormatWriter.scala:563)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$DynamicPartitionWriteTask$$anonfun$execute$5.apply(FileFormatWriter.scala:530)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$DynamicPartitionWriteTask.execute(FileFormatWriter.scala:530)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:272)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:270)
at
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1417)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:275)
... 8 more
Suppressed: java.io.IOException: The file being written is in an
invalid state. Probably caused by an error thrown previously. Current state:
COLUMN
at
org.apache.parquet.hadoop.ParquetFileWriter$STATE.error(ParquetFileWriter.java:182)
at
org.apache.parquet.hadoop.ParquetFileWriter$STATE.startBlock(ParquetFileWriter.java:174)
at
org.apache.parquet.hadoop.ParquetFileWriter.startBlock(ParquetFileWriter.java:284)
at
org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:166)
at
org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:109)
at
org.apache.parquet.hadoop.ParquetRecordWriter.close(ParquetRecordWriter.java:163)
at
org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.close(ParquetOutputWriter.scala:42)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$DynamicPartitionWriteTask.releaseResources(FileFormatWriter.scala:577)
at
org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$1.apply$mcV$sp(FileFormatWriter.scala:278)
at
org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1426)
... 9 more
2020-02-17 21:26:35,368 [dispatcher-event-loop-9] INFO
org.apache.spark.scheduler.TaskSetManager - Starting task 0.1 in stage 41.0
(TID 48, xxx, executor 1, partition 0, PROCESS_LOCAL, 8657 bytes)
> Parquet file in invalid state while writing to S3 from EMR
> ----------------------------------------------------------
>
> Key: PARQUET-632
> URL: https://issues.apache.org/jira/browse/PARQUET-632
> Project: Parquet
> Issue Type: Bug
> Affects Versions: 1.7.0
> Reporter: Peter Halliday
> Priority: Blocker
>
> I'm writing parquet to S3 from Spark 1.6.1 on EMR. And when it got to the
> last few files to write to S3, I received this stacktrace in the log with no
> other errors before or after it. It's very consistent. This particular
> batch keeps erroring the same way.
> {noformat}
> 2016-06-10 01:46:05,282] WARN org.apache.spark.scheduler.TaskSetManager
> [task-result-getter-2hread] - Lost task 3737.0 in stage 2.0 (TID 10585,
> ip-172-16-96-32.ec2.internal): org.apache.spark.SparkException: Task failed
> while writing rows.
> at
> org.apache.spark.sql.execution.datasources.DynamicPartitionWriterContainer.writeRows(WriterContainer.scala:414)
> at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
> at
> org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: java.io.IOException: The file being written is in an invalid
> state. Probably caused by an error thrown previously. Current state: COLUMN
> at
> org.apache.parquet.hadoop.ParquetFileWriter$STATE.error(ParquetFileWriter.java:146)
> at
> org.apache.parquet.hadoop.ParquetFileWriter$STATE.startBlock(ParquetFileWriter.java:138)
> at
> org.apache.parquet.hadoop.ParquetFileWriter.startBlock(ParquetFileWriter.java:195)
> at
> org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:153)
> at
> org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:113)
> at
> org.apache.parquet.hadoop.ParquetRecordWriter.close(ParquetRecordWriter.java:112)
> at
> org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.close(ParquetRelation.scala:101)
> at
> org.apache.spark.sql.execution.datasources.DynamicPartitionWriterContainer.writeRows(WriterContainer.scala:405)
> ... 8 more
> {noformat}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)