[
https://issues.apache.org/jira/browse/SPARK-18965?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15767462#comment-15767462
]
Pradeep Misra commented on SPARK-18965:
---------------------------------------
The reason I raised it as a bug is due to fact that even if more memory is
allocated, it uses byte buffer. If we use streams probably, we would be able to
get past this error.
XML files with xsd validation is a use case where wholeTextFIles() is critical.
The parsing takes few seconds so why restrict on the size of the file to 2 GB.
Thanks
> wholeTextFiles() is not able to read large files
> ------------------------------------------------
>
> Key: SPARK-18965
> URL: https://issues.apache.org/jira/browse/SPARK-18965
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 1.6.2
> Environment: All Platforms
> Reporter: Pradeep Misra
> Labels: ReadFile
> Original Estimate: 1,344h
> Remaining Estimate: 1,344h
>
> While working on wholeTextFiles() of size 134738099 (gz compressed) spark
> throws an OOM error.
> ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
> java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/11/30 14:25:36 ERROR SparkUncaughtExceptionHandler: Uncaught exception in
> thread Thread[Executor task launch worker-0,5,main]
> java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/11/30 14:25:36 INFO SparkContext: Invoking stop() from shutdown hook
> 16/11/30 14:25:36 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0,
> localhost): java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]