[
https://issues.apache.org/jira/browse/SPARK-18965?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Sean Owen resolved SPARK-18965.
-------------------------------
Resolution: Invalid
wholeTextFiles reads whole files into memory. WIth a large enough file, you run
out of memory. None of that is surprising, or a bug.
> wholeTextFiles() is not able to read large files
> ------------------------------------------------
>
> Key: SPARK-18965
> URL: https://issues.apache.org/jira/browse/SPARK-18965
> Project: Spark
> Issue Type: Bug
> Components: Spark Core
> Affects Versions: 1.6.2
> Environment: All Platforms
> Reporter: Pradeep Misra
> Labels: ReadFile
> Original Estimate: 1,344h
> Remaining Estimate: 1,344h
>
> While working on wholeTextFiles() of size 134738099 (gz compressed) spark
> throws an OOM error.
> ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
> java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/11/30 14:25:36 ERROR SparkUncaughtExceptionHandler: Uncaught exception in
> thread Thread[Executor task launch worker-0,5,main]
> java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> 16/11/30 14:25:36 INFO SparkContext: Invoking stop() from shutdown hook
> 16/11/30 14:25:36 WARN TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0,
> localhost): java.lang.OutOfMemoryError
> at
> java.io.ByteArrayOutputStream.hugeCapacity(ByteArrayOutputStream.java:123)
> at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:117)
> at
> java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
> at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
> at org.spark-project.guava.io.ByteStreams.copy(ByteStreams.java:211)
> at
> org.spark-project.guava.io.ByteStreams.toByteArray(ByteStreams.java:252)
> at
> org.apache.spark.input.WholeTextFileRecordReader.nextKeyValue(WholeTextFileRecordReader.scala:81)
> at
> org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader.nextKeyValue(CombineFileRecordReader.java:65)
> at
> org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:168)
> at
> org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39)
> at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
> at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1631)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1164)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at
> org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1882)
> at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
> at org.apache.spark.scheduler.Task.run(Task.scala:89)
> at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
> at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]