- Try doing less in each transformation - Try using different data structures within the transformations - Try not caching anything to free up more memory
On Wed, May 25, 2016 at 1:32 AM, pseudo oduesp <pseudo20...@gmail.com> wrote: > hi guys , > -i get this errors with pyspark 1.5.0 under cloudera CDH 5.5 (yarn) > > -i use yarn to deploy job on cluster. > -i use hive context and parquet file to save my data. > limit container 16 GB > number of executor i tested befor it s 12 GB (executor memory) > -i tested to increase number of partitions (by default it s 200) i > multipie by 2 and 3 whitout succes. > > -I try to change number of sql partitins shuffle > > > -i remarque in spark UI when (shuffle write it triggerd no problem) but > (when shuffle read triggerd i lost executors and get erros) > > > > and realy blocked by this error where she came from > > > > > ERROR util.SparkUncaughtExceptionHandler: Uncaught exception in thread > Thread[Executor task launch worker-5,5,main] > java.lang.OutOfMemoryError: Java heap space > at > parquet.column.values.dictionary.IntList.initSlab(IntList.java:90) > at parquet.column.values.dictionary.IntList.<init>(IntList.java:86) > at > parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93) > at > parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229) > at > parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131) > at > parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178) > at > parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203) > at > parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84) > at > parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68) > at > parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56) > at > parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207) > at > parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) > at > parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) > at > parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) > at > parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) > at > org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) > at parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) > at > parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) > at > parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) > at > parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) > at > org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) > at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) > at org.apache.spark.scheduler.Task.run(Task.scala:88) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > 16/05/25 09:54:42 ERROR util.SparkUncaughtExceptionHandler: Uncaught > exception in thread Thread[Executor task launch worker-6,5,main] > java.lang.OutOfMemoryError: Java heap space > at > parquet.column.values.dictionary.IntList.initSlab(IntList.java:90) > at parquet.column.values.dictionary.IntList.<init>(IntList.java:86) > at > parquet.column.values.dictionary.DictionaryValuesWriter.<init>(DictionaryValuesWriter.java:93) > at > parquet.column.values.dictionary.DictionaryValuesWriter$PlainBinaryDictionaryValuesWriter.<init>(DictionaryValuesWriter.java:229) > at > parquet.column.ParquetProperties.dictionaryWriter(ParquetProperties.java:131) > at > parquet.column.ParquetProperties.dictWriterWithFallBack(ParquetProperties.java:178) > at > parquet.column.ParquetProperties.getValuesWriter(ParquetProperties.java:203) > at > parquet.column.impl.ColumnWriterV1.<init>(ColumnWriterV1.java:84) > at > parquet.column.impl.ColumnWriteStoreV1.newMemColumn(ColumnWriteStoreV1.java:68) > at > parquet.column.impl.ColumnWriteStoreV1.getColumnWriter(ColumnWriteStoreV1.java:56) > at > parquet.io.MessageColumnIO$MessageColumnIORecordConsumer.<init>(MessageColumnIO.java:207) > at > parquet.io.MessageColumnIO.getRecordWriter(MessageColumnIO.java:405) > at > parquet.hadoop.InternalParquetRecordWriter.initStore(InternalParquetRecordWriter.java:107) > at > parquet.hadoop.InternalParquetRecordWriter.<init>(InternalParquetRecordWriter.java:97) > at > parquet.hadoop.ParquetRecordWriter.<init>(ParquetRecordWriter.java:100) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:326) > at > parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:282) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94) > at > org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272) > at > org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150) > at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) > at org.apache.spark.scheduler.Task.run(Task.scala:88) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > > > i >