My problem is quite simple - JVM is running out of memory during model = dt.fit(train_small). My train_small dataset contains only 100 rows(I have limited the number of rows to make sure the size of dataset doesn't cause the memory overflow). But each row has a column all_features with a long vector(300+ entries). Could this be the source of the OOM error?
Here is my code: dt = DecisionTreeRegressor(featuresCol="all_features", labelCol="rating", predictionCol="prediction") model = dt.fit(train_small) predictions = model.transform(test_small) evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) Stacktrace: Py4JJavaError: An error occurred while calling o5719.fit. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 13 in stage 8821.0 failed 1 times, most recent failure: Lost task 13.0 in stage 8821.0 (TID 83708, localhost, executor driver): java.lang.OutOfMemoryError: Java heap space at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57) at java.nio.ByteBuffer.allocate(ByteBuffer.java:335) at org.apache.spark.sql.execution.columnar.NullableColumnBuilder$class.build(NullableColumnBuilder.scala:74) at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.build(ColumnBuilder.scala:91) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1$$anonfun$next$2.apply(InMemoryRelation.scala:134) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1$$anonfun$next$2.apply(InMemoryRelation.scala:133) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:133) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:97) at org.apache.spark.storage.memory.PartiallyUnrolledIterator.next(MemoryStore.scala:706) at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140) at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:170) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$5.apply(BlockManager.scala:964) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$5.apply(BlockManager.scala:963) at org.apache.spark.storage.DiskStore.put(DiskStore.scala:57) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:963) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:947) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:887) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:947) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:693) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1456) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1444) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1443) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1443) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1671) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1626) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1615) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2015) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2036) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1353) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) at org.apache.spark.rdd.RDD.withScope(RDD.scala:362) at org.apache.spark.rdd.RDD.take(RDD.scala:1326) at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:112) at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105) at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:111) at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:46) at org.apache.spark.ml.Predictor.fit(Predictor.scala:118) at org.apache.spark.ml.Predictor.fit(Predictor.scala:82) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:280) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:214) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.OutOfMemoryError: Java heap space at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57) at java.nio.ByteBuffer.allocate(ByteBuffer.java:335) at org.apache.spark.sql.execution.columnar.NullableColumnBuilder$class.build(NullableColumnBuilder.scala:74) at org.apache.spark.sql.execution.columnar.ComplexColumnBuilder.build(ColumnBuilder.scala:91) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1$$anonfun$next$2.apply(InMemoryRelation.scala:134) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1$$anonfun$next$2.apply(InMemoryRelation.scala:133) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234) at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33) at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186) at scala.collection.TraversableLike$class.map(TraversableLike.scala:234) at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:133) at org.apache.spark.sql.execution.columnar.InMemoryRelation$$anonfun$1$$anon$1.next(InMemoryRelation.scala:97) at org.apache.spark.storage.memory.PartiallyUnrolledIterator.next(MemoryStore.scala:706) at org.apache.spark.serializer.SerializationStream.writeAll(Serializer.scala:140) at org.apache.spark.serializer.SerializerManager.dataSerializeStream(SerializerManager.scala:170) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$5.apply(BlockManager.scala:964) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1$$anonfun$apply$5.apply(BlockManager.scala:963) at org.apache.spark.storage.DiskStore.put(DiskStore.scala:57) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:963) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:947) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:887) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:947) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:693) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) at org.apache.spark.rdd.RDD.iterator(RDD.scala:287) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Pyspark-out-of-memory-exception-during-model-training-tp28395.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe e-mail: user-unsubscr...@spark.apache.org